summaryrefslogtreecommitdiffstats
path: root/src/backend/access/gin
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/gin')
-rw-r--r--src/backend/access/gin/Makefile32
-rw-r--r--src/backend/access/gin/README562
-rw-r--r--src/backend/access/gin/ginarrayproc.c305
-rw-r--r--src/backend/access/gin/ginbtree.c795
-rw-r--r--src/backend/access/gin/ginbulk.c293
-rw-r--r--src/backend/access/gin/gindatapage.c1942
-rw-r--r--src/backend/access/gin/ginentrypage.c772
-rw-r--r--src/backend/access/gin/ginfast.c1068
-rw-r--r--src/backend/access/gin/ginget.c1970
-rw-r--r--src/backend/access/gin/gininsert.c541
-rw-r--r--src/backend/access/gin/ginlogic.c246
-rw-r--r--src/backend/access/gin/ginpostinglist.c434
-rw-r--r--src/backend/access/gin/ginscan.c468
-rw-r--r--src/backend/access/gin/ginutil.c707
-rw-r--r--src/backend/access/gin/ginvacuum.c822
-rw-r--r--src/backend/access/gin/ginvalidate.c338
-rw-r--r--src/backend/access/gin/ginxlog.c813
17 files changed, 12108 insertions, 0 deletions
diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile
new file mode 100644
index 0000000..3fceaee
--- /dev/null
+++ b/src/backend/access/gin/Makefile
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/gin
+#
+# IDENTIFICATION
+# src/backend/access/gin/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/gin
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ ginarrayproc.o \
+ ginbtree.o \
+ ginbulk.o \
+ gindatapage.o \
+ ginentrypage.o \
+ ginfast.o \
+ ginget.o \
+ gininsert.o \
+ ginlogic.o \
+ ginpostinglist.o \
+ ginscan.o \
+ ginutil.o \
+ ginvacuum.o \
+ ginvalidate.o \
+ ginxlog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README
new file mode 100644
index 0000000..41d4e1e
--- /dev/null
+++ b/src/backend/access/gin/README
@@ -0,0 +1,562 @@
+src/backend/access/gin/README
+
+Gin for PostgreSQL
+==================
+
+Gin was sponsored by jfg://networks (http://www.jfg-networks.com/)
+
+Gin stands for Generalized Inverted Index and should be considered as a genie,
+not a drink.
+
+Generalized means that the index does not know which operation it accelerates.
+It instead works with custom strategies, defined for specific data types (read
+"Index Method Strategies" in the PostgreSQL documentation). In that sense, Gin
+is similar to GiST and differs from btree indices, which have predefined,
+comparison-based operations.
+
+An inverted index is an index structure storing a set of (key, posting list)
+pairs, where 'posting list' is a set of heap rows in which the key occurs.
+(A text document would usually contain many keys.) The primary goal of
+Gin indices is support for highly scalable, full-text search in PostgreSQL.
+
+A Gin index consists of a B-tree index constructed over key values,
+where each key is an element of some indexed items (element of array, lexeme
+for tsvector) and where each tuple in a leaf page contains either a pointer to
+a B-tree over item pointers (posting tree), or a simple list of item pointers
+(posting list) if the list is small enough.
+
+Note: There is no delete operation in the key (entry) tree. The reason for
+this is that in our experience, the set of distinct words in a large corpus
+changes very slowly. This greatly simplifies the code and concurrency
+algorithms.
+
+Core PostgreSQL includes built-in Gin support for one-dimensional arrays
+(eg. integer[], text[]). The following operations are available:
+
+ * contains: value_array @> query_array
+ * overlaps: value_array && query_array
+ * is contained by: value_array <@ query_array
+
+Synopsis
+--------
+
+=# create index txt_idx on aa using gin(a);
+
+Features
+--------
+
+ * Concurrency
+ * Write-Ahead Logging (WAL). (Recoverability from crashes.)
+ * User-defined opclasses. (The scheme is similar to GiST.)
+ * Optimized index creation (Makes use of maintenance_work_mem to accumulate
+ postings in memory.)
+ * Text search support via an opclass
+ * Soft upper limit on the returned results set using a GUC variable:
+ gin_fuzzy_search_limit
+
+Gin Fuzzy Limit
+---------------
+
+There are often situations when a full-text search returns a very large set of
+results. Since reading tuples from the disk and sorting them could take a
+lot of time, this is unacceptable for production. (Note that the search
+itself is very fast.)
+
+Such queries usually contain very frequent lexemes, so the results are not
+very helpful. To facilitate execution of such queries Gin has a configurable
+soft upper limit on the size of the returned set, determined by the
+'gin_fuzzy_search_limit' GUC variable. This is set to 0 by default (no
+limit).
+
+If a non-zero search limit is set, then the returned set is a subset of the
+whole result set, chosen at random.
+
+"Soft" means that the actual number of returned results could differ
+from the specified limit, depending on the query and the quality of the
+system's random number generator.
+
+From experience, a value of 'gin_fuzzy_search_limit' in the thousands
+(eg. 5000-20000) works well. This means that 'gin_fuzzy_search_limit' will
+have no effect for queries returning a result set with less tuples than this
+number.
+
+Index structure
+---------------
+
+The "items" that a GIN index indexes are composite values that contain
+zero or more "keys". For example, an item might be an integer array, and
+then the keys would be the individual integer values. The index actually
+stores and searches for the key values, not the items per se. In the
+pg_opclass entry for a GIN opclass, the opcintype is the data type of the
+items, and the opckeytype is the data type of the keys. GIN is optimized
+for cases where items contain many keys and the same key values appear
+in many different items.
+
+A GIN index contains a metapage, a btree of key entries, and possibly
+"posting tree" pages, which hold the overflow when a key entry acquires
+too many heap tuple pointers to fit in a btree page. Additionally, if the
+fast-update feature is enabled, there can be "list pages" holding "pending"
+key entries that haven't yet been merged into the main btree. The list
+pages have to be scanned linearly when doing a search, so the pending
+entries should be merged into the main btree before there get to be too
+many of them. The advantage of the pending list is that bulk insertion of
+a few thousand entries can be much faster than retail insertion. (The win
+comes mainly from not having to do multiple searches/insertions when the
+same key appears in multiple new heap tuples.)
+
+Key entries are nominally of the same IndexTuple format as used in other
+index types, but since a leaf key entry typically refers to multiple heap
+tuples, there are significant differences. (See GinFormTuple, which works
+by building a "normal" index tuple and then modifying it.) The points to
+know are:
+
+* In a single-column index, a key tuple just contains the key datum, but
+in a multi-column index, a key tuple contains the pair (column number,
+key datum) where the column number is stored as an int2. This is needed
+to support different key data types in different columns. This much of
+the tuple is built by index_form_tuple according to the usual rules.
+The column number (if present) can never be null, but the key datum can
+be, in which case a null bitmap is present as usual. (As usual for index
+tuples, the size of the null bitmap is fixed at INDEX_MAX_KEYS.)
+
+* If the key datum is null (ie, IndexTupleHasNulls() is true), then
+just after the nominal index data (ie, at offset IndexInfoFindDataOffset
+or IndexInfoFindDataOffset + sizeof(int2)) there is a byte indicating
+the "category" of the null entry. These are the possible categories:
+ 1 = ordinary null key value extracted from an indexable item
+ 2 = placeholder for zero-key indexable item
+ 3 = placeholder for null indexable item
+Placeholder null entries are inserted into the index because otherwise
+there would be no index entry at all for an empty or null indexable item,
+which would mean that full index scans couldn't be done and various corner
+cases would give wrong answers. The different categories of null entries
+are treated as distinct keys by the btree, but heap itempointers for the
+same category of null entry are merged into one index entry just as happens
+with ordinary key entries.
+
+* In a key entry at the btree leaf level, at the next SHORTALIGN boundary,
+there is a list of item pointers, in compressed format (see Posting List
+Compression section), pointing to the heap tuples for which the indexable
+items contain this key. This is called the "posting list".
+
+If the list would be too big for the index tuple to fit on an index page, the
+ItemPointers are pushed out to a separate posting page or pages, and none
+appear in the key entry itself. The separate pages are called a "posting
+tree" (see below); Note that in either case, the ItemPointers associated with
+a key can easily be read out in sorted order; this is relied on by the scan
+algorithms.
+
+* The index tuple header fields of a leaf key entry are abused as follows:
+
+1) Posting list case:
+
+* ItemPointerGetBlockNumber(&itup->t_tid) contains the offset from index
+ tuple start to the posting list.
+ Access macros: GinGetPostingOffset(itup) / GinSetPostingOffset(itup,n)
+
+* ItemPointerGetOffsetNumber(&itup->t_tid) contains the number of elements
+ in the posting list (number of heap itempointers).
+ Access macros: GinGetNPosting(itup) / GinSetNPosting(itup,n)
+
+* If IndexTupleHasNulls(itup) is true, the null category byte can be
+ accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c)
+
+* The posting list can be accessed with GinGetPosting(itup)
+
+* If GinItupIsCompressed(itup), the posting list is stored in compressed
+ format. Otherwise it is just an array of ItemPointers. New tuples are always
+ stored in compressed format, uncompressed items can be present if the
+ database was migrated from 9.3 or earlier version.
+
+2) Posting tree case:
+
+* ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number
+ of the root of the posting tree.
+ Access macros: GinGetPostingTree(itup) / GinSetPostingTree(itup, blkno)
+
+* ItemPointerGetOffsetNumber(&itup->t_tid) contains the magic number
+ GIN_TREE_POSTING, which distinguishes this from the posting-list case
+ (it's large enough that that many heap itempointers couldn't possibly
+ fit on an index page). This value is inserted automatically by the
+ GinSetPostingTree macro.
+
+* If IndexTupleHasNulls(itup) is true, the null category byte can be
+ accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c)
+
+* The posting list is not present and must not be accessed.
+
+Use the macro GinIsPostingTree(itup) to determine which case applies.
+
+In both cases, itup->t_info & INDEX_SIZE_MASK contains actual total size of
+tuple, and the INDEX_VAR_MASK and INDEX_NULL_MASK bits have their normal
+meanings as set by index_form_tuple.
+
+Index tuples in non-leaf levels of the btree contain the optional column
+number, key datum, and null category byte as above. They do not contain
+a posting list. ItemPointerGetBlockNumber(&itup->t_tid) is the downlink
+to the next lower btree level, and ItemPointerGetOffsetNumber(&itup->t_tid)
+is InvalidOffsetNumber. Use the access macros GinGetDownlink/GinSetDownlink
+to get/set the downlink.
+
+Index entries that appear in "pending list" pages work a tad differently as
+well. The optional column number, key datum, and null category byte are as
+for other GIN index entries. However, there is always exactly one heap
+itempointer associated with a pending entry, and it is stored in the t_tid
+header field just as in non-GIN indexes. There is no posting list.
+Furthermore, the code that searches the pending list assumes that all
+entries for a given heap tuple appear consecutively in the pending list and
+are sorted by the column-number-plus-key-datum. The GIN_LIST_FULLROW page
+flag bit tells whether entries for a given heap tuple are spread across
+multiple pending-list pages. If GIN_LIST_FULLROW is set, the page contains
+all the entries for one or more heap tuples. If GIN_LIST_FULLROW is clear,
+the page contains entries for only one heap tuple, *and* they are not all
+the entries for that tuple. (Thus, a heap tuple whose entries do not all
+fit on one pending-list page must have those pages to itself, even if this
+results in wasting much of the space on the preceding page and the last
+page for the tuple.)
+
+GIN packs downlinks and pivot keys into internal page tuples in a different way
+than nbtree does. Lehman & Yao defines it as following.
+
+P_0, K_1, P_1, K_2, P_2, ... , K_n, P_n, K_{n+1}
+
+There P_i is a downlink and K_i is a key. K_i splits key space between P_{i-1}
+and P_i (0 <= i <= n). K_{n+1} is high key.
+
+In internal page tuple is key and downlink grouped together. nbtree packs
+keys and downlinks into tuples as following.
+
+(K_{n+1}, None), (-Inf, P_0), (K_1, P_1), ... , (K_n, P_n)
+
+There tuples are shown in parentheses. So, highkey is stored separately. P_i
+is grouped with K_i. P_0 is grouped with -Inf key.
+
+GIN packs keys and downlinks into tuples in a different way.
+
+(P_0, K_1), (P_1, K_2), ... , (P_n, K_{n+1})
+
+P_i is grouped with K_{i+1}. -Inf key is not needed.
+
+There are couple of additional notes regarding K_{n+1} key.
+1) In entry tree rightmost page, a key coupled with P_n doesn't really matter.
+Highkey is assumed to be infinity.
+2) In posting tree, a key coupled with P_n always doesn't matter. Highkey for
+non-rightmost pages is stored separately and accessed via
+GinDataPageGetRightBound().
+
+Posting tree
+------------
+
+If a posting list is too large to store in-line in a key entry, a posting tree
+is created. A posting tree is a B-tree structure, where the ItemPointer is
+used as the key.
+
+Internal posting tree pages use the standard PageHeader and the same "opaque"
+struct as other GIN page, but do not contain regular index tuples. Instead,
+the contents of the page is an array of PostingItem structs. Each PostingItem
+consists of the block number of the child page, and the right bound of that
+child page, as an ItemPointer. The right bound of the page is stored right
+after the page header, before the PostingItem array.
+
+Posting tree leaf pages also use the standard PageHeader and opaque struct,
+and the right bound of the page is stored right after the page header, but
+the page content comprises of a number of compressed posting lists. The
+compressed posting lists are stored one after each other, between page header
+and pd_lower. The space between pd_lower and pd_upper is unused, which allows
+full-page images of posting tree leaf pages to skip the unused space in middle
+(buffer_std = true in XLogRecData).
+
+The item pointers are stored in a number of independent compressed posting
+lists (also called segments), instead of one big one, to make random access
+to a given item pointer faster: to find an item in a compressed list, you
+have to read the list from the beginning, but when the items are split into
+multiple lists, you can first skip over to the list containing the item you're
+looking for, and read only that segment. Also, an update only needs to
+re-encode the affected segment.
+
+Posting List Compression
+------------------------
+
+To fit as many item pointers on a page as possible, posting tree leaf pages
+and posting lists stored inline in entry tree leaf tuples use a lightweight
+form of compression. We take advantage of the fact that the item pointers
+are stored in sorted order. Instead of storing the block and offset number of
+each item pointer separately, we store the difference from the previous item.
+That in itself doesn't do much, but it allows us to use so-called varbyte
+encoding to compress them.
+
+Varbyte encoding is a method to encode integers, allowing smaller numbers to
+take less space at the cost of larger numbers. Each integer is represented by
+variable number of bytes. High bit of each byte in varbyte encoding determines
+whether the next byte is still part of this number. Therefore, to read a single
+varbyte encoded number, you have to read bytes until you find a byte with the
+high bit not set.
+
+When encoding, the block and offset number forming the item pointer are
+combined into a single integer. The offset number is stored in the 11 low
+bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number
+is stored in the higher bits. That requires 43 bits in total, which
+conveniently fits in at most 6 bytes.
+
+A compressed posting list is passed around and stored on disk in a
+GinPostingList struct. The first item in the list is stored uncompressed
+as a regular ItemPointerData, followed by the length of the list in bytes,
+followed by the packed items.
+
+Concurrency
+-----------
+
+The entry tree and each posting tree are B-trees, with right-links connecting
+sibling pages at the same level. This is the same structure that is used in
+the regular B-tree indexam (invented by Lehman & Yao), but we don't support
+scanning a GIN trees backwards, so we don't need left-links. The entry tree
+leaves don't have dedicated high keys, instead greatest leaf tuple serves as
+high key. That works because tuples are never deleted from the entry tree.
+
+The algorithms used to operate entry and posting trees are considered below.
+
+### Locating the leaf page
+
+When we search for leaf page in GIN btree to perform a read, we descend from
+the root page to the leaf through using downlinks taking pin and shared lock on
+one page at once. So, we release pin and shared lock on previous page before
+getting them on the next page.
+
+The picture below shows tree state after finding the leaf page. Lower case
+letters depicts tree pages. 'S' depicts shared lock on the page.
+
+ a
+ / | \
+ b c d
+ / | \ | \ | \
+ eS f g h i j k
+
+### Steping right
+
+Concurrent page splits move the keyspace to right, so after following a
+downlink, the page actually containing the key we're looking for might be
+somewhere to the right of the page we landed on. In that case, we follow the
+right-links until we find the page we're looking for.
+
+During stepping right we take pin and shared lock on the right sibling before
+releasing them from the current page. This mechanism was designed to protect
+from stepping to delete page. We step to the right sibling while hold lock on
+the rightlink pointing there. So, it's guaranteed that nobody updates rightlink
+concurrently and doesn't delete right sibling accordingly.
+
+The picture below shows two pages locked at once during stepping right.
+
+ a
+ / | \
+ b c d
+ / | \ | \ | \
+ eS fS g h i j k
+
+### Insert
+
+While finding appropriate leaf for insertion we also descend from the root to
+leaf, while shared locking one page at once in. But during insertion we don't
+release pins from root and internal pages. That could save us some lookups to
+the buffers hash table for downlinks insertion assuming parents are not changed
+due to concurrent splits. Once we reach leaf we re-lock the page in exclusive
+mode.
+
+The picture below shows leaf page locked in exclusive mode and ready for
+insertion. 'P' and 'E' depict pin and exclusive lock correspondingly.
+
+
+ aP
+ / | \
+ b cP d
+ / | \ | \ | \
+ e f g hE i j k
+
+
+If insert causes a page split, the parent is locked in exclusive mode before
+unlocking the left child. So, insertion algorithm can exclusively lock both
+parent and child pages at once starting from child.
+
+The picture below shows tree state after leaf page split. 'q' is new page
+produced by split. Parent 'c' is about to have downlink inserted.
+
+ aP
+ / | \
+ b cE d
+ / | \ / | \ | \
+ e f g hE q i j k
+
+
+### Page deletion
+
+Vacuum never deletes tuples or pages from the entry tree. It traverses entry
+tree leafs in logical order by rightlinks and removes deletable TIDs from
+posting lists. Posting trees are processed by links from entry tree leafs. They
+are vacuumed in two stages. At first stage, deletable TIDs are removed from
+leafs. If first stage detects at least one empty page, then at the second stage
+ginScanToDelete() deletes empty pages.
+
+ginScanToDelete() traverses the whole tree in depth-first manner. It starts
+from the super-exclusive lock on the tree root. This lock prevents all the
+concurrent insertions into this tree while we're deleting pages. However,
+there are still might be some in-progress readers, who traversed root before
+we locked it.
+
+The picture below shows tree state after page deletion algorithm traversed to
+leftmost leaf of the tree.
+
+ aE
+ / | \
+ bE c d
+ / | \ | \ | \
+ eE f g h i j k
+
+Deletion algorithm keeps exclusive locks on left siblings of pages comprising
+currently investigated path. Thus, if current page is to be removed, all
+required pages to remove both downlink and rightlink are already locked. That
+avoids potential right to left page locking order, which could deadlock with
+concurrent stepping right.
+
+A search concurrent to page deletion might already have read a pointer to the
+page to be deleted, and might be just about to follow it. A page can be reached
+via the right-link of its left sibling, or via its downlink in the parent.
+
+To prevent a backend from reaching a deleted page via a right-link, stepping
+right algorithm doesn't release lock on the current page until lock of the
+right page is acquired.
+
+The downlink is more tricky. A search descending the tree must release the lock
+on the parent page before locking the child, or it could deadlock with a
+concurrent split of the child page; a page split locks the parent, while already
+holding a lock on the child page. So, deleted page cannot be reclaimed
+immediately. Instead, we have to wait for every transaction, which might wait
+to reference this page, to finish. Corresponding processes must observe that
+the page is marked deleted and recover accordingly.
+
+The picture below shows tree state after page deletion algorithm further
+traversed the tree. Currently investigated path is 'a-c-h'. Left siblings 'b'
+and 'g' of 'c' and 'h' correspondingly are also exclusively locked.
+
+ aE
+ / | \
+ bE cE d
+ / | \ | \ | \
+ e f gE hE i j k
+
+The next picture shows tree state after page 'h' was deleted. It's marked with
+'deleted' flag and newest xid, which might visit it. Downlink from 'c' to 'h'
+is also deleted.
+
+ aE
+ / | \
+ bE cE d
+ / | \ \ | \
+ e f gE hD iE j k
+
+However, it's still possible that concurrent reader has seen downlink from 'c'
+to 'h' before we deleted it. In that case this reader will step right from 'h'
+to till find non-deleted page. Xid-marking of page 'h' guarantees that this
+page wouldn't be reused till all such readers gone. Next leaf page under
+investigation is 'i'. 'g' remains locked as it becomes left sibling of 'i'.
+
+The next picture shows tree state after 'i' and 'c' was deleted. Internal page
+'c' was deleted because it appeared to have no downlinks. The path under
+investigation is 'a-d-j'. Pages 'b' and 'g' are locked as self siblings of 'd'
+and 'j'.
+
+ aE
+ / \
+ bE cD dE
+ / | \ | \
+ e f gE hD iD jE k
+
+During the replay of page deletion at standby, the page's left sibling, the
+target page, and its parent, are locked in that order. This order guarantees
+no deadlock with concurrent reads.
+
+Predicate Locking
+-----------------
+
+GIN supports predicate locking, for serializable snapshot isolation.
+A predicate locks represent that a scan has scanned a range of values. They
+are not concerned with physical pages as such, but the logical key values.
+A predicate lock on a page covers the key range that would belong on that
+page, whether or not there are any matching tuples there currently. In other
+words, a predicate lock on an index page covers the "gaps" between the index
+tuples. To minimize false positives, predicate locks are acquired at the
+finest level possible.
+
+* Like in the B-tree index, it is enough to lock only leaf pages, because all
+ insertions happen at the leaf level.
+
+* In an equality search (i.e. not a partial match search), if a key entry has
+ a posting tree, we lock the posting tree root page, to represent a lock on
+ just that key entry. Otherwise, we lock the entry tree page. We also lock
+ the entry tree page if no match is found, to lock the "gap" where the entry
+ would've been, had there been one.
+
+* In a partial match search, we lock all the entry leaf pages that we scan,
+ in addition to locks on posting tree roots, to represent the "gaps" between
+ values.
+
+* In addition to the locks on entry leaf pages and posting tree roots, all
+ scans grab a lock the metapage. This is to interlock with insertions to
+ the fast update pending list. An insertion to the pending list can really
+ belong anywhere in the tree, and the lock on the metapage represents that.
+
+The interlock for fastupdate pending lists means that with fastupdate=on,
+we effectively always grab a full-index lock, so you could get a lot of false
+positives.
+
+Compatibility
+-------------
+
+Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in
+uncompressed format because of pg_upgrade from 9.3 or earlier versions.
+For compatibility, old uncompressed format is also supported. Following
+rules are used to handle it:
+
+* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list.
+This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid).
+Use GinItupIsCompressed(itup) to check the flag.
+
+* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag.
+ Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to
+ check and set this flag.
+
+* All scan operations check format of posting list add use corresponding code
+to read its content.
+
+* When updating an index tuple containing an uncompressed posting list, it
+will be replaced with new index tuple containing a compressed list.
+
+* When updating an uncompressed posting tree leaf page, it's compressed.
+
+* If vacuum finds some dead TIDs in uncompressed posting lists, they are
+converted into compressed posting lists. This assumes that the compressed
+posting list fits in the space occupied by the uncompressed list. IOW, we
+assume that the compressed version of the page, with the dead items removed,
+takes less space than the old uncompressed version.
+
+Limitations
+-----------
+
+ * Gin doesn't use scan->kill_prior_tuple & scan->ignore_killed_tuples
+ * Gin searches entries only by equality matching, or simple range
+ matching using the "partial match" feature.
+
+TODO
+----
+
+Nearest future:
+
+ * Opclasses for more types (no programming, just many catalog changes)
+
+Distant future:
+
+ * Replace B-tree of entries to something like GiST
+
+Authors
+-------
+
+Original work was done by Teodor Sigaev (teodor@sigaev.ru) and Oleg Bartunov
+(oleg@sai.msu.su).
diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c
new file mode 100644
index 0000000..bf73e32
--- /dev/null
+++ b/src/backend/access/gin/ginarrayproc.c
@@ -0,0 +1,305 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginarrayproc.c
+ * support functions for GIN's indexing of any array
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginarrayproc.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/gin.h"
+#include "access/stratnum.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+
+
+#define GinOverlapStrategy 1
+#define GinContainsStrategy 2
+#define GinContainedStrategy 3
+#define GinEqualStrategy 4
+
+
+/*
+ * extractValue support function
+ */
+Datum
+ginarrayextract(PG_FUNCTION_ARGS)
+{
+ /* Make copy of array input to ensure it doesn't disappear while in use */
+ ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0);
+ int32 *nkeys = (int32 *) PG_GETARG_POINTER(1);
+ bool **nullFlags = (bool **) PG_GETARG_POINTER(2);
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Datum *elems;
+ bool *nulls;
+ int nelems;
+
+ get_typlenbyvalalign(ARR_ELEMTYPE(array),
+ &elmlen, &elmbyval, &elmalign);
+
+ deconstruct_array(array,
+ ARR_ELEMTYPE(array),
+ elmlen, elmbyval, elmalign,
+ &elems, &nulls, &nelems);
+
+ *nkeys = nelems;
+ *nullFlags = nulls;
+
+ /* we should not free array, elems[i] points into it */
+ PG_RETURN_POINTER(elems);
+}
+
+/*
+ * Formerly, ginarrayextract had only two arguments. Now it has three,
+ * but we still need a pg_proc entry with two args to support reloading
+ * pre-9.1 contrib/intarray opclass declarations. This compatibility
+ * function should go away eventually.
+ */
+Datum
+ginarrayextract_2args(PG_FUNCTION_ARGS)
+{
+ if (PG_NARGS() < 3) /* should not happen */
+ elog(ERROR, "ginarrayextract requires three arguments");
+ return ginarrayextract(fcinfo);
+}
+
+/*
+ * extractQuery support function
+ */
+Datum
+ginqueryarrayextract(PG_FUNCTION_ARGS)
+{
+ /* Make copy of array input to ensure it doesn't disappear while in use */
+ ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0);
+ int32 *nkeys = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+
+ /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ bool **nullFlags = (bool **) PG_GETARG_POINTER(5);
+ int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Datum *elems;
+ bool *nulls;
+ int nelems;
+
+ get_typlenbyvalalign(ARR_ELEMTYPE(array),
+ &elmlen, &elmbyval, &elmalign);
+
+ deconstruct_array(array,
+ ARR_ELEMTYPE(array),
+ elmlen, elmbyval, elmalign,
+ &elems, &nulls, &nelems);
+
+ *nkeys = nelems;
+ *nullFlags = nulls;
+
+ switch (strategy)
+ {
+ case GinOverlapStrategy:
+ *searchMode = GIN_SEARCH_MODE_DEFAULT;
+ break;
+ case GinContainsStrategy:
+ if (nelems > 0)
+ *searchMode = GIN_SEARCH_MODE_DEFAULT;
+ else /* everything contains the empty set */
+ *searchMode = GIN_SEARCH_MODE_ALL;
+ break;
+ case GinContainedStrategy:
+ /* empty set is contained in everything */
+ *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY;
+ break;
+ case GinEqualStrategy:
+ if (nelems > 0)
+ *searchMode = GIN_SEARCH_MODE_DEFAULT;
+ else
+ *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY;
+ break;
+ default:
+ elog(ERROR, "ginqueryarrayextract: unknown strategy number: %d",
+ strategy);
+ }
+
+ /* we should not free array, elems[i] points into it */
+ PG_RETURN_POINTER(elems);
+}
+
+/*
+ * consistent support function
+ */
+Datum
+ginarrayconsistent(PG_FUNCTION_ARGS)
+{
+ bool *check = (bool *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+
+ /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(5);
+
+ /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(6); */
+ bool *nullFlags = (bool *) PG_GETARG_POINTER(7);
+ bool res;
+ int32 i;
+
+ switch (strategy)
+ {
+ case GinOverlapStrategy:
+ /* result is not lossy */
+ *recheck = false;
+ /* must have a match for at least one non-null element */
+ res = false;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] && !nullFlags[i])
+ {
+ res = true;
+ break;
+ }
+ }
+ break;
+ case GinContainsStrategy:
+ /* result is not lossy */
+ *recheck = false;
+ /* must have all elements in check[] true, and no nulls */
+ res = true;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!check[i] || nullFlags[i])
+ {
+ res = false;
+ break;
+ }
+ }
+ break;
+ case GinContainedStrategy:
+ /* we will need recheck */
+ *recheck = true;
+ /* can't do anything else useful here */
+ res = true;
+ break;
+ case GinEqualStrategy:
+ /* we will need recheck */
+ *recheck = true;
+
+ /*
+ * Must have all elements in check[] true; no discrimination
+ * against nulls here. This is because array_contain_compare and
+ * array_eq handle nulls differently ...
+ */
+ res = true;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!check[i])
+ {
+ res = false;
+ break;
+ }
+ }
+ break;
+ default:
+ elog(ERROR, "ginarrayconsistent: unknown strategy number: %d",
+ strategy);
+ res = false;
+ }
+
+ PG_RETURN_BOOL(res);
+}
+
+/*
+ * triconsistent support function
+ */
+Datum
+ginarraytriconsistent(PG_FUNCTION_ARGS)
+{
+ GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+
+ /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(5); */
+ bool *nullFlags = (bool *) PG_GETARG_POINTER(6);
+ GinTernaryValue res;
+ int32 i;
+
+ switch (strategy)
+ {
+ case GinOverlapStrategy:
+ /* must have a match for at least one non-null element */
+ res = GIN_FALSE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!nullFlags[i])
+ {
+ if (check[i] == GIN_TRUE)
+ {
+ res = GIN_TRUE;
+ break;
+ }
+ else if (check[i] == GIN_MAYBE && res == GIN_FALSE)
+ {
+ res = GIN_MAYBE;
+ }
+ }
+ }
+ break;
+ case GinContainsStrategy:
+ /* must have all elements in check[] true, and no nulls */
+ res = GIN_TRUE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE || nullFlags[i])
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ if (check[i] == GIN_MAYBE)
+ {
+ res = GIN_MAYBE;
+ }
+ }
+ break;
+ case GinContainedStrategy:
+ /* can't do anything else useful here */
+ res = GIN_MAYBE;
+ break;
+ case GinEqualStrategy:
+
+ /*
+ * Must have all elements in check[] true; no discrimination
+ * against nulls here. This is because array_contain_compare and
+ * array_eq handle nulls differently ...
+ */
+ res = GIN_MAYBE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE)
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ }
+ break;
+ default:
+ elog(ERROR, "ginarrayconsistent: unknown strategy number: %d",
+ strategy);
+ res = false;
+ }
+
+ PG_RETURN_GIN_TERNARY_VALUE(res);
+}
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
new file mode 100644
index 0000000..482cf10
--- /dev/null
+++ b/src/backend/access/gin/ginbtree.c
@@ -0,0 +1,795 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginbtree.c
+ * page utilities routines for the postgres inverted index access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginbtree.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/predicate.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static void ginFindParents(GinBtree btree, GinBtreeStack *stack);
+static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ Buffer childbuf, GinStatsData *buildStats);
+static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
+ bool freestack, GinStatsData *buildStats);
+
+/*
+ * Lock buffer by needed method for search.
+ */
+int
+ginTraverseLock(Buffer buffer, bool searchMode)
+{
+ Page page;
+ int access = GIN_SHARE;
+
+ LockBuffer(buffer, GIN_SHARE);
+ page = BufferGetPage(buffer);
+ if (GinPageIsLeaf(page))
+ {
+ if (searchMode == false)
+ {
+ /* we should relock our page */
+ LockBuffer(buffer, GIN_UNLOCK);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+
+ /* But root can become non-leaf during relock */
+ if (!GinPageIsLeaf(page))
+ {
+ /* restore old lock type (very rare) */
+ LockBuffer(buffer, GIN_UNLOCK);
+ LockBuffer(buffer, GIN_SHARE);
+ }
+ else
+ access = GIN_EXCLUSIVE;
+ }
+ }
+
+ return access;
+}
+
+/*
+ * Descend the tree to the leaf page that contains or would contain the key
+ * we're searching for. The key should already be filled in 'btree', in
+ * tree-type specific manner. If btree->fullScan is true, descends to the
+ * leftmost leaf page.
+ *
+ * If 'searchmode' is false, on return stack->buffer is exclusively locked,
+ * and the stack represents the full path to the root. Otherwise stack->buffer
+ * is share-locked, and stack->parent is NULL.
+ *
+ * If 'rootConflictCheck' is true, tree root is checked for serialization
+ * conflict.
+ */
+GinBtreeStack *
+ginFindLeafPage(GinBtree btree, bool searchMode,
+ bool rootConflictCheck, Snapshot snapshot)
+{
+ GinBtreeStack *stack;
+
+ stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
+ stack->blkno = btree->rootBlkno;
+ stack->buffer = ReadBuffer(btree->index, btree->rootBlkno);
+ stack->parent = NULL;
+ stack->predictNumber = 1;
+
+ if (rootConflictCheck)
+ CheckForSerializableConflictIn(btree->index, NULL, btree->rootBlkno);
+
+ for (;;)
+ {
+ Page page;
+ BlockNumber child;
+ int access;
+
+ stack->off = InvalidOffsetNumber;
+
+ page = BufferGetPage(stack->buffer);
+ TestForOldSnapshot(snapshot, btree->index, page);
+
+ access = ginTraverseLock(stack->buffer, searchMode);
+
+ /*
+ * If we're going to modify the tree, finish any incomplete splits we
+ * encounter on the way.
+ */
+ if (!searchMode && GinPageIsIncompleteSplit(page))
+ ginFinishSplit(btree, stack, false, NULL);
+
+ /*
+ * ok, page is correctly locked, we should check to move right ..,
+ * root never has a right link, so small optimization
+ */
+ while (btree->fullScan == false && stack->blkno != btree->rootBlkno &&
+ btree->isMoveRight(btree, page))
+ {
+ BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
+
+ if (rightlink == InvalidBlockNumber)
+ /* rightmost page */
+ break;
+
+ stack->buffer = ginStepRight(stack->buffer, btree->index, access);
+ stack->blkno = rightlink;
+ page = BufferGetPage(stack->buffer);
+ TestForOldSnapshot(snapshot, btree->index, page);
+
+ if (!searchMode && GinPageIsIncompleteSplit(page))
+ ginFinishSplit(btree, stack, false, NULL);
+ }
+
+ if (GinPageIsLeaf(page)) /* we found, return locked page */
+ return stack;
+
+ /* now we have correct buffer, try to find child */
+ child = btree->findChildPage(btree, stack);
+
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+ Assert(child != InvalidBlockNumber);
+ Assert(stack->blkno != child);
+
+ if (searchMode)
+ {
+ /* in search mode we may forget path to leaf */
+ stack->blkno = child;
+ stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
+ }
+ else
+ {
+ GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
+
+ ptr->parent = stack;
+ stack = ptr;
+ stack->blkno = child;
+ stack->buffer = ReadBuffer(btree->index, stack->blkno);
+ stack->predictNumber = 1;
+ }
+ }
+}
+
+/*
+ * Step right from current page.
+ *
+ * The next page is locked first, before releasing the current page. This is
+ * crucial to protect from concurrent page deletion (see comment in
+ * ginDeletePage).
+ */
+Buffer
+ginStepRight(Buffer buffer, Relation index, int lockmode)
+{
+ Buffer nextbuffer;
+ Page page = BufferGetPage(buffer);
+ bool isLeaf = GinPageIsLeaf(page);
+ bool isData = GinPageIsData(page);
+ BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+
+ nextbuffer = ReadBuffer(index, blkno);
+ LockBuffer(nextbuffer, lockmode);
+ UnlockReleaseBuffer(buffer);
+
+ /* Sanity check that the page we stepped to is of similar kind. */
+ page = BufferGetPage(nextbuffer);
+ if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
+ elog(ERROR, "right sibling of GIN page is of different type");
+
+ return nextbuffer;
+}
+
+void
+freeGinBtreeStack(GinBtreeStack *stack)
+{
+ while (stack)
+ {
+ GinBtreeStack *tmp = stack->parent;
+
+ if (stack->buffer != InvalidBuffer)
+ ReleaseBuffer(stack->buffer);
+
+ pfree(stack);
+ stack = tmp;
+ }
+}
+
+/*
+ * Try to find parent for current stack position. Returns correct parent and
+ * child's offset in stack->parent. The root page is never released, to
+ * prevent conflict with vacuum process.
+ */
+static void
+ginFindParents(GinBtree btree, GinBtreeStack *stack)
+{
+ Page page;
+ Buffer buffer;
+ BlockNumber blkno,
+ leftmostBlkno;
+ OffsetNumber offset;
+ GinBtreeStack *root;
+ GinBtreeStack *ptr;
+
+ /*
+ * Unwind the stack all the way up to the root, leaving only the root
+ * item.
+ *
+ * Be careful not to release the pin on the root page! The pin on root
+ * page is required to lock out concurrent vacuums on the tree.
+ */
+ root = stack->parent;
+ while (root->parent)
+ {
+ ReleaseBuffer(root->buffer);
+ root = root->parent;
+ }
+
+ Assert(root->blkno == btree->rootBlkno);
+ Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno);
+ root->off = InvalidOffsetNumber;
+
+ blkno = root->blkno;
+ buffer = root->buffer;
+
+ ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
+
+ for (;;)
+ {
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+ page = BufferGetPage(buffer);
+ if (GinPageIsLeaf(page))
+ elog(ERROR, "Lost path");
+
+ if (GinPageIsIncompleteSplit(page))
+ {
+ Assert(blkno != btree->rootBlkno);
+ ptr->blkno = blkno;
+ ptr->buffer = buffer;
+
+ /*
+ * parent may be wrong, but if so, the ginFinishSplit call will
+ * recurse to call ginFindParents again to fix it.
+ */
+ ptr->parent = root;
+ ptr->off = InvalidOffsetNumber;
+
+ ginFinishSplit(btree, ptr, false, NULL);
+ }
+
+ leftmostBlkno = btree->getLeftMostChild(btree, page);
+
+ while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
+ {
+ blkno = GinPageGetOpaque(page)->rightlink;
+ if (blkno == InvalidBlockNumber)
+ {
+ UnlockReleaseBuffer(buffer);
+ break;
+ }
+ buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
+ page = BufferGetPage(buffer);
+
+ /* finish any incomplete splits, as above */
+ if (GinPageIsIncompleteSplit(page))
+ {
+ Assert(blkno != btree->rootBlkno);
+ ptr->blkno = blkno;
+ ptr->buffer = buffer;
+ ptr->parent = root;
+ ptr->off = InvalidOffsetNumber;
+
+ ginFinishSplit(btree, ptr, false, NULL);
+ }
+ }
+
+ if (blkno != InvalidBlockNumber)
+ {
+ ptr->blkno = blkno;
+ ptr->buffer = buffer;
+ ptr->parent = root; /* it may be wrong, but in next call we will
+ * correct */
+ ptr->off = offset;
+ stack->parent = ptr;
+ return;
+ }
+
+ /* Descend down to next level */
+ blkno = leftmostBlkno;
+ buffer = ReadBuffer(btree->index, blkno);
+ }
+}
+
+/*
+ * Insert a new item to a page.
+ *
+ * Returns true if the insertion was finished. On false, the page was split and
+ * the parent needs to be updated. (A root split returns true as it doesn't
+ * need any further action by the caller to complete.)
+ *
+ * When inserting a downlink to an internal page, 'childbuf' contains the
+ * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared
+ * atomically with the insert. Also, the existing item at offset stack->off
+ * in the target page is updated to point to updateblkno.
+ *
+ * stack->buffer is locked on entry, and is kept locked.
+ * Likewise for childbuf, if given.
+ */
+static bool
+ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ Buffer childbuf, GinStatsData *buildStats)
+{
+ Page page = BufferGetPage(stack->buffer);
+ bool result;
+ GinPlaceToPageRC rc;
+ uint16 xlflags = 0;
+ Page childpage = NULL;
+ Page newlpage = NULL,
+ newrpage = NULL;
+ void *ptp_workspace = NULL;
+ MemoryContext tmpCxt;
+ MemoryContext oldCxt;
+
+ /*
+ * We do all the work of this function and its subfunctions in a temporary
+ * memory context. This avoids leakages and simplifies APIs, since some
+ * subfunctions allocate storage that has to survive until we've finished
+ * the WAL insertion.
+ */
+ tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "ginPlaceToPage temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+ oldCxt = MemoryContextSwitchTo(tmpCxt);
+
+ if (GinPageIsData(page))
+ xlflags |= GIN_INSERT_ISDATA;
+ if (GinPageIsLeaf(page))
+ {
+ xlflags |= GIN_INSERT_ISLEAF;
+ Assert(!BufferIsValid(childbuf));
+ Assert(updateblkno == InvalidBlockNumber);
+ }
+ else
+ {
+ Assert(BufferIsValid(childbuf));
+ Assert(updateblkno != InvalidBlockNumber);
+ childpage = BufferGetPage(childbuf);
+ }
+
+ /*
+ * See if the incoming tuple will fit on the page. beginPlaceToPage will
+ * decide if the page needs to be split, and will compute the split
+ * contents if so. See comments for beginPlaceToPage and execPlaceToPage
+ * functions for more details of the API here.
+ */
+ rc = btree->beginPlaceToPage(btree, stack->buffer, stack,
+ insertdata, updateblkno,
+ &ptp_workspace,
+ &newlpage, &newrpage);
+
+ if (rc == GPTP_NO_WORK)
+ {
+ /* Nothing to do */
+ result = true;
+ }
+ else if (rc == GPTP_INSERT)
+ {
+ /* It will fit, perform the insertion */
+ START_CRIT_SECTION();
+
+ if (RelationNeedsWAL(btree->index) && !btree->isBuild)
+ {
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, stack->buffer, REGBUF_STANDARD);
+ if (BufferIsValid(childbuf))
+ XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD);
+ }
+
+ /* Perform the page update, and register any extra WAL data */
+ btree->execPlaceToPage(btree, stack->buffer, stack,
+ insertdata, updateblkno, ptp_workspace);
+
+ MarkBufferDirty(stack->buffer);
+
+ /* An insert to an internal page finishes the split of the child. */
+ if (BufferIsValid(childbuf))
+ {
+ GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
+ MarkBufferDirty(childbuf);
+ }
+
+ if (RelationNeedsWAL(btree->index) && !btree->isBuild)
+ {
+ XLogRecPtr recptr;
+ ginxlogInsert xlrec;
+ BlockIdData childblknos[2];
+
+ xlrec.flags = xlflags;
+
+ XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert));
+
+ /*
+ * Log information about child if this was an insertion of a
+ * downlink.
+ */
+ if (BufferIsValid(childbuf))
+ {
+ BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf));
+ BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink);
+ XLogRegisterData((char *) childblknos,
+ sizeof(BlockIdData) * 2);
+ }
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT);
+ PageSetLSN(page, recptr);
+ if (BufferIsValid(childbuf))
+ PageSetLSN(childpage, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Insertion is complete. */
+ result = true;
+ }
+ else if (rc == GPTP_SPLIT)
+ {
+ /*
+ * Didn't fit, need to split. The split has been computed in newlpage
+ * and newrpage, which are pointers to palloc'd pages, not associated
+ * with buffers. stack->buffer is not touched yet.
+ */
+ Buffer rbuffer;
+ BlockNumber savedRightLink;
+ ginxlogSplit data;
+ Buffer lbuffer = InvalidBuffer;
+ Page newrootpg = NULL;
+
+ /* Get a new index page to become the right page */
+ rbuffer = GinNewBuffer(btree->index);
+
+ /* During index build, count the new page */
+ if (buildStats)
+ {
+ if (btree->isData)
+ buildStats->nDataPages++;
+ else
+ buildStats->nEntryPages++;
+ }
+
+ savedRightLink = GinPageGetOpaque(page)->rightlink;
+
+ /* Begin setting up WAL record */
+ data.node = btree->index->rd_node;
+ data.flags = xlflags;
+ if (BufferIsValid(childbuf))
+ {
+ data.leftChildBlkno = BufferGetBlockNumber(childbuf);
+ data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink;
+ }
+ else
+ data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber;
+
+ if (stack->parent == NULL)
+ {
+ /*
+ * splitting the root, so we need to allocate new left page and
+ * place pointers to left and right page on root page.
+ */
+ lbuffer = GinNewBuffer(btree->index);
+
+ /* During index build, count the new left page */
+ if (buildStats)
+ {
+ if (btree->isData)
+ buildStats->nDataPages++;
+ else
+ buildStats->nEntryPages++;
+ }
+
+ data.rrlink = InvalidBlockNumber;
+ data.flags |= GIN_SPLIT_ROOT;
+
+ GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber;
+ GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
+
+ /*
+ * Construct a new root page containing downlinks to the new left
+ * and right pages. (Do this in a temporary copy rather than
+ * overwriting the original page directly, since we're not in the
+ * critical section yet.)
+ */
+ newrootpg = PageGetTempPage(newrpage);
+ GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ);
+
+ btree->fillRoot(btree, newrootpg,
+ BufferGetBlockNumber(lbuffer), newlpage,
+ BufferGetBlockNumber(rbuffer), newrpage);
+
+ if (GinPageIsLeaf(BufferGetPage(stack->buffer)))
+ {
+
+ PredicateLockPageSplit(btree->index,
+ BufferGetBlockNumber(stack->buffer),
+ BufferGetBlockNumber(lbuffer));
+
+ PredicateLockPageSplit(btree->index,
+ BufferGetBlockNumber(stack->buffer),
+ BufferGetBlockNumber(rbuffer));
+ }
+
+ }
+ else
+ {
+ /* splitting a non-root page */
+ data.rrlink = savedRightLink;
+
+ GinPageGetOpaque(newrpage)->rightlink = savedRightLink;
+ GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT;
+ GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
+
+ if (GinPageIsLeaf(BufferGetPage(stack->buffer)))
+ {
+
+ PredicateLockPageSplit(btree->index,
+ BufferGetBlockNumber(stack->buffer),
+ BufferGetBlockNumber(rbuffer));
+ }
+ }
+
+ /*
+ * OK, we have the new contents of the left page in a temporary copy
+ * now (newlpage), and likewise for the new contents of the
+ * newly-allocated right block. The original page is still unchanged.
+ *
+ * If this is a root split, we also have a temporary page containing
+ * the new contents of the root.
+ */
+
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(rbuffer);
+ MarkBufferDirty(stack->buffer);
+
+ /*
+ * Restore the temporary copies over the real buffers.
+ */
+ if (stack->parent == NULL)
+ {
+ /* Splitting the root, three pages to update */
+ MarkBufferDirty(lbuffer);
+ memcpy(page, newrootpg, BLCKSZ);
+ memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ);
+ memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
+ }
+ else
+ {
+ /* Normal split, only two pages to update */
+ memcpy(page, newlpage, BLCKSZ);
+ memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
+ }
+
+ /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */
+ if (BufferIsValid(childbuf))
+ {
+ GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
+ MarkBufferDirty(childbuf);
+ }
+
+ /* write WAL record */
+ if (RelationNeedsWAL(btree->index) && !btree->isBuild)
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+
+ /*
+ * We just take full page images of all the split pages. Splits
+ * are uncommon enough that it's not worth complicating the code
+ * to be more efficient.
+ */
+ if (stack->parent == NULL)
+ {
+ XLogRegisterBuffer(0, lbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+ XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+ XLogRegisterBuffer(2, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+ }
+ else
+ {
+ XLogRegisterBuffer(0, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+ XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+ }
+ if (BufferIsValid(childbuf))
+ XLogRegisterBuffer(3, childbuf, REGBUF_STANDARD);
+
+ XLogRegisterData((char *) &data, sizeof(ginxlogSplit));
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT);
+
+ PageSetLSN(page, recptr);
+ PageSetLSN(BufferGetPage(rbuffer), recptr);
+ if (stack->parent == NULL)
+ PageSetLSN(BufferGetPage(lbuffer), recptr);
+ if (BufferIsValid(childbuf))
+ PageSetLSN(childpage, recptr);
+ }
+ END_CRIT_SECTION();
+
+ /*
+ * We can release the locks/pins on the new pages now, but keep
+ * stack->buffer locked. childbuf doesn't get unlocked either.
+ */
+ UnlockReleaseBuffer(rbuffer);
+ if (stack->parent == NULL)
+ UnlockReleaseBuffer(lbuffer);
+
+ /*
+ * If we split the root, we're done. Otherwise the split is not
+ * complete until the downlink for the new page has been inserted to
+ * the parent.
+ */
+ result = (stack->parent == NULL);
+ }
+ else
+ {
+ elog(ERROR, "invalid return code from GIN beginPlaceToPage method: %d", rc);
+ result = false; /* keep compiler quiet */
+ }
+
+ /* Clean up temp context */
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextDelete(tmpCxt);
+
+ return result;
+}
+
+/*
+ * Finish a split by inserting the downlink for the new page to parent.
+ *
+ * On entry, stack->buffer is exclusively locked.
+ *
+ * If freestack is true, all the buffers are released and unlocked as we
+ * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept
+ * locked, and stack is unmodified, except for possibly moving right to find
+ * the correct parent of page.
+ */
+static void
+ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
+ GinStatsData *buildStats)
+{
+ Page page;
+ bool done;
+ bool first = true;
+
+ /*
+ * freestack == false when we encounter an incompletely split page during
+ * a scan, while freestack == true is used in the normal scenario that a
+ * split is finished right after the initial insert.
+ */
+ if (!freestack)
+ elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
+ stack->blkno, RelationGetRelationName(btree->index));
+
+ /* this loop crawls up the stack until the insertion is complete */
+ do
+ {
+ GinBtreeStack *parent = stack->parent;
+ void *insertdata;
+ BlockNumber updateblkno;
+
+ /* search parent to lock */
+ LockBuffer(parent->buffer, GIN_EXCLUSIVE);
+
+ /*
+ * If the parent page was incompletely split, finish that split first,
+ * then continue with the current one.
+ *
+ * Note: we have to finish *all* incomplete splits we encounter, even
+ * if we have to move right. Otherwise we might choose as the target a
+ * page that has no downlink in the parent, and splitting it further
+ * would fail.
+ */
+ if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
+ ginFinishSplit(btree, parent, false, buildStats);
+
+ /* move right if it's needed */
+ page = BufferGetPage(parent->buffer);
+ while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber)
+ {
+ if (GinPageRightMost(page))
+ {
+ /*
+ * rightmost page, but we don't find parent, we should use
+ * plain search...
+ */
+ LockBuffer(parent->buffer, GIN_UNLOCK);
+ ginFindParents(btree, stack);
+ parent = stack->parent;
+ Assert(parent != NULL);
+ break;
+ }
+
+ parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
+ parent->blkno = BufferGetBlockNumber(parent->buffer);
+ page = BufferGetPage(parent->buffer);
+
+ if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
+ ginFinishSplit(btree, parent, false, buildStats);
+ }
+
+ /* insert the downlink */
+ insertdata = btree->prepareDownlink(btree, stack->buffer);
+ updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink;
+ done = ginPlaceToPage(btree, parent,
+ insertdata, updateblkno,
+ stack->buffer, buildStats);
+ pfree(insertdata);
+
+ /*
+ * If the caller requested to free the stack, unlock and release the
+ * child buffer now. Otherwise keep it pinned and locked, but if we
+ * have to recurse up the tree, we can unlock the upper pages, only
+ * keeping the page at the bottom of the stack locked.
+ */
+ if (!first || freestack)
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+ if (freestack)
+ {
+ ReleaseBuffer(stack->buffer);
+ pfree(stack);
+ }
+ stack = parent;
+
+ first = false;
+ } while (!done);
+
+ /* unlock the parent */
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+
+ if (freestack)
+ freeGinBtreeStack(stack);
+}
+
+/*
+ * Insert a value to tree described by stack.
+ *
+ * The value to be inserted is given in 'insertdata'. Its format depends
+ * on whether this is an entry or data tree, ginInsertValue just passes it
+ * through to the tree-specific callback function.
+ *
+ * During an index build, buildStats is non-null and the counters it contains
+ * are incremented as needed.
+ *
+ * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
+ */
+void
+ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata,
+ GinStatsData *buildStats)
+{
+ bool done;
+
+ /* If the leaf page was incompletely split, finish the split first */
+ if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
+ ginFinishSplit(btree, stack, false, buildStats);
+
+ done = ginPlaceToPage(btree, stack,
+ insertdata, InvalidBlockNumber,
+ InvalidBuffer, buildStats);
+ if (done)
+ {
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stack);
+ }
+ else
+ ginFinishSplit(btree, stack, true, buildStats);
+}
diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c
new file mode 100644
index 0000000..4c5067c
--- /dev/null
+++ b/src/backend/access/gin/ginbulk.c
@@ -0,0 +1,293 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginbulk.c
+ * routines for fast build of inverted index
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginbulk.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/gin_private.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+
+
+#define DEF_NENTRY 2048 /* GinEntryAccumulator allocation quantum */
+#define DEF_NPTR 5 /* ItemPointer initial allocation quantum */
+
+
+/* Combiner function for rbtree.c */
+static void
+ginCombineData(RBTNode *existing, const RBTNode *newdata, void *arg)
+{
+ GinEntryAccumulator *eo = (GinEntryAccumulator *) existing;
+ const GinEntryAccumulator *en = (const GinEntryAccumulator *) newdata;
+ BuildAccumulator *accum = (BuildAccumulator *) arg;
+
+ /*
+ * Note this code assumes that newdata contains only one itempointer.
+ */
+ if (eo->count >= eo->maxcount)
+ {
+ if (eo->maxcount > INT_MAX)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("posting list is too long"),
+ errhint("Reduce maintenance_work_mem.")));
+
+ accum->allocatedMemory -= GetMemoryChunkSpace(eo->list);
+ eo->maxcount *= 2;
+ eo->list = (ItemPointerData *)
+ repalloc_huge(eo->list, sizeof(ItemPointerData) * eo->maxcount);
+ accum->allocatedMemory += GetMemoryChunkSpace(eo->list);
+ }
+
+ /* If item pointers are not ordered, they will need to be sorted later */
+ if (eo->shouldSort == false)
+ {
+ int res;
+
+ res = ginCompareItemPointers(eo->list + eo->count - 1, en->list);
+ Assert(res != 0);
+
+ if (res > 0)
+ eo->shouldSort = true;
+ }
+
+ eo->list[eo->count] = en->list[0];
+ eo->count++;
+}
+
+/* Comparator function for rbtree.c */
+static int
+cmpEntryAccumulator(const RBTNode *a, const RBTNode *b, void *arg)
+{
+ const GinEntryAccumulator *ea = (const GinEntryAccumulator *) a;
+ const GinEntryAccumulator *eb = (const GinEntryAccumulator *) b;
+ BuildAccumulator *accum = (BuildAccumulator *) arg;
+
+ return ginCompareAttEntries(accum->ginstate,
+ ea->attnum, ea->key, ea->category,
+ eb->attnum, eb->key, eb->category);
+}
+
+/* Allocator function for rbtree.c */
+static RBTNode *
+ginAllocEntryAccumulator(void *arg)
+{
+ BuildAccumulator *accum = (BuildAccumulator *) arg;
+ GinEntryAccumulator *ea;
+
+ /*
+ * Allocate memory by rather big chunks to decrease overhead. We have no
+ * need to reclaim RBTNodes individually, so this costs nothing.
+ */
+ if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY)
+ {
+ accum->entryallocator = palloc(sizeof(GinEntryAccumulator) * DEF_NENTRY);
+ accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator);
+ accum->eas_used = 0;
+ }
+
+ /* Allocate new RBTNode from current chunk */
+ ea = accum->entryallocator + accum->eas_used;
+ accum->eas_used++;
+
+ return (RBTNode *) ea;
+}
+
+void
+ginInitBA(BuildAccumulator *accum)
+{
+ /* accum->ginstate is intentionally not set here */
+ accum->allocatedMemory = 0;
+ accum->entryallocator = NULL;
+ accum->eas_used = 0;
+ accum->tree = rbt_create(sizeof(GinEntryAccumulator),
+ cmpEntryAccumulator,
+ ginCombineData,
+ ginAllocEntryAccumulator,
+ NULL, /* no freefunc needed */
+ (void *) accum);
+}
+
+/*
+ * This is basically the same as datumCopy(), but extended to count
+ * palloc'd space in accum->allocatedMemory.
+ */
+static Datum
+getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value)
+{
+ Form_pg_attribute att;
+ Datum res;
+
+ att = TupleDescAttr(accum->ginstate->origTupdesc, attnum - 1);
+ if (att->attbyval)
+ res = value;
+ else
+ {
+ res = datumCopy(value, false, att->attlen);
+ accum->allocatedMemory += GetMemoryChunkSpace(DatumGetPointer(res));
+ }
+ return res;
+}
+
+/*
+ * Find/store one entry from indexed value.
+ */
+static void
+ginInsertBAEntry(BuildAccumulator *accum,
+ ItemPointer heapptr, OffsetNumber attnum,
+ Datum key, GinNullCategory category)
+{
+ GinEntryAccumulator eatmp;
+ GinEntryAccumulator *ea;
+ bool isNew;
+
+ /*
+ * For the moment, fill only the fields of eatmp that will be looked at by
+ * cmpEntryAccumulator or ginCombineData.
+ */
+ eatmp.attnum = attnum;
+ eatmp.key = key;
+ eatmp.category = category;
+ /* temporarily set up single-entry itempointer list */
+ eatmp.list = heapptr;
+
+ ea = (GinEntryAccumulator *) rbt_insert(accum->tree, (RBTNode *) &eatmp,
+ &isNew);
+
+ if (isNew)
+ {
+ /*
+ * Finish initializing new tree entry, including making permanent
+ * copies of the datum (if it's not null) and itempointer.
+ */
+ if (category == GIN_CAT_NORM_KEY)
+ ea->key = getDatumCopy(accum, attnum, key);
+ ea->maxcount = DEF_NPTR;
+ ea->count = 1;
+ ea->shouldSort = false;
+ ea->list =
+ (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR);
+ ea->list[0] = *heapptr;
+ accum->allocatedMemory += GetMemoryChunkSpace(ea->list);
+ }
+ else
+ {
+ /*
+ * ginCombineData did everything needed.
+ */
+ }
+}
+
+/*
+ * Insert the entries for one heap pointer.
+ *
+ * Since the entries are being inserted into a balanced binary tree, you
+ * might think that the order of insertion wouldn't be critical, but it turns
+ * out that inserting the entries in sorted order results in a lot of
+ * rebalancing operations and is slow. To prevent this, we attempt to insert
+ * the nodes in an order that will produce a nearly-balanced tree if the input
+ * is in fact sorted.
+ *
+ * We do this as follows. First, we imagine that we have an array whose size
+ * is the smallest power of two greater than or equal to the actual array
+ * size. Second, we insert the middle entry of our virtual array into the
+ * tree; then, we insert the middles of each half of our virtual array, then
+ * middles of quarters, etc.
+ */
+void
+ginInsertBAEntries(BuildAccumulator *accum,
+ ItemPointer heapptr, OffsetNumber attnum,
+ Datum *entries, GinNullCategory *categories,
+ int32 nentries)
+{
+ uint32 step = nentries;
+
+ if (nentries <= 0)
+ return;
+
+ Assert(ItemPointerIsValid(heapptr) && attnum >= FirstOffsetNumber);
+
+ /*
+ * step will contain largest power of 2 and <= nentries
+ */
+ step |= (step >> 1);
+ step |= (step >> 2);
+ step |= (step >> 4);
+ step |= (step >> 8);
+ step |= (step >> 16);
+ step >>= 1;
+ step++;
+
+ while (step > 0)
+ {
+ int i;
+
+ for (i = step - 1; i < nentries && i >= 0; i += step << 1 /* *2 */ )
+ ginInsertBAEntry(accum, heapptr, attnum,
+ entries[i], categories[i]);
+
+ step >>= 1; /* /2 */
+ }
+}
+
+static int
+qsortCompareItemPointers(const void *a, const void *b)
+{
+ int res = ginCompareItemPointers((ItemPointer) a, (ItemPointer) b);
+
+ /* Assert that there are no equal item pointers being sorted */
+ Assert(res != 0);
+ return res;
+}
+
+/* Prepare to read out the rbtree contents using ginGetBAEntry */
+void
+ginBeginBAScan(BuildAccumulator *accum)
+{
+ rbt_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk);
+}
+
+/*
+ * Get the next entry in sequence from the BuildAccumulator's rbtree.
+ * This consists of a single key datum and a list (array) of one or more
+ * heap TIDs in which that key is found. The list is guaranteed sorted.
+ */
+ItemPointerData *
+ginGetBAEntry(BuildAccumulator *accum,
+ OffsetNumber *attnum, Datum *key, GinNullCategory *category,
+ uint32 *n)
+{
+ GinEntryAccumulator *entry;
+ ItemPointerData *list;
+
+ entry = (GinEntryAccumulator *) rbt_iterate(&accum->tree_walk);
+
+ if (entry == NULL)
+ return NULL; /* no more entries */
+
+ *attnum = entry->attnum;
+ *key = entry->key;
+ *category = entry->category;
+ list = entry->list;
+ *n = entry->count;
+
+ Assert(list != NULL && entry->count > 0);
+
+ if (entry->shouldSort && entry->count > 1)
+ qsort(list, entry->count, sizeof(ItemPointerData),
+ qsortCompareItemPointers);
+
+ return list;
+}
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
new file mode 100644
index 0000000..06c0586
--- /dev/null
+++ b/src/backend/access/gin/gindatapage.c
@@ -0,0 +1,1942 @@
+/*-------------------------------------------------------------------------
+ *
+ * gindatapage.c
+ * routines for handling GIN posting tree pages.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/gindatapage.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/xloginsert.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "storage/predicate.h"
+#include "utils/rel.h"
+
+/*
+ * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
+ *
+ * The code can deal with any size, but random access is more efficient when
+ * a number of smaller lists are stored, rather than one big list. If a
+ * posting list would become larger than Max size as a result of insertions,
+ * it is split into two. If a posting list would be smaller than minimum
+ * size, it is merged with the next posting list.
+ */
+#define GinPostingListSegmentMaxSize 384
+#define GinPostingListSegmentTargetSize 256
+#define GinPostingListSegmentMinSize 128
+
+/*
+ * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
+ * long segment. This is used when estimating how much space is required
+ * for N items, at minimum.
+ */
+#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6)
+
+/*
+ * A working struct for manipulating a posting tree leaf page.
+ */
+typedef struct
+{
+ dlist_head segments; /* a list of leafSegmentInfos */
+
+ /*
+ * The following fields represent how the segments are split across pages,
+ * if a page split is required. Filled in by leafRepackItems.
+ */
+ dlist_node *lastleft; /* last segment on left page */
+ int lsize; /* total size on left page */
+ int rsize; /* total size on right page */
+
+ bool oldformat; /* page is in pre-9.4 format on disk */
+
+ /*
+ * If we need WAL data representing the reconstructed leaf page, it's
+ * stored here by computeLeafRecompressWALData.
+ */
+ char *walinfo; /* buffer start */
+ int walinfolen; /* and length */
+} disassembledLeaf;
+
+typedef struct
+{
+ dlist_node node; /* linked list pointers */
+
+ /*-------------
+ * 'action' indicates the status of this in-memory segment, compared to
+ * what's on disk. It is one of the GIN_SEGMENT_* action codes:
+ *
+ * UNMODIFIED no changes
+ * DELETE the segment is to be removed. 'seg' and 'items' are
+ * ignored
+ * INSERT this is a completely new segment
+ * REPLACE this replaces an existing segment with new content
+ * ADDITEMS like REPLACE, but no items have been removed, and we track
+ * in detail what items have been added to this segment, in
+ * 'modifieditems'
+ *-------------
+ */
+ char action;
+
+ ItemPointerData *modifieditems;
+ uint16 nmodifieditems;
+
+ /*
+ * The following fields represent the items in this segment. If 'items' is
+ * not NULL, it contains a palloc'd array of the items in this segment. If
+ * 'seg' is not NULL, it contains the items in an already-compressed
+ * format. It can point to an on-disk page (!modified), or a palloc'd
+ * segment in memory. If both are set, they must represent the same items.
+ */
+ GinPostingList *seg;
+ ItemPointer items;
+ int nitems; /* # of items in 'items', if items != NULL */
+} leafSegmentInfo;
+
+static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
+static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ Page *newlpage, Page *newrpage);
+
+static disassembledLeaf *disassembleLeaf(Page page);
+static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
+static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
+ int nNewItems);
+
+static void computeLeafRecompressWALData(disassembledLeaf *leaf);
+static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
+static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
+ ItemPointerData lbound, ItemPointerData rbound,
+ Page lpage, Page rpage);
+
+/*
+ * Read TIDs from leaf data page to single uncompressed array. The TIDs are
+ * returned in ascending order.
+ *
+ * advancePast is a hint, indicating that the caller is only interested in
+ * TIDs > advancePast. To return all items, use ItemPointerSetMin.
+ *
+ * Note: This function can still return items smaller than advancePast that
+ * are in the same posting list as the items of interest, so the caller must
+ * still check all the returned items. But passing it allows this function to
+ * skip whole posting lists.
+ */
+ItemPointer
+GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast)
+{
+ ItemPointer result;
+
+ if (GinPageIsCompressed(page))
+ {
+ GinPostingList *seg = GinDataLeafPageGetPostingList(page);
+ Size len = GinDataLeafPageGetPostingListSize(page);
+ Pointer endptr = ((Pointer) seg) + len;
+ GinPostingList *next;
+
+ /* Skip to the segment containing advancePast+1 */
+ if (ItemPointerIsValid(&advancePast))
+ {
+ next = GinNextPostingListSegment(seg);
+ while ((Pointer) next < endptr &&
+ ginCompareItemPointers(&next->first, &advancePast) <= 0)
+ {
+ seg = next;
+ next = GinNextPostingListSegment(seg);
+ }
+ len = endptr - (Pointer) seg;
+ }
+
+ if (len > 0)
+ result = ginPostingListDecodeAllSegments(seg, len, nitems);
+ else
+ {
+ result = NULL;
+ *nitems = 0;
+ }
+ }
+ else
+ {
+ ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems);
+
+ result = palloc((*nitems) * sizeof(ItemPointerData));
+ memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData));
+ }
+
+ return result;
+}
+
+/*
+ * Places all TIDs from leaf data page to bitmap.
+ */
+int
+GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm)
+{
+ ItemPointer uncompressed;
+ int nitems;
+
+ if (GinPageIsCompressed(page))
+ {
+ GinPostingList *segment = GinDataLeafPageGetPostingList(page);
+ Size len = GinDataLeafPageGetPostingListSize(page);
+
+ nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm);
+ }
+ else
+ {
+ uncompressed = dataLeafPageGetUncompressed(page, &nitems);
+
+ if (nitems > 0)
+ tbm_add_tuples(tbm, uncompressed, nitems, false);
+ }
+
+ return nitems;
+}
+
+/*
+ * Get pointer to the uncompressed array of items on a pre-9.4 format
+ * uncompressed leaf page. The number of items in the array is returned in
+ * *nitems.
+ */
+static ItemPointer
+dataLeafPageGetUncompressed(Page page, int *nitems)
+{
+ ItemPointer items;
+
+ Assert(!GinPageIsCompressed(page));
+
+ /*
+ * In the old pre-9.4 page format, the whole page content is used for
+ * uncompressed items, and the number of items is stored in 'maxoff'
+ */
+ items = (ItemPointer) GinDataPageGetData(page);
+ *nitems = GinPageGetOpaque(page)->maxoff;
+
+ return items;
+}
+
+/*
+ * Check if we should follow the right link to find the item we're searching
+ * for.
+ *
+ * Compares inserting item pointer with the right bound of the current page.
+ */
+static bool
+dataIsMoveRight(GinBtree btree, Page page)
+{
+ ItemPointer iptr = GinDataPageGetRightBound(page);
+
+ if (GinPageRightMost(page))
+ return false;
+
+ if (GinPageIsDeleted(page))
+ return true;
+
+ return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? true : false;
+}
+
+/*
+ * Find correct PostingItem in non-leaf page. It is assumed that this is
+ * the correct page, and the searched value SHOULD be on the page.
+ */
+static BlockNumber
+dataLocateItem(GinBtree btree, GinBtreeStack *stack)
+{
+ OffsetNumber low,
+ high,
+ maxoff;
+ PostingItem *pitem = NULL;
+ int result;
+ Page page = BufferGetPage(stack->buffer);
+
+ Assert(!GinPageIsLeaf(page));
+ Assert(GinPageIsData(page));
+
+ if (btree->fullScan)
+ {
+ stack->off = FirstOffsetNumber;
+ stack->predictNumber *= GinPageGetOpaque(page)->maxoff;
+ return btree->getLeftMostChild(btree, page);
+ }
+
+ low = FirstOffsetNumber;
+ maxoff = high = GinPageGetOpaque(page)->maxoff;
+ Assert(high >= low);
+
+ high++;
+
+ while (high > low)
+ {
+ OffsetNumber mid = low + ((high - low) / 2);
+
+ pitem = GinDataPageGetPostingItem(page, mid);
+
+ if (mid == maxoff)
+ {
+ /*
+ * Right infinity, page already correctly chosen with a help of
+ * dataIsMoveRight
+ */
+ result = -1;
+ }
+ else
+ {
+ pitem = GinDataPageGetPostingItem(page, mid);
+ result = ginCompareItemPointers(&btree->itemptr, &(pitem->key));
+ }
+
+ if (result == 0)
+ {
+ stack->off = mid;
+ return PostingItemGetBlockNumber(pitem);
+ }
+ else if (result > 0)
+ low = mid + 1;
+ else
+ high = mid;
+ }
+
+ Assert(high >= FirstOffsetNumber && high <= maxoff);
+
+ stack->off = high;
+ pitem = GinDataPageGetPostingItem(page, high);
+ return PostingItemGetBlockNumber(pitem);
+}
+
+/*
+ * Find link to blkno on non-leaf page, returns offset of PostingItem
+ */
+static OffsetNumber
+dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
+{
+ OffsetNumber i,
+ maxoff = GinPageGetOpaque(page)->maxoff;
+ PostingItem *pitem;
+
+ Assert(!GinPageIsLeaf(page));
+ Assert(GinPageIsData(page));
+
+ /* if page isn't changed, we return storedOff */
+ if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
+ {
+ pitem = GinDataPageGetPostingItem(page, storedOff);
+ if (PostingItemGetBlockNumber(pitem) == blkno)
+ return storedOff;
+
+ /*
+ * we hope, that needed pointer goes to right. It's true if there
+ * wasn't a deletion
+ */
+ for (i = storedOff + 1; i <= maxoff; i++)
+ {
+ pitem = GinDataPageGetPostingItem(page, i);
+ if (PostingItemGetBlockNumber(pitem) == blkno)
+ return i;
+ }
+
+ maxoff = storedOff - 1;
+ }
+
+ /* last chance */
+ for (i = FirstOffsetNumber; i <= maxoff; i++)
+ {
+ pitem = GinDataPageGetPostingItem(page, i);
+ if (PostingItemGetBlockNumber(pitem) == blkno)
+ return i;
+ }
+
+ return InvalidOffsetNumber;
+}
+
+/*
+ * Return blkno of leftmost child
+ */
+static BlockNumber
+dataGetLeftMostPage(GinBtree btree, Page page)
+{
+ PostingItem *pitem;
+
+ Assert(!GinPageIsLeaf(page));
+ Assert(GinPageIsData(page));
+ Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber);
+
+ pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
+ return PostingItemGetBlockNumber(pitem);
+}
+
+/*
+ * Add PostingItem to a non-leaf page.
+ */
+void
+GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
+{
+ OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
+ char *ptr;
+
+ Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber);
+ Assert(!GinPageIsLeaf(page));
+
+ if (offset == InvalidOffsetNumber)
+ {
+ ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1);
+ }
+ else
+ {
+ ptr = (char *) GinDataPageGetPostingItem(page, offset);
+ if (offset != maxoff + 1)
+ memmove(ptr + sizeof(PostingItem),
+ ptr,
+ (maxoff - offset + 1) * sizeof(PostingItem));
+ }
+ memcpy(ptr, data, sizeof(PostingItem));
+
+ maxoff++;
+ GinPageGetOpaque(page)->maxoff = maxoff;
+
+ /*
+ * Also set pd_lower to the end of the posting items, to follow the
+ * "standard" page layout, so that we can squeeze out the unused space
+ * from full-page images.
+ */
+ GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
+}
+
+/*
+ * Delete posting item from non-leaf page
+ */
+void
+GinPageDeletePostingItem(Page page, OffsetNumber offset)
+{
+ OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
+
+ Assert(!GinPageIsLeaf(page));
+ Assert(offset >= FirstOffsetNumber && offset <= maxoff);
+
+ if (offset != maxoff)
+ memmove(GinDataPageGetPostingItem(page, offset),
+ GinDataPageGetPostingItem(page, offset + 1),
+ sizeof(PostingItem) * (maxoff - offset));
+
+ maxoff--;
+ GinPageGetOpaque(page)->maxoff = maxoff;
+
+ GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
+}
+
+/*
+ * Prepare to insert data on a leaf data page.
+ *
+ * If it will fit, return GPTP_INSERT after doing whatever setup is needed
+ * before we enter the insertion critical section. *ptp_workspace can be
+ * set to pass information along to the execPlaceToPage function.
+ *
+ * If it won't fit, perform a page split and return two temporary page
+ * images into *newlpage and *newrpage, with result GPTP_SPLIT.
+ *
+ * In neither case should the given page buffer be modified here.
+ */
+static GinPlaceToPageRC
+dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata,
+ void **ptp_workspace,
+ Page *newlpage, Page *newrpage)
+{
+ GinBtreeDataLeafInsertData *items = insertdata;
+ ItemPointer newItems = &items->items[items->curitem];
+ int maxitems = items->nitem - items->curitem;
+ Page page = BufferGetPage(buf);
+ int i;
+ ItemPointerData rbound;
+ ItemPointerData lbound;
+ bool needsplit;
+ bool append;
+ int segsize;
+ Size freespace;
+ disassembledLeaf *leaf;
+ leafSegmentInfo *lastleftinfo;
+ ItemPointerData maxOldItem;
+ ItemPointerData remaining;
+
+ rbound = *GinDataPageGetRightBound(page);
+
+ /*
+ * Count how many of the new items belong to this page.
+ */
+ if (!GinPageRightMost(page))
+ {
+ for (i = 0; i < maxitems; i++)
+ {
+ if (ginCompareItemPointers(&newItems[i], &rbound) > 0)
+ {
+ /*
+ * This needs to go to some other location in the tree. (The
+ * caller should've chosen the insert location so that at
+ * least the first item goes here.)
+ */
+ Assert(i > 0);
+ break;
+ }
+ }
+ maxitems = i;
+ }
+
+ /* Disassemble the data on the page */
+ leaf = disassembleLeaf(page);
+
+ /*
+ * Are we appending to the end of the page? IOW, are all the new items
+ * larger than any of the existing items.
+ */
+ if (!dlist_is_empty(&leaf->segments))
+ {
+ lastleftinfo = dlist_container(leafSegmentInfo, node,
+ dlist_tail_node(&leaf->segments));
+ if (!lastleftinfo->items)
+ lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+ &lastleftinfo->nitems);
+ maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1];
+ if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0)
+ append = true;
+ else
+ append = false;
+ }
+ else
+ {
+ ItemPointerSetMin(&maxOldItem);
+ append = true;
+ }
+
+ /*
+ * If we're appending to the end of the page, we will append as many items
+ * as we can fit (after splitting), and stop when the pages becomes full.
+ * Otherwise we have to limit the number of new items to insert, because
+ * once we start packing we can't just stop when we run out of space,
+ * because we must make sure that all the old items still fit.
+ */
+ if (GinPageIsCompressed(page))
+ freespace = GinDataLeafPageGetFreeSpace(page);
+ else
+ freespace = 0;
+ if (append)
+ {
+ /*
+ * Even when appending, trying to append more items than will fit is
+ * not completely free, because we will merge the new items and old
+ * items into an array below. In the best case, every new item fits in
+ * a single byte, and we can use all the free space on the old page as
+ * well as the new page. For simplicity, ignore segment overhead etc.
+ */
+ maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize);
+ }
+ else
+ {
+ /*
+ * Calculate a conservative estimate of how many new items we can fit
+ * on the two pages after splitting.
+ *
+ * We can use any remaining free space on the old page to store full
+ * segments, as well as the new page. Each full-sized segment can hold
+ * at least MinTuplesPerSegment items
+ */
+ int nnewsegments;
+
+ nnewsegments = freespace / GinPostingListSegmentMaxSize;
+ nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize;
+ maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
+ }
+
+ /* Add the new items to the segment list */
+ if (!addItemsToLeaf(leaf, newItems, maxitems))
+ {
+ /* all items were duplicates, we have nothing to do */
+ items->curitem += maxitems;
+
+ return GPTP_NO_WORK;
+ }
+
+ /*
+ * Pack the items back to compressed segments, ready for writing to disk.
+ */
+ needsplit = leafRepackItems(leaf, &remaining);
+
+ /*
+ * Did all the new items fit?
+ *
+ * If we're appending, it's OK if they didn't. But as a sanity check,
+ * verify that all the old items fit.
+ */
+ if (ItemPointerIsValid(&remaining))
+ {
+ if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0)
+ elog(ERROR, "could not split GIN page; all old items didn't fit");
+
+ /* Count how many of the new items did fit. */
+ for (i = 0; i < maxitems; i++)
+ {
+ if (ginCompareItemPointers(&newItems[i], &remaining) >= 0)
+ break;
+ }
+ if (i == 0)
+ elog(ERROR, "could not split GIN page; no new items fit");
+ maxitems = i;
+ }
+
+ if (!needsplit)
+ {
+ /*
+ * Great, all the items fit on a single page. If needed, prepare data
+ * for a WAL record describing the changes we'll make.
+ */
+ if (RelationNeedsWAL(btree->index) && !btree->isBuild)
+ computeLeafRecompressWALData(leaf);
+
+ /*
+ * We're ready to enter the critical section, but
+ * dataExecPlaceToPageLeaf will need access to the "leaf" data.
+ */
+ *ptp_workspace = leaf;
+
+ if (append)
+ elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+ items->nitem - items->curitem - maxitems);
+ else
+ elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+ items->nitem - items->curitem - maxitems);
+ }
+ else
+ {
+ /*
+ * Have to split.
+ *
+ * leafRepackItems already divided the segments between the left and
+ * the right page. It filled the left page as full as possible, and
+ * put the rest to the right page. When building a new index, that's
+ * good, because the table is scanned from beginning to end and there
+ * won't be any more insertions to the left page during the build.
+ * This packs the index as tight as possible. But otherwise, split
+ * 50/50, by moving segments from the left page to the right page
+ * until they're balanced.
+ *
+ * As a further heuristic, when appending items to the end of the
+ * page, try to make the left page 75% full, on the assumption that
+ * subsequent insertions will probably also go to the end. This packs
+ * the index somewhat tighter when appending to a table, which is very
+ * common.
+ */
+ if (!btree->isBuild)
+ {
+ while (dlist_has_prev(&leaf->segments, leaf->lastleft))
+ {
+ lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+
+ /* ignore deleted segments */
+ if (lastleftinfo->action != GIN_SEGMENT_DELETE)
+ {
+ segsize = SizeOfGinPostingList(lastleftinfo->seg);
+
+ /*
+ * Note that we check that the right page doesn't become
+ * more full than the left page even when appending. It's
+ * possible that we added enough items to make both pages
+ * more than 75% full.
+ */
+ if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0)
+ break;
+ if (append)
+ {
+ if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4)
+ break;
+ }
+
+ leaf->lsize -= segsize;
+ leaf->rsize += segsize;
+ }
+ leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
+ }
+ }
+ Assert(leaf->lsize <= GinDataPageMaxDataSize);
+ Assert(leaf->rsize <= GinDataPageMaxDataSize);
+
+ /*
+ * Fetch the max item in the left page's last segment; it becomes the
+ * right bound of the page.
+ */
+ lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+ if (!lastleftinfo->items)
+ lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+ &lastleftinfo->nitems);
+ lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
+
+ /*
+ * Now allocate a couple of temporary page images, and fill them.
+ */
+ *newlpage = palloc(BLCKSZ);
+ *newrpage = palloc(BLCKSZ);
+
+ dataPlaceToPageLeafSplit(leaf, lbound, rbound,
+ *newlpage, *newrpage);
+
+ Assert(GinPageRightMost(page) ||
+ ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
+ GinDataPageGetRightBound(*newrpage)) < 0);
+
+ if (append)
+ elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+ items->nitem - items->curitem - maxitems);
+ else
+ elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)",
+ maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+ items->nitem - items->curitem - maxitems);
+ }
+
+ items->curitem += maxitems;
+
+ return needsplit ? GPTP_SPLIT : GPTP_INSERT;
+}
+
+/*
+ * Perform data insertion after beginPlaceToPage has decided it will fit.
+ *
+ * This is invoked within a critical section, and XLOG record creation (if
+ * needed) is already started. The target buffer is registered in slot 0.
+ */
+static void
+dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, void *ptp_workspace)
+{
+ disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace;
+
+ /* Apply changes to page */
+ dataPlaceToPageLeafRecompress(buf, leaf);
+
+ /* If needed, register WAL data built by computeLeafRecompressWALData */
+ if (RelationNeedsWAL(btree->index) && !btree->isBuild)
+ {
+ XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen);
+ }
+}
+
+/*
+ * Vacuum a posting tree leaf page.
+ */
+void
+ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
+{
+ Page page = BufferGetPage(buffer);
+ disassembledLeaf *leaf;
+ bool removedsomething = false;
+ dlist_iter iter;
+
+ leaf = disassembleLeaf(page);
+
+ /* Vacuum each segment. */
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+ int oldsegsize;
+ ItemPointer cleaned;
+ int ncleaned;
+
+ if (!seginfo->items)
+ seginfo->items = ginPostingListDecode(seginfo->seg,
+ &seginfo->nitems);
+ if (seginfo->seg)
+ oldsegsize = SizeOfGinPostingList(seginfo->seg);
+ else
+ oldsegsize = GinDataPageMaxDataSize;
+
+ cleaned = ginVacuumItemPointers(gvs,
+ seginfo->items,
+ seginfo->nitems,
+ &ncleaned);
+ pfree(seginfo->items);
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ if (cleaned)
+ {
+ if (ncleaned > 0)
+ {
+ int npacked;
+
+ seginfo->seg = ginCompressPostingList(cleaned,
+ ncleaned,
+ oldsegsize,
+ &npacked);
+ /* Removing an item never increases the size of the segment */
+ if (npacked != ncleaned)
+ elog(ERROR, "could not fit vacuumed posting list");
+ seginfo->action = GIN_SEGMENT_REPLACE;
+ }
+ else
+ {
+ seginfo->seg = NULL;
+ seginfo->items = NULL;
+ seginfo->action = GIN_SEGMENT_DELETE;
+ }
+ seginfo->nitems = ncleaned;
+
+ removedsomething = true;
+ }
+ }
+
+ /*
+ * If we removed any items, reconstruct the page from the pieces.
+ *
+ * We don't try to re-encode the segments here, even though some of them
+ * might be really small now that we've removed some items from them. It
+ * seems like a waste of effort, as there isn't really any benefit from
+ * larger segments per se; larger segments only help to pack more items in
+ * the same space. We might as well delay doing that until the next
+ * insertion, which will need to re-encode at least part of the page
+ * anyway.
+ *
+ * Also note if the page was in uncompressed, pre-9.4 format before, it is
+ * now represented as one huge segment that contains all the items. It
+ * might make sense to split that, to speed up random access, but we don't
+ * bother. You'll have to REINDEX anyway if you want the full gain of the
+ * new tighter index format.
+ */
+ if (removedsomething)
+ {
+ bool modified;
+
+ /*
+ * Make sure we have a palloc'd copy of all segments, after the first
+ * segment that is modified. (dataPlaceToPageLeafRecompress requires
+ * this).
+ */
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ modified = true;
+ if (modified && seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ int segsize = SizeOfGinPostingList(seginfo->seg);
+ GinPostingList *tmp = (GinPostingList *) palloc(segsize);
+
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
+ if (RelationNeedsWAL(indexrel))
+ computeLeafRecompressWALData(leaf);
+
+ /* Apply changes to page */
+ START_CRIT_SECTION();
+
+ dataPlaceToPageLeafRecompress(buffer, leaf);
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(indexrel))
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+ XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen);
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE);
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+ }
+}
+
+/*
+ * Construct a ginxlogRecompressDataLeaf record representing the changes
+ * in *leaf. (Because this requires a palloc, we have to do it before
+ * we enter the critical section that actually updates the page.)
+ */
+static void
+computeLeafRecompressWALData(disassembledLeaf *leaf)
+{
+ int nmodified = 0;
+ char *walbufbegin;
+ char *walbufend;
+ dlist_iter iter;
+ int segno;
+ ginxlogRecompressDataLeaf *recompress_xlog;
+
+ /* Count the modified segments */
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ nmodified++;
+ }
+
+ walbufbegin =
+ palloc(sizeof(ginxlogRecompressDataLeaf) +
+ BLCKSZ + /* max size needed to hold the segment data */
+ nmodified * 2 /* (segno + action) per action */
+ );
+ walbufend = walbufbegin;
+
+ recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
+ walbufend += sizeof(ginxlogRecompressDataLeaf);
+
+ recompress_xlog->nactions = nmodified;
+
+ segno = 0;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+ int segsize = 0;
+ int datalen;
+ uint8 action = seginfo->action;
+
+ if (action == GIN_SEGMENT_UNMODIFIED)
+ {
+ segno++;
+ continue;
+ }
+
+ if (action != GIN_SEGMENT_DELETE)
+ segsize = SizeOfGinPostingList(seginfo->seg);
+
+ /*
+ * If storing the uncompressed list of added item pointers would take
+ * more space than storing the compressed segment as is, do that
+ * instead.
+ */
+ if (action == GIN_SEGMENT_ADDITEMS &&
+ seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
+ {
+ action = GIN_SEGMENT_REPLACE;
+ }
+
+ *((uint8 *) (walbufend++)) = segno;
+ *(walbufend++) = action;
+
+ switch (action)
+ {
+ case GIN_SEGMENT_DELETE:
+ datalen = 0;
+ break;
+
+ case GIN_SEGMENT_ADDITEMS:
+ datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
+ memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16));
+ memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
+ datalen += sizeof(uint16);
+ break;
+
+ case GIN_SEGMENT_INSERT:
+ case GIN_SEGMENT_REPLACE:
+ datalen = SHORTALIGN(segsize);
+ memcpy(walbufend, seginfo->seg, segsize);
+ break;
+
+ default:
+ elog(ERROR, "unexpected GIN leaf action %d", action);
+ }
+ walbufend += datalen;
+
+ if (action != GIN_SEGMENT_INSERT)
+ segno++;
+ }
+
+ /* Pass back the constructed info via *leaf */
+ leaf->walinfo = walbufbegin;
+ leaf->walinfolen = walbufend - walbufbegin;
+}
+
+/*
+ * Assemble a disassembled posting tree leaf page back to a buffer.
+ *
+ * This just updates the target buffer; WAL stuff is caller's responsibility.
+ *
+ * NOTE: The segment pointers must not point directly to the same buffer,
+ * except for segments that have not been modified and whose preceding
+ * segments have not been modified either.
+ */
+static void
+dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
+{
+ Page page = BufferGetPage(buf);
+ char *ptr;
+ int newsize;
+ bool modified = false;
+ dlist_iter iter;
+ int segsize;
+
+ /*
+ * If the page was in pre-9.4 format before, convert the header, and force
+ * all segments to be copied to the page whether they were modified or
+ * not.
+ */
+ if (!GinPageIsCompressed(page))
+ {
+ Assert(leaf->oldformat);
+ GinPageSetCompressed(page);
+ GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ modified = true;
+ }
+
+ ptr = (char *) GinDataLeafPageGetPostingList(page);
+ newsize = 0;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ modified = true;
+
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ segsize = SizeOfGinPostingList(seginfo->seg);
+
+ if (modified)
+ memcpy(ptr, seginfo->seg, segsize);
+
+ ptr += segsize;
+ newsize += segsize;
+ }
+ }
+
+ Assert(newsize <= GinDataPageMaxDataSize);
+ GinDataPageSetDataSize(page, newsize);
+}
+
+/*
+ * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf
+ * segments to two pages instead of one.
+ *
+ * This is different from the non-split cases in that this does not modify
+ * the original page directly, but writes to temporary in-memory copies of
+ * the new left and right pages.
+ */
+static void
+dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
+ ItemPointerData lbound, ItemPointerData rbound,
+ Page lpage, Page rpage)
+{
+ char *ptr;
+ int segsize;
+ int lsize;
+ int rsize;
+ dlist_node *node;
+ dlist_node *firstright;
+ leafSegmentInfo *seginfo;
+
+ /* Initialize temporary pages to hold the new left and right pages */
+ GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+ GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+
+ /*
+ * Copy the segments that go to the left page.
+ *
+ * XXX: We should skip copying the unmodified part of the left page, like
+ * we do when recompressing.
+ */
+ lsize = 0;
+ ptr = (char *) GinDataLeafPageGetPostingList(lpage);
+ firstright = dlist_next_node(&leaf->segments, leaf->lastleft);
+ for (node = dlist_head_node(&leaf->segments);
+ node != firstright;
+ node = dlist_next_node(&leaf->segments, node))
+ {
+ seginfo = dlist_container(leafSegmentInfo, node, node);
+
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ lsize += segsize;
+ }
+ }
+ Assert(lsize == leaf->lsize);
+ GinDataPageSetDataSize(lpage, lsize);
+ *GinDataPageGetRightBound(lpage) = lbound;
+
+ /* Copy the segments that go to the right page */
+ ptr = (char *) GinDataLeafPageGetPostingList(rpage);
+ rsize = 0;
+ for (node = firstright;
+ ;
+ node = dlist_next_node(&leaf->segments, node))
+ {
+ seginfo = dlist_container(leafSegmentInfo, node, node);
+
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ rsize += segsize;
+ }
+
+ if (!dlist_has_next(&leaf->segments, node))
+ break;
+ }
+ Assert(rsize == leaf->rsize);
+ GinDataPageSetDataSize(rpage, rsize);
+ *GinDataPageGetRightBound(rpage) = rbound;
+}
+
+/*
+ * Prepare to insert data on an internal data page.
+ *
+ * If it will fit, return GPTP_INSERT after doing whatever setup is needed
+ * before we enter the insertion critical section. *ptp_workspace can be
+ * set to pass information along to the execPlaceToPage function.
+ *
+ * If it won't fit, perform a page split and return two temporary page
+ * images into *newlpage and *newrpage, with result GPTP_SPLIT.
+ *
+ * In neither case should the given page buffer be modified here.
+ *
+ * Note: on insertion to an internal node, in addition to inserting the given
+ * item, the downlink of the existing item at stack->off will be updated to
+ * point to updateblkno.
+ */
+static GinPlaceToPageRC
+dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ void **ptp_workspace,
+ Page *newlpage, Page *newrpage)
+{
+ Page page = BufferGetPage(buf);
+
+ /* If it doesn't fit, deal with split case */
+ if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
+ {
+ dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
+ newlpage, newrpage);
+ return GPTP_SPLIT;
+ }
+
+ /* Else, we're ready to proceed with insertion */
+ return GPTP_INSERT;
+}
+
+/*
+ * Perform data insertion after beginPlaceToPage has decided it will fit.
+ *
+ * This is invoked within a critical section, and XLOG record creation (if
+ * needed) is already started. The target buffer is registered in slot 0.
+ */
+static void
+dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ void *ptp_workspace)
+{
+ Page page = BufferGetPage(buf);
+ OffsetNumber off = stack->off;
+ PostingItem *pitem;
+
+ /* Update existing downlink to point to next page (on internal page) */
+ pitem = GinDataPageGetPostingItem(page, off);
+ PostingItemSetBlockNumber(pitem, updateblkno);
+
+ /* Add new item */
+ pitem = (PostingItem *) insertdata;
+ GinDataPageAddPostingItem(page, pitem, off);
+
+ if (RelationNeedsWAL(btree->index) && !btree->isBuild)
+ {
+ /*
+ * This must be static, because it has to survive until XLogInsert,
+ * and we can't palloc here. Ugly, but the XLogInsert infrastructure
+ * isn't reentrant anyway.
+ */
+ static ginxlogInsertDataInternal data;
+
+ data.offset = off;
+ data.newitem = *pitem;
+
+ XLogRegisterBufData(0, (char *) &data,
+ sizeof(ginxlogInsertDataInternal));
+ }
+}
+
+/*
+ * Prepare to insert data on a posting-tree data page.
+ *
+ * If it will fit, return GPTP_INSERT after doing whatever setup is needed
+ * before we enter the insertion critical section. *ptp_workspace can be
+ * set to pass information along to the execPlaceToPage function.
+ *
+ * If it won't fit, perform a page split and return two temporary page
+ * images into *newlpage and *newrpage, with result GPTP_SPLIT.
+ *
+ * In neither case should the given page buffer be modified here.
+ *
+ * Note: on insertion to an internal node, in addition to inserting the given
+ * item, the downlink of the existing item at stack->off will be updated to
+ * point to updateblkno.
+ *
+ * Calls relevant function for internal or leaf page because they are handled
+ * very differently.
+ */
+static GinPlaceToPageRC
+dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ void **ptp_workspace,
+ Page *newlpage, Page *newrpage)
+{
+ Page page = BufferGetPage(buf);
+
+ Assert(GinPageIsData(page));
+
+ if (GinPageIsLeaf(page))
+ return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata,
+ ptp_workspace,
+ newlpage, newrpage);
+ else
+ return dataBeginPlaceToPageInternal(btree, buf, stack,
+ insertdata, updateblkno,
+ ptp_workspace,
+ newlpage, newrpage);
+}
+
+/*
+ * Perform data insertion after beginPlaceToPage has decided it will fit.
+ *
+ * This is invoked within a critical section, and XLOG record creation (if
+ * needed) is already started. The target buffer is registered in slot 0.
+ *
+ * Calls relevant function for internal or leaf page because they are handled
+ * very differently.
+ */
+static void
+dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ void *ptp_workspace)
+{
+ Page page = BufferGetPage(buf);
+
+ if (GinPageIsLeaf(page))
+ dataExecPlaceToPageLeaf(btree, buf, stack, insertdata,
+ ptp_workspace);
+ else
+ dataExecPlaceToPageInternal(btree, buf, stack, insertdata,
+ updateblkno, ptp_workspace);
+}
+
+/*
+ * Split internal page and insert new data.
+ *
+ * Returns new temp pages to *newlpage and *newrpage.
+ * The original buffer is left untouched.
+ */
+static void
+dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ void *insertdata, BlockNumber updateblkno,
+ Page *newlpage, Page *newrpage)
+{
+ Page oldpage = BufferGetPage(origbuf);
+ OffsetNumber off = stack->off;
+ int nitems = GinPageGetOpaque(oldpage)->maxoff;
+ int nleftitems;
+ int nrightitems;
+ Size pageSize = PageGetPageSize(oldpage);
+ ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
+ ItemPointer bound;
+ Page lpage;
+ Page rpage;
+ OffsetNumber separator;
+ PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
+
+ lpage = PageGetTempPage(oldpage);
+ rpage = PageGetTempPage(oldpage);
+ GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+ GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+
+ /*
+ * First construct a new list of PostingItems, which includes all the old
+ * items, and the new item.
+ */
+ memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
+ (off - 1) * sizeof(PostingItem));
+
+ allitems[off - 1] = *((PostingItem *) insertdata);
+ memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off),
+ (nitems - (off - 1)) * sizeof(PostingItem));
+ nitems++;
+
+ /* Update existing downlink to point to next page */
+ PostingItemSetBlockNumber(&allitems[off], updateblkno);
+
+ /*
+ * When creating a new index, fit as many tuples as possible on the left
+ * page, on the assumption that the table is scanned from beginning to
+ * end. This packs the index as tight as possible.
+ */
+ if (btree->isBuild && GinPageRightMost(oldpage))
+ separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem);
+ else
+ separator = nitems / 2;
+ nleftitems = separator;
+ nrightitems = nitems - separator;
+
+ memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
+ allitems,
+ nleftitems * sizeof(PostingItem));
+ GinPageGetOpaque(lpage)->maxoff = nleftitems;
+ memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
+ &allitems[separator],
+ nrightitems * sizeof(PostingItem));
+ GinPageGetOpaque(rpage)->maxoff = nrightitems;
+
+ /*
+ * Also set pd_lower for both pages, like GinDataPageAddPostingItem does.
+ */
+ GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem));
+ GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem));
+
+ /* set up right bound for left page */
+ bound = GinDataPageGetRightBound(lpage);
+ *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key;
+
+ /* set up right bound for right page */
+ *GinDataPageGetRightBound(rpage) = oldbound;
+
+ /* return temp pages to caller */
+ *newlpage = lpage;
+ *newrpage = rpage;
+}
+
+/*
+ * Construct insertion payload for inserting the downlink for given buffer.
+ */
+static void *
+dataPrepareDownlink(GinBtree btree, Buffer lbuf)
+{
+ PostingItem *pitem = palloc(sizeof(PostingItem));
+ Page lpage = BufferGetPage(lbuf);
+
+ PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf));
+ pitem->key = *GinDataPageGetRightBound(lpage);
+
+ return pitem;
+}
+
+/*
+ * Fills new root by right bound values from child.
+ * Also called from ginxlog, should not use btree
+ */
+void
+ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage)
+{
+ PostingItem li,
+ ri;
+
+ li.key = *GinDataPageGetRightBound(lpage);
+ PostingItemSetBlockNumber(&li, lblkno);
+ GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber);
+
+ ri.key = *GinDataPageGetRightBound(rpage);
+ PostingItemSetBlockNumber(&ri, rblkno);
+ GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
+}
+
+
+/*** Functions to work with disassembled leaf pages ***/
+
+/*
+ * Disassemble page into a disassembledLeaf struct.
+ */
+static disassembledLeaf *
+disassembleLeaf(Page page)
+{
+ disassembledLeaf *leaf;
+ GinPostingList *seg;
+ Pointer segbegin;
+ Pointer segend;
+
+ leaf = palloc0(sizeof(disassembledLeaf));
+ dlist_init(&leaf->segments);
+
+ if (GinPageIsCompressed(page))
+ {
+ /*
+ * Create a leafSegmentInfo entry for each segment.
+ */
+ seg = GinDataLeafPageGetPostingList(page);
+ segbegin = (Pointer) seg;
+ segend = segbegin + GinDataLeafPageGetPostingListSize(page);
+ while ((Pointer) seg < segend)
+ {
+ leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
+
+ seginfo->action = GIN_SEGMENT_UNMODIFIED;
+ seginfo->seg = seg;
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ dlist_push_tail(&leaf->segments, &seginfo->node);
+
+ seg = GinNextPostingListSegment(seg);
+ }
+ leaf->oldformat = false;
+ }
+ else
+ {
+ /*
+ * A pre-9.4 format uncompressed page is represented by a single
+ * segment, with an array of items. The corner case is uncompressed
+ * page containing no items, which is represented as no segments.
+ */
+ ItemPointer uncompressed;
+ int nuncompressed;
+ leafSegmentInfo *seginfo;
+
+ uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed);
+
+ if (nuncompressed > 0)
+ {
+ seginfo = palloc(sizeof(leafSegmentInfo));
+
+ seginfo->action = GIN_SEGMENT_REPLACE;
+ seginfo->seg = NULL;
+ seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
+ memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
+ seginfo->nitems = nuncompressed;
+
+ dlist_push_tail(&leaf->segments, &seginfo->node);
+ }
+
+ leaf->oldformat = true;
+ }
+
+ return leaf;
+}
+
+/*
+ * Distribute newItems to the segments.
+ *
+ * Any segments that acquire new items are decoded, and the new items are
+ * merged with the old items.
+ *
+ * Returns true if any new items were added. False means they were all
+ * duplicates of existing items on the page.
+ */
+static bool
+addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
+{
+ dlist_iter iter;
+ ItemPointer nextnew = newItems;
+ int newleft = nNewItems;
+ bool modified = false;
+ leafSegmentInfo *newseg;
+
+ /*
+ * If the page is completely empty, just construct one new segment to hold
+ * all the new items.
+ */
+ if (dlist_is_empty(&leaf->segments))
+ {
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = newItems;
+ newseg->nitems = nNewItems;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
+ return true;
+ }
+
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
+ int nthis;
+ ItemPointer tmpitems;
+ int ntmpitems;
+
+ /*
+ * How many of the new items fall into this segment?
+ */
+ if (!dlist_has_next(&leaf->segments, iter.cur))
+ nthis = newleft;
+ else
+ {
+ leafSegmentInfo *next;
+ ItemPointerData next_first;
+
+ next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
+ dlist_next_node(&leaf->segments, iter.cur));
+ if (next->items)
+ next_first = next->items[0];
+ else
+ {
+ Assert(next->seg != NULL);
+ next_first = next->seg->first;
+ }
+
+ nthis = 0;
+ while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0)
+ nthis++;
+ }
+ if (nthis == 0)
+ continue;
+
+ /* Merge the new items with the existing items. */
+ if (!cur->items)
+ cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
+
+ /*
+ * Fast path for the important special case that we're appending to
+ * the end of the page: don't let the last segment on the page grow
+ * larger than the target, create a new segment before that happens.
+ */
+ if (!dlist_has_next(&leaf->segments, iter.cur) &&
+ ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 &&
+ cur->seg != NULL &&
+ SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
+ {
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = nextnew;
+ newseg->nitems = nthis;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
+ modified = true;
+ break;
+ }
+
+ tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
+ nextnew, nthis,
+ &ntmpitems);
+ if (ntmpitems != cur->nitems)
+ {
+ /*
+ * If there are no duplicates, track the added items so that we
+ * can emit a compact ADDITEMS WAL record later on. (it doesn't
+ * seem worth re-checking which items were duplicates, if there
+ * were any)
+ */
+ if (ntmpitems == nthis + cur->nitems &&
+ cur->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ cur->action = GIN_SEGMENT_ADDITEMS;
+ cur->modifieditems = nextnew;
+ cur->nmodifieditems = nthis;
+ }
+ else
+ cur->action = GIN_SEGMENT_REPLACE;
+
+ cur->items = tmpitems;
+ cur->nitems = ntmpitems;
+ cur->seg = NULL;
+ modified = true;
+ }
+
+ nextnew += nthis;
+ newleft -= nthis;
+ if (newleft == 0)
+ break;
+ }
+
+ return modified;
+}
+
+/*
+ * Recompresses all segments that have been modified.
+ *
+ * If not all the items fit on two pages (ie. after split), we store as
+ * many items as fit, and set *remaining to the first item that didn't fit.
+ * If all items fit, *remaining is set to invalid.
+ *
+ * Returns true if the page has to be split.
+ */
+static bool
+leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
+{
+ int pgused = 0;
+ bool needsplit = false;
+ dlist_iter iter;
+ int segsize;
+ leafSegmentInfo *nextseg;
+ int npacked;
+ bool modified;
+ dlist_node *cur_node;
+ dlist_node *next_node;
+
+ ItemPointerSetInvalid(remaining);
+
+ /*
+ * cannot use dlist_foreach_modify here because we insert adjacent items
+ * while iterating.
+ */
+ for (cur_node = dlist_head_node(&leaf->segments);
+ cur_node != NULL;
+ cur_node = next_node)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ cur_node);
+
+ if (dlist_has_next(&leaf->segments, cur_node))
+ next_node = dlist_next_node(&leaf->segments, cur_node);
+ else
+ next_node = NULL;
+
+ /* Compress the posting list, if necessary */
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ if (seginfo->seg == NULL)
+ {
+ if (seginfo->nitems > GinPostingListSegmentMaxSize)
+ npacked = 0; /* no chance that it would fit. */
+ else
+ {
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentMaxSize,
+ &npacked);
+ }
+ if (npacked != seginfo->nitems)
+ {
+ /*
+ * Too large. Compress again to the target size, and
+ * create a new segment to represent the remaining items.
+ * The new segment is inserted after this one, so it will
+ * be processed in the next iteration of this loop.
+ */
+ if (seginfo->seg)
+ pfree(seginfo->seg);
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentTargetSize,
+ &npacked);
+ if (seginfo->action != GIN_SEGMENT_INSERT)
+ seginfo->action = GIN_SEGMENT_REPLACE;
+
+ nextseg = palloc(sizeof(leafSegmentInfo));
+ nextseg->action = GIN_SEGMENT_INSERT;
+ nextseg->seg = NULL;
+ nextseg->items = &seginfo->items[npacked];
+ nextseg->nitems = seginfo->nitems - npacked;
+ next_node = &nextseg->node;
+ dlist_insert_after(cur_node, next_node);
+ }
+ }
+
+ /*
+ * If the segment is very small, merge it with the next segment.
+ */
+ if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
+ {
+ int nmerged;
+
+ nextseg = dlist_container(leafSegmentInfo, node, next_node);
+
+ if (seginfo->items == NULL)
+ seginfo->items = ginPostingListDecode(seginfo->seg,
+ &seginfo->nitems);
+ if (nextseg->items == NULL)
+ nextseg->items = ginPostingListDecode(nextseg->seg,
+ &nextseg->nitems);
+ nextseg->items =
+ ginMergeItemPointers(seginfo->items, seginfo->nitems,
+ nextseg->items, nextseg->nitems,
+ &nmerged);
+ Assert(nmerged == seginfo->nitems + nextseg->nitems);
+ nextseg->nitems = nmerged;
+ nextseg->seg = NULL;
+
+ nextseg->action = GIN_SEGMENT_REPLACE;
+ nextseg->modifieditems = NULL;
+ nextseg->nmodifieditems = 0;
+
+ if (seginfo->action == GIN_SEGMENT_INSERT)
+ {
+ dlist_delete(cur_node);
+ continue;
+ }
+ else
+ {
+ seginfo->action = GIN_SEGMENT_DELETE;
+ seginfo->seg = NULL;
+ }
+ }
+
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ }
+
+ if (seginfo->action == GIN_SEGMENT_DELETE)
+ continue;
+
+ /*
+ * OK, we now have a compressed version of this segment ready for
+ * copying to the page. Did we exceed the size that fits on one page?
+ */
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ if (pgused + segsize > GinDataPageMaxDataSize)
+ {
+ if (!needsplit)
+ {
+ /* switch to right page */
+ Assert(pgused > 0);
+ leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
+ needsplit = true;
+ leaf->lsize = pgused;
+ pgused = 0;
+ }
+ else
+ {
+ /*
+ * Filled both pages. The last segment we constructed did not
+ * fit.
+ */
+ *remaining = seginfo->seg->first;
+
+ /*
+ * remove all segments that did not fit from the list.
+ */
+ while (dlist_has_next(&leaf->segments, cur_node))
+ dlist_delete(dlist_next_node(&leaf->segments, cur_node));
+ dlist_delete(cur_node);
+ break;
+ }
+ }
+
+ pgused += segsize;
+ }
+
+ if (!needsplit)
+ {
+ leaf->lsize = pgused;
+ leaf->rsize = 0;
+ }
+ else
+ leaf->rsize = pgused;
+
+ Assert(leaf->lsize <= GinDataPageMaxDataSize);
+ Assert(leaf->rsize <= GinDataPageMaxDataSize);
+
+ /*
+ * Make a palloc'd copy of every segment after the first modified one,
+ * because as we start copying items to the original page, we might
+ * overwrite an existing segment.
+ */
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ {
+ modified = true;
+ }
+ else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ GinPostingList *tmp;
+
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ tmp = palloc(segsize);
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
+ return needsplit;
+}
+
+
+/*** Functions that are exported to the rest of the GIN code ***/
+
+/*
+ * Creates new posting tree containing the given TIDs. Returns the page
+ * number of the root of the new posting tree.
+ *
+ * items[] must be in sorted order with no duplicates.
+ */
+BlockNumber
+createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
+ GinStatsData *buildStats, Buffer entrybuffer)
+{
+ BlockNumber blkno;
+ Buffer buffer;
+ Page tmppage;
+ Page page;
+ Pointer ptr;
+ int nrootitems;
+ int rootsize;
+ bool is_build = (buildStats != NULL);
+
+ /* Construct the new root page in memory first. */
+ tmppage = (Page) palloc(BLCKSZ);
+ GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+ GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber;
+
+ /*
+ * Write as many of the items to the root page as fit. In segments of max
+ * GinPostingListSegmentMaxSize bytes each.
+ */
+ nrootitems = 0;
+ rootsize = 0;
+ ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage);
+ while (nrootitems < nitems)
+ {
+ GinPostingList *segment;
+ int npacked;
+ int segsize;
+
+ segment = ginCompressPostingList(&items[nrootitems],
+ nitems - nrootitems,
+ GinPostingListSegmentMaxSize,
+ &npacked);
+ segsize = SizeOfGinPostingList(segment);
+ if (rootsize + segsize > GinDataPageMaxDataSize)
+ break;
+
+ memcpy(ptr, segment, segsize);
+ ptr += segsize;
+ rootsize += segsize;
+ nrootitems += npacked;
+ pfree(segment);
+ }
+ GinDataPageSetDataSize(tmppage, rootsize);
+
+ /*
+ * All set. Get a new physical page, and copy the in-memory page to it.
+ */
+ buffer = GinNewBuffer(index);
+ page = BufferGetPage(buffer);
+ blkno = BufferGetBlockNumber(buffer);
+
+ /*
+ * Copy any predicate locks from the entry tree leaf (containing posting
+ * list) to the posting tree.
+ */
+ PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
+
+ START_CRIT_SECTION();
+
+ PageRestoreTempPage(tmppage, page);
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index) && !is_build)
+ {
+ XLogRecPtr recptr;
+ ginxlogCreatePostingTree data;
+
+ data.size = rootsize;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree));
+
+ XLogRegisterData((char *) GinDataLeafPageGetPostingList(page),
+ rootsize);
+ XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE);
+ PageSetLSN(page, recptr);
+ }
+
+ UnlockReleaseBuffer(buffer);
+
+ END_CRIT_SECTION();
+
+ /* During index build, count the newly-added data page */
+ if (buildStats)
+ buildStats->nDataPages++;
+
+ elog(DEBUG2, "created GIN posting tree with %d items", nrootitems);
+
+ /*
+ * Add any remaining TIDs to the newly-created posting tree.
+ */
+ if (nitems > nrootitems)
+ {
+ ginInsertItemPointers(index, blkno,
+ items + nrootitems,
+ nitems - nrootitems,
+ buildStats);
+ }
+
+ return blkno;
+}
+
+static void
+ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
+{
+ memset(btree, 0, sizeof(GinBtreeData));
+
+ btree->index = index;
+ btree->rootBlkno = rootBlkno;
+
+ btree->findChildPage = dataLocateItem;
+ btree->getLeftMostChild = dataGetLeftMostPage;
+ btree->isMoveRight = dataIsMoveRight;
+ btree->findItem = NULL;
+ btree->findChildPtr = dataFindChildPtr;
+ btree->beginPlaceToPage = dataBeginPlaceToPage;
+ btree->execPlaceToPage = dataExecPlaceToPage;
+ btree->fillRoot = ginDataFillRoot;
+ btree->prepareDownlink = dataPrepareDownlink;
+
+ btree->isData = true;
+ btree->fullScan = false;
+ btree->isBuild = false;
+}
+
+/*
+ * Inserts array of item pointers, may execute several tree scan (very rare)
+ */
+void
+ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats)
+{
+ GinBtreeData btree;
+ GinBtreeDataLeafInsertData insertdata;
+ GinBtreeStack *stack;
+
+ ginPrepareDataScan(&btree, index, rootBlkno);
+ btree.isBuild = (buildStats != NULL);
+ insertdata.items = items;
+ insertdata.nitem = nitem;
+ insertdata.curitem = 0;
+
+ while (insertdata.curitem < insertdata.nitem)
+ {
+ /* search for the leaf page where the first item should go to */
+ btree.itemptr = insertdata.items[insertdata.curitem];
+ stack = ginFindLeafPage(&btree, false, true, NULL);
+
+ ginInsertValue(&btree, stack, &insertdata, buildStats);
+ }
+}
+
+/*
+ * Starts a new scan on a posting tree.
+ */
+GinBtreeStack *
+ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno,
+ Snapshot snapshot)
+{
+ GinBtreeStack *stack;
+
+ ginPrepareDataScan(btree, index, rootBlkno);
+
+ btree->fullScan = true;
+
+ stack = ginFindLeafPage(btree, true, false, snapshot);
+
+ return stack;
+}
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
new file mode 100644
index 0000000..29c36bc
--- /dev/null
+++ b/src/backend/access/gin/ginentrypage.c
@@ -0,0 +1,772 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginentrypage.c
+ * routines for handling GIN entry tree pages.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginentrypage.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+static void entrySplitPage(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ GinBtreeEntryInsertData *insertData,
+ BlockNumber updateblkno,
+ Page *newlpage, Page *newrpage);
+
+/*
+ * Form a tuple for entry tree.
+ *
+ * If the tuple would be too big to be stored, function throws a suitable
+ * error if errorTooBig is true, or returns NULL if errorTooBig is false.
+ *
+ * See src/backend/access/gin/README for a description of the index tuple
+ * format that is being built here. We build on the assumption that we
+ * are making a leaf-level key entry containing a posting list of nipd items.
+ * If the caller is actually trying to make a posting-tree entry, non-leaf
+ * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite
+ * the t_tid fields as necessary. In any case, 'data' can be NULL to skip
+ * filling in the posting list; the caller is responsible for filling it
+ * afterwards if data = NULL and nipd > 0.
+ */
+IndexTuple
+GinFormTuple(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
+ Pointer data, Size dataSize, int nipd,
+ bool errorTooBig)
+{
+ Datum datums[2];
+ bool isnull[2];
+ IndexTuple itup;
+ uint32 newsize;
+
+ /* Build the basic tuple: optional column number, plus key datum */
+ if (ginstate->oneCol)
+ {
+ datums[0] = key;
+ isnull[0] = (category != GIN_CAT_NORM_KEY);
+ }
+ else
+ {
+ datums[0] = UInt16GetDatum(attnum);
+ isnull[0] = false;
+ datums[1] = key;
+ isnull[1] = (category != GIN_CAT_NORM_KEY);
+ }
+
+ itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull);
+
+ /*
+ * Determine and store offset to the posting list, making sure there is
+ * room for the category byte if needed.
+ *
+ * Note: because index_form_tuple MAXALIGNs the tuple size, there may well
+ * be some wasted pad space. Is it worth recomputing the data length to
+ * prevent that? That would also allow us to Assert that the real data
+ * doesn't overlap the GinNullCategory byte, which this code currently
+ * takes on faith.
+ */
+ newsize = IndexTupleSize(itup);
+
+ if (IndexTupleHasNulls(itup))
+ {
+ uint32 minsize;
+
+ Assert(category != GIN_CAT_NORM_KEY);
+ minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory);
+ newsize = Max(newsize, minsize);
+ }
+
+ newsize = SHORTALIGN(newsize);
+
+ GinSetPostingOffset(itup, newsize);
+ GinSetNPosting(itup, nipd);
+
+ /*
+ * Add space needed for posting list, if any. Then check that the tuple
+ * won't be too big to store.
+ */
+ newsize += dataSize;
+
+ newsize = MAXALIGN(newsize);
+
+ if (newsize > GinMaxItemSize)
+ {
+ if (errorTooBig)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ (Size) newsize, (Size) GinMaxItemSize,
+ RelationGetRelationName(ginstate->index))));
+ pfree(itup);
+ return NULL;
+ }
+
+ /*
+ * Resize tuple if needed
+ */
+ if (newsize != IndexTupleSize(itup))
+ {
+ itup = repalloc(itup, newsize);
+
+ /*
+ * PostgreSQL 9.3 and earlier did not clear this new space, so we
+ * might find uninitialized padding when reading tuples from disk.
+ */
+ memset((char *) itup + IndexTupleSize(itup),
+ 0, newsize - IndexTupleSize(itup));
+ /* set new size in tuple header */
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+ }
+
+ /*
+ * Copy in the posting list, if provided
+ */
+ if (data)
+ {
+ char *ptr = GinGetPosting(itup);
+
+ memcpy(ptr, data, dataSize);
+ }
+
+ /*
+ * Insert category byte, if needed
+ */
+ if (category != GIN_CAT_NORM_KEY)
+ {
+ Assert(IndexTupleHasNulls(itup));
+ GinSetNullCategory(itup, ginstate, category);
+ }
+ return itup;
+}
+
+/*
+ * Read item pointers from leaf entry tuple.
+ *
+ * Returns a palloc'd array of ItemPointers. The number of items is returned
+ * in *nitems.
+ */
+ItemPointer
+ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
+ int *nitems)
+{
+ Pointer ptr = GinGetPosting(itup);
+ int nipd = GinGetNPosting(itup);
+ ItemPointer ipd;
+ int ndecoded;
+
+ if (GinItupIsCompressed(itup))
+ {
+ if (nipd > 0)
+ {
+ ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded);
+ if (nipd != ndecoded)
+ elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded",
+ nipd, ndecoded);
+ }
+ else
+ {
+ ipd = palloc(0);
+ }
+ }
+ else
+ {
+ ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd);
+ memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd);
+ }
+ *nitems = nipd;
+ return ipd;
+}
+
+/*
+ * Form a non-leaf entry tuple by copying the key data from the given tuple,
+ * which can be either a leaf or non-leaf entry tuple.
+ *
+ * Any posting list in the source tuple is not copied. The specified child
+ * block number is inserted into t_tid.
+ */
+static IndexTuple
+GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk)
+{
+ IndexTuple nitup;
+
+ if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
+ {
+ /* Tuple contains a posting list, just copy stuff before that */
+ uint32 origsize = GinGetPostingOffset(itup);
+
+ origsize = MAXALIGN(origsize);
+ nitup = (IndexTuple) palloc(origsize);
+ memcpy(nitup, itup, origsize);
+ /* ... be sure to fix the size header field ... */
+ nitup->t_info &= ~INDEX_SIZE_MASK;
+ nitup->t_info |= origsize;
+ }
+ else
+ {
+ /* Copy the tuple as-is */
+ nitup = (IndexTuple) palloc(IndexTupleSize(itup));
+ memcpy(nitup, itup, IndexTupleSize(itup));
+ }
+
+ /* Now insert the correct downlink */
+ GinSetDownlink(nitup, childblk);
+
+ return nitup;
+}
+
+/*
+ * Entry tree is a "static", ie tuple never deletes from it,
+ * so we don't use right bound, we use rightmost key instead.
+ */
+static IndexTuple
+getRightMostTuple(Page page)
+{
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+
+ return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff));
+}
+
+static bool
+entryIsMoveRight(GinBtree btree, Page page)
+{
+ IndexTuple itup;
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
+
+ if (GinPageRightMost(page))
+ return false;
+
+ itup = getRightMostTuple(page);
+ attnum = gintuple_get_attrnum(btree->ginstate, itup);
+ key = gintuple_get_key(btree->ginstate, itup, &category);
+
+ if (ginCompareAttEntries(btree->ginstate,
+ btree->entryAttnum, btree->entryKey, btree->entryCategory,
+ attnum, key, category) > 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * Find correct tuple in non-leaf page. It supposed that
+ * page correctly chosen and searching value SHOULD be on page
+ */
+static BlockNumber
+entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
+{
+ OffsetNumber low,
+ high,
+ maxoff;
+ IndexTuple itup = NULL;
+ int result;
+ Page page = BufferGetPage(stack->buffer);
+
+ Assert(!GinPageIsLeaf(page));
+ Assert(!GinPageIsData(page));
+
+ if (btree->fullScan)
+ {
+ stack->off = FirstOffsetNumber;
+ stack->predictNumber *= PageGetMaxOffsetNumber(page);
+ return btree->getLeftMostChild(btree, page);
+ }
+
+ low = FirstOffsetNumber;
+ maxoff = high = PageGetMaxOffsetNumber(page);
+ Assert(high >= low);
+
+ high++;
+
+ while (high > low)
+ {
+ OffsetNumber mid = low + ((high - low) / 2);
+
+ if (mid == maxoff && GinPageRightMost(page))
+ {
+ /* Right infinity */
+ result = -1;
+ }
+ else
+ {
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
+ attnum = gintuple_get_attrnum(btree->ginstate, itup);
+ key = gintuple_get_key(btree->ginstate, itup, &category);
+ result = ginCompareAttEntries(btree->ginstate,
+ btree->entryAttnum,
+ btree->entryKey,
+ btree->entryCategory,
+ attnum, key, category);
+ }
+
+ if (result == 0)
+ {
+ stack->off = mid;
+ Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
+ return GinGetDownlink(itup);
+ }
+ else if (result > 0)
+ low = mid + 1;
+ else
+ high = mid;
+ }
+
+ Assert(high >= FirstOffsetNumber && high <= maxoff);
+
+ stack->off = high;
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high));
+ Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
+ return GinGetDownlink(itup);
+}
+
+/*
+ * Searches correct position for value on leaf page.
+ * Page should be correctly chosen.
+ * Returns true if value found on page.
+ */
+static bool
+entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
+{
+ Page page = BufferGetPage(stack->buffer);
+ OffsetNumber low,
+ high;
+
+ Assert(GinPageIsLeaf(page));
+ Assert(!GinPageIsData(page));
+
+ if (btree->fullScan)
+ {
+ stack->off = FirstOffsetNumber;
+ return true;
+ }
+
+ low = FirstOffsetNumber;
+ high = PageGetMaxOffsetNumber(page);
+
+ if (high < low)
+ {
+ stack->off = FirstOffsetNumber;
+ return false;
+ }
+
+ high++;
+
+ while (high > low)
+ {
+ OffsetNumber mid = low + ((high - low) / 2);
+ IndexTuple itup;
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
+ int result;
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
+ attnum = gintuple_get_attrnum(btree->ginstate, itup);
+ key = gintuple_get_key(btree->ginstate, itup, &category);
+ result = ginCompareAttEntries(btree->ginstate,
+ btree->entryAttnum,
+ btree->entryKey,
+ btree->entryCategory,
+ attnum, key, category);
+ if (result == 0)
+ {
+ stack->off = mid;
+ return true;
+ }
+ else if (result > 0)
+ low = mid + 1;
+ else
+ high = mid;
+ }
+
+ stack->off = high;
+ return false;
+}
+
+static OffsetNumber
+entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
+{
+ OffsetNumber i,
+ maxoff = PageGetMaxOffsetNumber(page);
+ IndexTuple itup;
+
+ Assert(!GinPageIsLeaf(page));
+ Assert(!GinPageIsData(page));
+
+ /* if page isn't changed, we returns storedOff */
+ if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff));
+ if (GinGetDownlink(itup) == blkno)
+ return storedOff;
+
+ /*
+ * we hope, that needed pointer goes to right. It's true if there
+ * wasn't a deletion
+ */
+ for (i = storedOff + 1; i <= maxoff; i++)
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+ if (GinGetDownlink(itup) == blkno)
+ return i;
+ }
+ maxoff = storedOff - 1;
+ }
+
+ /* last chance */
+ for (i = FirstOffsetNumber; i <= maxoff; i++)
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+ if (GinGetDownlink(itup) == blkno)
+ return i;
+ }
+
+ return InvalidOffsetNumber;
+}
+
+static BlockNumber
+entryGetLeftMostPage(GinBtree btree, Page page)
+{
+ IndexTuple itup;
+
+ Assert(!GinPageIsLeaf(page));
+ Assert(!GinPageIsData(page));
+ Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
+ return GinGetDownlink(itup);
+}
+
+static bool
+entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off,
+ GinBtreeEntryInsertData *insertData)
+{
+ Size releasedsz = 0;
+ Size addedsz;
+ Page page = BufferGetPage(buf);
+
+ Assert(insertData->entry);
+ Assert(!GinPageIsData(page));
+
+ if (insertData->isDelete)
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+
+ releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+ }
+
+ addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData);
+
+ if (PageGetFreeSpace(page) + releasedsz >= addedsz)
+ return true;
+
+ return false;
+}
+
+/*
+ * Delete tuple on leaf page if tuples existed and we
+ * should update it, update old child blkno to new right page
+ * if child split occurred
+ */
+static void
+entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
+ GinBtreeEntryInsertData *insertData, BlockNumber updateblkno)
+{
+ Assert(insertData->entry);
+ Assert(!GinPageIsData(page));
+
+ if (insertData->isDelete)
+ {
+ Assert(GinPageIsLeaf(page));
+ PageIndexTupleDelete(page, off);
+ }
+
+ if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber)
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+
+ GinSetDownlink(itup, updateblkno);
+ }
+}
+
+/*
+ * Prepare to insert data on an entry page.
+ *
+ * If it will fit, return GPTP_INSERT after doing whatever setup is needed
+ * before we enter the insertion critical section. *ptp_workspace can be
+ * set to pass information along to the execPlaceToPage function.
+ *
+ * If it won't fit, perform a page split and return two temporary page
+ * images into *newlpage and *newrpage, with result GPTP_SPLIT.
+ *
+ * In neither case should the given page buffer be modified here.
+ *
+ * Note: on insertion to an internal node, in addition to inserting the given
+ * item, the downlink of the existing item at stack->off will be updated to
+ * point to updateblkno.
+ */
+static GinPlaceToPageRC
+entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertPayload, BlockNumber updateblkno,
+ void **ptp_workspace,
+ Page *newlpage, Page *newrpage)
+{
+ GinBtreeEntryInsertData *insertData = insertPayload;
+ OffsetNumber off = stack->off;
+
+ /* If it doesn't fit, deal with split case */
+ if (!entryIsEnoughSpace(btree, buf, off, insertData))
+ {
+ entrySplitPage(btree, buf, stack, insertData, updateblkno,
+ newlpage, newrpage);
+ return GPTP_SPLIT;
+ }
+
+ /* Else, we're ready to proceed with insertion */
+ return GPTP_INSERT;
+}
+
+/*
+ * Perform data insertion after beginPlaceToPage has decided it will fit.
+ *
+ * This is invoked within a critical section, and XLOG record creation (if
+ * needed) is already started. The target buffer is registered in slot 0.
+ */
+static void
+entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+ void *insertPayload, BlockNumber updateblkno,
+ void *ptp_workspace)
+{
+ GinBtreeEntryInsertData *insertData = insertPayload;
+ Page page = BufferGetPage(buf);
+ OffsetNumber off = stack->off;
+ OffsetNumber placed;
+
+ entryPreparePage(btree, page, off, insertData, updateblkno);
+
+ placed = PageAddItem(page,
+ (Item) insertData->entry,
+ IndexTupleSize(insertData->entry),
+ off, false, false);
+ if (placed != off)
+ elog(ERROR, "failed to add item to index page in \"%s\"",
+ RelationGetRelationName(btree->index));
+
+ if (RelationNeedsWAL(btree->index) && !btree->isBuild)
+ {
+ /*
+ * This must be static, because it has to survive until XLogInsert,
+ * and we can't palloc here. Ugly, but the XLogInsert infrastructure
+ * isn't reentrant anyway.
+ */
+ static ginxlogInsertEntry data;
+
+ data.isDelete = insertData->isDelete;
+ data.offset = off;
+
+ XLogRegisterBufData(0, (char *) &data,
+ offsetof(ginxlogInsertEntry, tuple));
+ XLogRegisterBufData(0, (char *) insertData->entry,
+ IndexTupleSize(insertData->entry));
+ }
+}
+
+/*
+ * Split entry page and insert new data.
+ *
+ * Returns new temp pages to *newlpage and *newrpage.
+ * The original buffer is left untouched.
+ */
+static void
+entrySplitPage(GinBtree btree, Buffer origbuf,
+ GinBtreeStack *stack,
+ GinBtreeEntryInsertData *insertData,
+ BlockNumber updateblkno,
+ Page *newlpage, Page *newrpage)
+{
+ OffsetNumber off = stack->off;
+ OffsetNumber i,
+ maxoff,
+ separator = InvalidOffsetNumber;
+ Size totalsize = 0;
+ Size lsize = 0,
+ size;
+ char *ptr;
+ IndexTuple itup;
+ Page page;
+ Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
+ Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
+ Size pageSize = PageGetPageSize(lpage);
+ PGAlignedBlock tupstore[2]; /* could need 2 pages' worth of tuples */
+
+ entryPreparePage(btree, lpage, off, insertData, updateblkno);
+
+ /*
+ * First, append all the existing tuples and the new tuple we're inserting
+ * one after another in a temporary workspace.
+ */
+ maxoff = PageGetMaxOffsetNumber(lpage);
+ ptr = tupstore[0].data;
+ for (i = FirstOffsetNumber; i <= maxoff; i++)
+ {
+ if (i == off)
+ {
+ size = MAXALIGN(IndexTupleSize(insertData->entry));
+ memcpy(ptr, insertData->entry, size);
+ ptr += size;
+ totalsize += size + sizeof(ItemIdData);
+ }
+
+ itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i));
+ size = MAXALIGN(IndexTupleSize(itup));
+ memcpy(ptr, itup, size);
+ ptr += size;
+ totalsize += size + sizeof(ItemIdData);
+ }
+
+ if (off == maxoff + 1)
+ {
+ size = MAXALIGN(IndexTupleSize(insertData->entry));
+ memcpy(ptr, insertData->entry, size);
+ ptr += size;
+ totalsize += size + sizeof(ItemIdData);
+ }
+
+ /*
+ * Initialize the left and right pages, and copy all the tuples back to
+ * them.
+ */
+ GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
+ GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
+
+ ptr = tupstore[0].data;
+ maxoff++;
+ lsize = 0;
+
+ page = lpage;
+ for (i = FirstOffsetNumber; i <= maxoff; i++)
+ {
+ itup = (IndexTuple) ptr;
+
+ /*
+ * Decide where to split. We try to equalize the pages' total data
+ * size, not number of tuples.
+ */
+ if (lsize > totalsize / 2)
+ {
+ if (separator == InvalidOffsetNumber)
+ separator = i - 1;
+ page = rpage;
+ }
+ else
+ {
+ lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+ }
+
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page in \"%s\"",
+ RelationGetRelationName(btree->index));
+ ptr += MAXALIGN(IndexTupleSize(itup));
+ }
+
+ /* return temp pages to caller */
+ *newlpage = lpage;
+ *newrpage = rpage;
+}
+
+/*
+ * Construct insertion payload for inserting the downlink for given buffer.
+ */
+static void *
+entryPrepareDownlink(GinBtree btree, Buffer lbuf)
+{
+ GinBtreeEntryInsertData *insertData;
+ Page lpage = BufferGetPage(lbuf);
+ BlockNumber lblkno = BufferGetBlockNumber(lbuf);
+ IndexTuple itup;
+
+ itup = getRightMostTuple(lpage);
+
+ insertData = palloc(sizeof(GinBtreeEntryInsertData));
+ insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno);
+ insertData->isDelete = false;
+
+ return insertData;
+}
+
+/*
+ * Fills new root by rightest values from child.
+ * Also called from ginxlog, should not use btree
+ */
+void
+ginEntryFillRoot(GinBtree btree, Page root,
+ BlockNumber lblkno, Page lpage,
+ BlockNumber rblkno, Page rpage)
+{
+ IndexTuple itup;
+
+ itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno);
+ if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index root page");
+ pfree(itup);
+
+ itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno);
+ if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index root page");
+ pfree(itup);
+}
+
+/*
+ * Set up GinBtree for entry page access
+ *
+ * Note: during WAL recovery, there may be no valid data in ginstate
+ * other than a faked-up Relation pointer; the key datum is bogus too.
+ */
+void
+ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
+ Datum key, GinNullCategory category,
+ GinState *ginstate)
+{
+ memset(btree, 0, sizeof(GinBtreeData));
+
+ btree->index = ginstate->index;
+ btree->rootBlkno = GIN_ROOT_BLKNO;
+ btree->ginstate = ginstate;
+
+ btree->findChildPage = entryLocateEntry;
+ btree->getLeftMostChild = entryGetLeftMostPage;
+ btree->isMoveRight = entryIsMoveRight;
+ btree->findItem = entryLocateLeafEntry;
+ btree->findChildPtr = entryFindChildPtr;
+ btree->beginPlaceToPage = entryBeginPlaceToPage;
+ btree->execPlaceToPage = entryExecPlaceToPage;
+ btree->fillRoot = ginEntryFillRoot;
+ btree->prepareDownlink = entryPrepareDownlink;
+
+ btree->isData = false;
+ btree->fullScan = false;
+ btree->isBuild = false;
+
+ btree->entryAttnum = attnum;
+ btree->entryKey = key;
+ btree->entryCategory = category;
+}
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
new file mode 100644
index 0000000..e0d9940
--- /dev/null
+++ b/src/backend/access/gin/ginfast.c
@@ -0,0 +1,1068 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginfast.c
+ * Fast insert routines for the Postgres inverted index access method.
+ * Pending entries are stored in linear list of pages. Later on
+ * (typically during VACUUM), ginInsertCleanup() will be invoked to
+ * transfer pending entries into the regular index structure. This
+ * wins because bulk insertion is much more efficient than retail.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginfast.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "catalog/pg_am.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "port/pg_bitutils.h"
+#include "postmaster/autovacuum.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* GUC parameter */
+int gin_pending_list_limit = 0;
+
+#define GIN_PAGE_FREESIZE \
+ ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) )
+
+typedef struct KeyArray
+{
+ Datum *keys; /* expansible array */
+ GinNullCategory *categories; /* another expansible array */
+ int32 nvalues; /* current number of valid entries */
+ int32 maxvalues; /* allocated size of arrays */
+} KeyArray;
+
+
+/*
+ * Build a pending-list page from the given array of tuples, and write it out.
+ *
+ * Returns amount of free space left on the page.
+ */
+static int32
+writeListPage(Relation index, Buffer buffer,
+ IndexTuple *tuples, int32 ntuples, BlockNumber rightlink)
+{
+ Page page = BufferGetPage(buffer);
+ int32 i,
+ freesize,
+ size = 0;
+ OffsetNumber l,
+ off;
+ PGAlignedBlock workspace;
+ char *ptr;
+
+ START_CRIT_SECTION();
+
+ GinInitBuffer(buffer, GIN_LIST);
+
+ off = FirstOffsetNumber;
+ ptr = workspace.data;
+
+ for (i = 0; i < ntuples; i++)
+ {
+ int this_size = IndexTupleSize(tuples[i]);
+
+ memcpy(ptr, tuples[i], this_size);
+ ptr += this_size;
+ size += this_size;
+
+ l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false);
+
+ if (l == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page in \"%s\"",
+ RelationGetRelationName(index));
+
+ off++;
+ }
+
+ Assert(size <= BLCKSZ); /* else we overran workspace */
+
+ GinPageGetOpaque(page)->rightlink = rightlink;
+
+ /*
+ * tail page may contain only whole row(s) or final part of row placed on
+ * previous pages (a "row" here meaning all the index tuples generated for
+ * one heap tuple)
+ */
+ if (rightlink == InvalidBlockNumber)
+ {
+ GinPageSetFullRow(page);
+ GinPageGetOpaque(page)->maxoff = 1;
+ }
+ else
+ {
+ GinPageGetOpaque(page)->maxoff = 0;
+ }
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ ginxlogInsertListPage data;
+ XLogRecPtr recptr;
+
+ data.rightlink = rightlink;
+ data.ntuples = ntuples;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
+ XLogRegisterBufData(0, workspace.data, size);
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE);
+ PageSetLSN(page, recptr);
+ }
+
+ /* get free space before releasing buffer */
+ freesize = PageGetExactFreeSpace(page);
+
+ UnlockReleaseBuffer(buffer);
+
+ END_CRIT_SECTION();
+
+ return freesize;
+}
+
+static void
+makeSublist(Relation index, IndexTuple *tuples, int32 ntuples,
+ GinMetaPageData *res)
+{
+ Buffer curBuffer = InvalidBuffer;
+ Buffer prevBuffer = InvalidBuffer;
+ int i,
+ size = 0,
+ tupsize;
+ int startTuple = 0;
+
+ Assert(ntuples > 0);
+
+ /*
+ * Split tuples into pages
+ */
+ for (i = 0; i < ntuples; i++)
+ {
+ if (curBuffer == InvalidBuffer)
+ {
+ curBuffer = GinNewBuffer(index);
+
+ if (prevBuffer != InvalidBuffer)
+ {
+ res->nPendingPages++;
+ writeListPage(index, prevBuffer,
+ tuples + startTuple,
+ i - startTuple,
+ BufferGetBlockNumber(curBuffer));
+ }
+ else
+ {
+ res->head = BufferGetBlockNumber(curBuffer);
+ }
+
+ prevBuffer = curBuffer;
+ startTuple = i;
+ size = 0;
+ }
+
+ tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData);
+
+ if (size + tupsize > GinListPageSize)
+ {
+ /* won't fit, force a new page and reprocess */
+ i--;
+ curBuffer = InvalidBuffer;
+ }
+ else
+ {
+ size += tupsize;
+ }
+ }
+
+ /*
+ * Write last page
+ */
+ res->tail = BufferGetBlockNumber(curBuffer);
+ res->tailFreeSize = writeListPage(index, curBuffer,
+ tuples + startTuple,
+ ntuples - startTuple,
+ InvalidBlockNumber);
+ res->nPendingPages++;
+ /* that was only one heap tuple */
+ res->nPendingHeapTuples = 1;
+}
+
+/*
+ * Write the index tuples contained in *collector into the index's
+ * pending list.
+ *
+ * Function guarantees that all these tuples will be inserted consecutively,
+ * preserving order
+ */
+void
+ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
+{
+ Relation index = ginstate->index;
+ Buffer metabuffer;
+ Page metapage;
+ GinMetaPageData *metadata = NULL;
+ Buffer buffer = InvalidBuffer;
+ Page page = NULL;
+ ginxlogUpdateMeta data;
+ bool separateList = false;
+ bool needCleanup = false;
+ int cleanupSize;
+ bool needWal;
+
+ if (collector->ntuples == 0)
+ return;
+
+ needWal = RelationNeedsWAL(index);
+
+ data.node = index->rd_node;
+ data.ntuples = 0;
+ data.newRightlink = data.prevTail = InvalidBlockNumber;
+
+ metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
+ metapage = BufferGetPage(metabuffer);
+
+ /*
+ * An insertion to the pending list could logically belong anywhere in the
+ * tree, so it conflicts with all serializable scans. All scans acquire a
+ * predicate lock on the metabuffer to represent that.
+ */
+ CheckForSerializableConflictIn(index, NULL, GIN_METAPAGE_BLKNO);
+
+ if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
+ {
+ /*
+ * Total size is greater than one page => make sublist
+ */
+ separateList = true;
+ }
+ else
+ {
+ LockBuffer(metabuffer, GIN_EXCLUSIVE);
+ metadata = GinPageGetMeta(metapage);
+
+ if (metadata->head == InvalidBlockNumber ||
+ collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize)
+ {
+ /*
+ * Pending list is empty or total size is greater than freespace
+ * on tail page => make sublist
+ *
+ * We unlock metabuffer to keep high concurrency
+ */
+ separateList = true;
+ LockBuffer(metabuffer, GIN_UNLOCK);
+ }
+ }
+
+ if (separateList)
+ {
+ /*
+ * We should make sublist separately and append it to the tail
+ */
+ GinMetaPageData sublist;
+
+ memset(&sublist, 0, sizeof(GinMetaPageData));
+ makeSublist(index, collector->tuples, collector->ntuples, &sublist);
+
+ if (needWal)
+ XLogBeginInsert();
+
+ /*
+ * metapage was unlocked, see above
+ */
+ LockBuffer(metabuffer, GIN_EXCLUSIVE);
+ metadata = GinPageGetMeta(metapage);
+
+ if (metadata->head == InvalidBlockNumber)
+ {
+ /*
+ * Main list is empty, so just insert sublist as main list
+ */
+ START_CRIT_SECTION();
+
+ metadata->head = sublist.head;
+ metadata->tail = sublist.tail;
+ metadata->tailFreeSize = sublist.tailFreeSize;
+
+ metadata->nPendingPages = sublist.nPendingPages;
+ metadata->nPendingHeapTuples = sublist.nPendingHeapTuples;
+ }
+ else
+ {
+ /*
+ * Merge lists
+ */
+ data.prevTail = metadata->tail;
+ data.newRightlink = sublist.head;
+
+ buffer = ReadBuffer(index, metadata->tail);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+ page = BufferGetPage(buffer);
+
+ Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber);
+
+ START_CRIT_SECTION();
+
+ GinPageGetOpaque(page)->rightlink = sublist.head;
+
+ MarkBufferDirty(buffer);
+
+ metadata->tail = sublist.tail;
+ metadata->tailFreeSize = sublist.tailFreeSize;
+
+ metadata->nPendingPages += sublist.nPendingPages;
+ metadata->nPendingHeapTuples += sublist.nPendingHeapTuples;
+
+ if (needWal)
+ XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
+ }
+ }
+ else
+ {
+ /*
+ * Insert into tail page. Metapage is already locked
+ */
+ OffsetNumber l,
+ off;
+ int i,
+ tupsize;
+ char *ptr;
+ char *collectordata;
+
+ buffer = ReadBuffer(index, metadata->tail);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+ page = BufferGetPage(buffer);
+
+ off = (PageIsEmpty(page)) ? FirstOffsetNumber :
+ OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ collectordata = ptr = (char *) palloc(collector->sumsize);
+
+ data.ntuples = collector->ntuples;
+
+ if (needWal)
+ XLogBeginInsert();
+
+ START_CRIT_SECTION();
+
+ /*
+ * Increase counter of heap tuples
+ */
+ Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples);
+ GinPageGetOpaque(page)->maxoff++;
+ metadata->nPendingHeapTuples++;
+
+ for (i = 0; i < collector->ntuples; i++)
+ {
+ tupsize = IndexTupleSize(collector->tuples[i]);
+ l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false);
+
+ if (l == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page in \"%s\"",
+ RelationGetRelationName(index));
+
+ memcpy(ptr, collector->tuples[i], tupsize);
+ ptr += tupsize;
+
+ off++;
+ }
+
+ Assert((ptr - collectordata) <= collector->sumsize);
+ if (needWal)
+ {
+ XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
+ XLogRegisterBufData(1, collectordata, collector->sumsize);
+ }
+
+ metadata->tailFreeSize = PageGetExactFreeSpace(page);
+
+ MarkBufferDirty(buffer);
+ }
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page. (We must do this here because pre-v11 versions of PG did not
+ * set the metapage's pd_lower correctly, so a pg_upgraded index might
+ * contain the wrong value.)
+ */
+ ((PageHeader) metapage)->pd_lower =
+ ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage;
+
+ /*
+ * Write metabuffer, make xlog entry
+ */
+ MarkBufferDirty(metabuffer);
+
+ if (needWal)
+ {
+ XLogRecPtr recptr;
+
+ memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));
+
+ XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta));
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE);
+ PageSetLSN(metapage, recptr);
+
+ if (buffer != InvalidBuffer)
+ {
+ PageSetLSN(page, recptr);
+ }
+ }
+
+ if (buffer != InvalidBuffer)
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * Force pending list cleanup when it becomes too long. And,
+ * ginInsertCleanup could take significant amount of time, so we prefer to
+ * call it when it can do all the work in a single collection cycle. In
+ * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it
+ * while pending list is still small enough to fit into
+ * gin_pending_list_limit.
+ *
+ * ginInsertCleanup() should not be called inside our CRIT_SECTION.
+ */
+ cleanupSize = GinGetPendingListCleanupSize(index);
+ if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L)
+ needCleanup = true;
+
+ UnlockReleaseBuffer(metabuffer);
+
+ END_CRIT_SECTION();
+
+ /*
+ * Since it could contend with concurrent cleanup process we cleanup
+ * pending list not forcibly.
+ */
+ if (needCleanup)
+ ginInsertCleanup(ginstate, false, true, false, NULL);
+}
+
+/*
+ * Create temporary index tuples for a single indexable item (one index column
+ * for the heap tuple specified by ht_ctid), and append them to the array
+ * in *collector. They will subsequently be written out using
+ * ginHeapTupleFastInsert. Note that to guarantee consistent state, all
+ * temp tuples for a given heap tuple must be written in one call to
+ * ginHeapTupleFastInsert.
+ */
+void
+ginHeapTupleFastCollect(GinState *ginstate,
+ GinTupleCollector *collector,
+ OffsetNumber attnum, Datum value, bool isNull,
+ ItemPointer ht_ctid)
+{
+ Datum *entries;
+ GinNullCategory *categories;
+ int32 i,
+ nentries;
+
+ /*
+ * Extract the key values that need to be inserted in the index
+ */
+ entries = ginExtractEntries(ginstate, attnum, value, isNull,
+ &nentries, &categories);
+
+ /*
+ * Protect against integer overflow in allocation calculations
+ */
+ if (nentries < 0 ||
+ collector->ntuples + nentries > MaxAllocSize / sizeof(IndexTuple))
+ elog(ERROR, "too many entries for GIN index");
+
+ /*
+ * Allocate/reallocate memory for storing collected tuples
+ */
+ if (collector->tuples == NULL)
+ {
+ /*
+ * Determine the number of elements to allocate in the tuples array
+ * initially. Make it a power of 2 to avoid wasting memory when
+ * resizing (since palloc likes powers of 2).
+ */
+ collector->lentuples = pg_nextpower2_32(Max(16, nentries));
+ collector->tuples = (IndexTuple *) palloc(sizeof(IndexTuple) * collector->lentuples);
+ }
+ else if (collector->lentuples < collector->ntuples + nentries)
+ {
+ /*
+ * Advance lentuples to the next suitable power of 2. This won't
+ * overflow, though we could get to a value that exceeds
+ * MaxAllocSize/sizeof(IndexTuple), causing an error in repalloc.
+ */
+ collector->lentuples = pg_nextpower2_32(collector->ntuples + nentries);
+ collector->tuples = (IndexTuple *) repalloc(collector->tuples,
+ sizeof(IndexTuple) * collector->lentuples);
+ }
+
+ /*
+ * Build an index tuple for each key value, and add to array. In pending
+ * tuples we just stick the heap TID into t_tid.
+ */
+ for (i = 0; i < nentries; i++)
+ {
+ IndexTuple itup;
+
+ itup = GinFormTuple(ginstate, attnum, entries[i], categories[i],
+ NULL, 0, 0, true);
+ itup->t_tid = *ht_ctid;
+ collector->tuples[collector->ntuples++] = itup;
+ collector->sumsize += IndexTupleSize(itup);
+ }
+}
+
+/*
+ * Deletes pending list pages up to (not including) newHead page.
+ * If newHead == InvalidBlockNumber then function drops the whole list.
+ *
+ * metapage is pinned and exclusive-locked throughout this function.
+ */
+static void
+shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
+ bool fill_fsm, IndexBulkDeleteResult *stats)
+{
+ Page metapage;
+ GinMetaPageData *metadata;
+ BlockNumber blknoToDelete;
+
+ metapage = BufferGetPage(metabuffer);
+ metadata = GinPageGetMeta(metapage);
+ blknoToDelete = metadata->head;
+
+ do
+ {
+ Page page;
+ int i;
+ int64 nDeletedHeapTuples = 0;
+ ginxlogDeleteListPages data;
+ Buffer buffers[GIN_NDELETE_AT_ONCE];
+ BlockNumber freespace[GIN_NDELETE_AT_ONCE];
+
+ data.ndeleted = 0;
+ while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead)
+ {
+ freespace[data.ndeleted] = blknoToDelete;
+ buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete);
+ LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE);
+ page = BufferGetPage(buffers[data.ndeleted]);
+
+ data.ndeleted++;
+
+ Assert(!GinPageIsDeleted(page));
+
+ nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff;
+ blknoToDelete = GinPageGetOpaque(page)->rightlink;
+ }
+
+ if (stats)
+ stats->pages_deleted += data.ndeleted;
+
+ /*
+ * This operation touches an unusually large number of pages, so
+ * prepare the XLogInsert machinery for that before entering the
+ * critical section.
+ */
+ if (RelationNeedsWAL(index))
+ XLogEnsureRecordSpace(data.ndeleted, 0);
+
+ START_CRIT_SECTION();
+
+ metadata->head = blknoToDelete;
+
+ Assert(metadata->nPendingPages >= data.ndeleted);
+ metadata->nPendingPages -= data.ndeleted;
+ Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples);
+ metadata->nPendingHeapTuples -= nDeletedHeapTuples;
+
+ if (blknoToDelete == InvalidBlockNumber)
+ {
+ metadata->tail = InvalidBlockNumber;
+ metadata->tailFreeSize = 0;
+ metadata->nPendingPages = 0;
+ metadata->nPendingHeapTuples = 0;
+ }
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c
+ * compresses the page. (We must do this here because pre-v11
+ * versions of PG did not set the metapage's pd_lower correctly, so a
+ * pg_upgraded index might contain the wrong value.)
+ */
+ ((PageHeader) metapage)->pd_lower =
+ ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage;
+
+ MarkBufferDirty(metabuffer);
+
+ for (i = 0; i < data.ndeleted; i++)
+ {
+ page = BufferGetPage(buffers[i]);
+ GinPageGetOpaque(page)->flags = GIN_DELETED;
+ MarkBufferDirty(buffers[i]);
+ }
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, metabuffer,
+ REGBUF_WILL_INIT | REGBUF_STANDARD);
+ for (i = 0; i < data.ndeleted; i++)
+ XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT);
+
+ memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));
+
+ XLogRegisterData((char *) &data,
+ sizeof(ginxlogDeleteListPages));
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE);
+ PageSetLSN(metapage, recptr);
+
+ for (i = 0; i < data.ndeleted; i++)
+ {
+ page = BufferGetPage(buffers[i]);
+ PageSetLSN(page, recptr);
+ }
+ }
+
+ for (i = 0; i < data.ndeleted; i++)
+ UnlockReleaseBuffer(buffers[i]);
+
+ END_CRIT_SECTION();
+
+ for (i = 0; fill_fsm && i < data.ndeleted; i++)
+ RecordFreeIndexPage(index, freespace[i]);
+
+ } while (blknoToDelete != newHead);
+}
+
+/* Initialize empty KeyArray */
+static void
+initKeyArray(KeyArray *keys, int32 maxvalues)
+{
+ keys->keys = (Datum *) palloc(sizeof(Datum) * maxvalues);
+ keys->categories = (GinNullCategory *)
+ palloc(sizeof(GinNullCategory) * maxvalues);
+ keys->nvalues = 0;
+ keys->maxvalues = maxvalues;
+}
+
+/* Add datum to KeyArray, resizing if needed */
+static void
+addDatum(KeyArray *keys, Datum datum, GinNullCategory category)
+{
+ if (keys->nvalues >= keys->maxvalues)
+ {
+ keys->maxvalues *= 2;
+ keys->keys = (Datum *)
+ repalloc(keys->keys, sizeof(Datum) * keys->maxvalues);
+ keys->categories = (GinNullCategory *)
+ repalloc(keys->categories, sizeof(GinNullCategory) * keys->maxvalues);
+ }
+
+ keys->keys[keys->nvalues] = datum;
+ keys->categories[keys->nvalues] = category;
+ keys->nvalues++;
+}
+
+/*
+ * Collect data from a pending-list page in preparation for insertion into
+ * the main index.
+ *
+ * Go through all tuples >= startoff on page and collect values in accum
+ *
+ * Note that ka is just workspace --- it does not carry any state across
+ * calls.
+ */
+static void
+processPendingPage(BuildAccumulator *accum, KeyArray *ka,
+ Page page, OffsetNumber startoff)
+{
+ ItemPointerData heapptr;
+ OffsetNumber i,
+ maxoff;
+ OffsetNumber attrnum;
+
+ /* reset *ka to empty */
+ ka->nvalues = 0;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ Assert(maxoff >= FirstOffsetNumber);
+ ItemPointerSetInvalid(&heapptr);
+ attrnum = 0;
+
+ for (i = startoff; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+ OffsetNumber curattnum;
+ Datum curkey;
+ GinNullCategory curcategory;
+
+ /* Check for change of heap TID or attnum */
+ curattnum = gintuple_get_attrnum(accum->ginstate, itup);
+
+ if (!ItemPointerIsValid(&heapptr))
+ {
+ heapptr = itup->t_tid;
+ attrnum = curattnum;
+ }
+ else if (!(ItemPointerEquals(&heapptr, &itup->t_tid) &&
+ curattnum == attrnum))
+ {
+ /*
+ * ginInsertBAEntries can insert several datums per call, but only
+ * for one heap tuple and one column. So call it at a boundary,
+ * and reset ka.
+ */
+ ginInsertBAEntries(accum, &heapptr, attrnum,
+ ka->keys, ka->categories, ka->nvalues);
+ ka->nvalues = 0;
+ heapptr = itup->t_tid;
+ attrnum = curattnum;
+ }
+
+ /* Add key to KeyArray */
+ curkey = gintuple_get_key(accum->ginstate, itup, &curcategory);
+ addDatum(ka, curkey, curcategory);
+ }
+
+ /* Dump out all remaining keys */
+ ginInsertBAEntries(accum, &heapptr, attrnum,
+ ka->keys, ka->categories, ka->nvalues);
+}
+
+/*
+ * Move tuples from pending pages into regular GIN structure.
+ *
+ * On first glance it looks completely not crash-safe. But if we crash
+ * after posting entries to the main index and before removing them from the
+ * pending list, it's okay because when we redo the posting later on, nothing
+ * bad will happen.
+ *
+ * fill_fsm indicates that ginInsertCleanup should add deleted pages
+ * to FSM otherwise caller is responsible to put deleted pages into
+ * FSM.
+ *
+ * If stats isn't null, we count deleted pending pages into the counts.
+ */
+void
+ginInsertCleanup(GinState *ginstate, bool full_clean,
+ bool fill_fsm, bool forceCleanup,
+ IndexBulkDeleteResult *stats)
+{
+ Relation index = ginstate->index;
+ Buffer metabuffer,
+ buffer;
+ Page metapage,
+ page;
+ GinMetaPageData *metadata;
+ MemoryContext opCtx,
+ oldCtx;
+ BuildAccumulator accum;
+ KeyArray datums;
+ BlockNumber blkno,
+ blknoFinish;
+ bool cleanupFinish = false;
+ bool fsm_vac = false;
+ Size workMemory;
+
+ /*
+ * We would like to prevent concurrent cleanup process. For that we will
+ * lock metapage in exclusive mode using LockPage() call. Nobody other
+ * will use that lock for metapage, so we keep possibility of concurrent
+ * insertion into pending list
+ */
+
+ if (forceCleanup)
+ {
+ /*
+ * We are called from [auto]vacuum/analyze or gin_clean_pending_list()
+ * and we would like to wait concurrent cleanup to finish.
+ */
+ LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
+ workMemory =
+ (IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ?
+ autovacuum_work_mem : maintenance_work_mem;
+ }
+ else
+ {
+ /*
+ * We are called from regular insert and if we see concurrent cleanup
+ * just exit in hope that concurrent process will clean up pending
+ * list.
+ */
+ if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock))
+ return;
+ workMemory = work_mem;
+ }
+
+ metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, GIN_SHARE);
+ metapage = BufferGetPage(metabuffer);
+ metadata = GinPageGetMeta(metapage);
+
+ if (metadata->head == InvalidBlockNumber)
+ {
+ /* Nothing to do */
+ UnlockReleaseBuffer(metabuffer);
+ UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
+ return;
+ }
+
+ /*
+ * Remember a tail page to prevent infinite cleanup if other backends add
+ * new tuples faster than we can cleanup.
+ */
+ blknoFinish = metadata->tail;
+
+ /*
+ * Read and lock head of pending list
+ */
+ blkno = metadata->head;
+ buffer = ReadBuffer(index, blkno);
+ LockBuffer(buffer, GIN_SHARE);
+ page = BufferGetPage(buffer);
+
+ LockBuffer(metabuffer, GIN_UNLOCK);
+
+ /*
+ * Initialize. All temporary space will be in opCtx
+ */
+ opCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "GIN insert cleanup temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ oldCtx = MemoryContextSwitchTo(opCtx);
+
+ initKeyArray(&datums, 128);
+ ginInitBA(&accum);
+ accum.ginstate = ginstate;
+
+ /*
+ * At the top of this loop, we have pin and lock on the current page of
+ * the pending list. However, we'll release that before exiting the loop.
+ * Note we also have pin but not lock on the metapage.
+ */
+ for (;;)
+ {
+ Assert(!GinPageIsDeleted(page));
+
+ /*
+ * Are we walk through the page which as we remember was a tail when
+ * we start our cleanup? But if caller asks us to clean up whole
+ * pending list then ignore old tail, we will work until list becomes
+ * empty.
+ */
+ if (blkno == blknoFinish && full_clean == false)
+ cleanupFinish = true;
+
+ /*
+ * read page's datums into accum
+ */
+ processPendingPage(&accum, &datums, page, FirstOffsetNumber);
+
+ vacuum_delay_point();
+
+ /*
+ * Is it time to flush memory to disk? Flush if we are at the end of
+ * the pending list, or if we have a full row and memory is getting
+ * full.
+ */
+ if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber ||
+ (GinPageHasFullRow(page) &&
+ (accum.allocatedMemory >= workMemory * 1024L)))
+ {
+ ItemPointerData *list;
+ uint32 nlist;
+ Datum key;
+ GinNullCategory category;
+ OffsetNumber maxoff,
+ attnum;
+
+ /*
+ * Unlock current page to increase performance. Changes of page
+ * will be checked later by comparing maxoff after completion of
+ * memory flush.
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ LockBuffer(buffer, GIN_UNLOCK);
+
+ /*
+ * Moving collected data into regular structure can take
+ * significant amount of time - so, run it without locking pending
+ * list.
+ */
+ ginBeginBAScan(&accum);
+ while ((list = ginGetBAEntry(&accum,
+ &attnum, &key, &category, &nlist)) != NULL)
+ {
+ ginEntryInsert(ginstate, attnum, key, category,
+ list, nlist, NULL);
+ vacuum_delay_point();
+ }
+
+ /*
+ * Lock the whole list to remove pages
+ */
+ LockBuffer(metabuffer, GIN_EXCLUSIVE);
+ LockBuffer(buffer, GIN_SHARE);
+
+ Assert(!GinPageIsDeleted(page));
+
+ /*
+ * While we left the page unlocked, more stuff might have gotten
+ * added to it. If so, process those entries immediately. There
+ * shouldn't be very many, so we don't worry about the fact that
+ * we're doing this with exclusive lock. Insertion algorithm
+ * guarantees that inserted row(s) will not continue on next page.
+ * NOTE: intentionally no vacuum_delay_point in this loop.
+ */
+ if (PageGetMaxOffsetNumber(page) != maxoff)
+ {
+ ginInitBA(&accum);
+ processPendingPage(&accum, &datums, page, maxoff + 1);
+
+ ginBeginBAScan(&accum);
+ while ((list = ginGetBAEntry(&accum,
+ &attnum, &key, &category, &nlist)) != NULL)
+ ginEntryInsert(ginstate, attnum, key, category,
+ list, nlist, NULL);
+ }
+
+ /*
+ * Remember next page - it will become the new list head
+ */
+ blkno = GinPageGetOpaque(page)->rightlink;
+ UnlockReleaseBuffer(buffer); /* shiftList will do exclusive
+ * locking */
+
+ /*
+ * remove read pages from pending list, at this point all content
+ * of read pages is in regular structure
+ */
+ shiftList(index, metabuffer, blkno, fill_fsm, stats);
+
+ /* At this point, some pending pages have been freed up */
+ fsm_vac = true;
+
+ Assert(blkno == metadata->head);
+ LockBuffer(metabuffer, GIN_UNLOCK);
+
+ /*
+ * if we removed the whole pending list or we cleanup tail (which
+ * we remembered on start our cleanup process) then just exit
+ */
+ if (blkno == InvalidBlockNumber || cleanupFinish)
+ break;
+
+ /*
+ * release memory used so far and reinit state
+ */
+ MemoryContextReset(opCtx);
+ initKeyArray(&datums, datums.maxvalues);
+ ginInitBA(&accum);
+ }
+ else
+ {
+ blkno = GinPageGetOpaque(page)->rightlink;
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /*
+ * Read next page in pending list
+ */
+ vacuum_delay_point();
+ buffer = ReadBuffer(index, blkno);
+ LockBuffer(buffer, GIN_SHARE);
+ page = BufferGetPage(buffer);
+ }
+
+ UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
+ ReleaseBuffer(metabuffer);
+
+ /*
+ * As pending list pages can have a high churn rate, it is desirable to
+ * recycle them immediately to the FreeSpaceMap when ordinary backends
+ * clean the list.
+ */
+ if (fsm_vac && fill_fsm)
+ IndexFreeSpaceMapVacuum(index);
+
+ /* Clean up temporary space */
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextDelete(opCtx);
+}
+
+/*
+ * SQL-callable function to clean the insert pending list
+ */
+Datum
+gin_clean_pending_list(PG_FUNCTION_ARGS)
+{
+ Oid indexoid = PG_GETARG_OID(0);
+ Relation indexRel = index_open(indexoid, RowExclusiveLock);
+ IndexBulkDeleteResult stats;
+ GinState ginstate;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("GIN pending list cannot be cleaned up during recovery.")));
+
+ /* Must be a GIN index */
+ if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
+ indexRel->rd_rel->relam != GIN_AM_OID)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is not a GIN index",
+ RelationGetRelationName(indexRel))));
+
+ /*
+ * Reject attempts to read non-local temporary relations; we would be
+ * likely to get wrong data since we have no visibility into the owning
+ * session's local buffers.
+ */
+ if (RELATION_IS_OTHER_TEMP(indexRel))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot access temporary indexes of other sessions")));
+
+ /* User must own the index (comparable to privileges needed for VACUUM) */
+ if (!pg_class_ownercheck(indexoid, GetUserId()))
+ aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
+ RelationGetRelationName(indexRel));
+
+ memset(&stats, 0, sizeof(stats));
+ initGinState(&ginstate, indexRel);
+ ginInsertCleanup(&ginstate, true, true, true, &stats);
+
+ index_close(indexRel, RowExclusiveLock);
+
+ PG_RETURN_INT64((int64) stats.pages_deleted);
+}
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
new file mode 100644
index 0000000..03191e0
--- /dev/null
+++ b/src/backend/access/gin/ginget.c
@@ -0,0 +1,1970 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginget.c
+ * fetch tuples from a GIN scan.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginget.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "storage/predicate.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* GUC parameter */
+int GinFuzzySearchLimit = 0;
+
+typedef struct pendingPosition
+{
+ Buffer pendingBuffer;
+ OffsetNumber firstOffset;
+ OffsetNumber lastOffset;
+ ItemPointerData item;
+ bool *hasMatchKey;
+} pendingPosition;
+
+
+/*
+ * Goes to the next page if current offset is outside of bounds
+ */
+static bool
+moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot)
+{
+ Page page = BufferGetPage(stack->buffer);
+
+ if (stack->off > PageGetMaxOffsetNumber(page))
+ {
+ /*
+ * We scanned the whole page, so we should take right page
+ */
+ if (GinPageRightMost(page))
+ return false; /* no more pages */
+
+ stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+ stack->blkno = BufferGetBlockNumber(stack->buffer);
+ stack->off = FirstOffsetNumber;
+ PredicateLockPage(btree->index, stack->blkno, snapshot);
+ }
+
+ return true;
+}
+
+/*
+ * Scan all pages of a posting tree and save all its heap ItemPointers
+ * in scanEntry->matchBitmap
+ */
+static void
+scanPostingTree(Relation index, GinScanEntry scanEntry,
+ BlockNumber rootPostingTree, Snapshot snapshot)
+{
+ GinBtreeData btree;
+ GinBtreeStack *stack;
+ Buffer buffer;
+ Page page;
+
+ /* Descend to the leftmost leaf page */
+ stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
+ buffer = stack->buffer;
+
+ IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
+
+ freeGinBtreeStack(stack);
+
+ /*
+ * Loop iterates through all leaf pages of posting tree
+ */
+ for (;;)
+ {
+ page = BufferGetPage(buffer);
+ if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
+ {
+ int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
+
+ scanEntry->predictNumberResult += n;
+ }
+
+ if (GinPageRightMost(page))
+ break; /* no more pages */
+
+ buffer = ginStepRight(buffer, index, GIN_SHARE);
+ }
+
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
+ * match the search entry. This supports three different match modes:
+ *
+ * 1. Partial-match support: scan from current point until the
+ * comparePartialFn says we're done.
+ * 2. SEARCH_MODE_ALL: scan from current point (which should be first
+ * key for the current attnum) until we hit null items or end of attnum
+ * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
+ * key for the current attnum) until we hit end of attnum
+ *
+ * Returns true if done, false if it's necessary to restart scan from scratch
+ */
+static bool
+collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
+ GinScanEntry scanEntry, Snapshot snapshot)
+{
+ OffsetNumber attnum;
+ Form_pg_attribute attr;
+
+ /* Initialize empty bitmap result */
+ scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
+
+ /* Null query cannot partial-match anything */
+ if (scanEntry->isPartialMatch &&
+ scanEntry->queryCategory != GIN_CAT_NORM_KEY)
+ return true;
+
+ /* Locate tupdesc entry for key column (for attbyval/attlen data) */
+ attnum = scanEntry->attnum;
+ attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);
+
+ /*
+ * Predicate lock entry leaf page, following pages will be locked by
+ * moveRightIfItNeeded()
+ */
+ PredicateLockPage(btree->index, stack->buffer, snapshot);
+
+ for (;;)
+ {
+ Page page;
+ IndexTuple itup;
+ Datum idatum;
+ GinNullCategory icategory;
+
+ /*
+ * stack->off points to the interested entry, buffer is already locked
+ */
+ if (moveRightIfItNeeded(btree, stack, snapshot) == false)
+ return true;
+
+ page = BufferGetPage(stack->buffer);
+ TestForOldSnapshot(snapshot, btree->index, page);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
+
+ /*
+ * If tuple stores another attribute then stop scan
+ */
+ if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
+ return true;
+
+ /* Safe to fetch attribute value */
+ idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
+
+ /*
+ * Check for appropriate scan stop conditions
+ */
+ if (scanEntry->isPartialMatch)
+ {
+ int32 cmp;
+
+ /*
+ * In partial match, stop scan at any null (including
+ * placeholders); partial matches never match nulls
+ */
+ if (icategory != GIN_CAT_NORM_KEY)
+ return true;
+
+ /*----------
+ * Check of partial match.
+ * case cmp == 0 => match
+ * case cmp > 0 => not match and finish scan
+ * case cmp < 0 => not match and continue scan
+ *----------
+ */
+ cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
+ btree->ginstate->supportCollation[attnum - 1],
+ scanEntry->queryKey,
+ idatum,
+ UInt16GetDatum(scanEntry->strategy),
+ PointerGetDatum(scanEntry->extra_data)));
+
+ if (cmp > 0)
+ return true;
+ else if (cmp < 0)
+ {
+ stack->off++;
+ continue;
+ }
+ }
+ else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
+ {
+ /*
+ * In ALL mode, we are not interested in null items, so we can
+ * stop if we get to a null-item placeholder (which will be the
+ * last entry for a given attnum). We do want to include NULL_KEY
+ * and EMPTY_ITEM entries, though.
+ */
+ if (icategory == GIN_CAT_NULL_ITEM)
+ return true;
+ }
+
+ /*
+ * OK, we want to return the TIDs listed in this entry.
+ */
+ if (GinIsPostingTree(itup))
+ {
+ BlockNumber rootPostingTree = GinGetPostingTree(itup);
+
+ /*
+ * We should unlock current page (but not unpin) during tree scan
+ * to prevent deadlock with vacuum processes.
+ *
+ * We save current entry value (idatum) to be able to re-find our
+ * tuple after re-locking
+ */
+ if (icategory == GIN_CAT_NORM_KEY)
+ idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
+
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+
+ /*
+ * Acquire predicate lock on the posting tree. We already hold a
+ * lock on the entry page, but insertions to the posting tree
+ * don't check for conflicts on that level.
+ */
+ PredicateLockPage(btree->index, rootPostingTree, snapshot);
+
+ /* Collect all the TIDs in this entry's posting tree */
+ scanPostingTree(btree->index, scanEntry, rootPostingTree,
+ snapshot);
+
+ /*
+ * We lock again the entry page and while it was unlocked insert
+ * might have occurred, so we need to re-find our position.
+ */
+ LockBuffer(stack->buffer, GIN_SHARE);
+ page = BufferGetPage(stack->buffer);
+ if (!GinPageIsLeaf(page))
+ {
+ /*
+ * Root page becomes non-leaf while we unlock it. We will
+ * start again, this situation doesn't occur often - root can
+ * became a non-leaf only once per life of index.
+ */
+ return false;
+ }
+
+ /* Search forward to re-find idatum */
+ for (;;)
+ {
+ if (moveRightIfItNeeded(btree, stack, snapshot) == false)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to re-find tuple within index \"%s\"",
+ RelationGetRelationName(btree->index))));
+
+ page = BufferGetPage(stack->buffer);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
+
+ if (gintuple_get_attrnum(btree->ginstate, itup) == attnum)
+ {
+ Datum newDatum;
+ GinNullCategory newCategory;
+
+ newDatum = gintuple_get_key(btree->ginstate, itup,
+ &newCategory);
+
+ if (ginCompareEntries(btree->ginstate, attnum,
+ newDatum, newCategory,
+ idatum, icategory) == 0)
+ break; /* Found! */
+ }
+
+ stack->off++;
+ }
+
+ if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
+ pfree(DatumGetPointer(idatum));
+ }
+ else
+ {
+ ItemPointer ipd;
+ int nipd;
+
+ ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
+ tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
+ scanEntry->predictNumberResult += GinGetNPosting(itup);
+ pfree(ipd);
+ }
+
+ /*
+ * Done with this entry, go to the next
+ */
+ stack->off++;
+ }
+}
+
+/*
+ * Start* functions setup beginning state of searches: finds correct buffer and pins it.
+ */
+static void
+startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
+{
+ GinBtreeData btreeEntry;
+ GinBtreeStack *stackEntry;
+ Page page;
+ bool needUnlock;
+
+restartScanEntry:
+ entry->buffer = InvalidBuffer;
+ ItemPointerSetMin(&entry->curItem);
+ entry->offset = InvalidOffsetNumber;
+ if (entry->list)
+ pfree(entry->list);
+ entry->list = NULL;
+ entry->nlist = 0;
+ entry->matchBitmap = NULL;
+ entry->matchResult = NULL;
+ entry->reduceResult = false;
+ entry->predictNumberResult = 0;
+
+ /*
+ * we should find entry, and begin scan of posting tree or just store
+ * posting list in memory
+ */
+ ginPrepareEntryScan(&btreeEntry, entry->attnum,
+ entry->queryKey, entry->queryCategory,
+ ginstate);
+ stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot);
+ page = BufferGetPage(stackEntry->buffer);
+
+ /* ginFindLeafPage() will have already checked snapshot age. */
+ needUnlock = true;
+
+ entry->isFinished = true;
+
+ if (entry->isPartialMatch ||
+ entry->queryCategory == GIN_CAT_EMPTY_QUERY)
+ {
+ /*
+ * btreeEntry.findItem locates the first item >= given search key.
+ * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
+ * because of the way the GIN_CAT_EMPTY_QUERY category code is
+ * assigned.) We scan forward from there and collect all TIDs needed
+ * for the entry type.
+ */
+ btreeEntry.findItem(&btreeEntry, stackEntry);
+ if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
+ == false)
+ {
+ /*
+ * GIN tree was seriously restructured, so we will cleanup all
+ * found data and rescan. See comments near 'return false' in
+ * collectMatchBitmap()
+ */
+ if (entry->matchBitmap)
+ {
+ if (entry->matchIterator)
+ tbm_end_iterate(entry->matchIterator);
+ entry->matchIterator = NULL;
+ tbm_free(entry->matchBitmap);
+ entry->matchBitmap = NULL;
+ }
+ LockBuffer(stackEntry->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stackEntry);
+ goto restartScanEntry;
+ }
+
+ if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
+ {
+ entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
+ entry->isFinished = false;
+ }
+ }
+ else if (btreeEntry.findItem(&btreeEntry, stackEntry))
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
+
+ if (GinIsPostingTree(itup))
+ {
+ BlockNumber rootPostingTree = GinGetPostingTree(itup);
+ GinBtreeStack *stack;
+ Page page;
+ ItemPointerData minItem;
+
+ /*
+ * This is an equality scan, so lock the root of the posting tree.
+ * It represents a lock on the exact key value, and covers all the
+ * items in the posting tree.
+ */
+ PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
+
+ /*
+ * We should unlock entry page before touching posting tree to
+ * prevent deadlocks with vacuum processes. Because entry is never
+ * deleted from page and posting tree is never reduced to the
+ * posting list, we can unlock page after getting BlockNumber of
+ * root of posting tree.
+ */
+ LockBuffer(stackEntry->buffer, GIN_UNLOCK);
+ needUnlock = false;
+
+ stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
+ rootPostingTree, snapshot);
+ entry->buffer = stack->buffer;
+
+ /*
+ * We keep buffer pinned because we need to prevent deletion of
+ * page during scan. See GIN's vacuum implementation. RefCount is
+ * increased to keep buffer pinned after freeGinBtreeStack() call.
+ */
+ IncrBufferRefCount(entry->buffer);
+
+ page = BufferGetPage(entry->buffer);
+
+ /*
+ * Load the first page into memory.
+ */
+ ItemPointerSetMin(&minItem);
+ entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
+
+ entry->predictNumberResult = stack->predictNumber * entry->nlist;
+
+ LockBuffer(entry->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stack);
+ entry->isFinished = false;
+ }
+ else
+ {
+ /*
+ * Lock the entry leaf page. This is more coarse-grained than
+ * necessary, because it will conflict with any insertions that
+ * land on the same leaf page, not only the exact key we searched
+ * for. But locking an individual tuple would require updating
+ * that lock whenever it moves because of insertions or vacuums,
+ * which seems too complicated.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer),
+ snapshot);
+ if (GinGetNPosting(itup) > 0)
+ {
+ entry->list = ginReadTuple(ginstate, entry->attnum, itup,
+ &entry->nlist);
+ entry->predictNumberResult = entry->nlist;
+
+ entry->isFinished = false;
+ }
+ }
+ }
+ else
+ {
+ /*
+ * No entry found. Predicate lock the leaf page, to lock the place
+ * where the entry would've been, had there been one.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer), snapshot);
+ }
+
+ if (needUnlock)
+ LockBuffer(stackEntry->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stackEntry);
+}
+
+/*
+ * Comparison function for scan entry indexes. Sorts by predictNumberResult,
+ * least frequent items first.
+ */
+static int
+entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
+{
+ const GinScanKey key = (const GinScanKey) arg;
+ int i1 = *(const int *) a1;
+ int i2 = *(const int *) a2;
+ uint32 n1 = key->scanEntry[i1]->predictNumberResult;
+ uint32 n2 = key->scanEntry[i2]->predictNumberResult;
+
+ if (n1 < n2)
+ return -1;
+ else if (n1 == n2)
+ return 0;
+ else
+ return 1;
+}
+
+static void
+startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
+{
+ MemoryContext oldCtx = CurrentMemoryContext;
+ int i;
+ int j;
+ int *entryIndexes;
+
+ ItemPointerSetMin(&key->curItem);
+ key->curItemMatches = false;
+ key->recheckCurItem = false;
+ key->isFinished = false;
+
+ /*
+ * Divide the entries into two distinct sets: required and additional.
+ * Additional entries are not enough for a match alone, without any items
+ * from the required set, but are needed by the consistent function to
+ * decide if an item matches. When scanning, we can skip over items from
+ * additional entries that have no corresponding matches in any of the
+ * required entries. That speeds up queries like "frequent & rare"
+ * considerably, if the frequent term can be put in the additional set.
+ *
+ * There can be many legal ways to divide them entries into these two
+ * sets. A conservative division is to just put everything in the required
+ * set, but the more you can put in the additional set, the more you can
+ * skip during the scan. To maximize skipping, we try to put as many
+ * frequent items as possible into additional, and less frequent ones into
+ * required. To do that, sort the entries by frequency
+ * (predictNumberResult), and put entries into the required set in that
+ * order, until the consistent function says that none of the remaining
+ * entries can form a match, without any items from the required set. The
+ * rest go to the additional set.
+ *
+ * Exclude-only scan keys are known to have no required entries.
+ */
+ if (key->excludeOnly)
+ {
+ MemoryContextSwitchTo(so->keyCtx);
+
+ key->nrequired = 0;
+ key->nadditional = key->nentries;
+ key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
+ for (i = 0; i < key->nadditional; i++)
+ key->additionalEntries[i] = key->scanEntry[i];
+ }
+ else if (key->nentries > 1)
+ {
+ MemoryContextSwitchTo(so->tempCtx);
+
+ entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
+ for (i = 0; i < key->nentries; i++)
+ entryIndexes[i] = i;
+ qsort_arg(entryIndexes, key->nentries, sizeof(int),
+ entryIndexByFrequencyCmp, key);
+
+ for (i = 0; i < key->nentries - 1; i++)
+ {
+ /* Pass all entries <= i as FALSE, and the rest as MAYBE */
+ for (j = 0; j <= i; j++)
+ key->entryRes[entryIndexes[j]] = GIN_FALSE;
+ for (j = i + 1; j < key->nentries; j++)
+ key->entryRes[entryIndexes[j]] = GIN_MAYBE;
+
+ if (key->triConsistentFn(key) == GIN_FALSE)
+ break;
+ }
+ /* i is now the last required entry. */
+
+ MemoryContextSwitchTo(so->keyCtx);
+
+ key->nrequired = i + 1;
+ key->nadditional = key->nentries - key->nrequired;
+ key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
+ key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
+
+ j = 0;
+ for (i = 0; i < key->nrequired; i++)
+ key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
+ for (i = 0; i < key->nadditional; i++)
+ key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
+
+ /* clean up after consistentFn calls (also frees entryIndexes) */
+ MemoryContextReset(so->tempCtx);
+ }
+ else
+ {
+ MemoryContextSwitchTo(so->keyCtx);
+
+ key->nrequired = 1;
+ key->nadditional = 0;
+ key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
+ key->requiredEntries[0] = key->scanEntry[0];
+ }
+ MemoryContextSwitchTo(oldCtx);
+}
+
+static void
+startScan(IndexScanDesc scan)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ GinState *ginstate = &so->ginstate;
+ uint32 i;
+
+ for (i = 0; i < so->totalentries; i++)
+ startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
+
+ if (GinFuzzySearchLimit > 0)
+ {
+ /*
+ * If all of keys more than threshold we will try to reduce result, we
+ * hope (and only hope, for intersection operation of array our
+ * supposition isn't true), that total result will not more than
+ * minimal predictNumberResult.
+ */
+ bool reduce = true;
+
+ for (i = 0; i < so->totalentries; i++)
+ {
+ if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
+ {
+ reduce = false;
+ break;
+ }
+ }
+ if (reduce)
+ {
+ for (i = 0; i < so->totalentries; i++)
+ {
+ so->entries[i]->predictNumberResult /= so->totalentries;
+ so->entries[i]->reduceResult = true;
+ }
+ }
+ }
+
+ /*
+ * Now that we have the estimates for the entry frequencies, finish
+ * initializing the scan keys.
+ */
+ for (i = 0; i < so->nkeys; i++)
+ startScanKey(ginstate, so, so->keys + i);
+}
+
+/*
+ * Load the next batch of item pointers from a posting tree.
+ *
+ * Note that we copy the page into GinScanEntry->list array and unlock it, but
+ * keep it pinned to prevent interference with vacuum.
+ */
+static void
+entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
+ ItemPointerData advancePast, Snapshot snapshot)
+{
+ Page page;
+ int i;
+ bool stepright;
+
+ if (!BufferIsValid(entry->buffer))
+ {
+ entry->isFinished = true;
+ return;
+ }
+
+ /*
+ * We have two strategies for finding the correct page: step right from
+ * the current page, or descend the tree again from the root. If
+ * advancePast equals the current item, the next matching item should be
+ * on the next page, so we step right. Otherwise, descend from root.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
+ {
+ stepright = true;
+ LockBuffer(entry->buffer, GIN_SHARE);
+ }
+ else
+ {
+ GinBtreeStack *stack;
+
+ ReleaseBuffer(entry->buffer);
+
+ /*
+ * Set the search key, and find the correct leaf page.
+ */
+ if (ItemPointerIsLossyPage(&advancePast))
+ {
+ ItemPointerSet(&entry->btree.itemptr,
+ GinItemPointerGetBlockNumber(&advancePast) + 1,
+ FirstOffsetNumber);
+ }
+ else
+ {
+ ItemPointerSet(&entry->btree.itemptr,
+ GinItemPointerGetBlockNumber(&advancePast),
+ OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
+ }
+ entry->btree.fullScan = false;
+ stack = ginFindLeafPage(&entry->btree, true, false, snapshot);
+
+ /* we don't need the stack, just the buffer. */
+ entry->buffer = stack->buffer;
+ IncrBufferRefCount(entry->buffer);
+ freeGinBtreeStack(stack);
+ stepright = false;
+ }
+
+ elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
+ GinItemPointerGetBlockNumber(&advancePast),
+ GinItemPointerGetOffsetNumber(&advancePast),
+ !stepright);
+
+ page = BufferGetPage(entry->buffer);
+ for (;;)
+ {
+ entry->offset = InvalidOffsetNumber;
+ if (entry->list)
+ {
+ pfree(entry->list);
+ entry->list = NULL;
+ entry->nlist = 0;
+ }
+
+ if (stepright)
+ {
+ /*
+ * We've processed all the entries on this page. If it was the
+ * last page in the tree, we're done.
+ */
+ if (GinPageRightMost(page))
+ {
+ UnlockReleaseBuffer(entry->buffer);
+ entry->buffer = InvalidBuffer;
+ entry->isFinished = true;
+ return;
+ }
+
+ /*
+ * Step to next page, following the right link. then find the
+ * first ItemPointer greater than advancePast.
+ */
+ entry->buffer = ginStepRight(entry->buffer,
+ ginstate->index,
+ GIN_SHARE);
+ page = BufferGetPage(entry->buffer);
+ }
+ stepright = true;
+
+ if (GinPageGetOpaque(page)->flags & GIN_DELETED)
+ continue; /* page was deleted by concurrent vacuum */
+
+ /*
+ * The first item > advancePast might not be on this page, but
+ * somewhere to the right, if the page was split, or a non-match from
+ * another key in the query allowed us to skip some items from this
+ * entry. Keep following the right-links until we re-find the correct
+ * page.
+ */
+ if (!GinPageRightMost(page) &&
+ ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
+ {
+ /*
+ * the item we're looking is > the right bound of the page, so it
+ * can't be on this page.
+ */
+ continue;
+ }
+
+ entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
+
+ for (i = 0; i < entry->nlist; i++)
+ {
+ if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
+ {
+ entry->offset = i;
+
+ if (GinPageRightMost(page))
+ {
+ /* after processing the copied items, we're done. */
+ UnlockReleaseBuffer(entry->buffer);
+ entry->buffer = InvalidBuffer;
+ }
+ else
+ LockBuffer(entry->buffer, GIN_UNLOCK);
+ return;
+ }
+ }
+ }
+}
+
+#define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
+#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
+
+/*
+ * Sets entry->curItem to next heap item pointer > advancePast, for one entry
+ * of one scan key, or sets entry->isFinished to true if there are no more.
+ *
+ * Item pointers are returned in ascending order.
+ *
+ * Note: this can return a "lossy page" item pointer, indicating that the
+ * entry potentially matches all items on that heap page. However, it is
+ * not allowed to return both a lossy page pointer and exact (regular)
+ * item pointers for the same page. (Doing so would break the key-combination
+ * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
+ * current implementation this is guaranteed by the behavior of tidbitmaps.
+ */
+static void
+entryGetItem(GinState *ginstate, GinScanEntry entry,
+ ItemPointerData advancePast, Snapshot snapshot)
+{
+ Assert(!entry->isFinished);
+
+ Assert(!ItemPointerIsValid(&entry->curItem) ||
+ ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+
+ if (entry->matchBitmap)
+ {
+ /* A bitmap result */
+ BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
+ OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
+
+ for (;;)
+ {
+ /*
+ * If we've exhausted all items on this block, move to next block
+ * in the bitmap.
+ */
+ while (entry->matchResult == NULL ||
+ (entry->matchResult->ntuples >= 0 &&
+ entry->offset >= entry->matchResult->ntuples) ||
+ entry->matchResult->blockno < advancePastBlk ||
+ (ItemPointerIsLossyPage(&advancePast) &&
+ entry->matchResult->blockno == advancePastBlk))
+ {
+ entry->matchResult = tbm_iterate(entry->matchIterator);
+
+ if (entry->matchResult == NULL)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ tbm_end_iterate(entry->matchIterator);
+ entry->matchIterator = NULL;
+ entry->isFinished = true;
+ break;
+ }
+
+ /*
+ * Reset counter to the beginning of entry->matchResult. Note:
+ * entry->offset is still greater than matchResult->ntuples if
+ * matchResult is lossy. So, on next call we will get next
+ * result from TIDBitmap.
+ */
+ entry->offset = 0;
+ }
+ if (entry->isFinished)
+ break;
+
+ /*
+ * We're now on the first page after advancePast which has any
+ * items on it. If it's a lossy result, return that.
+ */
+ if (entry->matchResult->ntuples < 0)
+ {
+ ItemPointerSetLossyPage(&entry->curItem,
+ entry->matchResult->blockno);
+
+ /*
+ * We might as well fall out of the loop; we could not
+ * estimate number of results on this page to support correct
+ * reducing of result even if it's enabled.
+ */
+ break;
+ }
+
+ /*
+ * Not a lossy page. Skip over any offsets <= advancePast, and
+ * return that.
+ */
+ if (entry->matchResult->blockno == advancePastBlk)
+ {
+ /*
+ * First, do a quick check against the last offset on the
+ * page. If that's > advancePast, so are all the other
+ * offsets, so just go back to the top to get the next page.
+ */
+ if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
+ {
+ entry->offset = entry->matchResult->ntuples;
+ continue;
+ }
+
+ /* Otherwise scan to find the first item > advancePast */
+ while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
+ entry->offset++;
+ }
+
+ ItemPointerSet(&entry->curItem,
+ entry->matchResult->blockno,
+ entry->matchResult->offsets[entry->offset]);
+ entry->offset++;
+
+ /* Done unless we need to reduce the result */
+ if (!entry->reduceResult || !dropItem(entry))
+ break;
+ }
+ }
+ else if (!BufferIsValid(entry->buffer))
+ {
+ /*
+ * A posting list from an entry tuple, or the last page of a posting
+ * tree.
+ */
+ for (;;)
+ {
+ if (entry->offset >= entry->nlist)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ entry->isFinished = true;
+ break;
+ }
+
+ entry->curItem = entry->list[entry->offset++];
+
+ /* If we're not past advancePast, keep scanning */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ continue;
+
+ /* Done unless we need to reduce the result */
+ if (!entry->reduceResult || !dropItem(entry))
+ break;
+ }
+ }
+ else
+ {
+ /* A posting tree */
+ for (;;)
+ {
+ /* If we've processed the current batch, load more items */
+ while (entry->offset >= entry->nlist)
+ {
+ entryLoadMoreItems(ginstate, entry, advancePast, snapshot);
+
+ if (entry->isFinished)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ return;
+ }
+ }
+
+ entry->curItem = entry->list[entry->offset++];
+
+ /* If we're not past advancePast, keep scanning */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ continue;
+
+ /* Done unless we need to reduce the result */
+ if (!entry->reduceResult || !dropItem(entry))
+ break;
+
+ /*
+ * Advance advancePast (so that entryLoadMoreItems will load the
+ * right data), and keep scanning
+ */
+ advancePast = entry->curItem;
+ }
+ }
+}
+
+/*
+ * Identify the "current" item among the input entry streams for this scan key
+ * that is greater than advancePast, and test whether it passes the scan key
+ * qual condition.
+ *
+ * The current item is the smallest curItem among the inputs. key->curItem
+ * is set to that value. key->curItemMatches is set to indicate whether that
+ * TID passes the consistentFn test. If so, key->recheckCurItem is set true
+ * iff recheck is needed for this item pointer (including the case where the
+ * item pointer is a lossy page pointer).
+ *
+ * If all entry streams are exhausted, sets key->isFinished to true.
+ *
+ * Item pointers must be returned in ascending order.
+ *
+ * Note: this can return a "lossy page" item pointer, indicating that the
+ * key potentially matches all items on that heap page. However, it is
+ * not allowed to return both a lossy page pointer and exact (regular)
+ * item pointers for the same page. (Doing so would break the key-combination
+ * logic in scanGetItem.)
+ */
+static void
+keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
+ ItemPointerData advancePast, Snapshot snapshot)
+{
+ ItemPointerData minItem;
+ ItemPointerData curPageLossy;
+ uint32 i;
+ bool haveLossyEntry;
+ GinScanEntry entry;
+ GinTernaryValue res;
+ MemoryContext oldCtx;
+ bool allFinished;
+
+ Assert(!key->isFinished);
+
+ /*
+ * We might have already tested this item; if so, no need to repeat work.
+ * (Note: the ">" case can happen, if advancePast is exact but we
+ * previously had to set curItem to a lossy-page pointer.)
+ */
+ if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
+ return;
+
+ /*
+ * Find the minimum item > advancePast among the active entry streams.
+ *
+ * Note: a lossy-page entry is encoded by a ItemPointer with max value for
+ * offset (0xffff), so that it will sort after any exact entries for the
+ * same page. So we'll prefer to return exact pointers not lossy
+ * pointers, which is good.
+ */
+ ItemPointerSetMax(&minItem);
+ allFinished = true;
+ for (i = 0; i < key->nrequired; i++)
+ {
+ entry = key->requiredEntries[i];
+
+ if (entry->isFinished)
+ continue;
+
+ /*
+ * Advance this stream if necessary.
+ *
+ * In particular, since entry->curItem was initialized with
+ * ItemPointerSetMin, this ensures we fetch the first item for each
+ * entry on the first call.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ {
+ entryGetItem(ginstate, entry, advancePast, snapshot);
+ if (entry->isFinished)
+ continue;
+ }
+
+ allFinished = false;
+ if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+ minItem = entry->curItem;
+ }
+
+ if (allFinished && !key->excludeOnly)
+ {
+ /* all entries are finished */
+ key->isFinished = true;
+ return;
+ }
+
+ if (!key->excludeOnly)
+ {
+ /*
+ * For a normal scan key, we now know there are no matches < minItem.
+ *
+ * If minItem is lossy, it means that there were no exact items on the
+ * page among requiredEntries, because lossy pointers sort after exact
+ * items. However, there might be exact items for the same page among
+ * additionalEntries, so we mustn't advance past them.
+ */
+ if (ItemPointerIsLossyPage(&minItem))
+ {
+ if (GinItemPointerGetBlockNumber(&advancePast) <
+ GinItemPointerGetBlockNumber(&minItem))
+ {
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&minItem),
+ InvalidOffsetNumber);
+ }
+ }
+ else
+ {
+ Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&minItem),
+ OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
+ }
+ }
+ else
+ {
+ /*
+ * excludeOnly scan keys don't have any entries that are necessarily
+ * present in matching items. So, we consider the item just after
+ * advancePast.
+ */
+ Assert(key->nrequired == 0);
+ ItemPointerSet(&minItem,
+ GinItemPointerGetBlockNumber(&advancePast),
+ OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
+ }
+
+ /*
+ * We might not have loaded all the entry streams for this TID yet. We
+ * could call the consistent function, passing MAYBE for those entries, to
+ * see if it can decide if this TID matches based on the information we
+ * have. But if the consistent-function is expensive, and cannot in fact
+ * decide with partial information, that could be a big loss. So, load all
+ * the additional entries, before calling the consistent function.
+ */
+ for (i = 0; i < key->nadditional; i++)
+ {
+ entry = key->additionalEntries[i];
+
+ if (entry->isFinished)
+ continue;
+
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ {
+ entryGetItem(ginstate, entry, advancePast, snapshot);
+ if (entry->isFinished)
+ continue;
+ }
+
+ /*
+ * Normally, none of the items in additionalEntries can have a curItem
+ * larger than minItem. But if minItem is a lossy page, then there
+ * might be exact items on the same page among additionalEntries.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+ {
+ Assert(ItemPointerIsLossyPage(&minItem));
+ minItem = entry->curItem;
+ }
+ }
+
+ /*
+ * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
+ * and perform consistentFn test.
+ *
+ * Lossy-page entries pose a problem, since we don't know the correct
+ * entryRes state to pass to the consistentFn, and we also don't know what
+ * its combining logic will be (could be AND, OR, or even NOT). If the
+ * logic is OR then the consistentFn might succeed for all items in the
+ * lossy page even when none of the other entries match.
+ *
+ * Our strategy is to call the tri-state consistent function, with the
+ * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
+ * returns FALSE, none of the lossy items alone are enough for a match, so
+ * we don't need to return a lossy-page pointer. Otherwise, return a
+ * lossy-page pointer to indicate that the whole heap page must be
+ * checked. (On subsequent calls, we'll do nothing until minItem is past
+ * the page altogether, thus ensuring that we never return both regular
+ * and lossy pointers for the same page.)
+ *
+ * An exception is that it doesn't matter what we pass for lossy pointers
+ * in "hidden" entries, because the consistentFn's result can't depend on
+ * them. We could pass them as MAYBE as well, but if we're using the
+ * "shim" implementation of a tri-state consistent function (see
+ * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
+ * them as true.
+ *
+ * Note that only lossy-page entries pointing to the current item's page
+ * should trigger this processing; we might have future lossy pages in the
+ * entry array, but they aren't relevant yet.
+ */
+ key->curItem = minItem;
+ ItemPointerSetLossyPage(&curPageLossy,
+ GinItemPointerGetBlockNumber(&key->curItem));
+ haveLossyEntry = false;
+ for (i = 0; i < key->nentries; i++)
+ {
+ entry = key->scanEntry[i];
+ if (entry->isFinished == false &&
+ ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
+ {
+ if (i < key->nuserentries)
+ key->entryRes[i] = GIN_MAYBE;
+ else
+ key->entryRes[i] = GIN_TRUE;
+ haveLossyEntry = true;
+ }
+ else
+ key->entryRes[i] = GIN_FALSE;
+ }
+
+ /* prepare for calling consistentFn in temp context */
+ oldCtx = MemoryContextSwitchTo(tempCtx);
+
+ if (haveLossyEntry)
+ {
+ /* Have lossy-page entries, so see if whole page matches */
+ res = key->triConsistentFn(key);
+
+ if (res == GIN_TRUE || res == GIN_MAYBE)
+ {
+ /* Yes, so clean up ... */
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(tempCtx);
+
+ /* and return lossy pointer for whole page */
+ key->curItem = curPageLossy;
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ return;
+ }
+ }
+
+ /*
+ * At this point we know that we don't need to return a lossy whole-page
+ * pointer, but we might have matches for individual exact item pointers,
+ * possibly in combination with a lossy pointer. Pass lossy pointers as
+ * MAYBE to the ternary consistent function, to let it decide if this
+ * tuple satisfies the overall key, even though we don't know if the lossy
+ * entries match.
+ *
+ * Prepare entryRes array to be passed to consistentFn.
+ */
+ for (i = 0; i < key->nentries; i++)
+ {
+ entry = key->scanEntry[i];
+ if (entry->isFinished)
+ key->entryRes[i] = GIN_FALSE;
+#if 0
+
+ /*
+ * This case can't currently happen, because we loaded all the entries
+ * for this item earlier.
+ */
+ else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ key->entryRes[i] = GIN_MAYBE;
+#endif
+ else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
+ key->entryRes[i] = GIN_MAYBE;
+ else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
+ key->entryRes[i] = GIN_TRUE;
+ else
+ key->entryRes[i] = GIN_FALSE;
+ }
+
+ res = key->triConsistentFn(key);
+
+ switch (res)
+ {
+ case GIN_TRUE:
+ key->curItemMatches = true;
+ /* triConsistentFn set recheckCurItem */
+ break;
+
+ case GIN_FALSE:
+ key->curItemMatches = false;
+ break;
+
+ case GIN_MAYBE:
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ break;
+
+ default:
+
+ /*
+ * the 'default' case shouldn't happen, but if the consistent
+ * function returns something bogus, this is the safe result
+ */
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ break;
+ }
+
+ /*
+ * We have a tuple, and we know if it matches or not. If it's a non-match,
+ * we could continue to find the next matching tuple, but let's break out
+ * and give scanGetItem a chance to advance the other keys. They might be
+ * able to skip past to a much higher TID, allowing us to save work.
+ */
+
+ /* clean up after consistentFn calls */
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(tempCtx);
+}
+
+/*
+ * Get next heap item pointer (after advancePast) from scan.
+ * Returns true if anything found.
+ * On success, *item and *recheck are set.
+ *
+ * Note: this is very nearly the same logic as in keyGetItem(), except
+ * that we know the keys are to be combined with AND logic, whereas in
+ * keyGetItem() the combination logic is known only to the consistentFn.
+ */
+static bool
+scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
+ ItemPointerData *item, bool *recheck)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ uint32 i;
+ bool match;
+
+ /*----------
+ * Advance the scan keys in lock-step, until we find an item that matches
+ * all the keys. If any key reports isFinished, meaning its subset of the
+ * entries is exhausted, we can stop. Otherwise, set *item to the next
+ * matching item.
+ *
+ * This logic works only if a keyGetItem stream can never contain both
+ * exact and lossy pointers for the same page. Else we could have a
+ * case like
+ *
+ * stream 1 stream 2
+ * ... ...
+ * 42/6 42/7
+ * 50/1 42/0xffff
+ * ... ...
+ *
+ * We would conclude that 42/6 is not a match and advance stream 1,
+ * thus never detecting the match to the lossy pointer in stream 2.
+ * (keyGetItem has a similar problem versus entryGetItem.)
+ *----------
+ */
+ do
+ {
+ ItemPointerSetMin(item);
+ match = true;
+ for (i = 0; i < so->nkeys && match; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ /*
+ * If we're considering a lossy page, skip excludeOnly keys, They
+ * can't exclude the whole page anyway.
+ */
+ if (ItemPointerIsLossyPage(item) && key->excludeOnly)
+ {
+ /*
+ * ginNewScanKey() should never mark the first key as
+ * excludeOnly.
+ */
+ Assert(i > 0);
+ continue;
+ }
+
+ /* Fetch the next item for this key that is > advancePast. */
+ keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
+ scan->xs_snapshot);
+
+ if (key->isFinished)
+ return false;
+
+ /*
+ * If it's not a match, we can immediately conclude that nothing
+ * <= this item matches, without checking the rest of the keys.
+ */
+ if (!key->curItemMatches)
+ {
+ advancePast = key->curItem;
+ match = false;
+ break;
+ }
+
+ /*
+ * It's a match. We can conclude that nothing < matches, so the
+ * other key streams can skip to this item.
+ *
+ * Beware of lossy pointers, though; from a lossy pointer, we can
+ * only conclude that nothing smaller than this *block* matches.
+ */
+ if (ItemPointerIsLossyPage(&key->curItem))
+ {
+ if (GinItemPointerGetBlockNumber(&advancePast) <
+ GinItemPointerGetBlockNumber(&key->curItem))
+ {
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&key->curItem),
+ InvalidOffsetNumber);
+ }
+ }
+ else
+ {
+ Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&key->curItem),
+ OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
+ }
+
+ /*
+ * If this is the first key, remember this location as a potential
+ * match, and proceed to check the rest of the keys.
+ *
+ * Otherwise, check if this is the same item that we checked the
+ * previous keys for (or a lossy pointer for the same page). If
+ * not, loop back to check the previous keys for this item (we
+ * will check this key again too, but keyGetItem returns quickly
+ * for that)
+ */
+ if (i == 0)
+ {
+ *item = key->curItem;
+ }
+ else
+ {
+ if (ItemPointerIsLossyPage(&key->curItem) ||
+ ItemPointerIsLossyPage(item))
+ {
+ Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
+ match = (GinItemPointerGetBlockNumber(&key->curItem) ==
+ GinItemPointerGetBlockNumber(item));
+ }
+ else
+ {
+ Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
+ match = (ginCompareItemPointers(&key->curItem, item) == 0);
+ }
+ }
+ }
+ } while (!match);
+
+ Assert(!ItemPointerIsMin(item));
+
+ /*
+ * Now *item contains the first ItemPointer after previous result that
+ * satisfied all the keys for that exact TID, or a lossy reference to the
+ * same page.
+ *
+ * We must return recheck = true if any of the keys are marked recheck.
+ */
+ *recheck = false;
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ if (key->recheckCurItem)
+ {
+ *recheck = true;
+ break;
+ }
+ }
+
+ return true;
+}
+
+
+/*
+ * Functions for scanning the pending list
+ */
+
+
+/*
+ * Get ItemPointer of next heap row to be checked from pending list.
+ * Returns false if there are no more. On pages with several heap rows
+ * it returns each row separately, on page with part of heap row returns
+ * per page data. pos->firstOffset and pos->lastOffset are set to identify
+ * the range of pending-list tuples belonging to this heap row.
+ *
+ * The pendingBuffer is presumed pinned and share-locked on entry, and is
+ * pinned and share-locked on success exit. On failure exit it's released.
+ */
+static bool
+scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
+{
+ OffsetNumber maxoff;
+ Page page;
+ IndexTuple itup;
+
+ ItemPointerSetInvalid(&pos->item);
+ for (;;)
+ {
+ page = BufferGetPage(pos->pendingBuffer);
+ TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ if (pos->firstOffset > maxoff)
+ {
+ BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+
+ if (blkno == InvalidBlockNumber)
+ {
+ UnlockReleaseBuffer(pos->pendingBuffer);
+ pos->pendingBuffer = InvalidBuffer;
+
+ return false;
+ }
+ else
+ {
+ /*
+ * Here we must prevent deletion of next page by insertcleanup
+ * process, which may be trying to obtain exclusive lock on
+ * current page. So, we lock next page before releasing the
+ * current one
+ */
+ Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
+
+ LockBuffer(tmpbuf, GIN_SHARE);
+ UnlockReleaseBuffer(pos->pendingBuffer);
+
+ pos->pendingBuffer = tmpbuf;
+ pos->firstOffset = FirstOffsetNumber;
+ }
+ }
+ else
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
+ pos->item = itup->t_tid;
+ if (GinPageHasFullRow(page))
+ {
+ /*
+ * find itempointer to the next row
+ */
+ for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
+ if (!ItemPointerEquals(&pos->item, &itup->t_tid))
+ break;
+ }
+ }
+ else
+ {
+ /*
+ * All itempointers are the same on this page
+ */
+ pos->lastOffset = maxoff + 1;
+ }
+
+ /*
+ * Now pos->firstOffset points to the first tuple of current heap
+ * row, pos->lastOffset points to the first tuple of next heap row
+ * (or to the end of page)
+ */
+ break;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Scan pending-list page from current tuple (off) up till the first of:
+ * - match is found (then returns true)
+ * - no later match is possible
+ * - tuple's attribute number is not equal to entry's attrnum
+ * - reach end of page
+ *
+ * datum[]/category[]/datumExtracted[] arrays are used to cache the results
+ * of gintuple_get_key() on the current page.
+ */
+static bool
+matchPartialInPendingList(GinState *ginstate, Page page,
+ OffsetNumber off, OffsetNumber maxoff,
+ GinScanEntry entry,
+ Datum *datum, GinNullCategory *category,
+ bool *datumExtracted)
+{
+ IndexTuple itup;
+ int32 cmp;
+
+ /* Partial match to a null is not possible */
+ if (entry->queryCategory != GIN_CAT_NORM_KEY)
+ return false;
+
+ while (off < maxoff)
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+
+ if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
+ return false;
+
+ if (datumExtracted[off - 1] == false)
+ {
+ datum[off - 1] = gintuple_get_key(ginstate, itup,
+ &category[off - 1]);
+ datumExtracted[off - 1] = true;
+ }
+
+ /* Once we hit nulls, no further match is possible */
+ if (category[off - 1] != GIN_CAT_NORM_KEY)
+ return false;
+
+ /*----------
+ * Check partial match.
+ * case cmp == 0 => match
+ * case cmp > 0 => not match and end scan (no later match possible)
+ * case cmp < 0 => not match and continue scan
+ *----------
+ */
+ cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
+ ginstate->supportCollation[entry->attnum - 1],
+ entry->queryKey,
+ datum[off - 1],
+ UInt16GetDatum(entry->strategy),
+ PointerGetDatum(entry->extra_data)));
+ if (cmp == 0)
+ return true;
+ else if (cmp > 0)
+ return false;
+
+ off++;
+ }
+
+ return false;
+}
+
+/*
+ * Set up the entryRes array for each key by looking at
+ * every entry for current heap row in pending list.
+ *
+ * Returns true if each scan key has at least one entryRes match.
+ * This corresponds to the situations where the normal index search will
+ * try to apply the key's consistentFn. (A tuple not meeting that requirement
+ * cannot be returned by the normal search since no entry stream will
+ * source its TID.)
+ *
+ * The pendingBuffer is presumed pinned and share-locked on entry.
+ */
+static bool
+collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ OffsetNumber attrnum;
+ Page page;
+ IndexTuple itup;
+ int i,
+ j;
+
+ /*
+ * Reset all entryRes and hasMatchKey flags
+ */
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ memset(key->entryRes, GIN_FALSE, key->nentries);
+ }
+ memset(pos->hasMatchKey, false, so->nkeys);
+
+ /*
+ * Outer loop iterates over multiple pending-list pages when a single heap
+ * row has entries spanning those pages.
+ */
+ for (;;)
+ {
+ Datum datum[BLCKSZ / sizeof(IndexTupleData)];
+ GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
+ bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
+
+ Assert(pos->lastOffset > pos->firstOffset);
+ memset(datumExtracted + pos->firstOffset - 1, 0,
+ sizeof(bool) * (pos->lastOffset - pos->firstOffset));
+
+ page = BufferGetPage(pos->pendingBuffer);
+ TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
+
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ for (j = 0; j < key->nentries; j++)
+ {
+ GinScanEntry entry = key->scanEntry[j];
+ OffsetNumber StopLow = pos->firstOffset,
+ StopHigh = pos->lastOffset,
+ StopMiddle;
+
+ /* If already matched on earlier page, do no extra work */
+ if (key->entryRes[j])
+ continue;
+
+ /*
+ * Interesting tuples are from pos->firstOffset to
+ * pos->lastOffset and they are ordered by (attnum, Datum) as
+ * it's done in entry tree. So we can use binary search to
+ * avoid linear scanning.
+ */
+ while (StopLow < StopHigh)
+ {
+ int res;
+
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
+
+ attrnum = gintuple_get_attrnum(&so->ginstate, itup);
+
+ if (key->attnum < attrnum)
+ {
+ StopHigh = StopMiddle;
+ continue;
+ }
+ if (key->attnum > attrnum)
+ {
+ StopLow = StopMiddle + 1;
+ continue;
+ }
+
+ if (datumExtracted[StopMiddle - 1] == false)
+ {
+ datum[StopMiddle - 1] =
+ gintuple_get_key(&so->ginstate, itup,
+ &category[StopMiddle - 1]);
+ datumExtracted[StopMiddle - 1] = true;
+ }
+
+ if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
+ {
+ /* special behavior depending on searchMode */
+ if (entry->searchMode == GIN_SEARCH_MODE_ALL)
+ {
+ /* match anything except NULL_ITEM */
+ if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
+ res = -1;
+ else
+ res = 0;
+ }
+ else
+ {
+ /* match everything */
+ res = 0;
+ }
+ }
+ else
+ {
+ res = ginCompareEntries(&so->ginstate,
+ entry->attnum,
+ entry->queryKey,
+ entry->queryCategory,
+ datum[StopMiddle - 1],
+ category[StopMiddle - 1]);
+ }
+
+ if (res == 0)
+ {
+ /*
+ * Found exact match (there can be only one, except in
+ * EMPTY_QUERY mode).
+ *
+ * If doing partial match, scan forward from here to
+ * end of page to check for matches.
+ *
+ * See comment above about tuple's ordering.
+ */
+ if (entry->isPartialMatch)
+ key->entryRes[j] =
+ matchPartialInPendingList(&so->ginstate,
+ page,
+ StopMiddle,
+ pos->lastOffset,
+ entry,
+ datum,
+ category,
+ datumExtracted);
+ else
+ key->entryRes[j] = true;
+
+ /* done with binary search */
+ break;
+ }
+ else if (res < 0)
+ StopHigh = StopMiddle;
+ else
+ StopLow = StopMiddle + 1;
+ }
+
+ if (StopLow >= StopHigh && entry->isPartialMatch)
+ {
+ /*
+ * No exact match on this page. If doing partial match,
+ * scan from the first tuple greater than target value to
+ * end of page. Note that since we don't remember whether
+ * the comparePartialFn told us to stop early on a
+ * previous page, we will uselessly apply comparePartialFn
+ * to the first tuple on each subsequent page.
+ */
+ key->entryRes[j] =
+ matchPartialInPendingList(&so->ginstate,
+ page,
+ StopHigh,
+ pos->lastOffset,
+ entry,
+ datum,
+ category,
+ datumExtracted);
+ }
+
+ pos->hasMatchKey[i] |= key->entryRes[j];
+ }
+ }
+
+ /* Advance firstOffset over the scanned tuples */
+ pos->firstOffset = pos->lastOffset;
+
+ if (GinPageHasFullRow(page))
+ {
+ /*
+ * We have examined all pending entries for the current heap row.
+ * Break out of loop over pages.
+ */
+ break;
+ }
+ else
+ {
+ /*
+ * Advance to next page of pending entries for the current heap
+ * row. Complain if there isn't one.
+ */
+ ItemPointerData item = pos->item;
+
+ if (scanGetCandidate(scan, pos) == false ||
+ !ItemPointerEquals(&pos->item, &item))
+ elog(ERROR, "could not find additional pending pages for same heap tuple");
+ }
+ }
+
+ /*
+ * All scan keys except excludeOnly require at least one entry to match.
+ * excludeOnly keys are an exception, because their implied
+ * GIN_CAT_EMPTY_QUERY scanEntry always matches. So return "true" if all
+ * non-excludeOnly scan keys have at least one match.
+ */
+ for (i = 0; i < so->nkeys; i++)
+ {
+ if (pos->hasMatchKey[i] == false && !so->keys[i].excludeOnly)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Collect all matched rows from pending list into bitmap.
+ */
+static void
+scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ MemoryContext oldCtx;
+ bool recheck,
+ match;
+ int i;
+ pendingPosition pos;
+ Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
+ Page page;
+ BlockNumber blkno;
+
+ *ntids = 0;
+
+ /*
+ * Acquire predicate lock on the metapage, to conflict with any fastupdate
+ * insertions.
+ */
+ PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
+
+ LockBuffer(metabuffer, GIN_SHARE);
+ page = BufferGetPage(metabuffer);
+ TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
+ blkno = GinPageGetMeta(page)->head;
+
+ /*
+ * fetch head of list before unlocking metapage. head page must be pinned
+ * to prevent deletion by vacuum process
+ */
+ if (blkno == InvalidBlockNumber)
+ {
+ /* No pending list, so proceed with normal scan */
+ UnlockReleaseBuffer(metabuffer);
+ return;
+ }
+
+ pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
+ LockBuffer(pos.pendingBuffer, GIN_SHARE);
+ pos.firstOffset = FirstOffsetNumber;
+ UnlockReleaseBuffer(metabuffer);
+ pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
+
+ /*
+ * loop for each heap row. scanGetCandidate returns full row or row's
+ * tuples from first page.
+ */
+ while (scanGetCandidate(scan, &pos))
+ {
+ /*
+ * Check entries in tuple and set up entryRes array.
+ *
+ * If pending tuples belonging to the current heap row are spread
+ * across several pages, collectMatchesForHeapRow will read all of
+ * those pages.
+ */
+ if (!collectMatchesForHeapRow(scan, &pos))
+ continue;
+
+ /*
+ * Matching of entries of one row is finished, so check row using
+ * consistent functions.
+ */
+ oldCtx = MemoryContextSwitchTo(so->tempCtx);
+ recheck = false;
+ match = true;
+
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ if (!key->boolConsistentFn(key))
+ {
+ match = false;
+ break;
+ }
+ recheck |= key->recheckCurItem;
+ }
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(so->tempCtx);
+
+ if (match)
+ {
+ tbm_add_tuples(tbm, &pos.item, 1, recheck);
+ (*ntids)++;
+ }
+ }
+
+ pfree(pos.hasMatchKey);
+}
+
+
+#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
+
+int64
+gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ int64 ntids;
+ ItemPointerData iptr;
+ bool recheck;
+
+ /*
+ * Set up the scan keys, and check for unsatisfiable query.
+ */
+ ginFreeScanKeys(so); /* there should be no keys yet, but just to be
+ * sure */
+ ginNewScanKey(scan);
+
+ if (GinIsVoidRes(scan))
+ return 0;
+
+ ntids = 0;
+
+ /*
+ * First, scan the pending list and collect any matching entries into the
+ * bitmap. After we scan a pending item, some other backend could post it
+ * into the main index, and so we might visit it a second time during the
+ * main scan. This is okay because we'll just re-set the same bit in the
+ * bitmap. (The possibility of duplicate visits is a major reason why GIN
+ * can't support the amgettuple API, however.) Note that it would not do
+ * to scan the main index before the pending list, since concurrent
+ * cleanup could then make us miss entries entirely.
+ */
+ scanPendingInsert(scan, tbm, &ntids);
+
+ /*
+ * Now scan the main index.
+ */
+ startScan(scan);
+
+ ItemPointerSetMin(&iptr);
+
+ for (;;)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ if (!scanGetItem(scan, iptr, &iptr, &recheck))
+ break;
+
+ if (ItemPointerIsLossyPage(&iptr))
+ tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
+ else
+ tbm_add_tuples(tbm, &iptr, 1, recheck);
+ ntids++;
+ }
+
+ return ntids;
+}
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
new file mode 100644
index 0000000..0e8672c
--- /dev/null
+++ b/src/backend/access/gin/gininsert.c
@@ -0,0 +1,541 @@
+/*-------------------------------------------------------------------------
+ *
+ * gininsert.c
+ * insert routines for the postgres inverted index access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/gininsert.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/tableam.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/predicate.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+typedef struct
+{
+ GinState ginstate;
+ double indtuples;
+ GinStatsData buildStats;
+ MemoryContext tmpCtx;
+ MemoryContext funcCtx;
+ BuildAccumulator accum;
+} GinBuildState;
+
+
+/*
+ * Adds array of item pointers to tuple's posting list, or
+ * creates posting tree and tuple pointing to tree in case
+ * of not enough space. Max size of tuple is defined in
+ * GinFormTuple(). Returns a new, modified index tuple.
+ * items[] must be in sorted order with no duplicates.
+ */
+static IndexTuple
+addItemPointersToLeafTuple(GinState *ginstate,
+ IndexTuple old,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats, Buffer buffer)
+{
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
+ IndexTuple res;
+ ItemPointerData *newItems,
+ *oldItems;
+ int oldNPosting,
+ newNPosting;
+ GinPostingList *compressedList;
+
+ Assert(!GinIsPostingTree(old));
+
+ attnum = gintuple_get_attrnum(ginstate, old);
+ key = gintuple_get_key(ginstate, old, &category);
+
+ /* merge the old and new posting lists */
+ oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting);
+
+ newItems = ginMergeItemPointers(items, nitem,
+ oldItems, oldNPosting,
+ &newNPosting);
+
+ /* Compress the posting list, and try to a build tuple with room for it */
+ res = NULL;
+ compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize,
+ NULL);
+ pfree(newItems);
+ if (compressedList)
+ {
+ res = GinFormTuple(ginstate, attnum, key, category,
+ (char *) compressedList,
+ SizeOfGinPostingList(compressedList),
+ newNPosting,
+ false);
+ pfree(compressedList);
+ }
+ if (!res)
+ {
+ /* posting list would be too big, convert to posting tree */
+ BlockNumber postingRoot;
+
+ /*
+ * Initialize posting tree with the old tuple's posting list. It's
+ * surely small enough to fit on one posting-tree page, and should
+ * already be in order with no duplicates.
+ */
+ postingRoot = createPostingTree(ginstate->index,
+ oldItems,
+ oldNPosting,
+ buildStats,
+ buffer);
+
+ /* Now insert the TIDs-to-be-added into the posting tree */
+ ginInsertItemPointers(ginstate->index, postingRoot,
+ items, nitem,
+ buildStats);
+
+ /* And build a new posting-tree-only result tuple */
+ res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
+ GinSetPostingTree(res, postingRoot);
+ }
+ pfree(oldItems);
+
+ return res;
+}
+
+/*
+ * Build a fresh leaf tuple, either posting-list or posting-tree format
+ * depending on whether the given items list will fit.
+ * items[] must be in sorted order with no duplicates.
+ *
+ * This is basically the same logic as in addItemPointersToLeafTuple,
+ * but working from slightly different input.
+ */
+static IndexTuple
+buildFreshLeafTuple(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats, Buffer buffer)
+{
+ IndexTuple res = NULL;
+ GinPostingList *compressedList;
+
+ /* try to build a posting list tuple with all the items */
+ compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, NULL);
+ if (compressedList)
+ {
+ res = GinFormTuple(ginstate, attnum, key, category,
+ (char *) compressedList,
+ SizeOfGinPostingList(compressedList),
+ nitem, false);
+ pfree(compressedList);
+ }
+ if (!res)
+ {
+ /* posting list would be too big, build posting tree */
+ BlockNumber postingRoot;
+
+ /*
+ * Build posting-tree-only result tuple. We do this first so as to
+ * fail quickly if the key is too big.
+ */
+ res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
+
+ /*
+ * Initialize a new posting tree with the TIDs.
+ */
+ postingRoot = createPostingTree(ginstate->index, items, nitem,
+ buildStats, buffer);
+
+ /* And save the root link in the result tuple */
+ GinSetPostingTree(res, postingRoot);
+ }
+
+ return res;
+}
+
+/*
+ * Insert one or more heap TIDs associated with the given key value.
+ * This will either add a single key entry, or enlarge a pre-existing entry.
+ *
+ * During an index build, buildStats is non-null and the counters
+ * it contains should be incremented as needed.
+ */
+void
+ginEntryInsert(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats)
+{
+ GinBtreeData btree;
+ GinBtreeEntryInsertData insertdata;
+ GinBtreeStack *stack;
+ IndexTuple itup;
+ Page page;
+
+ insertdata.isDelete = false;
+
+ ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
+ btree.isBuild = (buildStats != NULL);
+
+ stack = ginFindLeafPage(&btree, false, false, NULL);
+ page = BufferGetPage(stack->buffer);
+
+ if (btree.findItem(&btree, stack))
+ {
+ /* found pre-existing entry */
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
+
+ if (GinIsPostingTree(itup))
+ {
+ /* add entries to existing posting tree */
+ BlockNumber rootPostingTree = GinGetPostingTree(itup);
+
+ /* release all stack */
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stack);
+
+ /* insert into posting tree */
+ ginInsertItemPointers(ginstate->index, rootPostingTree,
+ items, nitem,
+ buildStats);
+ return;
+ }
+
+ CheckForSerializableConflictIn(ginstate->index, NULL,
+ BufferGetBlockNumber(stack->buffer));
+ /* modify an existing leaf entry */
+ itup = addItemPointersToLeafTuple(ginstate, itup,
+ items, nitem, buildStats, stack->buffer);
+
+ insertdata.isDelete = true;
+ }
+ else
+ {
+ CheckForSerializableConflictIn(ginstate->index, NULL,
+ BufferGetBlockNumber(stack->buffer));
+ /* no match, so construct a new leaf entry */
+ itup = buildFreshLeafTuple(ginstate, attnum, key, category,
+ items, nitem, buildStats, stack->buffer);
+
+ /*
+ * nEntries counts leaf tuples, so increment it only when we make a
+ * new one.
+ */
+ if (buildStats)
+ buildStats->nEntries++;
+ }
+
+ /* Insert the new or modified leaf tuple */
+ insertdata.entry = itup;
+ ginInsertValue(&btree, stack, &insertdata, buildStats);
+ pfree(itup);
+}
+
+/*
+ * Extract index entries for a single indexable item, and add them to the
+ * BuildAccumulator's state.
+ *
+ * This function is used only during initial index creation.
+ */
+static void
+ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum,
+ Datum value, bool isNull,
+ ItemPointer heapptr)
+{
+ Datum *entries;
+ GinNullCategory *categories;
+ int32 nentries;
+ MemoryContext oldCtx;
+
+ oldCtx = MemoryContextSwitchTo(buildstate->funcCtx);
+ entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
+ value, isNull,
+ &nentries, &categories);
+ MemoryContextSwitchTo(oldCtx);
+
+ ginInsertBAEntries(&buildstate->accum, heapptr, attnum,
+ entries, categories, nentries);
+
+ buildstate->indtuples += nentries;
+
+ MemoryContextReset(buildstate->funcCtx);
+}
+
+static void
+ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
+ bool *isnull, bool tupleIsAlive, void *state)
+{
+ GinBuildState *buildstate = (GinBuildState *) state;
+ MemoryContext oldCtx;
+ int i;
+
+ oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+ for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
+ ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
+ values[i], isnull[i], tid);
+
+ /* If we've maxed out our available memory, dump everything to the index */
+ if (buildstate->accum.allocatedMemory >= (Size) maintenance_work_mem * 1024L)
+ {
+ ItemPointerData *list;
+ Datum key;
+ GinNullCategory category;
+ uint32 nlist;
+ OffsetNumber attnum;
+
+ ginBeginBAScan(&buildstate->accum);
+ while ((list = ginGetBAEntry(&buildstate->accum,
+ &attnum, &key, &category, &nlist)) != NULL)
+ {
+ /* there could be many entries, so be willing to abort here */
+ CHECK_FOR_INTERRUPTS();
+ ginEntryInsert(&buildstate->ginstate, attnum, key, category,
+ list, nlist, &buildstate->buildStats);
+ }
+
+ MemoryContextReset(buildstate->tmpCtx);
+ ginInitBA(&buildstate->accum);
+ }
+
+ MemoryContextSwitchTo(oldCtx);
+}
+
+IndexBuildResult *
+ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+ IndexBuildResult *result;
+ double reltuples;
+ GinBuildState buildstate;
+ Buffer RootBuffer,
+ MetaBuffer;
+ ItemPointerData *list;
+ Datum key;
+ GinNullCategory category;
+ uint32 nlist;
+ MemoryContext oldCtx;
+ OffsetNumber attnum;
+
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ initGinState(&buildstate.ginstate, index);
+ buildstate.indtuples = 0;
+ memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
+
+ /* initialize the meta page */
+ MetaBuffer = GinNewBuffer(index);
+
+ /* initialize the root page */
+ RootBuffer = GinNewBuffer(index);
+
+ START_CRIT_SECTION();
+ GinInitMetabuffer(MetaBuffer);
+ MarkBufferDirty(MetaBuffer);
+ GinInitBuffer(RootBuffer, GIN_LEAF);
+ MarkBufferDirty(RootBuffer);
+
+
+ UnlockReleaseBuffer(MetaBuffer);
+ UnlockReleaseBuffer(RootBuffer);
+ END_CRIT_SECTION();
+
+ /* count the root as first entry page */
+ buildstate.buildStats.nEntryPages++;
+
+ /*
+ * create a temporary memory context that is used to hold data not yet
+ * dumped out to the index
+ */
+ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin build temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * create a temporary memory context that is used for calling
+ * ginExtractEntries(), and can be reset after each tuple
+ */
+ buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin build temporary context for user-defined function",
+ ALLOCSET_DEFAULT_SIZES);
+
+ buildstate.accum.ginstate = &buildstate.ginstate;
+ ginInitBA(&buildstate.accum);
+
+ /*
+ * Do the heap scan. We disallow sync scan here because dataPlaceToPage
+ * prefers to receive tuples in TID order.
+ */
+ reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
+ ginBuildCallback, (void *) &buildstate,
+ NULL);
+
+ /* dump remaining entries to the index */
+ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
+ ginBeginBAScan(&buildstate.accum);
+ while ((list = ginGetBAEntry(&buildstate.accum,
+ &attnum, &key, &category, &nlist)) != NULL)
+ {
+ /* there could be many entries, so be willing to abort here */
+ CHECK_FOR_INTERRUPTS();
+ ginEntryInsert(&buildstate.ginstate, attnum, key, category,
+ list, nlist, &buildstate.buildStats);
+ }
+ MemoryContextSwitchTo(oldCtx);
+
+ MemoryContextDelete(buildstate.funcCtx);
+ MemoryContextDelete(buildstate.tmpCtx);
+
+ /*
+ * Update metapage stats
+ */
+ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
+ ginUpdateStats(index, &buildstate.buildStats, true);
+
+ /*
+ * We didn't write WAL records as we built the index, so if WAL-logging is
+ * required, write all pages to the WAL now.
+ */
+ if (RelationNeedsWAL(index))
+ {
+ log_newpage_range(index, MAIN_FORKNUM,
+ 0, RelationGetNumberOfBlocks(index),
+ true);
+ }
+
+ /*
+ * Return statistics
+ */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = buildstate.indtuples;
+
+ return result;
+}
+
+/*
+ * ginbuildempty() -- build an empty gin index in the initialization fork
+ */
+void
+ginbuildempty(Relation index)
+{
+ Buffer RootBuffer,
+ MetaBuffer;
+
+ /* An empty GIN index has two pages. */
+ MetaBuffer =
+ ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE);
+ RootBuffer =
+ ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /* Initialize and xlog metabuffer and root buffer. */
+ START_CRIT_SECTION();
+ GinInitMetabuffer(MetaBuffer);
+ MarkBufferDirty(MetaBuffer);
+ log_newpage_buffer(MetaBuffer, true);
+ GinInitBuffer(RootBuffer, GIN_LEAF);
+ MarkBufferDirty(RootBuffer);
+ log_newpage_buffer(RootBuffer, false);
+ END_CRIT_SECTION();
+
+ /* Unlock and release the buffers. */
+ UnlockReleaseBuffer(MetaBuffer);
+ UnlockReleaseBuffer(RootBuffer);
+}
+
+/*
+ * Insert index entries for a single indexable item during "normal"
+ * (non-fast-update) insertion
+ */
+static void
+ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum,
+ Datum value, bool isNull,
+ ItemPointer item)
+{
+ Datum *entries;
+ GinNullCategory *categories;
+ int32 i,
+ nentries;
+
+ entries = ginExtractEntries(ginstate, attnum, value, isNull,
+ &nentries, &categories);
+
+ for (i = 0; i < nentries; i++)
+ ginEntryInsert(ginstate, attnum, entries[i], categories[i],
+ item, 1, NULL);
+}
+
+bool
+gininsert(Relation index, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ IndexInfo *indexInfo)
+{
+ GinState *ginstate = (GinState *) indexInfo->ii_AmCache;
+ MemoryContext oldCtx;
+ MemoryContext insertCtx;
+ int i;
+
+ /* Initialize GinState cache if first call in this statement */
+ if (ginstate == NULL)
+ {
+ oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context);
+ ginstate = (GinState *) palloc(sizeof(GinState));
+ initGinState(ginstate, index);
+ indexInfo->ii_AmCache = (void *) ginstate;
+ MemoryContextSwitchTo(oldCtx);
+ }
+
+ insertCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin insert temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ oldCtx = MemoryContextSwitchTo(insertCtx);
+
+ if (GinGetUseFastUpdate(index))
+ {
+ GinTupleCollector collector;
+
+ memset(&collector, 0, sizeof(GinTupleCollector));
+
+ for (i = 0; i < ginstate->origTupdesc->natts; i++)
+ ginHeapTupleFastCollect(ginstate, &collector,
+ (OffsetNumber) (i + 1),
+ values[i], isnull[i],
+ ht_ctid);
+
+ ginHeapTupleFastInsert(ginstate, &collector);
+ }
+ else
+ {
+ for (i = 0; i < ginstate->origTupdesc->natts; i++)
+ ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
+ values[i], isnull[i],
+ ht_ctid);
+ }
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextDelete(insertCtx);
+
+ return false;
+}
diff --git a/src/backend/access/gin/ginlogic.c b/src/backend/access/gin/ginlogic.c
new file mode 100644
index 0000000..6bf3288
--- /dev/null
+++ b/src/backend/access/gin/ginlogic.c
@@ -0,0 +1,246 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginlogic.c
+ * routines for performing binary- and ternary-logic consistent checks.
+ *
+ * A GIN operator class can provide a boolean or ternary consistent
+ * function, or both. This file provides both boolean and ternary
+ * interfaces to the rest of the GIN code, even if only one of them is
+ * implemented by the opclass.
+ *
+ * Providing a boolean interface when the opclass implements only the
+ * ternary function is straightforward - just call the ternary function
+ * with the check-array as is, and map the GIN_TRUE, GIN_FALSE, GIN_MAYBE
+ * return codes to TRUE, FALSE and TRUE+recheck, respectively. Providing
+ * a ternary interface when the opclass only implements a boolean function
+ * is implemented by calling the boolean function many times, with all the
+ * MAYBE arguments set to all combinations of TRUE and FALSE (up to a
+ * certain number of MAYBE arguments).
+ *
+ * (A boolean function is enough to determine if an item matches, but a
+ * GIN scan can apply various optimizations if it can determine that an
+ * item matches or doesn't match, even if it doesn't know if some of the
+ * keys are present or not. That's what the ternary consistent function
+ * is used for.)
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginlogic.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/reloptions.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_type.h"
+#include "miscadmin.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+
+
+/*
+ * Maximum number of MAYBE inputs that shimTriConsistentFn will try to
+ * resolve by calling all combinations.
+ */
+#define MAX_MAYBE_ENTRIES 4
+
+/*
+ * Dummy consistent functions for an EVERYTHING key. Just claim it matches.
+ */
+static bool
+trueConsistentFn(GinScanKey key)
+{
+ key->recheckCurItem = false;
+ return true;
+}
+static GinTernaryValue
+trueTriConsistentFn(GinScanKey key)
+{
+ return GIN_TRUE;
+}
+
+/*
+ * A helper function for calling a regular, binary logic, consistent function.
+ */
+static bool
+directBoolConsistentFn(GinScanKey key)
+{
+ /*
+ * Initialize recheckCurItem in case the consistentFn doesn't know it
+ * should set it. The safe assumption in that case is to force recheck.
+ */
+ key->recheckCurItem = true;
+
+ return DatumGetBool(FunctionCall8Coll(key->consistentFmgrInfo,
+ key->collation,
+ PointerGetDatum(key->entryRes),
+ UInt16GetDatum(key->strategy),
+ key->query,
+ UInt32GetDatum(key->nuserentries),
+ PointerGetDatum(key->extra_data),
+ PointerGetDatum(&key->recheckCurItem),
+ PointerGetDatum(key->queryValues),
+ PointerGetDatum(key->queryCategories)));
+}
+
+/*
+ * A helper function for calling a native ternary logic consistent function.
+ */
+static GinTernaryValue
+directTriConsistentFn(GinScanKey key)
+{
+ return DatumGetGinTernaryValue(FunctionCall7Coll(key->triConsistentFmgrInfo,
+ key->collation,
+ PointerGetDatum(key->entryRes),
+ UInt16GetDatum(key->strategy),
+ key->query,
+ UInt32GetDatum(key->nuserentries),
+ PointerGetDatum(key->extra_data),
+ PointerGetDatum(key->queryValues),
+ PointerGetDatum(key->queryCategories)));
+}
+
+/*
+ * This function implements a binary logic consistency check, using a ternary
+ * logic consistent function provided by the opclass. GIN_MAYBE return value
+ * is interpreted as true with recheck flag.
+ */
+static bool
+shimBoolConsistentFn(GinScanKey key)
+{
+ GinTernaryValue result;
+
+ result = DatumGetGinTernaryValue(FunctionCall7Coll(key->triConsistentFmgrInfo,
+ key->collation,
+ PointerGetDatum(key->entryRes),
+ UInt16GetDatum(key->strategy),
+ key->query,
+ UInt32GetDatum(key->nuserentries),
+ PointerGetDatum(key->extra_data),
+ PointerGetDatum(key->queryValues),
+ PointerGetDatum(key->queryCategories)));
+ if (result == GIN_MAYBE)
+ {
+ key->recheckCurItem = true;
+ return true;
+ }
+ else
+ {
+ key->recheckCurItem = false;
+ return result;
+ }
+}
+
+/*
+ * This function implements a tri-state consistency check, using a boolean
+ * consistent function provided by the opclass.
+ *
+ * Our strategy is to call consistentFn with MAYBE inputs replaced with every
+ * combination of TRUE/FALSE. If consistentFn returns the same value for every
+ * combination, that's the overall result. Otherwise, return MAYBE. Testing
+ * every combination is O(n^2), so this is only feasible for a small number of
+ * MAYBE inputs.
+ *
+ * NB: This function modifies the key->entryRes array!
+ */
+static GinTernaryValue
+shimTriConsistentFn(GinScanKey key)
+{
+ int nmaybe;
+ int maybeEntries[MAX_MAYBE_ENTRIES];
+ int i;
+ bool boolResult;
+ bool recheck = false;
+ GinTernaryValue curResult;
+
+ /*
+ * Count how many MAYBE inputs there are, and store their indexes in
+ * maybeEntries. If there are too many MAYBE inputs, it's not feasible to
+ * test all combinations, so give up and return MAYBE.
+ */
+ nmaybe = 0;
+ for (i = 0; i < key->nentries; i++)
+ {
+ if (key->entryRes[i] == GIN_MAYBE)
+ {
+ if (nmaybe >= MAX_MAYBE_ENTRIES)
+ return GIN_MAYBE;
+ maybeEntries[nmaybe++] = i;
+ }
+ }
+
+ /*
+ * If none of the inputs were MAYBE, so we can just call consistent
+ * function as is.
+ */
+ if (nmaybe == 0)
+ return directBoolConsistentFn(key);
+
+ /* First call consistent function with all the maybe-inputs set FALSE */
+ for (i = 0; i < nmaybe; i++)
+ key->entryRes[maybeEntries[i]] = GIN_FALSE;
+ curResult = directBoolConsistentFn(key);
+
+ for (;;)
+ {
+ /* Twiddle the entries for next combination. */
+ for (i = 0; i < nmaybe; i++)
+ {
+ if (key->entryRes[maybeEntries[i]] == GIN_FALSE)
+ {
+ key->entryRes[maybeEntries[i]] = GIN_TRUE;
+ break;
+ }
+ else
+ key->entryRes[maybeEntries[i]] = GIN_FALSE;
+ }
+ if (i == nmaybe)
+ break;
+
+ boolResult = directBoolConsistentFn(key);
+ recheck |= key->recheckCurItem;
+
+ if (curResult != boolResult)
+ return GIN_MAYBE;
+ }
+
+ /* TRUE with recheck is taken to mean MAYBE */
+ if (curResult == GIN_TRUE && recheck)
+ curResult = GIN_MAYBE;
+
+ return curResult;
+}
+
+/*
+ * Set up the implementation of the consistent functions for a scan key.
+ */
+void
+ginInitConsistentFunction(GinState *ginstate, GinScanKey key)
+{
+ if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING)
+ {
+ key->boolConsistentFn = trueConsistentFn;
+ key->triConsistentFn = trueTriConsistentFn;
+ }
+ else
+ {
+ key->consistentFmgrInfo = &ginstate->consistentFn[key->attnum - 1];
+ key->triConsistentFmgrInfo = &ginstate->triConsistentFn[key->attnum - 1];
+ key->collation = ginstate->supportCollation[key->attnum - 1];
+
+ if (OidIsValid(ginstate->consistentFn[key->attnum - 1].fn_oid))
+ key->boolConsistentFn = directBoolConsistentFn;
+ else
+ key->boolConsistentFn = shimBoolConsistentFn;
+
+ if (OidIsValid(ginstate->triConsistentFn[key->attnum - 1].fn_oid))
+ key->triConsistentFn = directTriConsistentFn;
+ else
+ key->triConsistentFn = shimTriConsistentFn;
+ }
+}
diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c
new file mode 100644
index 0000000..216b2b9
--- /dev/null
+++ b/src/backend/access/gin/ginpostinglist.c
@@ -0,0 +1,434 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginpostinglist.c
+ * routines for dealing with posting lists.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginpostinglist.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+
+#ifdef USE_ASSERT_CHECKING
+#define CHECK_ENCODING_ROUNDTRIP
+#endif
+
+/*
+ * For encoding purposes, item pointers are represented as 64-bit unsigned
+ * integers. The lowest 11 bits represent the offset number, and the next
+ * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
+ * only 43 low bits are used.
+ *
+ * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
+ * 2^11 on all supported block sizes. We are frugal with the bits, because
+ * smaller integers use fewer bytes in the varbyte encoding, saving disk
+ * space. (If we get a new table AM in the future that wants to use the full
+ * range of possible offset numbers, we'll need to change this.)
+ *
+ * These 43-bit integers are encoded using varbyte encoding. In each byte,
+ * the 7 low bits contain data, while the highest bit is a continuation bit.
+ * When the continuation bit is set, the next byte is part of the same
+ * integer, otherwise this is the last byte of this integer. 43 bits need
+ * at most 7 bytes in this encoding:
+ *
+ * 0XXXXXXX
+ * 1XXXXXXX 0XXXXYYY
+ * 1XXXXXXX 1XXXXYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
+ *
+ * X = bits used for offset number
+ * Y = bits used for block number
+ * u = unused bit
+ *
+ * The bytes are in stored in little-endian order.
+ *
+ * An important property of this encoding is that removing an item from list
+ * never increases the size of the resulting compressed posting list. Proof:
+ *
+ * Removing number is actually replacement of two numbers with their sum. We
+ * have to prove that varbyte encoding of a sum can't be longer than varbyte
+ * encoding of its summands. Sum of two numbers is at most one bit wider than
+ * the larger of the summands. Widening a number by one bit enlarges its length
+ * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
+ * is at most one byte longer than varbyte encoding of larger summand. Lesser
+ * summand is at least one byte, so the sum cannot take more space than the
+ * summands, Q.E.D.
+ *
+ * This property greatly simplifies VACUUM, which can assume that posting
+ * lists always fit on the same page after vacuuming. Note that even though
+ * that holds for removing items from a posting list, you must also be
+ * careful to not cause expansion e.g. when merging uncompressed items on the
+ * page into the compressed lists, when vacuuming.
+ */
+
+/*
+ * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
+ * integer, but you can't fit that many items on a page. 11 ought to be more
+ * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
+ * use the minimum number of bits, but that would require changing the on-disk
+ * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
+ */
+#define MaxHeapTuplesPerPageBits 11
+
+/* Max. number of bytes needed to encode the largest supported integer. */
+#define MaxBytesPerInteger 7
+
+static inline uint64
+itemptr_to_uint64(const ItemPointer iptr)
+{
+ uint64 val;
+
+ Assert(ItemPointerIsValid(iptr));
+ Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits));
+
+ val = GinItemPointerGetBlockNumber(iptr);
+ val <<= MaxHeapTuplesPerPageBits;
+ val |= GinItemPointerGetOffsetNumber(iptr);
+
+ return val;
+}
+
+static inline void
+uint64_to_itemptr(uint64 val, ItemPointer iptr)
+{
+ GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1));
+ val = val >> MaxHeapTuplesPerPageBits;
+ GinItemPointerSetBlockNumber(iptr, val);
+
+ Assert(ItemPointerIsValid(iptr));
+}
+
+/*
+ * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
+ */
+static void
+encode_varbyte(uint64 val, unsigned char **ptr)
+{
+ unsigned char *p = *ptr;
+
+ while (val > 0x7F)
+ {
+ *(p++) = 0x80 | (val & 0x7F);
+ val >>= 7;
+ }
+ *(p++) = (unsigned char) val;
+
+ *ptr = p;
+}
+
+/*
+ * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
+ */
+static uint64
+decode_varbyte(unsigned char **ptr)
+{
+ uint64 val;
+ unsigned char *p = *ptr;
+ uint64 c;
+
+ /* 1st byte */
+ c = *(p++);
+ val = c & 0x7F;
+ if (c & 0x80)
+ {
+ /* 2nd byte */
+ c = *(p++);
+ val |= (c & 0x7F) << 7;
+ if (c & 0x80)
+ {
+ /* 3rd byte */
+ c = *(p++);
+ val |= (c & 0x7F) << 14;
+ if (c & 0x80)
+ {
+ /* 4th byte */
+ c = *(p++);
+ val |= (c & 0x7F) << 21;
+ if (c & 0x80)
+ {
+ /* 5th byte */
+ c = *(p++);
+ val |= (c & 0x7F) << 28;
+ if (c & 0x80)
+ {
+ /* 6th byte */
+ c = *(p++);
+ val |= (c & 0x7F) << 35;
+ if (c & 0x80)
+ {
+ /* 7th byte, should not have continuation bit */
+ c = *(p++);
+ val |= c << 42;
+ Assert((c & 0x80) == 0);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ *ptr = p;
+
+ return val;
+}
+
+/*
+ * Encode a posting list.
+ *
+ * The encoded list is returned in a palloc'd struct, which will be at most
+ * 'maxsize' bytes in size. The number items in the returned segment is
+ * returned in *nwritten. If it's not equal to nipd, not all the items fit
+ * in 'maxsize', and only the first *nwritten were encoded.
+ *
+ * The allocated size of the returned struct is short-aligned, and the padding
+ * byte at the end, if any, is zero.
+ */
+GinPostingList *
+ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
+ int *nwritten)
+{
+ uint64 prev;
+ int totalpacked = 0;
+ int maxbytes;
+ GinPostingList *result;
+ unsigned char *ptr;
+ unsigned char *endptr;
+
+ maxsize = SHORTALIGN_DOWN(maxsize);
+
+ result = palloc(maxsize);
+
+ maxbytes = maxsize - offsetof(GinPostingList, bytes);
+ Assert(maxbytes > 0);
+
+ /* Store the first special item */
+ result->first = ipd[0];
+
+ prev = itemptr_to_uint64(&result->first);
+
+ ptr = result->bytes;
+ endptr = result->bytes + maxbytes;
+ for (totalpacked = 1; totalpacked < nipd; totalpacked++)
+ {
+ uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
+ uint64 delta = val - prev;
+
+ Assert(val > prev);
+
+ if (endptr - ptr >= MaxBytesPerInteger)
+ encode_varbyte(delta, &ptr);
+ else
+ {
+ /*
+ * There are less than 7 bytes left. Have to check if the next
+ * item fits in that space before writing it out.
+ */
+ unsigned char buf[MaxBytesPerInteger];
+ unsigned char *p = buf;
+
+ encode_varbyte(delta, &p);
+ if (p - buf > (endptr - ptr))
+ break; /* output is full */
+
+ memcpy(ptr, buf, p - buf);
+ ptr += (p - buf);
+ }
+ prev = val;
+ }
+ result->nbytes = ptr - result->bytes;
+
+ /*
+ * If we wrote an odd number of bytes, zero out the padding byte at the
+ * end.
+ */
+ if (result->nbytes != SHORTALIGN(result->nbytes))
+ result->bytes[result->nbytes] = 0;
+
+ if (nwritten)
+ *nwritten = totalpacked;
+
+ Assert(SizeOfGinPostingList(result) <= maxsize);
+
+ /*
+ * Check that the encoded segment decodes back to the original items.
+ */
+#if defined (CHECK_ENCODING_ROUNDTRIP)
+ {
+ int ndecoded;
+ ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
+
+ Assert(ndecoded == totalpacked);
+ Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0);
+ pfree(tmp);
+ }
+#endif
+
+ return result;
+}
+
+/*
+ * Decode a compressed posting list into an array of item pointers.
+ * The number of items is returned in *ndecoded.
+ */
+ItemPointer
+ginPostingListDecode(GinPostingList *plist, int *ndecoded)
+{
+ return ginPostingListDecodeAllSegments(plist,
+ SizeOfGinPostingList(plist),
+ ndecoded);
+}
+
+/*
+ * Decode multiple posting list segments into an array of item pointers.
+ * The number of items is returned in *ndecoded_out. The segments are stored
+ * one after each other, with total size 'len' bytes.
+ */
+ItemPointer
+ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
+{
+ ItemPointer result;
+ int nallocated;
+ uint64 val;
+ char *endseg = ((char *) segment) + len;
+ int ndecoded;
+ unsigned char *ptr;
+ unsigned char *endptr;
+
+ /*
+ * Guess an initial size of the array.
+ */
+ nallocated = segment->nbytes * 2 + 1;
+ result = palloc(nallocated * sizeof(ItemPointerData));
+
+ ndecoded = 0;
+ while ((char *) segment < endseg)
+ {
+ /* enlarge output array if needed */
+ if (ndecoded >= nallocated)
+ {
+ nallocated *= 2;
+ result = repalloc(result, nallocated * sizeof(ItemPointerData));
+ }
+
+ /* copy the first item */
+ Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
+ Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
+ result[ndecoded] = segment->first;
+ ndecoded++;
+
+ val = itemptr_to_uint64(&segment->first);
+ ptr = segment->bytes;
+ endptr = segment->bytes + segment->nbytes;
+ while (ptr < endptr)
+ {
+ /* enlarge output array if needed */
+ if (ndecoded >= nallocated)
+ {
+ nallocated *= 2;
+ result = repalloc(result, nallocated * sizeof(ItemPointerData));
+ }
+
+ val += decode_varbyte(&ptr);
+
+ uint64_to_itemptr(val, &result[ndecoded]);
+ ndecoded++;
+ }
+ segment = GinNextPostingListSegment(segment);
+ }
+
+ if (ndecoded_out)
+ *ndecoded_out = ndecoded;
+ return result;
+}
+
+/*
+ * Add all item pointers from a bunch of posting lists to a TIDBitmap.
+ */
+int
+ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
+ TIDBitmap *tbm)
+{
+ int ndecoded;
+ ItemPointer items;
+
+ items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
+ tbm_add_tuples(tbm, items, ndecoded, false);
+ pfree(items);
+
+ return ndecoded;
+}
+
+/*
+ * Merge two ordered arrays of itempointers, eliminating any duplicates.
+ *
+ * Returns a palloc'd array, and *nmerged is set to the number of items in
+ * the result, after eliminating duplicates.
+ */
+ItemPointer
+ginMergeItemPointers(ItemPointerData *a, uint32 na,
+ ItemPointerData *b, uint32 nb,
+ int *nmerged)
+{
+ ItemPointerData *dst;
+
+ dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
+
+ /*
+ * If the argument arrays don't overlap, we can just append them to each
+ * other.
+ */
+ if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
+ {
+ memcpy(dst, a, na * sizeof(ItemPointerData));
+ memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
+ *nmerged = na + nb;
+ }
+ else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
+ {
+ memcpy(dst, b, nb * sizeof(ItemPointerData));
+ memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
+ *nmerged = na + nb;
+ }
+ else
+ {
+ ItemPointerData *dptr = dst;
+ ItemPointerData *aptr = a;
+ ItemPointerData *bptr = b;
+
+ while (aptr - a < na && bptr - b < nb)
+ {
+ int cmp = ginCompareItemPointers(aptr, bptr);
+
+ if (cmp > 0)
+ *dptr++ = *bptr++;
+ else if (cmp == 0)
+ {
+ /* only keep one copy of the identical items */
+ *dptr++ = *bptr++;
+ aptr++;
+ }
+ else
+ *dptr++ = *aptr++;
+ }
+
+ while (aptr - a < na)
+ *dptr++ = *aptr++;
+
+ while (bptr - b < nb)
+ *dptr++ = *bptr++;
+
+ *nmerged = dptr - dst;
+ }
+
+ return dst;
+}
diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c
new file mode 100644
index 0000000..55e2d49
--- /dev/null
+++ b/src/backend/access/gin/ginscan.c
@@ -0,0 +1,468 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginscan.c
+ * routines to manage scans of inverted index relations
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginscan.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/relscan.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+IndexScanDesc
+ginbeginscan(Relation rel, int nkeys, int norderbys)
+{
+ IndexScanDesc scan;
+ GinScanOpaque so;
+
+ /* no order by operators allowed */
+ Assert(norderbys == 0);
+
+ scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
+ /* allocate private workspace */
+ so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData));
+ so->keys = NULL;
+ so->nkeys = 0;
+ so->tempCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin scan temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+ so->keyCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin scan key context",
+ ALLOCSET_DEFAULT_SIZES);
+ initGinState(&so->ginstate, scan->indexRelation);
+
+ scan->opaque = so;
+
+ return scan;
+}
+
+/*
+ * Create a new GinScanEntry, unless an equivalent one already exists,
+ * in which case just return it
+ */
+static GinScanEntry
+ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum,
+ StrategyNumber strategy, int32 searchMode,
+ Datum queryKey, GinNullCategory queryCategory,
+ bool isPartialMatch, Pointer extra_data)
+{
+ GinState *ginstate = &so->ginstate;
+ GinScanEntry scanEntry;
+ uint32 i;
+
+ /*
+ * Look for an existing equivalent entry.
+ *
+ * Entries with non-null extra_data are never considered identical, since
+ * we can't know exactly what the opclass might be doing with that.
+ */
+ if (extra_data == NULL)
+ {
+ for (i = 0; i < so->totalentries; i++)
+ {
+ GinScanEntry prevEntry = so->entries[i];
+
+ if (prevEntry->extra_data == NULL &&
+ prevEntry->isPartialMatch == isPartialMatch &&
+ prevEntry->strategy == strategy &&
+ prevEntry->searchMode == searchMode &&
+ prevEntry->attnum == attnum &&
+ ginCompareEntries(ginstate, attnum,
+ prevEntry->queryKey,
+ prevEntry->queryCategory,
+ queryKey,
+ queryCategory) == 0)
+ {
+ /* Successful match */
+ return prevEntry;
+ }
+ }
+ }
+
+ /* Nope, create a new entry */
+ scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData));
+ scanEntry->queryKey = queryKey;
+ scanEntry->queryCategory = queryCategory;
+ scanEntry->isPartialMatch = isPartialMatch;
+ scanEntry->extra_data = extra_data;
+ scanEntry->strategy = strategy;
+ scanEntry->searchMode = searchMode;
+ scanEntry->attnum = attnum;
+
+ scanEntry->buffer = InvalidBuffer;
+ ItemPointerSetMin(&scanEntry->curItem);
+ scanEntry->matchBitmap = NULL;
+ scanEntry->matchIterator = NULL;
+ scanEntry->matchResult = NULL;
+ scanEntry->list = NULL;
+ scanEntry->nlist = 0;
+ scanEntry->offset = InvalidOffsetNumber;
+ scanEntry->isFinished = false;
+ scanEntry->reduceResult = false;
+
+ /* Add it to so's array */
+ if (so->totalentries >= so->allocentries)
+ {
+ so->allocentries *= 2;
+ so->entries = (GinScanEntry *)
+ repalloc(so->entries, so->allocentries * sizeof(GinScanEntry));
+ }
+ so->entries[so->totalentries++] = scanEntry;
+
+ return scanEntry;
+}
+
+/*
+ * Append hidden scan entry of given category to the scan key.
+ *
+ * NB: this had better be called at most once per scan key, since
+ * ginFillScanKey leaves room for only one hidden entry. Currently,
+ * it seems sufficiently clear that this is true that we don't bother
+ * with any cross-check logic.
+ */
+static void
+ginScanKeyAddHiddenEntry(GinScanOpaque so, GinScanKey key,
+ GinNullCategory queryCategory)
+{
+ int i = key->nentries++;
+
+ /* strategy is of no interest because this is not a partial-match item */
+ key->scanEntry[i] = ginFillScanEntry(so, key->attnum,
+ InvalidStrategy, key->searchMode,
+ (Datum) 0, queryCategory,
+ false, NULL);
+}
+
+/*
+ * Initialize the next GinScanKey using the output from the extractQueryFn
+ */
+static void
+ginFillScanKey(GinScanOpaque so, OffsetNumber attnum,
+ StrategyNumber strategy, int32 searchMode,
+ Datum query, uint32 nQueryValues,
+ Datum *queryValues, GinNullCategory *queryCategories,
+ bool *partial_matches, Pointer *extra_data)
+{
+ GinScanKey key = &(so->keys[so->nkeys++]);
+ GinState *ginstate = &so->ginstate;
+ uint32 i;
+
+ key->nentries = nQueryValues;
+ key->nuserentries = nQueryValues;
+
+ /* Allocate one extra array slot for possible "hidden" entry */
+ key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) *
+ (nQueryValues + 1));
+ key->entryRes = (GinTernaryValue *) palloc0(sizeof(GinTernaryValue) *
+ (nQueryValues + 1));
+
+ key->query = query;
+ key->queryValues = queryValues;
+ key->queryCategories = queryCategories;
+ key->extra_data = extra_data;
+ key->strategy = strategy;
+ key->searchMode = searchMode;
+ key->attnum = attnum;
+
+ /*
+ * Initially, scan keys of GIN_SEARCH_MODE_ALL mode are marked
+ * excludeOnly. This might get changed later.
+ */
+ key->excludeOnly = (searchMode == GIN_SEARCH_MODE_ALL);
+
+ ItemPointerSetMin(&key->curItem);
+ key->curItemMatches = false;
+ key->recheckCurItem = false;
+ key->isFinished = false;
+ key->nrequired = 0;
+ key->nadditional = 0;
+ key->requiredEntries = NULL;
+ key->additionalEntries = NULL;
+
+ ginInitConsistentFunction(ginstate, key);
+
+ /* Set up normal scan entries using extractQueryFn's outputs */
+ for (i = 0; i < nQueryValues; i++)
+ {
+ Datum queryKey;
+ GinNullCategory queryCategory;
+ bool isPartialMatch;
+ Pointer this_extra;
+
+ queryKey = queryValues[i];
+ queryCategory = queryCategories[i];
+ isPartialMatch =
+ (ginstate->canPartialMatch[attnum - 1] && partial_matches)
+ ? partial_matches[i] : false;
+ this_extra = (extra_data) ? extra_data[i] : NULL;
+
+ key->scanEntry[i] = ginFillScanEntry(so, attnum,
+ strategy, searchMode,
+ queryKey, queryCategory,
+ isPartialMatch, this_extra);
+ }
+
+ /*
+ * For GIN_SEARCH_MODE_INCLUDE_EMPTY and GIN_SEARCH_MODE_EVERYTHING search
+ * modes, we add the "hidden" entry immediately. GIN_SEARCH_MODE_ALL is
+ * handled later, since we might be able to omit the hidden entry for it.
+ */
+ if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY)
+ ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_ITEM);
+ else if (searchMode == GIN_SEARCH_MODE_EVERYTHING)
+ ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_QUERY);
+}
+
+/*
+ * Release current scan keys, if any.
+ */
+void
+ginFreeScanKeys(GinScanOpaque so)
+{
+ uint32 i;
+
+ if (so->keys == NULL)
+ return;
+
+ for (i = 0; i < so->totalentries; i++)
+ {
+ GinScanEntry entry = so->entries[i];
+
+ if (entry->buffer != InvalidBuffer)
+ ReleaseBuffer(entry->buffer);
+ if (entry->list)
+ pfree(entry->list);
+ if (entry->matchIterator)
+ tbm_end_iterate(entry->matchIterator);
+ if (entry->matchBitmap)
+ tbm_free(entry->matchBitmap);
+ }
+
+ MemoryContextResetAndDeleteChildren(so->keyCtx);
+
+ so->keys = NULL;
+ so->nkeys = 0;
+ so->entries = NULL;
+ so->totalentries = 0;
+}
+
+void
+ginNewScanKey(IndexScanDesc scan)
+{
+ ScanKey scankey = scan->keyData;
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ int i;
+ bool hasNullQuery = false;
+ bool attrHasNormalScan[INDEX_MAX_KEYS] = {false};
+ MemoryContext oldCtx;
+
+ /*
+ * Allocate all the scan key information in the key context. (If
+ * extractQuery leaks anything there, it won't be reset until the end of
+ * scan or rescan, but that's OK.)
+ */
+ oldCtx = MemoryContextSwitchTo(so->keyCtx);
+
+ /* if no scan keys provided, allocate extra EVERYTHING GinScanKey */
+ so->keys = (GinScanKey)
+ palloc(Max(scan->numberOfKeys, 1) * sizeof(GinScanKeyData));
+ so->nkeys = 0;
+
+ /* initialize expansible array of GinScanEntry pointers */
+ so->totalentries = 0;
+ so->allocentries = 32;
+ so->entries = (GinScanEntry *)
+ palloc(so->allocentries * sizeof(GinScanEntry));
+
+ so->isVoidRes = false;
+
+ for (i = 0; i < scan->numberOfKeys; i++)
+ {
+ ScanKey skey = &scankey[i];
+ Datum *queryValues;
+ int32 nQueryValues = 0;
+ bool *partial_matches = NULL;
+ Pointer *extra_data = NULL;
+ bool *nullFlags = NULL;
+ GinNullCategory *categories;
+ int32 searchMode = GIN_SEARCH_MODE_DEFAULT;
+
+ /*
+ * We assume that GIN-indexable operators are strict, so a null query
+ * argument means an unsatisfiable query.
+ */
+ if (skey->sk_flags & SK_ISNULL)
+ {
+ so->isVoidRes = true;
+ break;
+ }
+
+ /* OK to call the extractQueryFn */
+ queryValues = (Datum *)
+ DatumGetPointer(FunctionCall7Coll(&so->ginstate.extractQueryFn[skey->sk_attno - 1],
+ so->ginstate.supportCollation[skey->sk_attno - 1],
+ skey->sk_argument,
+ PointerGetDatum(&nQueryValues),
+ UInt16GetDatum(skey->sk_strategy),
+ PointerGetDatum(&partial_matches),
+ PointerGetDatum(&extra_data),
+ PointerGetDatum(&nullFlags),
+ PointerGetDatum(&searchMode)));
+
+ /*
+ * If bogus searchMode is returned, treat as GIN_SEARCH_MODE_ALL; note
+ * in particular we don't allow extractQueryFn to select
+ * GIN_SEARCH_MODE_EVERYTHING.
+ */
+ if (searchMode < GIN_SEARCH_MODE_DEFAULT ||
+ searchMode > GIN_SEARCH_MODE_ALL)
+ searchMode = GIN_SEARCH_MODE_ALL;
+
+ /* Non-default modes require the index to have placeholders */
+ if (searchMode != GIN_SEARCH_MODE_DEFAULT)
+ hasNullQuery = true;
+
+ /*
+ * In default mode, no keys means an unsatisfiable query.
+ */
+ if (queryValues == NULL || nQueryValues <= 0)
+ {
+ if (searchMode == GIN_SEARCH_MODE_DEFAULT)
+ {
+ so->isVoidRes = true;
+ break;
+ }
+ nQueryValues = 0; /* ensure sane value */
+ }
+
+ /*
+ * Create GinNullCategory representation. If the extractQueryFn
+ * didn't create a nullFlags array, we assume everything is non-null.
+ * While at it, detect whether any null keys are present.
+ */
+ categories = (GinNullCategory *) palloc0(nQueryValues * sizeof(GinNullCategory));
+ if (nullFlags)
+ {
+ int32 j;
+
+ for (j = 0; j < nQueryValues; j++)
+ {
+ if (nullFlags[j])
+ {
+ categories[j] = GIN_CAT_NULL_KEY;
+ hasNullQuery = true;
+ }
+ }
+ }
+
+ ginFillScanKey(so, skey->sk_attno,
+ skey->sk_strategy, searchMode,
+ skey->sk_argument, nQueryValues,
+ queryValues, categories,
+ partial_matches, extra_data);
+
+ /* Remember if we had any non-excludeOnly keys */
+ if (searchMode != GIN_SEARCH_MODE_ALL)
+ attrHasNormalScan[skey->sk_attno - 1] = true;
+ }
+
+ /*
+ * Processing GIN_SEARCH_MODE_ALL scan keys requires us to make a second
+ * pass over the scan keys. Above we marked each such scan key as
+ * excludeOnly. If the involved column has any normal (not excludeOnly)
+ * scan key as well, then we can leave it like that. Otherwise, one
+ * excludeOnly scan key must receive a GIN_CAT_EMPTY_QUERY hidden entry
+ * and be set to normal (excludeOnly = false).
+ */
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = &so->keys[i];
+
+ if (key->searchMode != GIN_SEARCH_MODE_ALL)
+ continue;
+
+ if (!attrHasNormalScan[key->attnum - 1])
+ {
+ key->excludeOnly = false;
+ ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_QUERY);
+ attrHasNormalScan[key->attnum - 1] = true;
+ }
+ }
+
+ /*
+ * If there are no regular scan keys, generate an EVERYTHING scankey to
+ * drive a full-index scan.
+ */
+ if (so->nkeys == 0 && !so->isVoidRes)
+ {
+ hasNullQuery = true;
+ ginFillScanKey(so, FirstOffsetNumber,
+ InvalidStrategy, GIN_SEARCH_MODE_EVERYTHING,
+ (Datum) 0, 0,
+ NULL, NULL, NULL, NULL);
+ }
+
+ /*
+ * If the index is version 0, it may be missing null and placeholder
+ * entries, which would render searches for nulls and full-index scans
+ * unreliable. Throw an error if so.
+ */
+ if (hasNullQuery && !so->isVoidRes)
+ {
+ GinStatsData ginStats;
+
+ ginGetStats(scan->indexRelation, &ginStats);
+ if (ginStats.ginVersion < 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("old GIN indexes do not support whole-index scans nor searches for nulls"),
+ errhint("To fix this, do REINDEX INDEX \"%s\".",
+ RelationGetRelationName(scan->indexRelation))));
+ }
+
+ MemoryContextSwitchTo(oldCtx);
+
+ pgstat_count_index_scan(scan->indexRelation);
+}
+
+void
+ginrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+
+ ginFreeScanKeys(so);
+
+ if (scankey && scan->numberOfKeys > 0)
+ {
+ memmove(scan->keyData, scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ }
+}
+
+
+void
+ginendscan(IndexScanDesc scan)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+
+ ginFreeScanKeys(so);
+
+ MemoryContextDelete(so->tempCtx);
+ MemoryContextDelete(so->keyCtx);
+
+ pfree(so);
+}
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
new file mode 100644
index 0000000..cdd626f
--- /dev/null
+++ b/src/backend/access/gin/ginutil.c
@@ -0,0 +1,707 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginutil.c
+ * Utility routines for the Postgres inverted index access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginutil.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/reloptions.h"
+#include "access/xloginsert.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_type.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/builtins.h"
+#include "utils/index_selfuncs.h"
+#include "utils/typcache.h"
+
+
+/*
+ * GIN handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+ginhandler(PG_FUNCTION_ARGS)
+{
+ IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+ amroutine->amstrategies = 0;
+ amroutine->amsupport = GINNProcs;
+ amroutine->amoptsprocnum = GIN_OPTIONS_PROC;
+ amroutine->amcanorder = false;
+ amroutine->amcanorderbyop = false;
+ amroutine->amcanbackward = false;
+ amroutine->amcanunique = false;
+ amroutine->amcanmulticol = true;
+ amroutine->amoptionalkey = true;
+ amroutine->amsearcharray = false;
+ amroutine->amsearchnulls = false;
+ amroutine->amstorage = true;
+ amroutine->amclusterable = false;
+ amroutine->ampredlocks = true;
+ amroutine->amcanparallel = false;
+ amroutine->amcaninclude = false;
+ amroutine->amusemaintenanceworkmem = true;
+ amroutine->amparallelvacuumoptions =
+ VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_CLEANUP;
+ amroutine->amkeytype = InvalidOid;
+
+ amroutine->ambuild = ginbuild;
+ amroutine->ambuildempty = ginbuildempty;
+ amroutine->aminsert = gininsert;
+ amroutine->ambulkdelete = ginbulkdelete;
+ amroutine->amvacuumcleanup = ginvacuumcleanup;
+ amroutine->amcanreturn = NULL;
+ amroutine->amcostestimate = gincostestimate;
+ amroutine->amoptions = ginoptions;
+ amroutine->amproperty = NULL;
+ amroutine->ambuildphasename = NULL;
+ amroutine->amvalidate = ginvalidate;
+ amroutine->amadjustmembers = ginadjustmembers;
+ amroutine->ambeginscan = ginbeginscan;
+ amroutine->amrescan = ginrescan;
+ amroutine->amgettuple = NULL;
+ amroutine->amgetbitmap = gingetbitmap;
+ amroutine->amendscan = ginendscan;
+ amroutine->ammarkpos = NULL;
+ amroutine->amrestrpos = NULL;
+ amroutine->amestimateparallelscan = NULL;
+ amroutine->aminitparallelscan = NULL;
+ amroutine->amparallelrescan = NULL;
+
+ PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * initGinState: fill in an empty GinState struct to describe the index
+ *
+ * Note: assorted subsidiary data is allocated in the CurrentMemoryContext.
+ */
+void
+initGinState(GinState *state, Relation index)
+{
+ TupleDesc origTupdesc = RelationGetDescr(index);
+ int i;
+
+ MemSet(state, 0, sizeof(GinState));
+
+ state->index = index;
+ state->oneCol = (origTupdesc->natts == 1) ? true : false;
+ state->origTupdesc = origTupdesc;
+
+ for (i = 0; i < origTupdesc->natts; i++)
+ {
+ Form_pg_attribute attr = TupleDescAttr(origTupdesc, i);
+
+ if (state->oneCol)
+ state->tupdesc[i] = state->origTupdesc;
+ else
+ {
+ state->tupdesc[i] = CreateTemplateTupleDesc(2);
+
+ TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL,
+ INT2OID, -1, 0);
+ TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL,
+ attr->atttypid,
+ attr->atttypmod,
+ attr->attndims);
+ TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 2,
+ attr->attcollation);
+ }
+
+ /*
+ * If the compare proc isn't specified in the opclass definition, look
+ * up the index key type's default btree comparator.
+ */
+ if (index_getprocid(index, i + 1, GIN_COMPARE_PROC) != InvalidOid)
+ {
+ fmgr_info_copy(&(state->compareFn[i]),
+ index_getprocinfo(index, i + 1, GIN_COMPARE_PROC),
+ CurrentMemoryContext);
+ }
+ else
+ {
+ TypeCacheEntry *typentry;
+
+ typentry = lookup_type_cache(attr->atttypid,
+ TYPECACHE_CMP_PROC_FINFO);
+ if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_FUNCTION),
+ errmsg("could not identify a comparison function for type %s",
+ format_type_be(attr->atttypid))));
+ fmgr_info_copy(&(state->compareFn[i]),
+ &(typentry->cmp_proc_finfo),
+ CurrentMemoryContext);
+ }
+
+ /* Opclass must always provide extract procs */
+ fmgr_info_copy(&(state->extractValueFn[i]),
+ index_getprocinfo(index, i + 1, GIN_EXTRACTVALUE_PROC),
+ CurrentMemoryContext);
+ fmgr_info_copy(&(state->extractQueryFn[i]),
+ index_getprocinfo(index, i + 1, GIN_EXTRACTQUERY_PROC),
+ CurrentMemoryContext);
+
+ /*
+ * Check opclass capability to do tri-state or binary logic consistent
+ * check.
+ */
+ if (index_getprocid(index, i + 1, GIN_TRICONSISTENT_PROC) != InvalidOid)
+ {
+ fmgr_info_copy(&(state->triConsistentFn[i]),
+ index_getprocinfo(index, i + 1, GIN_TRICONSISTENT_PROC),
+ CurrentMemoryContext);
+ }
+
+ if (index_getprocid(index, i + 1, GIN_CONSISTENT_PROC) != InvalidOid)
+ {
+ fmgr_info_copy(&(state->consistentFn[i]),
+ index_getprocinfo(index, i + 1, GIN_CONSISTENT_PROC),
+ CurrentMemoryContext);
+ }
+
+ if (state->consistentFn[i].fn_oid == InvalidOid &&
+ state->triConsistentFn[i].fn_oid == InvalidOid)
+ {
+ elog(ERROR, "missing GIN support function (%d or %d) for attribute %d of index \"%s\"",
+ GIN_CONSISTENT_PROC, GIN_TRICONSISTENT_PROC,
+ i + 1, RelationGetRelationName(index));
+ }
+
+ /*
+ * Check opclass capability to do partial match.
+ */
+ if (index_getprocid(index, i + 1, GIN_COMPARE_PARTIAL_PROC) != InvalidOid)
+ {
+ fmgr_info_copy(&(state->comparePartialFn[i]),
+ index_getprocinfo(index, i + 1, GIN_COMPARE_PARTIAL_PROC),
+ CurrentMemoryContext);
+ state->canPartialMatch[i] = true;
+ }
+ else
+ {
+ state->canPartialMatch[i] = false;
+ }
+
+ /*
+ * If the index column has a specified collation, we should honor that
+ * while doing comparisons. However, we may have a collatable storage
+ * type for a noncollatable indexed data type (for instance, hstore
+ * uses text index entries). If there's no index collation then
+ * specify default collation in case the support functions need
+ * collation. This is harmless if the support functions don't care
+ * about collation, so we just do it unconditionally. (We could
+ * alternatively call get_typcollation, but that seems like expensive
+ * overkill --- there aren't going to be any cases where a GIN storage
+ * type has a nondefault collation.)
+ */
+ if (OidIsValid(index->rd_indcollation[i]))
+ state->supportCollation[i] = index->rd_indcollation[i];
+ else
+ state->supportCollation[i] = DEFAULT_COLLATION_OID;
+ }
+}
+
+/*
+ * Extract attribute (column) number of stored entry from GIN tuple
+ */
+OffsetNumber
+gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple)
+{
+ OffsetNumber colN;
+
+ if (ginstate->oneCol)
+ {
+ /* column number is not stored explicitly */
+ colN = FirstOffsetNumber;
+ }
+ else
+ {
+ Datum res;
+ bool isnull;
+
+ /*
+ * First attribute is always int16, so we can safely use any tuple
+ * descriptor to obtain first attribute of tuple
+ */
+ res = index_getattr(tuple, FirstOffsetNumber, ginstate->tupdesc[0],
+ &isnull);
+ Assert(!isnull);
+
+ colN = DatumGetUInt16(res);
+ Assert(colN >= FirstOffsetNumber && colN <= ginstate->origTupdesc->natts);
+ }
+
+ return colN;
+}
+
+/*
+ * Extract stored datum (and possible null category) from GIN tuple
+ */
+Datum
+gintuple_get_key(GinState *ginstate, IndexTuple tuple,
+ GinNullCategory *category)
+{
+ Datum res;
+ bool isnull;
+
+ if (ginstate->oneCol)
+ {
+ /*
+ * Single column index doesn't store attribute numbers in tuples
+ */
+ res = index_getattr(tuple, FirstOffsetNumber, ginstate->origTupdesc,
+ &isnull);
+ }
+ else
+ {
+ /*
+ * Since the datum type depends on which index column it's from, we
+ * must be careful to use the right tuple descriptor here.
+ */
+ OffsetNumber colN = gintuple_get_attrnum(ginstate, tuple);
+
+ res = index_getattr(tuple, OffsetNumberNext(FirstOffsetNumber),
+ ginstate->tupdesc[colN - 1],
+ &isnull);
+ }
+
+ if (isnull)
+ *category = GinGetNullCategory(tuple, ginstate);
+ else
+ *category = GIN_CAT_NORM_KEY;
+
+ return res;
+}
+
+/*
+ * Allocate a new page (either by recycling, or by extending the index file)
+ * The returned buffer is already pinned and exclusive-locked
+ * Caller is responsible for initializing the page by calling GinInitBuffer
+ */
+Buffer
+GinNewBuffer(Relation index)
+{
+ Buffer buffer;
+ bool needLock;
+
+ /* First, try to get a page from FSM */
+ for (;;)
+ {
+ BlockNumber blkno = GetFreeIndexPage(index);
+
+ if (blkno == InvalidBlockNumber)
+ break;
+
+ buffer = ReadBuffer(index, blkno);
+
+ /*
+ * We have to guard against the possibility that someone else already
+ * recycled this page; the buffer may be locked if so.
+ */
+ if (ConditionalLockBuffer(buffer))
+ {
+ if (GinPageIsRecyclable(BufferGetPage(buffer)))
+ return buffer; /* OK to use */
+
+ LockBuffer(buffer, GIN_UNLOCK);
+ }
+
+ /* Can't use it, so release buffer and try again */
+ ReleaseBuffer(buffer);
+ }
+
+ /* Must extend the file */
+ needLock = !RELATION_IS_LOCAL(index);
+ if (needLock)
+ LockRelationForExtension(index, ExclusiveLock);
+
+ buffer = ReadBuffer(index, P_NEW);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+
+ if (needLock)
+ UnlockRelationForExtension(index, ExclusiveLock);
+
+ return buffer;
+}
+
+void
+GinInitPage(Page page, uint32 f, Size pageSize)
+{
+ GinPageOpaque opaque;
+
+ PageInit(page, pageSize, sizeof(GinPageOpaqueData));
+
+ opaque = GinPageGetOpaque(page);
+ opaque->flags = f;
+ opaque->rightlink = InvalidBlockNumber;
+}
+
+void
+GinInitBuffer(Buffer b, uint32 f)
+{
+ GinInitPage(BufferGetPage(b), f, BufferGetPageSize(b));
+}
+
+void
+GinInitMetabuffer(Buffer b)
+{
+ GinMetaPageData *metadata;
+ Page page = BufferGetPage(b);
+
+ GinInitPage(page, GIN_META, BufferGetPageSize(b));
+
+ metadata = GinPageGetMeta(page);
+
+ metadata->head = metadata->tail = InvalidBlockNumber;
+ metadata->tailFreeSize = 0;
+ metadata->nPendingPages = 0;
+ metadata->nPendingHeapTuples = 0;
+ metadata->nTotalPages = 0;
+ metadata->nEntryPages = 0;
+ metadata->nDataPages = 0;
+ metadata->nEntries = 0;
+ metadata->ginVersion = GIN_CURRENT_VERSION;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page.
+ */
+ ((PageHeader) page)->pd_lower =
+ ((char *) metadata + sizeof(GinMetaPageData)) - (char *) page;
+}
+
+/*
+ * Compare two keys of the same index column
+ */
+int
+ginCompareEntries(GinState *ginstate, OffsetNumber attnum,
+ Datum a, GinNullCategory categorya,
+ Datum b, GinNullCategory categoryb)
+{
+ /* if not of same null category, sort by that first */
+ if (categorya != categoryb)
+ return (categorya < categoryb) ? -1 : 1;
+
+ /* all null items in same category are equal */
+ if (categorya != GIN_CAT_NORM_KEY)
+ return 0;
+
+ /* both not null, so safe to call the compareFn */
+ return DatumGetInt32(FunctionCall2Coll(&ginstate->compareFn[attnum - 1],
+ ginstate->supportCollation[attnum - 1],
+ a, b));
+}
+
+/*
+ * Compare two keys of possibly different index columns
+ */
+int
+ginCompareAttEntries(GinState *ginstate,
+ OffsetNumber attnuma, Datum a, GinNullCategory categorya,
+ OffsetNumber attnumb, Datum b, GinNullCategory categoryb)
+{
+ /* attribute number is the first sort key */
+ if (attnuma != attnumb)
+ return (attnuma < attnumb) ? -1 : 1;
+
+ return ginCompareEntries(ginstate, attnuma, a, categorya, b, categoryb);
+}
+
+
+/*
+ * Support for sorting key datums in ginExtractEntries
+ *
+ * Note: we only have to worry about null and not-null keys here;
+ * ginExtractEntries never generates more than one placeholder null,
+ * so it doesn't have to sort those.
+ */
+typedef struct
+{
+ Datum datum;
+ bool isnull;
+} keyEntryData;
+
+typedef struct
+{
+ FmgrInfo *cmpDatumFunc;
+ Oid collation;
+ bool haveDups;
+} cmpEntriesArg;
+
+static int
+cmpEntries(const void *a, const void *b, void *arg)
+{
+ const keyEntryData *aa = (const keyEntryData *) a;
+ const keyEntryData *bb = (const keyEntryData *) b;
+ cmpEntriesArg *data = (cmpEntriesArg *) arg;
+ int res;
+
+ if (aa->isnull)
+ {
+ if (bb->isnull)
+ res = 0; /* NULL "=" NULL */
+ else
+ res = 1; /* NULL ">" not-NULL */
+ }
+ else if (bb->isnull)
+ res = -1; /* not-NULL "<" NULL */
+ else
+ res = DatumGetInt32(FunctionCall2Coll(data->cmpDatumFunc,
+ data->collation,
+ aa->datum, bb->datum));
+
+ /*
+ * Detect if we have any duplicates. If there are equal keys, qsort must
+ * compare them at some point, else it wouldn't know whether one should go
+ * before or after the other.
+ */
+ if (res == 0)
+ data->haveDups = true;
+
+ return res;
+}
+
+
+/*
+ * Extract the index key values from an indexable item
+ *
+ * The resulting key values are sorted, and any duplicates are removed.
+ * This avoids generating redundant index entries.
+ */
+Datum *
+ginExtractEntries(GinState *ginstate, OffsetNumber attnum,
+ Datum value, bool isNull,
+ int32 *nentries, GinNullCategory **categories)
+{
+ Datum *entries;
+ bool *nullFlags;
+ int32 i;
+
+ /*
+ * We don't call the extractValueFn on a null item. Instead generate a
+ * placeholder.
+ */
+ if (isNull)
+ {
+ *nentries = 1;
+ entries = (Datum *) palloc(sizeof(Datum));
+ entries[0] = (Datum) 0;
+ *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory));
+ (*categories)[0] = GIN_CAT_NULL_ITEM;
+ return entries;
+ }
+
+ /* OK, call the opclass's extractValueFn */
+ nullFlags = NULL; /* in case extractValue doesn't set it */
+ entries = (Datum *)
+ DatumGetPointer(FunctionCall3Coll(&ginstate->extractValueFn[attnum - 1],
+ ginstate->supportCollation[attnum - 1],
+ value,
+ PointerGetDatum(nentries),
+ PointerGetDatum(&nullFlags)));
+
+ /*
+ * Generate a placeholder if the item contained no keys.
+ */
+ if (entries == NULL || *nentries <= 0)
+ {
+ *nentries = 1;
+ entries = (Datum *) palloc(sizeof(Datum));
+ entries[0] = (Datum) 0;
+ *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory));
+ (*categories)[0] = GIN_CAT_EMPTY_ITEM;
+ return entries;
+ }
+
+ /*
+ * If the extractValueFn didn't create a nullFlags array, create one,
+ * assuming that everything's non-null.
+ */
+ if (nullFlags == NULL)
+ nullFlags = (bool *) palloc0(*nentries * sizeof(bool));
+
+ /*
+ * If there's more than one key, sort and unique-ify.
+ *
+ * XXX Using qsort here is notationally painful, and the overhead is
+ * pretty bad too. For small numbers of keys it'd likely be better to use
+ * a simple insertion sort.
+ */
+ if (*nentries > 1)
+ {
+ keyEntryData *keydata;
+ cmpEntriesArg arg;
+
+ keydata = (keyEntryData *) palloc(*nentries * sizeof(keyEntryData));
+ for (i = 0; i < *nentries; i++)
+ {
+ keydata[i].datum = entries[i];
+ keydata[i].isnull = nullFlags[i];
+ }
+
+ arg.cmpDatumFunc = &ginstate->compareFn[attnum - 1];
+ arg.collation = ginstate->supportCollation[attnum - 1];
+ arg.haveDups = false;
+ qsort_arg(keydata, *nentries, sizeof(keyEntryData),
+ cmpEntries, (void *) &arg);
+
+ if (arg.haveDups)
+ {
+ /* there are duplicates, must get rid of 'em */
+ int32 j;
+
+ entries[0] = keydata[0].datum;
+ nullFlags[0] = keydata[0].isnull;
+ j = 1;
+ for (i = 1; i < *nentries; i++)
+ {
+ if (cmpEntries(&keydata[i - 1], &keydata[i], &arg) != 0)
+ {
+ entries[j] = keydata[i].datum;
+ nullFlags[j] = keydata[i].isnull;
+ j++;
+ }
+ }
+ *nentries = j;
+ }
+ else
+ {
+ /* easy, no duplicates */
+ for (i = 0; i < *nentries; i++)
+ {
+ entries[i] = keydata[i].datum;
+ nullFlags[i] = keydata[i].isnull;
+ }
+ }
+
+ pfree(keydata);
+ }
+
+ /*
+ * Create GinNullCategory representation from nullFlags.
+ */
+ *categories = (GinNullCategory *) palloc0(*nentries * sizeof(GinNullCategory));
+ for (i = 0; i < *nentries; i++)
+ (*categories)[i] = (nullFlags[i] ? GIN_CAT_NULL_KEY : GIN_CAT_NORM_KEY);
+
+ return entries;
+}
+
+bytea *
+ginoptions(Datum reloptions, bool validate)
+{
+ static const relopt_parse_elt tab[] = {
+ {"fastupdate", RELOPT_TYPE_BOOL, offsetof(GinOptions, useFastUpdate)},
+ {"gin_pending_list_limit", RELOPT_TYPE_INT, offsetof(GinOptions,
+ pendingListCleanupSize)}
+ };
+
+ return (bytea *) build_reloptions(reloptions, validate,
+ RELOPT_KIND_GIN,
+ sizeof(GinOptions),
+ tab, lengthof(tab));
+}
+
+/*
+ * Fetch index's statistical data into *stats
+ *
+ * Note: in the result, nPendingPages can be trusted to be up-to-date,
+ * as can ginVersion; but the other fields are as of the last VACUUM.
+ */
+void
+ginGetStats(Relation index, GinStatsData *stats)
+{
+ Buffer metabuffer;
+ Page metapage;
+ GinMetaPageData *metadata;
+
+ metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, GIN_SHARE);
+ metapage = BufferGetPage(metabuffer);
+ metadata = GinPageGetMeta(metapage);
+
+ stats->nPendingPages = metadata->nPendingPages;
+ stats->nTotalPages = metadata->nTotalPages;
+ stats->nEntryPages = metadata->nEntryPages;
+ stats->nDataPages = metadata->nDataPages;
+ stats->nEntries = metadata->nEntries;
+ stats->ginVersion = metadata->ginVersion;
+
+ UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * Write the given statistics to the index's metapage
+ *
+ * Note: nPendingPages and ginVersion are *not* copied over
+ */
+void
+ginUpdateStats(Relation index, const GinStatsData *stats, bool is_build)
+{
+ Buffer metabuffer;
+ Page metapage;
+ GinMetaPageData *metadata;
+
+ metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, GIN_EXCLUSIVE);
+ metapage = BufferGetPage(metabuffer);
+ metadata = GinPageGetMeta(metapage);
+
+ START_CRIT_SECTION();
+
+ metadata->nTotalPages = stats->nTotalPages;
+ metadata->nEntryPages = stats->nEntryPages;
+ metadata->nDataPages = stats->nDataPages;
+ metadata->nEntries = stats->nEntries;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page. (We must do this here because pre-v11 versions of PG did not
+ * set the metapage's pd_lower correctly, so a pg_upgraded index might
+ * contain the wrong value.)
+ */
+ ((PageHeader) metapage)->pd_lower =
+ ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage;
+
+ MarkBufferDirty(metabuffer);
+
+ if (RelationNeedsWAL(index) && !is_build)
+ {
+ XLogRecPtr recptr;
+ ginxlogUpdateMeta data;
+
+ data.node = index->rd_node;
+ data.ntuples = 0;
+ data.newRightlink = data.prevTail = InvalidBlockNumber;
+ memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta));
+ XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE);
+ PageSetLSN(metapage, recptr);
+ }
+
+ UnlockReleaseBuffer(metabuffer);
+
+ END_CRIT_SECTION();
+}
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
new file mode 100644
index 0000000..a276eb0
--- /dev/null
+++ b/src/backend/access/gin/ginvacuum.c
@@ -0,0 +1,822 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginvacuum.c
+ * delete & vacuum routines for the postgres GIN
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginvacuum.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/xloginsert.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/memutils.h"
+
+struct GinVacuumState
+{
+ Relation index;
+ IndexBulkDeleteResult *result;
+ IndexBulkDeleteCallback callback;
+ void *callback_state;
+ GinState ginstate;
+ BufferAccessStrategy strategy;
+ MemoryContext tmpCxt;
+};
+
+/*
+ * Vacuums an uncompressed posting list. The size of the must can be specified
+ * in number of items (nitems).
+ *
+ * If none of the items need to be removed, returns NULL. Otherwise returns
+ * a new palloc'd array with the remaining items. The number of remaining
+ * items is returned in *nremaining.
+ */
+ItemPointer
+ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items,
+ int nitem, int *nremaining)
+{
+ int i,
+ remaining = 0;
+ ItemPointer tmpitems = NULL;
+
+ /*
+ * Iterate over TIDs array
+ */
+ for (i = 0; i < nitem; i++)
+ {
+ if (gvs->callback(items + i, gvs->callback_state))
+ {
+ gvs->result->tuples_removed += 1;
+ if (!tmpitems)
+ {
+ /*
+ * First TID to be deleted: allocate memory to hold the
+ * remaining items.
+ */
+ tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+ memcpy(tmpitems, items, sizeof(ItemPointerData) * i);
+ }
+ }
+ else
+ {
+ gvs->result->num_index_tuples += 1;
+ if (tmpitems)
+ tmpitems[remaining] = items[i];
+ remaining++;
+ }
+ }
+
+ *nremaining = remaining;
+ return tmpitems;
+}
+
+/*
+ * Create a WAL record for vacuuming entry tree leaf page.
+ */
+static void
+xlogVacuumPage(Relation index, Buffer buffer)
+{
+ Page page = BufferGetPage(buffer);
+ XLogRecPtr recptr;
+
+ /* This is only used for entry tree leaf pages. */
+ Assert(!GinPageIsData(page));
+ Assert(GinPageIsLeaf(page));
+
+ if (!RelationNeedsWAL(index))
+ return;
+
+ /*
+ * Always create a full image, we don't track the changes on the page at
+ * any more fine-grained level. This could obviously be improved...
+ */
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE);
+ PageSetLSN(page, recptr);
+}
+
+
+typedef struct DataPageDeleteStack
+{
+ struct DataPageDeleteStack *child;
+ struct DataPageDeleteStack *parent;
+
+ BlockNumber blkno; /* current block number */
+ Buffer leftBuffer; /* pinned and locked rightest non-deleted page
+ * on left */
+ bool isRoot;
+} DataPageDeleteStack;
+
+
+/*
+ * Delete a posting tree page.
+ */
+static void
+ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno,
+ BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot)
+{
+ Buffer dBuffer;
+ Buffer lBuffer;
+ Buffer pBuffer;
+ Page page,
+ parentPage;
+ BlockNumber rightlink;
+
+ /*
+ * This function MUST be called only if someone of parent pages hold
+ * exclusive cleanup lock. This guarantees that no insertions currently
+ * happen in this subtree. Caller also acquires Exclusive locks on
+ * deletable, parent and left pages.
+ */
+ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
+ RBM_NORMAL, gvs->strategy);
+ dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno,
+ RBM_NORMAL, gvs->strategy);
+ pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno,
+ RBM_NORMAL, gvs->strategy);
+
+ page = BufferGetPage(dBuffer);
+ rightlink = GinPageGetOpaque(page)->rightlink;
+
+ /*
+ * Any insert which would have gone on the leaf block will now go to its
+ * right sibling.
+ */
+ PredicateLockPageCombine(gvs->index, deleteBlkno, rightlink);
+
+ START_CRIT_SECTION();
+
+ /* Unlink the page by changing left sibling's rightlink */
+ page = BufferGetPage(lBuffer);
+ GinPageGetOpaque(page)->rightlink = rightlink;
+
+ /* Delete downlink from parent */
+ parentPage = BufferGetPage(pBuffer);
+#ifdef USE_ASSERT_CHECKING
+ do
+ {
+ PostingItem *tod = GinDataPageGetPostingItem(parentPage, myoff);
+
+ Assert(PostingItemGetBlockNumber(tod) == deleteBlkno);
+ } while (0);
+#endif
+ GinPageDeletePostingItem(parentPage, myoff);
+
+ page = BufferGetPage(dBuffer);
+
+ /*
+ * we shouldn't change rightlink field to save workability of running
+ * search scan
+ */
+
+ /*
+ * Mark page as deleted, and remember last xid which could know its
+ * address.
+ */
+ GinPageSetDeleted(page);
+ GinPageSetDeleteXid(page, ReadNextTransactionId());
+
+ MarkBufferDirty(pBuffer);
+ MarkBufferDirty(lBuffer);
+ MarkBufferDirty(dBuffer);
+
+ if (RelationNeedsWAL(gvs->index))
+ {
+ XLogRecPtr recptr;
+ ginxlogDeletePage data;
+
+ /*
+ * We can't pass REGBUF_STANDARD for the deleted page, because we
+ * didn't set pd_lower on pre-9.4 versions. The page might've been
+ * binary-upgraded from an older version, and hence not have pd_lower
+ * set correctly. Ditto for the left page, but removing the item from
+ * the parent updated its pd_lower, so we know that's OK at this
+ * point.
+ */
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, dBuffer, 0);
+ XLogRegisterBuffer(1, pBuffer, REGBUF_STANDARD);
+ XLogRegisterBuffer(2, lBuffer, 0);
+
+ data.parentOffset = myoff;
+ data.rightLink = GinPageGetOpaque(page)->rightlink;
+ data.deleteXid = GinPageGetDeleteXid(page);
+
+ XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage));
+
+ recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE);
+ PageSetLSN(page, recptr);
+ PageSetLSN(parentPage, recptr);
+ PageSetLSN(BufferGetPage(lBuffer), recptr);
+ }
+
+ ReleaseBuffer(pBuffer);
+ ReleaseBuffer(lBuffer);
+ ReleaseBuffer(dBuffer);
+
+ END_CRIT_SECTION();
+
+ gvs->result->pages_newly_deleted++;
+ gvs->result->pages_deleted++;
+}
+
+
+/*
+ * Scans posting tree and deletes empty pages. Caller must lock root page for
+ * cleanup. During scan path from root to current page is kept exclusively
+ * locked. Also keep left page exclusively locked, because ginDeletePage()
+ * needs it. If we try to relock left page later, it could deadlock with
+ * ginStepRight().
+ */
+static bool
+ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
+ DataPageDeleteStack *parent, OffsetNumber myoff)
+{
+ DataPageDeleteStack *me;
+ Buffer buffer;
+ Page page;
+ bool meDelete = false;
+ bool isempty;
+
+ if (isRoot)
+ {
+ me = parent;
+ }
+ else
+ {
+ if (!parent->child)
+ {
+ me = (DataPageDeleteStack *) palloc0(sizeof(DataPageDeleteStack));
+ me->parent = parent;
+ parent->child = me;
+ me->leftBuffer = InvalidBuffer;
+ }
+ else
+ me = parent->child;
+ }
+
+ buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, gvs->strategy);
+
+ if (!isRoot)
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+
+ page = BufferGetPage(buffer);
+
+ Assert(GinPageIsData(page));
+
+ if (!GinPageIsLeaf(page))
+ {
+ OffsetNumber i;
+
+ me->blkno = blkno;
+ for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
+ {
+ PostingItem *pitem = GinDataPageGetPostingItem(page, i);
+
+ if (ginScanToDelete(gvs, PostingItemGetBlockNumber(pitem), false, me, i))
+ i--;
+ }
+
+ if (GinPageRightMost(page) && BufferIsValid(me->child->leftBuffer))
+ {
+ UnlockReleaseBuffer(me->child->leftBuffer);
+ me->child->leftBuffer = InvalidBuffer;
+ }
+ }
+
+ if (GinPageIsLeaf(page))
+ isempty = GinDataLeafPageIsEmpty(page);
+ else
+ isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber;
+
+ if (isempty)
+ {
+ /* we never delete the left- or rightmost branch */
+ if (BufferIsValid(me->leftBuffer) && !GinPageRightMost(page))
+ {
+ Assert(!isRoot);
+ ginDeletePage(gvs, blkno, BufferGetBlockNumber(me->leftBuffer),
+ me->parent->blkno, myoff, me->parent->isRoot);
+ meDelete = true;
+ }
+ }
+
+ if (!meDelete)
+ {
+ if (BufferIsValid(me->leftBuffer))
+ UnlockReleaseBuffer(me->leftBuffer);
+ me->leftBuffer = buffer;
+ }
+ else
+ {
+ if (!isRoot)
+ LockBuffer(buffer, GIN_UNLOCK);
+
+ ReleaseBuffer(buffer);
+ }
+
+ if (isRoot)
+ ReleaseBuffer(buffer);
+
+ return meDelete;
+}
+
+
+/*
+ * Scan through posting tree leafs, delete empty tuples. Returns true if there
+ * is at least one empty page.
+ */
+static bool
+ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno)
+{
+ Buffer buffer;
+ Page page;
+ bool hasVoidPage = false;
+ MemoryContext oldCxt;
+
+ /* Find leftmost leaf page of posting tree and lock it in exclusive mode */
+ while (true)
+ {
+ PostingItem *pitem;
+
+ buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, gvs->strategy);
+ LockBuffer(buffer, GIN_SHARE);
+ page = BufferGetPage(buffer);
+
+ Assert(GinPageIsData(page));
+
+ if (GinPageIsLeaf(page))
+ {
+ LockBuffer(buffer, GIN_UNLOCK);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+ break;
+ }
+
+ Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
+
+ pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
+ blkno = PostingItemGetBlockNumber(pitem);
+ Assert(blkno != InvalidBlockNumber);
+
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /* Iterate all posting tree leaves using rightlinks and vacuum them */
+ while (true)
+ {
+ oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
+ ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextReset(gvs->tmpCxt);
+
+ if (GinDataLeafPageIsEmpty(page))
+ hasVoidPage = true;
+
+ blkno = GinPageGetOpaque(page)->rightlink;
+
+ UnlockReleaseBuffer(buffer);
+
+ if (blkno == InvalidBlockNumber)
+ break;
+
+ buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, gvs->strategy);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+ page = BufferGetPage(buffer);
+ }
+
+ return hasVoidPage;
+}
+
+static void
+ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
+{
+ if (ginVacuumPostingTreeLeaves(gvs, rootBlkno))
+ {
+ /*
+ * There is at least one empty page. So we have to rescan the tree
+ * deleting empty pages.
+ */
+ Buffer buffer;
+ DataPageDeleteStack root,
+ *ptr,
+ *tmp;
+
+ buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, rootBlkno,
+ RBM_NORMAL, gvs->strategy);
+
+ /*
+ * Lock posting tree root for cleanup to ensure there are no
+ * concurrent inserts.
+ */
+ LockBufferForCleanup(buffer);
+
+ memset(&root, 0, sizeof(DataPageDeleteStack));
+ root.leftBuffer = InvalidBuffer;
+ root.isRoot = true;
+
+ ginScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber);
+
+ ptr = root.child;
+
+ while (ptr)
+ {
+ tmp = ptr->child;
+ pfree(ptr);
+ ptr = tmp;
+ }
+
+ UnlockReleaseBuffer(buffer);
+ }
+}
+
+/*
+ * returns modified page or NULL if page isn't modified.
+ * Function works with original page until first change is occurred,
+ * then page is copied into temporary one.
+ */
+static Page
+ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint32 *nroot)
+{
+ Page origpage = BufferGetPage(buffer),
+ tmppage;
+ OffsetNumber i,
+ maxoff = PageGetMaxOffsetNumber(origpage);
+
+ tmppage = origpage;
+
+ *nroot = 0;
+
+ for (i = FirstOffsetNumber; i <= maxoff; i++)
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i));
+
+ if (GinIsPostingTree(itup))
+ {
+ /*
+ * store posting tree's roots for further processing, we can't
+ * vacuum it just now due to risk of deadlocks with scans/inserts
+ */
+ roots[*nroot] = GinGetDownlink(itup);
+ (*nroot)++;
+ }
+ else if (GinGetNPosting(itup) > 0)
+ {
+ int nitems;
+ ItemPointer items_orig;
+ bool free_items_orig;
+ ItemPointer items;
+
+ /* Get list of item pointers from the tuple. */
+ if (GinItupIsCompressed(itup))
+ {
+ items_orig = ginPostingListDecode((GinPostingList *) GinGetPosting(itup), &nitems);
+ free_items_orig = true;
+ }
+ else
+ {
+ items_orig = (ItemPointer) GinGetPosting(itup);
+ nitems = GinGetNPosting(itup);
+ free_items_orig = false;
+ }
+
+ /* Remove any items from the list that need to be vacuumed. */
+ items = ginVacuumItemPointers(gvs, items_orig, nitems, &nitems);
+
+ if (free_items_orig)
+ pfree(items_orig);
+
+ /* If any item pointers were removed, recreate the tuple. */
+ if (items)
+ {
+ OffsetNumber attnum;
+ Datum key;
+ GinNullCategory category;
+ GinPostingList *plist;
+ int plistsize;
+
+ if (nitems > 0)
+ {
+ plist = ginCompressPostingList(items, nitems, GinMaxItemSize, NULL);
+ plistsize = SizeOfGinPostingList(plist);
+ }
+ else
+ {
+ plist = NULL;
+ plistsize = 0;
+ }
+
+ /*
+ * if we already created a temporary page, make changes in
+ * place
+ */
+ if (tmppage == origpage)
+ {
+ /*
+ * On first difference, create a temporary copy of the
+ * page and copy the tuple's posting list to it.
+ */
+ tmppage = PageGetTempPageCopy(origpage);
+
+ /* set itup pointer to new page */
+ itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i));
+ }
+
+ attnum = gintuple_get_attrnum(&gvs->ginstate, itup);
+ key = gintuple_get_key(&gvs->ginstate, itup, &category);
+ itup = GinFormTuple(&gvs->ginstate, attnum, key, category,
+ (char *) plist, plistsize,
+ nitems, true);
+ if (plist)
+ pfree(plist);
+ PageIndexTupleDelete(tmppage, i);
+
+ if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
+ elog(ERROR, "failed to add item to index page in \"%s\"",
+ RelationGetRelationName(gvs->index));
+
+ pfree(itup);
+ pfree(items);
+ }
+ }
+ }
+
+ return (tmppage == origpage) ? NULL : tmppage;
+}
+
+IndexBulkDeleteResult *
+ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ Relation index = info->index;
+ BlockNumber blkno = GIN_ROOT_BLKNO;
+ GinVacuumState gvs;
+ Buffer buffer;
+ BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))];
+ uint32 nRoot;
+
+ gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "Gin vacuum temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+ gvs.index = index;
+ gvs.callback = callback;
+ gvs.callback_state = callback_state;
+ gvs.strategy = info->strategy;
+ initGinState(&gvs.ginstate, index);
+
+ /* first time through? */
+ if (stats == NULL)
+ {
+ /* Yes, so initialize stats to zeroes */
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+ /*
+ * and cleanup any pending inserts
+ */
+ ginInsertCleanup(&gvs.ginstate, !IsAutoVacuumWorkerProcess(),
+ false, true, stats);
+ }
+
+ /* we'll re-count the tuples each time */
+ stats->num_index_tuples = 0;
+ gvs.result = stats;
+
+ buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, info->strategy);
+
+ /* find leaf page */
+ for (;;)
+ {
+ Page page = BufferGetPage(buffer);
+ IndexTuple itup;
+
+ LockBuffer(buffer, GIN_SHARE);
+
+ Assert(!GinPageIsData(page));
+
+ if (GinPageIsLeaf(page))
+ {
+ LockBuffer(buffer, GIN_UNLOCK);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+
+ if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page))
+ {
+ LockBuffer(buffer, GIN_UNLOCK);
+ continue; /* check it one more */
+ }
+ break;
+ }
+
+ Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
+ blkno = GinGetDownlink(itup);
+ Assert(blkno != InvalidBlockNumber);
+
+ UnlockReleaseBuffer(buffer);
+ buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, info->strategy);
+ }
+
+ /* right now we found leftmost page in entry's BTree */
+
+ for (;;)
+ {
+ Page page = BufferGetPage(buffer);
+ Page resPage;
+ uint32 i;
+
+ Assert(!GinPageIsData(page));
+
+ resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot);
+
+ blkno = GinPageGetOpaque(page)->rightlink;
+
+ if (resPage)
+ {
+ START_CRIT_SECTION();
+ PageRestoreTempPage(resPage, page);
+ MarkBufferDirty(buffer);
+ xlogVacuumPage(gvs.index, buffer);
+ UnlockReleaseBuffer(buffer);
+ END_CRIT_SECTION();
+ }
+ else
+ {
+ UnlockReleaseBuffer(buffer);
+ }
+
+ vacuum_delay_point();
+
+ for (i = 0; i < nRoot; i++)
+ {
+ ginVacuumPostingTree(&gvs, rootOfPostingTree[i]);
+ vacuum_delay_point();
+ }
+
+ if (blkno == InvalidBlockNumber) /* rightmost page */
+ break;
+
+ buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, info->strategy);
+ LockBuffer(buffer, GIN_EXCLUSIVE);
+ }
+
+ MemoryContextDelete(gvs.tmpCxt);
+
+ return gvs.result;
+}
+
+IndexBulkDeleteResult *
+ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+ Relation index = info->index;
+ bool needLock;
+ BlockNumber npages,
+ blkno;
+ BlockNumber totFreePages;
+ GinState ginstate;
+ GinStatsData idxStat;
+
+ /*
+ * In an autovacuum analyze, we want to clean up pending insertions.
+ * Otherwise, an ANALYZE-only call is a no-op.
+ */
+ if (info->analyze_only)
+ {
+ if (IsAutoVacuumWorkerProcess())
+ {
+ initGinState(&ginstate, index);
+ ginInsertCleanup(&ginstate, false, true, true, stats);
+ }
+ return stats;
+ }
+
+ /*
+ * Set up all-zero stats and cleanup pending inserts if ginbulkdelete
+ * wasn't called
+ */
+ if (stats == NULL)
+ {
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ initGinState(&ginstate, index);
+ ginInsertCleanup(&ginstate, !IsAutoVacuumWorkerProcess(),
+ false, true, stats);
+ }
+
+ memset(&idxStat, 0, sizeof(idxStat));
+
+ /*
+ * XXX we always report the heap tuple count as the number of index
+ * entries. This is bogus if the index is partial, but it's real hard to
+ * tell how many distinct heap entries are referenced by a GIN index.
+ */
+ stats->num_index_tuples = Max(info->num_heap_tuples, 0);
+ stats->estimated_count = info->estimated_count;
+
+ /*
+ * Need lock unless it's local to this backend.
+ */
+ needLock = !RELATION_IS_LOCAL(index);
+
+ if (needLock)
+ LockRelationForExtension(index, ExclusiveLock);
+ npages = RelationGetNumberOfBlocks(index);
+ if (needLock)
+ UnlockRelationForExtension(index, ExclusiveLock);
+
+ totFreePages = 0;
+
+ for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
+ {
+ Buffer buffer;
+ Page page;
+
+ vacuum_delay_point();
+
+ buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, info->strategy);
+ LockBuffer(buffer, GIN_SHARE);
+ page = (Page) BufferGetPage(buffer);
+
+ if (GinPageIsRecyclable(page))
+ {
+ Assert(blkno != GIN_ROOT_BLKNO);
+ RecordFreeIndexPage(index, blkno);
+ totFreePages++;
+ }
+ else if (GinPageIsData(page))
+ {
+ idxStat.nDataPages++;
+ }
+ else if (!GinPageIsList(page))
+ {
+ idxStat.nEntryPages++;
+
+ if (GinPageIsLeaf(page))
+ idxStat.nEntries += PageGetMaxOffsetNumber(page);
+ }
+
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /* Update the metapage with accurate page and entry counts */
+ idxStat.nTotalPages = npages;
+ ginUpdateStats(info->index, &idxStat, false);
+
+ /* Finally, vacuum the FSM */
+ IndexFreeSpaceMapVacuum(info->index);
+
+ stats->pages_free = totFreePages;
+
+ if (needLock)
+ LockRelationForExtension(index, ExclusiveLock);
+ stats->num_pages = RelationGetNumberOfBlocks(index);
+ if (needLock)
+ UnlockRelationForExtension(index, ExclusiveLock);
+
+ return stats;
+}
+
+/*
+ * Return whether Page can safely be recycled.
+ */
+bool
+GinPageIsRecyclable(Page page)
+{
+ TransactionId delete_xid;
+
+ if (PageIsNew(page))
+ return true;
+
+ if (!GinPageIsDeleted(page))
+ return false;
+
+ delete_xid = GinPageGetDeleteXid(page);
+
+ if (!TransactionIdIsValid(delete_xid))
+ return true;
+
+ /*
+ * If no backend still could view delete_xid as in running, all scans
+ * concurrent with ginDeletePage() must have finished.
+ */
+ return GlobalVisCheckRemovableXid(NULL, delete_xid);
+}
diff --git a/src/backend/access/gin/ginvalidate.c b/src/backend/access/gin/ginvalidate.c
new file mode 100644
index 0000000..d2510da
--- /dev/null
+++ b/src/backend/access/gin/ginvalidate.c
@@ -0,0 +1,338 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginvalidate.c
+ * Opclass validator for GIN.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginvalidate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/gin_private.h"
+#include "access/htup_details.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+/*
+ * Validator for a GIN opclass.
+ */
+bool
+ginvalidate(Oid opclassoid)
+{
+ bool result = true;
+ HeapTuple classtup;
+ Form_pg_opclass classform;
+ Oid opfamilyoid;
+ Oid opcintype;
+ Oid opckeytype;
+ char *opclassname;
+ HeapTuple familytup;
+ Form_pg_opfamily familyform;
+ char *opfamilyname;
+ CatCList *proclist,
+ *oprlist;
+ List *grouplist;
+ OpFamilyOpFuncGroup *opclassgroup;
+ int i;
+ ListCell *lc;
+
+ /* Fetch opclass information */
+ classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
+ if (!HeapTupleIsValid(classtup))
+ elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
+ classform = (Form_pg_opclass) GETSTRUCT(classtup);
+
+ opfamilyoid = classform->opcfamily;
+ opcintype = classform->opcintype;
+ opckeytype = classform->opckeytype;
+ if (!OidIsValid(opckeytype))
+ opckeytype = opcintype;
+ opclassname = NameStr(classform->opcname);
+
+ /* Fetch opfamily information */
+ familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
+ if (!HeapTupleIsValid(familytup))
+ elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
+ familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
+
+ opfamilyname = NameStr(familyform->opfname);
+
+ /* Fetch all operators and support functions of the opfamily */
+ oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
+ proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
+
+ /* Check individual support functions */
+ for (i = 0; i < proclist->n_members; i++)
+ {
+ HeapTuple proctup = &proclist->members[i]->tuple;
+ Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
+ bool ok;
+
+ /*
+ * All GIN support functions should be registered with matching
+ * left/right types
+ */
+ if (procform->amproclefttype != procform->amprocrighttype)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types",
+ opfamilyname, "gin",
+ format_procedure(procform->amproc))));
+ result = false;
+ }
+
+ /*
+ * We can't check signatures except within the specific opclass, since
+ * we need to know the associated opckeytype in many cases.
+ */
+ if (procform->amproclefttype != opcintype)
+ continue;
+
+ /* Check procedure numbers and function signatures */
+ switch (procform->amprocnum)
+ {
+ case GIN_COMPARE_PROC:
+ ok = check_amproc_signature(procform->amproc, INT4OID, false,
+ 2, 2, opckeytype, opckeytype);
+ break;
+ case GIN_EXTRACTVALUE_PROC:
+ /* Some opclasses omit nullFlags */
+ ok = check_amproc_signature(procform->amproc, INTERNALOID, false,
+ 2, 3, opcintype, INTERNALOID,
+ INTERNALOID);
+ break;
+ case GIN_EXTRACTQUERY_PROC:
+ /* Some opclasses omit nullFlags and searchMode */
+ ok = check_amproc_signature(procform->amproc, INTERNALOID, false,
+ 5, 7, opcintype, INTERNALOID,
+ INT2OID, INTERNALOID, INTERNALOID,
+ INTERNALOID, INTERNALOID);
+ break;
+ case GIN_CONSISTENT_PROC:
+ /* Some opclasses omit queryKeys and nullFlags */
+ ok = check_amproc_signature(procform->amproc, BOOLOID, false,
+ 6, 8, INTERNALOID, INT2OID,
+ opcintype, INT4OID,
+ INTERNALOID, INTERNALOID,
+ INTERNALOID, INTERNALOID);
+ break;
+ case GIN_COMPARE_PARTIAL_PROC:
+ ok = check_amproc_signature(procform->amproc, INT4OID, false,
+ 4, 4, opckeytype, opckeytype,
+ INT2OID, INTERNALOID);
+ break;
+ case GIN_TRICONSISTENT_PROC:
+ ok = check_amproc_signature(procform->amproc, CHAROID, false,
+ 7, 7, INTERNALOID, INT2OID,
+ opcintype, INT4OID,
+ INTERNALOID, INTERNALOID,
+ INTERNALOID);
+ break;
+ case GIN_OPTIONS_PROC:
+ ok = check_amoptsproc_signature(procform->amproc);
+ break;
+ default:
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
+ opfamilyname, "gin",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ continue; /* don't want additional message */
+ }
+
+ if (!ok)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
+ opfamilyname, "gin",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ }
+ }
+
+ /* Check individual operators */
+ for (i = 0; i < oprlist->n_members; i++)
+ {
+ HeapTuple oprtup = &oprlist->members[i]->tuple;
+ Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
+
+ /* TODO: Check that only allowed strategy numbers exist */
+ if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
+ opfamilyname, "gin",
+ format_operator(oprform->amopopr),
+ oprform->amopstrategy)));
+ result = false;
+ }
+
+ /* gin doesn't support ORDER BY operators */
+ if (oprform->amoppurpose != AMOP_SEARCH ||
+ OidIsValid(oprform->amopsortfamily))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
+ opfamilyname, "gin",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+
+ /* Check operator signature --- same for all gin strategies */
+ if (!check_amop_signature(oprform->amopopr, BOOLOID,
+ oprform->amoplefttype,
+ oprform->amoprighttype))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
+ opfamilyname, "gin",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ }
+
+ /* Now check for inconsistent groups of operators/functions */
+ grouplist = identify_opfamily_groups(oprlist, proclist);
+ opclassgroup = NULL;
+ foreach(lc, grouplist)
+ {
+ OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
+
+ /* Remember the group exactly matching the test opclass */
+ if (thisgroup->lefttype == opcintype &&
+ thisgroup->righttype == opcintype)
+ opclassgroup = thisgroup;
+
+ /*
+ * There is not a lot we can do to check the operator sets, since each
+ * GIN opclass is more or less a law unto itself, and some contain
+ * only operators that are binary-compatible with the opclass datatype
+ * (meaning that empty operator sets can be OK). That case also means
+ * that we shouldn't insist on nonempty function sets except for the
+ * opclass's own group.
+ */
+ }
+
+ /* Check that the originally-named opclass is complete */
+ for (i = 1; i <= GINNProcs; i++)
+ {
+ if (opclassgroup &&
+ (opclassgroup->functionset & (((uint64) 1) << i)) != 0)
+ continue; /* got it */
+ if (i == GIN_COMPARE_PROC || i == GIN_COMPARE_PARTIAL_PROC ||
+ i == GIN_OPTIONS_PROC)
+ continue; /* optional method */
+ if (i == GIN_CONSISTENT_PROC || i == GIN_TRICONSISTENT_PROC)
+ continue; /* don't need both, see check below loop */
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator class \"%s\" of access method %s is missing support function %d",
+ opclassname, "gin", i)));
+ result = false;
+ }
+ if (!opclassgroup ||
+ ((opclassgroup->functionset & (1 << GIN_CONSISTENT_PROC)) == 0 &&
+ (opclassgroup->functionset & (1 << GIN_TRICONSISTENT_PROC)) == 0))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator class \"%s\" of access method %s is missing support function %d or %d",
+ opclassname, "gin",
+ GIN_CONSISTENT_PROC, GIN_TRICONSISTENT_PROC)));
+ result = false;
+ }
+
+
+ ReleaseCatCacheList(proclist);
+ ReleaseCatCacheList(oprlist);
+ ReleaseSysCache(familytup);
+ ReleaseSysCache(classtup);
+
+ return result;
+}
+
+/*
+ * Prechecking function for adding operators/functions to a GIN opfamily.
+ */
+void
+ginadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions)
+{
+ ListCell *lc;
+
+ /*
+ * Operator members of a GIN opfamily should never have hard dependencies,
+ * since their connection to the opfamily depends only on what the support
+ * functions think, and that can be altered. For consistency, we make all
+ * soft dependencies point to the opfamily, though a soft dependency on
+ * the opclass would work as well in the CREATE OPERATOR CLASS case.
+ */
+ foreach(lc, operators)
+ {
+ OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+ op->ref_is_hard = false;
+ op->ref_is_family = true;
+ op->refobjid = opfamilyoid;
+ }
+
+ /*
+ * Required support functions should have hard dependencies. Preferably
+ * those are just dependencies on the opclass, but if we're in ALTER
+ * OPERATOR FAMILY, we leave the dependency pointing at the whole
+ * opfamily. (Given that GIN opclasses generally don't share opfamilies,
+ * it seems unlikely to be worth working harder.)
+ */
+ foreach(lc, functions)
+ {
+ OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+ switch (op->number)
+ {
+ case GIN_EXTRACTVALUE_PROC:
+ case GIN_EXTRACTQUERY_PROC:
+ /* Required support function */
+ op->ref_is_hard = true;
+ break;
+ case GIN_COMPARE_PROC:
+ case GIN_CONSISTENT_PROC:
+ case GIN_COMPARE_PARTIAL_PROC:
+ case GIN_TRICONSISTENT_PROC:
+ case GIN_OPTIONS_PROC:
+ /* Optional, so force it to be a soft family dependency */
+ op->ref_is_hard = false;
+ op->ref_is_family = true;
+ op->refobjid = opfamilyoid;
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("support function number %d is invalid for access method %s",
+ op->number, "gin")));
+ break;
+ }
+ }
+}
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
new file mode 100644
index 0000000..09ce4d6
--- /dev/null
+++ b/src/backend/access/gin/ginxlog.c
@@ -0,0 +1,813 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginxlog.c
+ * WAL replay logic for inverted index.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginxlog.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bufmask.h"
+#include "access/gin_private.h"
+#include "access/ginxlog.h"
+#include "access/xlogutils.h"
+#include "utils/memutils.h"
+
+static MemoryContext opCtx; /* working memory for operations */
+
+static void
+ginRedoClearIncompleteSplit(XLogReaderState *record, uint8 block_id)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ Buffer buffer;
+ Page page;
+
+ if (XLogReadBufferForRedo(record, block_id, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+ GinPageGetOpaque(page)->flags &= ~GIN_INCOMPLETE_SPLIT;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+ginRedoCreatePTree(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record);
+ char *ptr;
+ Buffer buffer;
+ Page page;
+
+ buffer = XLogInitBufferForRedo(record, 0);
+ page = (Page) BufferGetPage(buffer);
+
+ GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED);
+
+ ptr = XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree);
+
+ /* Place page data */
+ memcpy(GinDataLeafPageGetPostingList(page), ptr, data->size);
+
+ GinDataPageSetDataSize(page, data->size);
+
+ PageSetLSN(page, lsn);
+
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
+{
+ Page page = BufferGetPage(buffer);
+ ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata;
+ OffsetNumber offset = data->offset;
+ IndexTuple itup;
+
+ if (rightblkno != InvalidBlockNumber)
+ {
+ /* update link to right page after split */
+ Assert(!GinPageIsLeaf(page));
+ Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page));
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offset));
+ GinSetDownlink(itup, rightblkno);
+ }
+
+ if (data->isDelete)
+ {
+ Assert(GinPageIsLeaf(page));
+ Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page));
+ PageIndexTupleDelete(page, offset);
+ }
+
+ itup = &data->tuple;
+
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber)
+ {
+ RelFileNode node;
+ ForkNumber forknum;
+ BlockNumber blknum;
+
+ BufferGetTag(buffer, &node, &forknum, &blknum);
+ elog(ERROR, "failed to add item to index page in %u/%u/%u",
+ node.spcNode, node.dbNode, node.relNode);
+ }
+}
+
+/*
+ * Redo recompression of posting list. Doing all the changes in-place is not
+ * always possible, because it might require more space than we've on the page.
+ * Instead, once modification is required we copy unprocessed tail of the page
+ * into separately allocated chunk of memory for further reading original
+ * versions of segments. Thanks to that we don't bother about moving page data
+ * in-place.
+ */
+static void
+ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
+{
+ int actionno;
+ int segno;
+ GinPostingList *oldseg;
+ Pointer segmentend;
+ char *walbuf;
+ int totalsize;
+ Pointer tailCopy = NULL;
+ Pointer writePtr;
+ Pointer segptr;
+
+ /*
+ * If the page is in pre-9.4 format, convert to new format first.
+ */
+ if (!GinPageIsCompressed(page))
+ {
+ ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page);
+ int nuncompressed = GinPageGetOpaque(page)->maxoff;
+ int npacked;
+
+ /*
+ * Empty leaf pages are deleted as part of vacuum, but leftmost and
+ * rightmost pages are never deleted. So, pg_upgrade'd from pre-9.4
+ * instances might contain empty leaf pages, and we need to handle
+ * them correctly.
+ */
+ if (nuncompressed > 0)
+ {
+ GinPostingList *plist;
+
+ plist = ginCompressPostingList(uncompressed, nuncompressed,
+ BLCKSZ, &npacked);
+ totalsize = SizeOfGinPostingList(plist);
+
+ Assert(npacked == nuncompressed);
+
+ memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize);
+ }
+ else
+ {
+ totalsize = 0;
+ }
+
+ GinDataPageSetDataSize(page, totalsize);
+ GinPageSetCompressed(page);
+ GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ }
+
+ oldseg = GinDataLeafPageGetPostingList(page);
+ writePtr = (Pointer) oldseg;
+ segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page);
+ segno = 0;
+
+ walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf);
+ for (actionno = 0; actionno < data->nactions; actionno++)
+ {
+ uint8 a_segno = *((uint8 *) (walbuf++));
+ uint8 a_action = *((uint8 *) (walbuf++));
+ GinPostingList *newseg = NULL;
+ int newsegsize = 0;
+ ItemPointerData *items = NULL;
+ uint16 nitems = 0;
+ ItemPointerData *olditems;
+ int nolditems;
+ ItemPointerData *newitems;
+ int nnewitems;
+ int segsize;
+
+ /* Extract all the information we need from the WAL record */
+ if (a_action == GIN_SEGMENT_INSERT ||
+ a_action == GIN_SEGMENT_REPLACE)
+ {
+ newseg = (GinPostingList *) walbuf;
+ newsegsize = SizeOfGinPostingList(newseg);
+ walbuf += SHORTALIGN(newsegsize);
+ }
+
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ memcpy(&nitems, walbuf, sizeof(uint16));
+ walbuf += sizeof(uint16);
+ items = (ItemPointerData *) walbuf;
+ walbuf += nitems * sizeof(ItemPointerData);
+ }
+
+ /* Skip to the segment that this action concerns */
+ Assert(segno <= a_segno);
+ while (segno < a_segno)
+ {
+ /*
+ * Once modification is started and page tail is copied, we've to
+ * copy unmodified segments.
+ */
+ segsize = SizeOfGinPostingList(oldseg);
+ if (tailCopy)
+ {
+ Assert(writePtr + segsize < PageGetSpecialPointer(page));
+ memcpy(writePtr, (Pointer) oldseg, segsize);
+ }
+ writePtr += segsize;
+ oldseg = GinNextPostingListSegment(oldseg);
+ segno++;
+ }
+
+ /*
+ * ADDITEMS action is handled like REPLACE, but the new segment to
+ * replace the old one is reconstructed using the old segment from
+ * disk and the new items from the WAL record.
+ */
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ int npacked;
+
+ olditems = ginPostingListDecode(oldseg, &nolditems);
+
+ newitems = ginMergeItemPointers(items, nitems,
+ olditems, nolditems,
+ &nnewitems);
+ Assert(nnewitems == nolditems + nitems);
+
+ newseg = ginCompressPostingList(newitems, nnewitems,
+ BLCKSZ, &npacked);
+ Assert(npacked == nnewitems);
+
+ newsegsize = SizeOfGinPostingList(newseg);
+ a_action = GIN_SEGMENT_REPLACE;
+ }
+
+ segptr = (Pointer) oldseg;
+ if (segptr != segmentend)
+ segsize = SizeOfGinPostingList(oldseg);
+ else
+ {
+ /*
+ * Positioned after the last existing segment. Only INSERTs
+ * expected here.
+ */
+ Assert(a_action == GIN_SEGMENT_INSERT);
+ segsize = 0;
+ }
+
+ /*
+ * We're about to start modification of the page. So, copy tail of
+ * the page if it's not done already.
+ */
+ if (!tailCopy && segptr != segmentend)
+ {
+ int tailSize = segmentend - segptr;
+
+ tailCopy = (Pointer) palloc(tailSize);
+ memcpy(tailCopy, segptr, tailSize);
+ segptr = tailCopy;
+ oldseg = (GinPostingList *) segptr;
+ segmentend = segptr + tailSize;
+ }
+
+ switch (a_action)
+ {
+ case GIN_SEGMENT_DELETE:
+ segptr += segsize;
+ segno++;
+ break;
+
+ case GIN_SEGMENT_INSERT:
+ /* copy the new segment in place */
+ Assert(writePtr + newsegsize <= PageGetSpecialPointer(page));
+ memcpy(writePtr, newseg, newsegsize);
+ writePtr += newsegsize;
+ break;
+
+ case GIN_SEGMENT_REPLACE:
+ /* copy the new version of segment in place */
+ Assert(writePtr + newsegsize <= PageGetSpecialPointer(page));
+ memcpy(writePtr, newseg, newsegsize);
+ writePtr += newsegsize;
+ segptr += segsize;
+ segno++;
+ break;
+
+ default:
+ elog(ERROR, "unexpected GIN leaf action: %u", a_action);
+ }
+ oldseg = (GinPostingList *) segptr;
+ }
+
+ /* Copy the rest of unmodified segments if any. */
+ segptr = (Pointer) oldseg;
+ if (segptr != segmentend && tailCopy)
+ {
+ int restSize = segmentend - segptr;
+
+ Assert(writePtr + restSize <= PageGetSpecialPointer(page));
+ memcpy(writePtr, segptr, restSize);
+ writePtr += restSize;
+ }
+
+ totalsize = writePtr - (Pointer) GinDataLeafPageGetPostingList(page);
+ GinDataPageSetDataSize(page, totalsize);
+}
+
+static void
+ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
+{
+ Page page = BufferGetPage(buffer);
+
+ if (isLeaf)
+ {
+ ginxlogRecompressDataLeaf *data = (ginxlogRecompressDataLeaf *) rdata;
+
+ Assert(GinPageIsLeaf(page));
+
+ ginRedoRecompress(page, data);
+ }
+ else
+ {
+ ginxlogInsertDataInternal *data = (ginxlogInsertDataInternal *) rdata;
+ PostingItem *oldpitem;
+
+ Assert(!GinPageIsLeaf(page));
+
+ /* update link to right page after split */
+ oldpitem = GinDataPageGetPostingItem(page, data->offset);
+ PostingItemSetBlockNumber(oldpitem, rightblkno);
+
+ GinDataPageAddPostingItem(page, &data->newitem, data->offset);
+ }
+}
+
+static void
+ginRedoInsert(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record);
+ Buffer buffer;
+#ifdef NOT_USED
+ BlockNumber leftChildBlkno = InvalidBlockNumber;
+#endif
+ BlockNumber rightChildBlkno = InvalidBlockNumber;
+ bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
+
+ /*
+ * First clear incomplete-split flag on child page if this finishes a
+ * split.
+ */
+ if (!isLeaf)
+ {
+ char *payload = XLogRecGetData(record) + sizeof(ginxlogInsert);
+
+#ifdef NOT_USED
+ leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload);
+#endif
+ payload += sizeof(BlockIdData);
+ rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload);
+ payload += sizeof(BlockIdData);
+
+ ginRedoClearIncompleteSplit(record, 1);
+ }
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ Page page = BufferGetPage(buffer);
+ Size len;
+ char *payload = XLogRecGetBlockData(record, 0, &len);
+
+ /* How to insert the payload is tree-type specific */
+ if (data->flags & GIN_INSERT_ISDATA)
+ {
+ Assert(GinPageIsData(page));
+ ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload);
+ }
+ else
+ {
+ Assert(!GinPageIsData(page));
+ ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload);
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+ginRedoSplit(XLogReaderState *record)
+{
+ ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record);
+ Buffer lbuffer,
+ rbuffer,
+ rootbuf;
+ bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
+ bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0;
+
+ /*
+ * First clear incomplete-split flag on child page if this finishes a
+ * split
+ */
+ if (!isLeaf)
+ ginRedoClearIncompleteSplit(record, 3);
+
+ if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+ elog(ERROR, "GIN split record did not contain a full-page image of left page");
+
+ if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED)
+ elog(ERROR, "GIN split record did not contain a full-page image of right page");
+
+ if (isRoot)
+ {
+ if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED)
+ elog(ERROR, "GIN split record did not contain a full-page image of root page");
+ UnlockReleaseBuffer(rootbuf);
+ }
+
+ UnlockReleaseBuffer(rbuffer);
+ UnlockReleaseBuffer(lbuffer);
+}
+
+/*
+ * VACUUM_PAGE record contains simply a full image of the page, similar to
+ * an XLOG_FPI record.
+ */
+static void
+ginRedoVacuumPage(XLogReaderState *record)
+{
+ Buffer buffer;
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
+ {
+ elog(ERROR, "replay of gin entry tree page vacuum did not restore the page");
+ }
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+ginRedoVacuumDataLeafPage(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ Buffer buffer;
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ Page page = BufferGetPage(buffer);
+ Size len;
+ ginxlogVacuumDataLeafPage *xlrec;
+
+ xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, &len);
+
+ Assert(GinPageIsLeaf(page));
+ Assert(GinPageIsData(page));
+
+ ginRedoRecompress(page, &xlrec->data);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+ginRedoDeletePage(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record);
+ Buffer dbuffer;
+ Buffer pbuffer;
+ Buffer lbuffer;
+ Page page;
+
+ /*
+ * Lock left page first in order to prevent possible deadlock with
+ * ginStepRight().
+ */
+ if (XLogReadBufferForRedo(record, 2, &lbuffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(lbuffer);
+ Assert(GinPageIsData(page));
+ GinPageGetOpaque(page)->rightlink = data->rightLink;
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(lbuffer);
+ }
+
+ if (XLogReadBufferForRedo(record, 0, &dbuffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(dbuffer);
+ Assert(GinPageIsData(page));
+ GinPageSetDeleted(page);
+ GinPageSetDeleteXid(page, data->deleteXid);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(dbuffer);
+ }
+
+ if (XLogReadBufferForRedo(record, 1, &pbuffer) == BLK_NEEDS_REDO)
+ {
+ page = BufferGetPage(pbuffer);
+ Assert(GinPageIsData(page));
+ Assert(!GinPageIsLeaf(page));
+ GinPageDeletePostingItem(page, data->parentOffset);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(pbuffer);
+ }
+
+ if (BufferIsValid(lbuffer))
+ UnlockReleaseBuffer(lbuffer);
+ if (BufferIsValid(pbuffer))
+ UnlockReleaseBuffer(pbuffer);
+ if (BufferIsValid(dbuffer))
+ UnlockReleaseBuffer(dbuffer);
+}
+
+static void
+ginRedoUpdateMetapage(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record);
+ Buffer metabuffer;
+ Page metapage;
+ Buffer buffer;
+
+ /*
+ * Restore the metapage. This is essentially the same as a full-page
+ * image, so restore the metapage unconditionally without looking at the
+ * LSN, to avoid torn page hazards.
+ */
+ metabuffer = XLogInitBufferForRedo(record, 0);
+ Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO);
+ metapage = BufferGetPage(metabuffer);
+
+ GinInitMetabuffer(metabuffer);
+ memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
+ PageSetLSN(metapage, lsn);
+ MarkBufferDirty(metabuffer);
+
+ if (data->ntuples > 0)
+ {
+ /*
+ * insert into tail page
+ */
+ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+ {
+ Page page = BufferGetPage(buffer);
+ OffsetNumber off;
+ int i;
+ Size tupsize;
+ char *payload;
+ IndexTuple tuples;
+ Size totaltupsize;
+
+ payload = XLogRecGetBlockData(record, 1, &totaltupsize);
+ tuples = (IndexTuple) payload;
+
+ if (PageIsEmpty(page))
+ off = FirstOffsetNumber;
+ else
+ off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ for (i = 0; i < data->ntuples; i++)
+ {
+ tupsize = IndexTupleSize(tuples);
+
+ if (PageAddItem(page, (Item) tuples, tupsize, off,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page");
+
+ tuples = (IndexTuple) (((char *) tuples) + tupsize);
+
+ off++;
+ }
+ Assert(payload + totaltupsize == (char *) tuples);
+
+ /*
+ * Increase counter of heap tuples
+ */
+ GinPageGetOpaque(page)->maxoff++;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ }
+ else if (data->prevTail != InvalidBlockNumber)
+ {
+ /*
+ * New tail
+ */
+ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+ {
+ Page page = BufferGetPage(buffer);
+
+ GinPageGetOpaque(page)->rightlink = data->newRightlink;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ }
+
+ UnlockReleaseBuffer(metabuffer);
+}
+
+static void
+ginRedoInsertListPage(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+ OffsetNumber l,
+ off = FirstOffsetNumber;
+ int i,
+ tupsize;
+ char *payload;
+ IndexTuple tuples;
+ Size totaltupsize;
+
+ /* We always re-initialize the page. */
+ buffer = XLogInitBufferForRedo(record, 0);
+ page = BufferGetPage(buffer);
+
+ GinInitBuffer(buffer, GIN_LIST);
+ GinPageGetOpaque(page)->rightlink = data->rightlink;
+ if (data->rightlink == InvalidBlockNumber)
+ {
+ /* tail of sublist */
+ GinPageSetFullRow(page);
+ GinPageGetOpaque(page)->maxoff = 1;
+ }
+ else
+ {
+ GinPageGetOpaque(page)->maxoff = 0;
+ }
+
+ payload = XLogRecGetBlockData(record, 0, &totaltupsize);
+
+ tuples = (IndexTuple) payload;
+ for (i = 0; i < data->ntuples; i++)
+ {
+ tupsize = IndexTupleSize(tuples);
+
+ l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
+
+ if (l == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page");
+
+ tuples = (IndexTuple) (((char *) tuples) + tupsize);
+ off++;
+ }
+ Assert((char *) tuples == payload + totaltupsize);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+ginRedoDeleteListPages(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ ginxlogDeleteListPages *data = (ginxlogDeleteListPages *) XLogRecGetData(record);
+ Buffer metabuffer;
+ Page metapage;
+ int i;
+
+ metabuffer = XLogInitBufferForRedo(record, 0);
+ Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO);
+ metapage = BufferGetPage(metabuffer);
+
+ GinInitMetabuffer(metabuffer);
+
+ memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
+ PageSetLSN(metapage, lsn);
+ MarkBufferDirty(metabuffer);
+
+ /*
+ * In normal operation, shiftList() takes exclusive lock on all the
+ * pages-to-be-deleted simultaneously. During replay, however, it should
+ * be all right to lock them one at a time. This is dependent on the fact
+ * that we are deleting pages from the head of the list, and that readers
+ * share-lock the next page before releasing the one they are on. So we
+ * cannot get past a reader that is on, or due to visit, any page we are
+ * going to delete. New incoming readers will block behind our metapage
+ * lock and then see a fully updated page list.
+ *
+ * No full-page images are taken of the deleted pages. Instead, they are
+ * re-initialized as empty, deleted pages. Their right-links don't need to
+ * be preserved, because no new readers can see the pages, as explained
+ * above.
+ */
+ for (i = 0; i < data->ndeleted; i++)
+ {
+ Buffer buffer;
+ Page page;
+
+ buffer = XLogInitBufferForRedo(record, i + 1);
+ page = BufferGetPage(buffer);
+ GinInitBuffer(buffer, GIN_DELETED);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+
+ UnlockReleaseBuffer(buffer);
+ }
+ UnlockReleaseBuffer(metabuffer);
+}
+
+void
+gin_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ MemoryContext oldCtx;
+
+ /*
+ * GIN indexes do not require any conflict processing. NB: If we ever
+ * implement a similar optimization as we have in b-tree, and remove
+ * killed tuples outside VACUUM, we'll need to handle that here.
+ */
+
+ oldCtx = MemoryContextSwitchTo(opCtx);
+ switch (info)
+ {
+ case XLOG_GIN_CREATE_PTREE:
+ ginRedoCreatePTree(record);
+ break;
+ case XLOG_GIN_INSERT:
+ ginRedoInsert(record);
+ break;
+ case XLOG_GIN_SPLIT:
+ ginRedoSplit(record);
+ break;
+ case XLOG_GIN_VACUUM_PAGE:
+ ginRedoVacuumPage(record);
+ break;
+ case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
+ ginRedoVacuumDataLeafPage(record);
+ break;
+ case XLOG_GIN_DELETE_PAGE:
+ ginRedoDeletePage(record);
+ break;
+ case XLOG_GIN_UPDATE_META_PAGE:
+ ginRedoUpdateMetapage(record);
+ break;
+ case XLOG_GIN_INSERT_LISTPAGE:
+ ginRedoInsertListPage(record);
+ break;
+ case XLOG_GIN_DELETE_LISTPAGE:
+ ginRedoDeleteListPages(record);
+ break;
+ default:
+ elog(PANIC, "gin_redo: unknown op code %u", info);
+ }
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(opCtx);
+}
+
+void
+gin_xlog_startup(void)
+{
+ opCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "GIN recovery temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+gin_xlog_cleanup(void)
+{
+ MemoryContextDelete(opCtx);
+ opCtx = NULL;
+}
+
+/*
+ * Mask a GIN page before running consistency checks on it.
+ */
+void
+gin_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ PageHeader pagehdr = (PageHeader) page;
+ GinPageOpaque opaque;
+
+ mask_page_lsn_and_checksum(page);
+ opaque = GinPageGetOpaque(page);
+
+ mask_page_hint_bits(page);
+
+ /*
+ * For a GIN_DELETED page, the page is initialized to empty. Hence, mask
+ * the whole page content. For other pages, mask the hole if pd_lower
+ * appears to have been set correctly.
+ */
+ if (opaque->flags & GIN_DELETED)
+ mask_page_content(page);
+ else if (pagehdr->pd_lower > SizeOfPageHeaderData)
+ mask_unused_space(page);
+}