summaryrefslogtreecommitdiffstats
path: root/src/backend/access/gin/ginget.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/gin/ginget.c')
-rw-r--r--src/backend/access/gin/ginget.c1973
1 files changed, 1973 insertions, 0 deletions
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
new file mode 100644
index 0000000..00fd6a9
--- /dev/null
+++ b/src/backend/access/gin/ginget.c
@@ -0,0 +1,1973 @@
+/*-------------------------------------------------------------------------
+ *
+ * ginget.c
+ * fetch tuples from a GIN scan.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gin/ginget.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gin_private.h"
+#include "access/relscan.h"
+#include "common/pg_prng.h"
+#include "miscadmin.h"
+#include "storage/predicate.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* GUC parameter */
+int GinFuzzySearchLimit = 0;
+
+typedef struct pendingPosition
+{
+ Buffer pendingBuffer;
+ OffsetNumber firstOffset;
+ OffsetNumber lastOffset;
+ ItemPointerData item;
+ bool *hasMatchKey;
+} pendingPosition;
+
+
+/*
+ * Goes to the next page if current offset is outside of bounds
+ */
+static bool
+moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot)
+{
+ Page page = BufferGetPage(stack->buffer);
+
+ if (stack->off > PageGetMaxOffsetNumber(page))
+ {
+ /*
+ * We scanned the whole page, so we should take right page
+ */
+ if (GinPageRightMost(page))
+ return false; /* no more pages */
+
+ stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+ stack->blkno = BufferGetBlockNumber(stack->buffer);
+ stack->off = FirstOffsetNumber;
+ PredicateLockPage(btree->index, stack->blkno, snapshot);
+ }
+
+ return true;
+}
+
+/*
+ * Scan all pages of a posting tree and save all its heap ItemPointers
+ * in scanEntry->matchBitmap
+ */
+static void
+scanPostingTree(Relation index, GinScanEntry scanEntry,
+ BlockNumber rootPostingTree, Snapshot snapshot)
+{
+ GinBtreeData btree;
+ GinBtreeStack *stack;
+ Buffer buffer;
+ Page page;
+
+ /* Descend to the leftmost leaf page */
+ stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
+ buffer = stack->buffer;
+
+ IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
+
+ freeGinBtreeStack(stack);
+
+ /*
+ * Loop iterates through all leaf pages of posting tree
+ */
+ for (;;)
+ {
+ page = BufferGetPage(buffer);
+ if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
+ {
+ int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
+
+ scanEntry->predictNumberResult += n;
+ }
+
+ if (GinPageRightMost(page))
+ break; /* no more pages */
+
+ buffer = ginStepRight(buffer, index, GIN_SHARE);
+ }
+
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
+ * match the search entry. This supports three different match modes:
+ *
+ * 1. Partial-match support: scan from current point until the
+ * comparePartialFn says we're done.
+ * 2. SEARCH_MODE_ALL: scan from current point (which should be first
+ * key for the current attnum) until we hit null items or end of attnum
+ * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
+ * key for the current attnum) until we hit end of attnum
+ *
+ * Returns true if done, false if it's necessary to restart scan from scratch
+ */
+static bool
+collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
+ GinScanEntry scanEntry, Snapshot snapshot)
+{
+ OffsetNumber attnum;
+ Form_pg_attribute attr;
+
+ /* Initialize empty bitmap result */
+ scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
+
+ /* Null query cannot partial-match anything */
+ if (scanEntry->isPartialMatch &&
+ scanEntry->queryCategory != GIN_CAT_NORM_KEY)
+ return true;
+
+ /* Locate tupdesc entry for key column (for attbyval/attlen data) */
+ attnum = scanEntry->attnum;
+ attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);
+
+ /*
+ * Predicate lock entry leaf page, following pages will be locked by
+ * moveRightIfItNeeded()
+ */
+ PredicateLockPage(btree->index,
+ BufferGetBlockNumber(stack->buffer),
+ snapshot);
+
+ for (;;)
+ {
+ Page page;
+ IndexTuple itup;
+ Datum idatum;
+ GinNullCategory icategory;
+
+ /*
+ * stack->off points to the interested entry, buffer is already locked
+ */
+ if (moveRightIfItNeeded(btree, stack, snapshot) == false)
+ return true;
+
+ page = BufferGetPage(stack->buffer);
+ TestForOldSnapshot(snapshot, btree->index, page);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
+
+ /*
+ * If tuple stores another attribute then stop scan
+ */
+ if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
+ return true;
+
+ /* Safe to fetch attribute value */
+ idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
+
+ /*
+ * Check for appropriate scan stop conditions
+ */
+ if (scanEntry->isPartialMatch)
+ {
+ int32 cmp;
+
+ /*
+ * In partial match, stop scan at any null (including
+ * placeholders); partial matches never match nulls
+ */
+ if (icategory != GIN_CAT_NORM_KEY)
+ return true;
+
+ /*----------
+ * Check of partial match.
+ * case cmp == 0 => match
+ * case cmp > 0 => not match and finish scan
+ * case cmp < 0 => not match and continue scan
+ *----------
+ */
+ cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
+ btree->ginstate->supportCollation[attnum - 1],
+ scanEntry->queryKey,
+ idatum,
+ UInt16GetDatum(scanEntry->strategy),
+ PointerGetDatum(scanEntry->extra_data)));
+
+ if (cmp > 0)
+ return true;
+ else if (cmp < 0)
+ {
+ stack->off++;
+ continue;
+ }
+ }
+ else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
+ {
+ /*
+ * In ALL mode, we are not interested in null items, so we can
+ * stop if we get to a null-item placeholder (which will be the
+ * last entry for a given attnum). We do want to include NULL_KEY
+ * and EMPTY_ITEM entries, though.
+ */
+ if (icategory == GIN_CAT_NULL_ITEM)
+ return true;
+ }
+
+ /*
+ * OK, we want to return the TIDs listed in this entry.
+ */
+ if (GinIsPostingTree(itup))
+ {
+ BlockNumber rootPostingTree = GinGetPostingTree(itup);
+
+ /*
+ * We should unlock current page (but not unpin) during tree scan
+ * to prevent deadlock with vacuum processes.
+ *
+ * We save current entry value (idatum) to be able to re-find our
+ * tuple after re-locking
+ */
+ if (icategory == GIN_CAT_NORM_KEY)
+ idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
+
+ LockBuffer(stack->buffer, GIN_UNLOCK);
+
+ /*
+ * Acquire predicate lock on the posting tree. We already hold a
+ * lock on the entry page, but insertions to the posting tree
+ * don't check for conflicts on that level.
+ */
+ PredicateLockPage(btree->index, rootPostingTree, snapshot);
+
+ /* Collect all the TIDs in this entry's posting tree */
+ scanPostingTree(btree->index, scanEntry, rootPostingTree,
+ snapshot);
+
+ /*
+ * We lock again the entry page and while it was unlocked insert
+ * might have occurred, so we need to re-find our position.
+ */
+ LockBuffer(stack->buffer, GIN_SHARE);
+ page = BufferGetPage(stack->buffer);
+ if (!GinPageIsLeaf(page))
+ {
+ /*
+ * Root page becomes non-leaf while we unlock it. We will
+ * start again, this situation doesn't occur often - root can
+ * became a non-leaf only once per life of index.
+ */
+ return false;
+ }
+
+ /* Search forward to re-find idatum */
+ for (;;)
+ {
+ if (moveRightIfItNeeded(btree, stack, snapshot) == false)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to re-find tuple within index \"%s\"",
+ RelationGetRelationName(btree->index))));
+
+ page = BufferGetPage(stack->buffer);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
+
+ if (gintuple_get_attrnum(btree->ginstate, itup) == attnum)
+ {
+ Datum newDatum;
+ GinNullCategory newCategory;
+
+ newDatum = gintuple_get_key(btree->ginstate, itup,
+ &newCategory);
+
+ if (ginCompareEntries(btree->ginstate, attnum,
+ newDatum, newCategory,
+ idatum, icategory) == 0)
+ break; /* Found! */
+ }
+
+ stack->off++;
+ }
+
+ if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
+ pfree(DatumGetPointer(idatum));
+ }
+ else
+ {
+ ItemPointer ipd;
+ int nipd;
+
+ ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
+ tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
+ scanEntry->predictNumberResult += GinGetNPosting(itup);
+ pfree(ipd);
+ }
+
+ /*
+ * Done with this entry, go to the next
+ */
+ stack->off++;
+ }
+}
+
+/*
+ * Start* functions setup beginning state of searches: finds correct buffer and pins it.
+ */
+static void
+startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
+{
+ GinBtreeData btreeEntry;
+ GinBtreeStack *stackEntry;
+ Page page;
+ bool needUnlock;
+
+restartScanEntry:
+ entry->buffer = InvalidBuffer;
+ ItemPointerSetMin(&entry->curItem);
+ entry->offset = InvalidOffsetNumber;
+ if (entry->list)
+ pfree(entry->list);
+ entry->list = NULL;
+ entry->nlist = 0;
+ entry->matchBitmap = NULL;
+ entry->matchResult = NULL;
+ entry->reduceResult = false;
+ entry->predictNumberResult = 0;
+
+ /*
+ * we should find entry, and begin scan of posting tree or just store
+ * posting list in memory
+ */
+ ginPrepareEntryScan(&btreeEntry, entry->attnum,
+ entry->queryKey, entry->queryCategory,
+ ginstate);
+ stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot);
+ page = BufferGetPage(stackEntry->buffer);
+
+ /* ginFindLeafPage() will have already checked snapshot age. */
+ needUnlock = true;
+
+ entry->isFinished = true;
+
+ if (entry->isPartialMatch ||
+ entry->queryCategory == GIN_CAT_EMPTY_QUERY)
+ {
+ /*
+ * btreeEntry.findItem locates the first item >= given search key.
+ * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
+ * because of the way the GIN_CAT_EMPTY_QUERY category code is
+ * assigned.) We scan forward from there and collect all TIDs needed
+ * for the entry type.
+ */
+ btreeEntry.findItem(&btreeEntry, stackEntry);
+ if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
+ == false)
+ {
+ /*
+ * GIN tree was seriously restructured, so we will cleanup all
+ * found data and rescan. See comments near 'return false' in
+ * collectMatchBitmap()
+ */
+ if (entry->matchBitmap)
+ {
+ if (entry->matchIterator)
+ tbm_end_iterate(entry->matchIterator);
+ entry->matchIterator = NULL;
+ tbm_free(entry->matchBitmap);
+ entry->matchBitmap = NULL;
+ }
+ LockBuffer(stackEntry->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stackEntry);
+ goto restartScanEntry;
+ }
+
+ if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
+ {
+ entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
+ entry->isFinished = false;
+ }
+ }
+ else if (btreeEntry.findItem(&btreeEntry, stackEntry))
+ {
+ IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
+
+ if (GinIsPostingTree(itup))
+ {
+ BlockNumber rootPostingTree = GinGetPostingTree(itup);
+ GinBtreeStack *stack;
+ Page page;
+ ItemPointerData minItem;
+
+ /*
+ * This is an equality scan, so lock the root of the posting tree.
+ * It represents a lock on the exact key value, and covers all the
+ * items in the posting tree.
+ */
+ PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
+
+ /*
+ * We should unlock entry page before touching posting tree to
+ * prevent deadlocks with vacuum processes. Because entry is never
+ * deleted from page and posting tree is never reduced to the
+ * posting list, we can unlock page after getting BlockNumber of
+ * root of posting tree.
+ */
+ LockBuffer(stackEntry->buffer, GIN_UNLOCK);
+ needUnlock = false;
+
+ stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
+ rootPostingTree, snapshot);
+ entry->buffer = stack->buffer;
+
+ /*
+ * We keep buffer pinned because we need to prevent deletion of
+ * page during scan. See GIN's vacuum implementation. RefCount is
+ * increased to keep buffer pinned after freeGinBtreeStack() call.
+ */
+ IncrBufferRefCount(entry->buffer);
+
+ page = BufferGetPage(entry->buffer);
+
+ /*
+ * Load the first page into memory.
+ */
+ ItemPointerSetMin(&minItem);
+ entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
+
+ entry->predictNumberResult = stack->predictNumber * entry->nlist;
+
+ LockBuffer(entry->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stack);
+ entry->isFinished = false;
+ }
+ else
+ {
+ /*
+ * Lock the entry leaf page. This is more coarse-grained than
+ * necessary, because it will conflict with any insertions that
+ * land on the same leaf page, not only the exact key we searched
+ * for. But locking an individual tuple would require updating
+ * that lock whenever it moves because of insertions or vacuums,
+ * which seems too complicated.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer),
+ snapshot);
+ if (GinGetNPosting(itup) > 0)
+ {
+ entry->list = ginReadTuple(ginstate, entry->attnum, itup,
+ &entry->nlist);
+ entry->predictNumberResult = entry->nlist;
+
+ entry->isFinished = false;
+ }
+ }
+ }
+ else
+ {
+ /*
+ * No entry found. Predicate lock the leaf page, to lock the place
+ * where the entry would've been, had there been one.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer), snapshot);
+ }
+
+ if (needUnlock)
+ LockBuffer(stackEntry->buffer, GIN_UNLOCK);
+ freeGinBtreeStack(stackEntry);
+}
+
+/*
+ * Comparison function for scan entry indexes. Sorts by predictNumberResult,
+ * least frequent items first.
+ */
+static int
+entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
+{
+ const GinScanKey key = (const GinScanKey) arg;
+ int i1 = *(const int *) a1;
+ int i2 = *(const int *) a2;
+ uint32 n1 = key->scanEntry[i1]->predictNumberResult;
+ uint32 n2 = key->scanEntry[i2]->predictNumberResult;
+
+ if (n1 < n2)
+ return -1;
+ else if (n1 == n2)
+ return 0;
+ else
+ return 1;
+}
+
+static void
+startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
+{
+ MemoryContext oldCtx = CurrentMemoryContext;
+ int i;
+ int j;
+ int *entryIndexes;
+
+ ItemPointerSetMin(&key->curItem);
+ key->curItemMatches = false;
+ key->recheckCurItem = false;
+ key->isFinished = false;
+
+ /*
+ * Divide the entries into two distinct sets: required and additional.
+ * Additional entries are not enough for a match alone, without any items
+ * from the required set, but are needed by the consistent function to
+ * decide if an item matches. When scanning, we can skip over items from
+ * additional entries that have no corresponding matches in any of the
+ * required entries. That speeds up queries like "frequent & rare"
+ * considerably, if the frequent term can be put in the additional set.
+ *
+ * There can be many legal ways to divide them entries into these two
+ * sets. A conservative division is to just put everything in the required
+ * set, but the more you can put in the additional set, the more you can
+ * skip during the scan. To maximize skipping, we try to put as many
+ * frequent items as possible into additional, and less frequent ones into
+ * required. To do that, sort the entries by frequency
+ * (predictNumberResult), and put entries into the required set in that
+ * order, until the consistent function says that none of the remaining
+ * entries can form a match, without any items from the required set. The
+ * rest go to the additional set.
+ *
+ * Exclude-only scan keys are known to have no required entries.
+ */
+ if (key->excludeOnly)
+ {
+ MemoryContextSwitchTo(so->keyCtx);
+
+ key->nrequired = 0;
+ key->nadditional = key->nentries;
+ key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
+ for (i = 0; i < key->nadditional; i++)
+ key->additionalEntries[i] = key->scanEntry[i];
+ }
+ else if (key->nentries > 1)
+ {
+ MemoryContextSwitchTo(so->tempCtx);
+
+ entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
+ for (i = 0; i < key->nentries; i++)
+ entryIndexes[i] = i;
+ qsort_arg(entryIndexes, key->nentries, sizeof(int),
+ entryIndexByFrequencyCmp, key);
+
+ for (i = 0; i < key->nentries - 1; i++)
+ {
+ /* Pass all entries <= i as FALSE, and the rest as MAYBE */
+ for (j = 0; j <= i; j++)
+ key->entryRes[entryIndexes[j]] = GIN_FALSE;
+ for (j = i + 1; j < key->nentries; j++)
+ key->entryRes[entryIndexes[j]] = GIN_MAYBE;
+
+ if (key->triConsistentFn(key) == GIN_FALSE)
+ break;
+ }
+ /* i is now the last required entry. */
+
+ MemoryContextSwitchTo(so->keyCtx);
+
+ key->nrequired = i + 1;
+ key->nadditional = key->nentries - key->nrequired;
+ key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
+ key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
+
+ j = 0;
+ for (i = 0; i < key->nrequired; i++)
+ key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
+ for (i = 0; i < key->nadditional; i++)
+ key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
+
+ /* clean up after consistentFn calls (also frees entryIndexes) */
+ MemoryContextReset(so->tempCtx);
+ }
+ else
+ {
+ MemoryContextSwitchTo(so->keyCtx);
+
+ key->nrequired = 1;
+ key->nadditional = 0;
+ key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
+ key->requiredEntries[0] = key->scanEntry[0];
+ }
+ MemoryContextSwitchTo(oldCtx);
+}
+
+static void
+startScan(IndexScanDesc scan)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ GinState *ginstate = &so->ginstate;
+ uint32 i;
+
+ for (i = 0; i < so->totalentries; i++)
+ startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
+
+ if (GinFuzzySearchLimit > 0)
+ {
+ /*
+ * If all of keys more than threshold we will try to reduce result, we
+ * hope (and only hope, for intersection operation of array our
+ * supposition isn't true), that total result will not more than
+ * minimal predictNumberResult.
+ */
+ bool reduce = true;
+
+ for (i = 0; i < so->totalentries; i++)
+ {
+ if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
+ {
+ reduce = false;
+ break;
+ }
+ }
+ if (reduce)
+ {
+ for (i = 0; i < so->totalentries; i++)
+ {
+ so->entries[i]->predictNumberResult /= so->totalentries;
+ so->entries[i]->reduceResult = true;
+ }
+ }
+ }
+
+ /*
+ * Now that we have the estimates for the entry frequencies, finish
+ * initializing the scan keys.
+ */
+ for (i = 0; i < so->nkeys; i++)
+ startScanKey(ginstate, so, so->keys + i);
+}
+
+/*
+ * Load the next batch of item pointers from a posting tree.
+ *
+ * Note that we copy the page into GinScanEntry->list array and unlock it, but
+ * keep it pinned to prevent interference with vacuum.
+ */
+static void
+entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
+ ItemPointerData advancePast, Snapshot snapshot)
+{
+ Page page;
+ int i;
+ bool stepright;
+
+ if (!BufferIsValid(entry->buffer))
+ {
+ entry->isFinished = true;
+ return;
+ }
+
+ /*
+ * We have two strategies for finding the correct page: step right from
+ * the current page, or descend the tree again from the root. If
+ * advancePast equals the current item, the next matching item should be
+ * on the next page, so we step right. Otherwise, descend from root.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
+ {
+ stepright = true;
+ LockBuffer(entry->buffer, GIN_SHARE);
+ }
+ else
+ {
+ GinBtreeStack *stack;
+
+ ReleaseBuffer(entry->buffer);
+
+ /*
+ * Set the search key, and find the correct leaf page.
+ */
+ if (ItemPointerIsLossyPage(&advancePast))
+ {
+ ItemPointerSet(&entry->btree.itemptr,
+ GinItemPointerGetBlockNumber(&advancePast) + 1,
+ FirstOffsetNumber);
+ }
+ else
+ {
+ ItemPointerSet(&entry->btree.itemptr,
+ GinItemPointerGetBlockNumber(&advancePast),
+ OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
+ }
+ entry->btree.fullScan = false;
+ stack = ginFindLeafPage(&entry->btree, true, false, snapshot);
+
+ /* we don't need the stack, just the buffer. */
+ entry->buffer = stack->buffer;
+ IncrBufferRefCount(entry->buffer);
+ freeGinBtreeStack(stack);
+ stepright = false;
+ }
+
+ elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
+ GinItemPointerGetBlockNumber(&advancePast),
+ GinItemPointerGetOffsetNumber(&advancePast),
+ !stepright);
+
+ page = BufferGetPage(entry->buffer);
+ for (;;)
+ {
+ entry->offset = InvalidOffsetNumber;
+ if (entry->list)
+ {
+ pfree(entry->list);
+ entry->list = NULL;
+ entry->nlist = 0;
+ }
+
+ if (stepright)
+ {
+ /*
+ * We've processed all the entries on this page. If it was the
+ * last page in the tree, we're done.
+ */
+ if (GinPageRightMost(page))
+ {
+ UnlockReleaseBuffer(entry->buffer);
+ entry->buffer = InvalidBuffer;
+ entry->isFinished = true;
+ return;
+ }
+
+ /*
+ * Step to next page, following the right link. then find the
+ * first ItemPointer greater than advancePast.
+ */
+ entry->buffer = ginStepRight(entry->buffer,
+ ginstate->index,
+ GIN_SHARE);
+ page = BufferGetPage(entry->buffer);
+ }
+ stepright = true;
+
+ if (GinPageGetOpaque(page)->flags & GIN_DELETED)
+ continue; /* page was deleted by concurrent vacuum */
+
+ /*
+ * The first item > advancePast might not be on this page, but
+ * somewhere to the right, if the page was split, or a non-match from
+ * another key in the query allowed us to skip some items from this
+ * entry. Keep following the right-links until we re-find the correct
+ * page.
+ */
+ if (!GinPageRightMost(page) &&
+ ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
+ {
+ /*
+ * the item we're looking is > the right bound of the page, so it
+ * can't be on this page.
+ */
+ continue;
+ }
+
+ entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
+
+ for (i = 0; i < entry->nlist; i++)
+ {
+ if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
+ {
+ entry->offset = i;
+
+ if (GinPageRightMost(page))
+ {
+ /* after processing the copied items, we're done. */
+ UnlockReleaseBuffer(entry->buffer);
+ entry->buffer = InvalidBuffer;
+ }
+ else
+ LockBuffer(entry->buffer, GIN_UNLOCK);
+ return;
+ }
+ }
+ }
+}
+
+#define gin_rand() pg_prng_double(&pg_global_prng_state)
+#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
+
+/*
+ * Sets entry->curItem to next heap item pointer > advancePast, for one entry
+ * of one scan key, or sets entry->isFinished to true if there are no more.
+ *
+ * Item pointers are returned in ascending order.
+ *
+ * Note: this can return a "lossy page" item pointer, indicating that the
+ * entry potentially matches all items on that heap page. However, it is
+ * not allowed to return both a lossy page pointer and exact (regular)
+ * item pointers for the same page. (Doing so would break the key-combination
+ * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
+ * current implementation this is guaranteed by the behavior of tidbitmaps.
+ */
+static void
+entryGetItem(GinState *ginstate, GinScanEntry entry,
+ ItemPointerData advancePast, Snapshot snapshot)
+{
+ Assert(!entry->isFinished);
+
+ Assert(!ItemPointerIsValid(&entry->curItem) ||
+ ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+
+ if (entry->matchBitmap)
+ {
+ /* A bitmap result */
+ BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
+ OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
+
+ for (;;)
+ {
+ /*
+ * If we've exhausted all items on this block, move to next block
+ * in the bitmap.
+ */
+ while (entry->matchResult == NULL ||
+ (entry->matchResult->ntuples >= 0 &&
+ entry->offset >= entry->matchResult->ntuples) ||
+ entry->matchResult->blockno < advancePastBlk ||
+ (ItemPointerIsLossyPage(&advancePast) &&
+ entry->matchResult->blockno == advancePastBlk))
+ {
+ entry->matchResult = tbm_iterate(entry->matchIterator);
+
+ if (entry->matchResult == NULL)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ tbm_end_iterate(entry->matchIterator);
+ entry->matchIterator = NULL;
+ entry->isFinished = true;
+ break;
+ }
+
+ /*
+ * Reset counter to the beginning of entry->matchResult. Note:
+ * entry->offset is still greater than matchResult->ntuples if
+ * matchResult is lossy. So, on next call we will get next
+ * result from TIDBitmap.
+ */
+ entry->offset = 0;
+ }
+ if (entry->isFinished)
+ break;
+
+ /*
+ * We're now on the first page after advancePast which has any
+ * items on it. If it's a lossy result, return that.
+ */
+ if (entry->matchResult->ntuples < 0)
+ {
+ ItemPointerSetLossyPage(&entry->curItem,
+ entry->matchResult->blockno);
+
+ /*
+ * We might as well fall out of the loop; we could not
+ * estimate number of results on this page to support correct
+ * reducing of result even if it's enabled.
+ */
+ break;
+ }
+
+ /*
+ * Not a lossy page. Skip over any offsets <= advancePast, and
+ * return that.
+ */
+ if (entry->matchResult->blockno == advancePastBlk)
+ {
+ /*
+ * First, do a quick check against the last offset on the
+ * page. If that's > advancePast, so are all the other
+ * offsets, so just go back to the top to get the next page.
+ */
+ if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
+ {
+ entry->offset = entry->matchResult->ntuples;
+ continue;
+ }
+
+ /* Otherwise scan to find the first item > advancePast */
+ while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
+ entry->offset++;
+ }
+
+ ItemPointerSet(&entry->curItem,
+ entry->matchResult->blockno,
+ entry->matchResult->offsets[entry->offset]);
+ entry->offset++;
+
+ /* Done unless we need to reduce the result */
+ if (!entry->reduceResult || !dropItem(entry))
+ break;
+ }
+ }
+ else if (!BufferIsValid(entry->buffer))
+ {
+ /*
+ * A posting list from an entry tuple, or the last page of a posting
+ * tree.
+ */
+ for (;;)
+ {
+ if (entry->offset >= entry->nlist)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ entry->isFinished = true;
+ break;
+ }
+
+ entry->curItem = entry->list[entry->offset++];
+
+ /* If we're not past advancePast, keep scanning */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ continue;
+
+ /* Done unless we need to reduce the result */
+ if (!entry->reduceResult || !dropItem(entry))
+ break;
+ }
+ }
+ else
+ {
+ /* A posting tree */
+ for (;;)
+ {
+ /* If we've processed the current batch, load more items */
+ while (entry->offset >= entry->nlist)
+ {
+ entryLoadMoreItems(ginstate, entry, advancePast, snapshot);
+
+ if (entry->isFinished)
+ {
+ ItemPointerSetInvalid(&entry->curItem);
+ return;
+ }
+ }
+
+ entry->curItem = entry->list[entry->offset++];
+
+ /* If we're not past advancePast, keep scanning */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ continue;
+
+ /* Done unless we need to reduce the result */
+ if (!entry->reduceResult || !dropItem(entry))
+ break;
+
+ /*
+ * Advance advancePast (so that entryLoadMoreItems will load the
+ * right data), and keep scanning
+ */
+ advancePast = entry->curItem;
+ }
+ }
+}
+
+/*
+ * Identify the "current" item among the input entry streams for this scan key
+ * that is greater than advancePast, and test whether it passes the scan key
+ * qual condition.
+ *
+ * The current item is the smallest curItem among the inputs. key->curItem
+ * is set to that value. key->curItemMatches is set to indicate whether that
+ * TID passes the consistentFn test. If so, key->recheckCurItem is set true
+ * iff recheck is needed for this item pointer (including the case where the
+ * item pointer is a lossy page pointer).
+ *
+ * If all entry streams are exhausted, sets key->isFinished to true.
+ *
+ * Item pointers must be returned in ascending order.
+ *
+ * Note: this can return a "lossy page" item pointer, indicating that the
+ * key potentially matches all items on that heap page. However, it is
+ * not allowed to return both a lossy page pointer and exact (regular)
+ * item pointers for the same page. (Doing so would break the key-combination
+ * logic in scanGetItem.)
+ */
+static void
+keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
+ ItemPointerData advancePast, Snapshot snapshot)
+{
+ ItemPointerData minItem;
+ ItemPointerData curPageLossy;
+ uint32 i;
+ bool haveLossyEntry;
+ GinScanEntry entry;
+ GinTernaryValue res;
+ MemoryContext oldCtx;
+ bool allFinished;
+
+ Assert(!key->isFinished);
+
+ /*
+ * We might have already tested this item; if so, no need to repeat work.
+ * (Note: the ">" case can happen, if advancePast is exact but we
+ * previously had to set curItem to a lossy-page pointer.)
+ */
+ if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
+ return;
+
+ /*
+ * Find the minimum item > advancePast among the active entry streams.
+ *
+ * Note: a lossy-page entry is encoded by a ItemPointer with max value for
+ * offset (0xffff), so that it will sort after any exact entries for the
+ * same page. So we'll prefer to return exact pointers not lossy
+ * pointers, which is good.
+ */
+ ItemPointerSetMax(&minItem);
+ allFinished = true;
+ for (i = 0; i < key->nrequired; i++)
+ {
+ entry = key->requiredEntries[i];
+
+ if (entry->isFinished)
+ continue;
+
+ /*
+ * Advance this stream if necessary.
+ *
+ * In particular, since entry->curItem was initialized with
+ * ItemPointerSetMin, this ensures we fetch the first item for each
+ * entry on the first call.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ {
+ entryGetItem(ginstate, entry, advancePast, snapshot);
+ if (entry->isFinished)
+ continue;
+ }
+
+ allFinished = false;
+ if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+ minItem = entry->curItem;
+ }
+
+ if (allFinished && !key->excludeOnly)
+ {
+ /* all entries are finished */
+ key->isFinished = true;
+ return;
+ }
+
+ if (!key->excludeOnly)
+ {
+ /*
+ * For a normal scan key, we now know there are no matches < minItem.
+ *
+ * If minItem is lossy, it means that there were no exact items on the
+ * page among requiredEntries, because lossy pointers sort after exact
+ * items. However, there might be exact items for the same page among
+ * additionalEntries, so we mustn't advance past them.
+ */
+ if (ItemPointerIsLossyPage(&minItem))
+ {
+ if (GinItemPointerGetBlockNumber(&advancePast) <
+ GinItemPointerGetBlockNumber(&minItem))
+ {
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&minItem),
+ InvalidOffsetNumber);
+ }
+ }
+ else
+ {
+ Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&minItem),
+ OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
+ }
+ }
+ else
+ {
+ /*
+ * excludeOnly scan keys don't have any entries that are necessarily
+ * present in matching items. So, we consider the item just after
+ * advancePast.
+ */
+ Assert(key->nrequired == 0);
+ ItemPointerSet(&minItem,
+ GinItemPointerGetBlockNumber(&advancePast),
+ OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
+ }
+
+ /*
+ * We might not have loaded all the entry streams for this TID yet. We
+ * could call the consistent function, passing MAYBE for those entries, to
+ * see if it can decide if this TID matches based on the information we
+ * have. But if the consistent-function is expensive, and cannot in fact
+ * decide with partial information, that could be a big loss. So, load all
+ * the additional entries, before calling the consistent function.
+ */
+ for (i = 0; i < key->nadditional; i++)
+ {
+ entry = key->additionalEntries[i];
+
+ if (entry->isFinished)
+ continue;
+
+ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ {
+ entryGetItem(ginstate, entry, advancePast, snapshot);
+ if (entry->isFinished)
+ continue;
+ }
+
+ /*
+ * Normally, none of the items in additionalEntries can have a curItem
+ * larger than minItem. But if minItem is a lossy page, then there
+ * might be exact items on the same page among additionalEntries.
+ */
+ if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+ {
+ Assert(ItemPointerIsLossyPage(&minItem));
+ minItem = entry->curItem;
+ }
+ }
+
+ /*
+ * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
+ * and perform consistentFn test.
+ *
+ * Lossy-page entries pose a problem, since we don't know the correct
+ * entryRes state to pass to the consistentFn, and we also don't know what
+ * its combining logic will be (could be AND, OR, or even NOT). If the
+ * logic is OR then the consistentFn might succeed for all items in the
+ * lossy page even when none of the other entries match.
+ *
+ * Our strategy is to call the tri-state consistent function, with the
+ * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
+ * returns FALSE, none of the lossy items alone are enough for a match, so
+ * we don't need to return a lossy-page pointer. Otherwise, return a
+ * lossy-page pointer to indicate that the whole heap page must be
+ * checked. (On subsequent calls, we'll do nothing until minItem is past
+ * the page altogether, thus ensuring that we never return both regular
+ * and lossy pointers for the same page.)
+ *
+ * An exception is that it doesn't matter what we pass for lossy pointers
+ * in "hidden" entries, because the consistentFn's result can't depend on
+ * them. We could pass them as MAYBE as well, but if we're using the
+ * "shim" implementation of a tri-state consistent function (see
+ * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
+ * them as true.
+ *
+ * Note that only lossy-page entries pointing to the current item's page
+ * should trigger this processing; we might have future lossy pages in the
+ * entry array, but they aren't relevant yet.
+ */
+ key->curItem = minItem;
+ ItemPointerSetLossyPage(&curPageLossy,
+ GinItemPointerGetBlockNumber(&key->curItem));
+ haveLossyEntry = false;
+ for (i = 0; i < key->nentries; i++)
+ {
+ entry = key->scanEntry[i];
+ if (entry->isFinished == false &&
+ ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
+ {
+ if (i < key->nuserentries)
+ key->entryRes[i] = GIN_MAYBE;
+ else
+ key->entryRes[i] = GIN_TRUE;
+ haveLossyEntry = true;
+ }
+ else
+ key->entryRes[i] = GIN_FALSE;
+ }
+
+ /* prepare for calling consistentFn in temp context */
+ oldCtx = MemoryContextSwitchTo(tempCtx);
+
+ if (haveLossyEntry)
+ {
+ /* Have lossy-page entries, so see if whole page matches */
+ res = key->triConsistentFn(key);
+
+ if (res == GIN_TRUE || res == GIN_MAYBE)
+ {
+ /* Yes, so clean up ... */
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(tempCtx);
+
+ /* and return lossy pointer for whole page */
+ key->curItem = curPageLossy;
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ return;
+ }
+ }
+
+ /*
+ * At this point we know that we don't need to return a lossy whole-page
+ * pointer, but we might have matches for individual exact item pointers,
+ * possibly in combination with a lossy pointer. Pass lossy pointers as
+ * MAYBE to the ternary consistent function, to let it decide if this
+ * tuple satisfies the overall key, even though we don't know if the lossy
+ * entries match.
+ *
+ * Prepare entryRes array to be passed to consistentFn.
+ */
+ for (i = 0; i < key->nentries; i++)
+ {
+ entry = key->scanEntry[i];
+ if (entry->isFinished)
+ key->entryRes[i] = GIN_FALSE;
+#if 0
+
+ /*
+ * This case can't currently happen, because we loaded all the entries
+ * for this item earlier.
+ */
+ else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+ key->entryRes[i] = GIN_MAYBE;
+#endif
+ else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
+ key->entryRes[i] = GIN_MAYBE;
+ else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
+ key->entryRes[i] = GIN_TRUE;
+ else
+ key->entryRes[i] = GIN_FALSE;
+ }
+
+ res = key->triConsistentFn(key);
+
+ switch (res)
+ {
+ case GIN_TRUE:
+ key->curItemMatches = true;
+ /* triConsistentFn set recheckCurItem */
+ break;
+
+ case GIN_FALSE:
+ key->curItemMatches = false;
+ break;
+
+ case GIN_MAYBE:
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ break;
+
+ default:
+
+ /*
+ * the 'default' case shouldn't happen, but if the consistent
+ * function returns something bogus, this is the safe result
+ */
+ key->curItemMatches = true;
+ key->recheckCurItem = true;
+ break;
+ }
+
+ /*
+ * We have a tuple, and we know if it matches or not. If it's a non-match,
+ * we could continue to find the next matching tuple, but let's break out
+ * and give scanGetItem a chance to advance the other keys. They might be
+ * able to skip past to a much higher TID, allowing us to save work.
+ */
+
+ /* clean up after consistentFn calls */
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(tempCtx);
+}
+
+/*
+ * Get next heap item pointer (after advancePast) from scan.
+ * Returns true if anything found.
+ * On success, *item and *recheck are set.
+ *
+ * Note: this is very nearly the same logic as in keyGetItem(), except
+ * that we know the keys are to be combined with AND logic, whereas in
+ * keyGetItem() the combination logic is known only to the consistentFn.
+ */
+static bool
+scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
+ ItemPointerData *item, bool *recheck)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ uint32 i;
+ bool match;
+
+ /*----------
+ * Advance the scan keys in lock-step, until we find an item that matches
+ * all the keys. If any key reports isFinished, meaning its subset of the
+ * entries is exhausted, we can stop. Otherwise, set *item to the next
+ * matching item.
+ *
+ * This logic works only if a keyGetItem stream can never contain both
+ * exact and lossy pointers for the same page. Else we could have a
+ * case like
+ *
+ * stream 1 stream 2
+ * ... ...
+ * 42/6 42/7
+ * 50/1 42/0xffff
+ * ... ...
+ *
+ * We would conclude that 42/6 is not a match and advance stream 1,
+ * thus never detecting the match to the lossy pointer in stream 2.
+ * (keyGetItem has a similar problem versus entryGetItem.)
+ *----------
+ */
+ do
+ {
+ ItemPointerSetMin(item);
+ match = true;
+ for (i = 0; i < so->nkeys && match; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ /*
+ * If we're considering a lossy page, skip excludeOnly keys. They
+ * can't exclude the whole page anyway.
+ */
+ if (ItemPointerIsLossyPage(item) && key->excludeOnly)
+ {
+ /*
+ * ginNewScanKey() should never mark the first key as
+ * excludeOnly.
+ */
+ Assert(i > 0);
+ continue;
+ }
+
+ /* Fetch the next item for this key that is > advancePast. */
+ keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
+ scan->xs_snapshot);
+
+ if (key->isFinished)
+ return false;
+
+ /*
+ * If it's not a match, we can immediately conclude that nothing
+ * <= this item matches, without checking the rest of the keys.
+ */
+ if (!key->curItemMatches)
+ {
+ advancePast = key->curItem;
+ match = false;
+ break;
+ }
+
+ /*
+ * It's a match. We can conclude that nothing < matches, so the
+ * other key streams can skip to this item.
+ *
+ * Beware of lossy pointers, though; from a lossy pointer, we can
+ * only conclude that nothing smaller than this *block* matches.
+ */
+ if (ItemPointerIsLossyPage(&key->curItem))
+ {
+ if (GinItemPointerGetBlockNumber(&advancePast) <
+ GinItemPointerGetBlockNumber(&key->curItem))
+ {
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&key->curItem),
+ InvalidOffsetNumber);
+ }
+ }
+ else
+ {
+ Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
+ ItemPointerSet(&advancePast,
+ GinItemPointerGetBlockNumber(&key->curItem),
+ OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
+ }
+
+ /*
+ * If this is the first key, remember this location as a potential
+ * match, and proceed to check the rest of the keys.
+ *
+ * Otherwise, check if this is the same item that we checked the
+ * previous keys for (or a lossy pointer for the same page). If
+ * not, loop back to check the previous keys for this item (we
+ * will check this key again too, but keyGetItem returns quickly
+ * for that)
+ */
+ if (i == 0)
+ {
+ *item = key->curItem;
+ }
+ else
+ {
+ if (ItemPointerIsLossyPage(&key->curItem) ||
+ ItemPointerIsLossyPage(item))
+ {
+ Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
+ match = (GinItemPointerGetBlockNumber(&key->curItem) ==
+ GinItemPointerGetBlockNumber(item));
+ }
+ else
+ {
+ Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
+ match = (ginCompareItemPointers(&key->curItem, item) == 0);
+ }
+ }
+ }
+ } while (!match);
+
+ Assert(!ItemPointerIsMin(item));
+
+ /*
+ * Now *item contains the first ItemPointer after previous result that
+ * satisfied all the keys for that exact TID, or a lossy reference to the
+ * same page.
+ *
+ * We must return recheck = true if any of the keys are marked recheck.
+ */
+ *recheck = false;
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ if (key->recheckCurItem)
+ {
+ *recheck = true;
+ break;
+ }
+ }
+
+ return true;
+}
+
+
+/*
+ * Functions for scanning the pending list
+ */
+
+
+/*
+ * Get ItemPointer of next heap row to be checked from pending list.
+ * Returns false if there are no more. On pages with several heap rows
+ * it returns each row separately, on page with part of heap row returns
+ * per page data. pos->firstOffset and pos->lastOffset are set to identify
+ * the range of pending-list tuples belonging to this heap row.
+ *
+ * The pendingBuffer is presumed pinned and share-locked on entry, and is
+ * pinned and share-locked on success exit. On failure exit it's released.
+ */
+static bool
+scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
+{
+ OffsetNumber maxoff;
+ Page page;
+ IndexTuple itup;
+
+ ItemPointerSetInvalid(&pos->item);
+ for (;;)
+ {
+ page = BufferGetPage(pos->pendingBuffer);
+ TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ if (pos->firstOffset > maxoff)
+ {
+ BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+
+ if (blkno == InvalidBlockNumber)
+ {
+ UnlockReleaseBuffer(pos->pendingBuffer);
+ pos->pendingBuffer = InvalidBuffer;
+
+ return false;
+ }
+ else
+ {
+ /*
+ * Here we must prevent deletion of next page by insertcleanup
+ * process, which may be trying to obtain exclusive lock on
+ * current page. So, we lock next page before releasing the
+ * current one
+ */
+ Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
+
+ LockBuffer(tmpbuf, GIN_SHARE);
+ UnlockReleaseBuffer(pos->pendingBuffer);
+
+ pos->pendingBuffer = tmpbuf;
+ pos->firstOffset = FirstOffsetNumber;
+ }
+ }
+ else
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
+ pos->item = itup->t_tid;
+ if (GinPageHasFullRow(page))
+ {
+ /*
+ * find itempointer to the next row
+ */
+ for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
+ if (!ItemPointerEquals(&pos->item, &itup->t_tid))
+ break;
+ }
+ }
+ else
+ {
+ /*
+ * All itempointers are the same on this page
+ */
+ pos->lastOffset = maxoff + 1;
+ }
+
+ /*
+ * Now pos->firstOffset points to the first tuple of current heap
+ * row, pos->lastOffset points to the first tuple of next heap row
+ * (or to the end of page)
+ */
+ break;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Scan pending-list page from current tuple (off) up till the first of:
+ * - match is found (then returns true)
+ * - no later match is possible
+ * - tuple's attribute number is not equal to entry's attrnum
+ * - reach end of page
+ *
+ * datum[]/category[]/datumExtracted[] arrays are used to cache the results
+ * of gintuple_get_key() on the current page.
+ */
+static bool
+matchPartialInPendingList(GinState *ginstate, Page page,
+ OffsetNumber off, OffsetNumber maxoff,
+ GinScanEntry entry,
+ Datum *datum, GinNullCategory *category,
+ bool *datumExtracted)
+{
+ IndexTuple itup;
+ int32 cmp;
+
+ /* Partial match to a null is not possible */
+ if (entry->queryCategory != GIN_CAT_NORM_KEY)
+ return false;
+
+ while (off < maxoff)
+ {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+
+ if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
+ return false;
+
+ if (datumExtracted[off - 1] == false)
+ {
+ datum[off - 1] = gintuple_get_key(ginstate, itup,
+ &category[off - 1]);
+ datumExtracted[off - 1] = true;
+ }
+
+ /* Once we hit nulls, no further match is possible */
+ if (category[off - 1] != GIN_CAT_NORM_KEY)
+ return false;
+
+ /*----------
+ * Check partial match.
+ * case cmp == 0 => match
+ * case cmp > 0 => not match and end scan (no later match possible)
+ * case cmp < 0 => not match and continue scan
+ *----------
+ */
+ cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
+ ginstate->supportCollation[entry->attnum - 1],
+ entry->queryKey,
+ datum[off - 1],
+ UInt16GetDatum(entry->strategy),
+ PointerGetDatum(entry->extra_data)));
+ if (cmp == 0)
+ return true;
+ else if (cmp > 0)
+ return false;
+
+ off++;
+ }
+
+ return false;
+}
+
+/*
+ * Set up the entryRes array for each key by looking at
+ * every entry for current heap row in pending list.
+ *
+ * Returns true if each scan key has at least one entryRes match.
+ * This corresponds to the situations where the normal index search will
+ * try to apply the key's consistentFn. (A tuple not meeting that requirement
+ * cannot be returned by the normal search since no entry stream will
+ * source its TID.)
+ *
+ * The pendingBuffer is presumed pinned and share-locked on entry.
+ */
+static bool
+collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ OffsetNumber attrnum;
+ Page page;
+ IndexTuple itup;
+ int i,
+ j;
+
+ /*
+ * Reset all entryRes and hasMatchKey flags
+ */
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ memset(key->entryRes, GIN_FALSE, key->nentries);
+ }
+ memset(pos->hasMatchKey, false, so->nkeys);
+
+ /*
+ * Outer loop iterates over multiple pending-list pages when a single heap
+ * row has entries spanning those pages.
+ */
+ for (;;)
+ {
+ Datum datum[BLCKSZ / sizeof(IndexTupleData)];
+ GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
+ bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
+
+ Assert(pos->lastOffset > pos->firstOffset);
+ memset(datumExtracted + pos->firstOffset - 1, 0,
+ sizeof(bool) * (pos->lastOffset - pos->firstOffset));
+
+ page = BufferGetPage(pos->pendingBuffer);
+ TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
+
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ for (j = 0; j < key->nentries; j++)
+ {
+ GinScanEntry entry = key->scanEntry[j];
+ OffsetNumber StopLow = pos->firstOffset,
+ StopHigh = pos->lastOffset,
+ StopMiddle;
+
+ /* If already matched on earlier page, do no extra work */
+ if (key->entryRes[j])
+ continue;
+
+ /*
+ * Interesting tuples are from pos->firstOffset to
+ * pos->lastOffset and they are ordered by (attnum, Datum) as
+ * it's done in entry tree. So we can use binary search to
+ * avoid linear scanning.
+ */
+ while (StopLow < StopHigh)
+ {
+ int res;
+
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
+
+ attrnum = gintuple_get_attrnum(&so->ginstate, itup);
+
+ if (key->attnum < attrnum)
+ {
+ StopHigh = StopMiddle;
+ continue;
+ }
+ if (key->attnum > attrnum)
+ {
+ StopLow = StopMiddle + 1;
+ continue;
+ }
+
+ if (datumExtracted[StopMiddle - 1] == false)
+ {
+ datum[StopMiddle - 1] =
+ gintuple_get_key(&so->ginstate, itup,
+ &category[StopMiddle - 1]);
+ datumExtracted[StopMiddle - 1] = true;
+ }
+
+ if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
+ {
+ /* special behavior depending on searchMode */
+ if (entry->searchMode == GIN_SEARCH_MODE_ALL)
+ {
+ /* match anything except NULL_ITEM */
+ if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
+ res = -1;
+ else
+ res = 0;
+ }
+ else
+ {
+ /* match everything */
+ res = 0;
+ }
+ }
+ else
+ {
+ res = ginCompareEntries(&so->ginstate,
+ entry->attnum,
+ entry->queryKey,
+ entry->queryCategory,
+ datum[StopMiddle - 1],
+ category[StopMiddle - 1]);
+ }
+
+ if (res == 0)
+ {
+ /*
+ * Found exact match (there can be only one, except in
+ * EMPTY_QUERY mode).
+ *
+ * If doing partial match, scan forward from here to
+ * end of page to check for matches.
+ *
+ * See comment above about tuple's ordering.
+ */
+ if (entry->isPartialMatch)
+ key->entryRes[j] =
+ matchPartialInPendingList(&so->ginstate,
+ page,
+ StopMiddle,
+ pos->lastOffset,
+ entry,
+ datum,
+ category,
+ datumExtracted);
+ else
+ key->entryRes[j] = true;
+
+ /* done with binary search */
+ break;
+ }
+ else if (res < 0)
+ StopHigh = StopMiddle;
+ else
+ StopLow = StopMiddle + 1;
+ }
+
+ if (StopLow >= StopHigh && entry->isPartialMatch)
+ {
+ /*
+ * No exact match on this page. If doing partial match,
+ * scan from the first tuple greater than target value to
+ * end of page. Note that since we don't remember whether
+ * the comparePartialFn told us to stop early on a
+ * previous page, we will uselessly apply comparePartialFn
+ * to the first tuple on each subsequent page.
+ */
+ key->entryRes[j] =
+ matchPartialInPendingList(&so->ginstate,
+ page,
+ StopHigh,
+ pos->lastOffset,
+ entry,
+ datum,
+ category,
+ datumExtracted);
+ }
+
+ pos->hasMatchKey[i] |= key->entryRes[j];
+ }
+ }
+
+ /* Advance firstOffset over the scanned tuples */
+ pos->firstOffset = pos->lastOffset;
+
+ if (GinPageHasFullRow(page))
+ {
+ /*
+ * We have examined all pending entries for the current heap row.
+ * Break out of loop over pages.
+ */
+ break;
+ }
+ else
+ {
+ /*
+ * Advance to next page of pending entries for the current heap
+ * row. Complain if there isn't one.
+ */
+ ItemPointerData item = pos->item;
+
+ if (scanGetCandidate(scan, pos) == false ||
+ !ItemPointerEquals(&pos->item, &item))
+ elog(ERROR, "could not find additional pending pages for same heap tuple");
+ }
+ }
+
+ /*
+ * All scan keys except excludeOnly require at least one entry to match.
+ * excludeOnly keys are an exception, because their implied
+ * GIN_CAT_EMPTY_QUERY scanEntry always matches. So return "true" if all
+ * non-excludeOnly scan keys have at least one match.
+ */
+ for (i = 0; i < so->nkeys; i++)
+ {
+ if (pos->hasMatchKey[i] == false && !so->keys[i].excludeOnly)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Collect all matched rows from pending list into bitmap.
+ */
+static void
+scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ MemoryContext oldCtx;
+ bool recheck,
+ match;
+ int i;
+ pendingPosition pos;
+ Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
+ Page page;
+ BlockNumber blkno;
+
+ *ntids = 0;
+
+ /*
+ * Acquire predicate lock on the metapage, to conflict with any fastupdate
+ * insertions.
+ */
+ PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
+
+ LockBuffer(metabuffer, GIN_SHARE);
+ page = BufferGetPage(metabuffer);
+ TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
+ blkno = GinPageGetMeta(page)->head;
+
+ /*
+ * fetch head of list before unlocking metapage. head page must be pinned
+ * to prevent deletion by vacuum process
+ */
+ if (blkno == InvalidBlockNumber)
+ {
+ /* No pending list, so proceed with normal scan */
+ UnlockReleaseBuffer(metabuffer);
+ return;
+ }
+
+ pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
+ LockBuffer(pos.pendingBuffer, GIN_SHARE);
+ pos.firstOffset = FirstOffsetNumber;
+ UnlockReleaseBuffer(metabuffer);
+ pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
+
+ /*
+ * loop for each heap row. scanGetCandidate returns full row or row's
+ * tuples from first page.
+ */
+ while (scanGetCandidate(scan, &pos))
+ {
+ /*
+ * Check entries in tuple and set up entryRes array.
+ *
+ * If pending tuples belonging to the current heap row are spread
+ * across several pages, collectMatchesForHeapRow will read all of
+ * those pages.
+ */
+ if (!collectMatchesForHeapRow(scan, &pos))
+ continue;
+
+ /*
+ * Matching of entries of one row is finished, so check row using
+ * consistent functions.
+ */
+ oldCtx = MemoryContextSwitchTo(so->tempCtx);
+ recheck = false;
+ match = true;
+
+ for (i = 0; i < so->nkeys; i++)
+ {
+ GinScanKey key = so->keys + i;
+
+ if (!key->boolConsistentFn(key))
+ {
+ match = false;
+ break;
+ }
+ recheck |= key->recheckCurItem;
+ }
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(so->tempCtx);
+
+ if (match)
+ {
+ tbm_add_tuples(tbm, &pos.item, 1, recheck);
+ (*ntids)++;
+ }
+ }
+
+ pfree(pos.hasMatchKey);
+}
+
+
+#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
+
+int64
+gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+ GinScanOpaque so = (GinScanOpaque) scan->opaque;
+ int64 ntids;
+ ItemPointerData iptr;
+ bool recheck;
+
+ /*
+ * Set up the scan keys, and check for unsatisfiable query.
+ */
+ ginFreeScanKeys(so); /* there should be no keys yet, but just to be
+ * sure */
+ ginNewScanKey(scan);
+
+ if (GinIsVoidRes(scan))
+ return 0;
+
+ ntids = 0;
+
+ /*
+ * First, scan the pending list and collect any matching entries into the
+ * bitmap. After we scan a pending item, some other backend could post it
+ * into the main index, and so we might visit it a second time during the
+ * main scan. This is okay because we'll just re-set the same bit in the
+ * bitmap. (The possibility of duplicate visits is a major reason why GIN
+ * can't support the amgettuple API, however.) Note that it would not do
+ * to scan the main index before the pending list, since concurrent
+ * cleanup could then make us miss entries entirely.
+ */
+ scanPendingInsert(scan, tbm, &ntids);
+
+ /*
+ * Now scan the main index.
+ */
+ startScan(scan);
+
+ ItemPointerSetMin(&iptr);
+
+ for (;;)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ if (!scanGetItem(scan, iptr, &iptr, &recheck))
+ break;
+
+ if (ItemPointerIsLossyPage(&iptr))
+ tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
+ else
+ tbm_add_tuples(tbm, &iptr, 1, recheck);
+ ntids++;
+ }
+
+ return ntids;
+}