diff options
Diffstat (limited to 'src/backend/access/gist/gistget.c')
-rw-r--r-- | src/backend/access/gist/gistget.c | 803 |
1 files changed, 803 insertions, 0 deletions
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c new file mode 100644 index 0000000..c8f7e78 --- /dev/null +++ b/src/backend/access/gist/gistget.c @@ -0,0 +1,803 @@ +/*------------------------------------------------------------------------- + * + * gistget.c + * fetch tuples from a GiST scan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistget.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/gist_private.h" +#include "access/relscan.h" +#include "lib/pairingheap.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/float.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * gistkillitems() -- set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * We re-read page here, so it's important to check page LSN. If the page + * has been modified since the last read (as determined by LSN), we cannot + * flag any entries because it is possible that the old entry was vacuumed + * away and the TID was re-used by a completely different heap tuple. + */ +static void +gistkillitems(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId iid; + int i; + bool killedsomething = false; + + Assert(so->curBlkno != InvalidBlockNumber); + Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); + Assert(so->killedItems != NULL); + + buffer = ReadBuffer(scan->indexRelation, so->curBlkno); + if (!BufferIsValid(buffer)) + return; + + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + + /* + * If page LSN differs it means that the page was modified since the last + * read. killedItems could be not valid so LP_DEAD hints applying is not + * safe. + */ + if (BufferGetLSNAtomic(buffer) != so->curPageLSN) + { + UnlockReleaseBuffer(buffer); + so->numKilled = 0; /* reset counter */ + return; + } + + Assert(GistPageIsLeaf(page)); + + /* + * Mark all killedItems as dead. We need no additional recheck, because, + * if page was modified, curPageLSN must have changed. + */ + for (i = 0; i < so->numKilled; i++) + { + offnum = so->killedItems[i]; + iid = PageGetItemId(page, offnum); + ItemIdMarkDead(iid); + killedsomething = true; + } + + if (killedsomething) + { + GistMarkPageHasGarbage(page); + MarkBufferDirtyHint(buffer, true); + } + + UnlockReleaseBuffer(buffer); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; +} + +/* + * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? + * + * The index tuple might represent either a heap tuple or a lower index page, + * depending on whether the containing page is a leaf page or not. + * + * On success return for a heap tuple, *recheck_p is set to indicate whether + * the quals need to be rechecked. We recheck if any of the consistent() + * functions request it. recheck is not interesting when examining a non-leaf + * entry, since we must visit the lower index page if there's any doubt. + * Similarly, *recheck_distances_p is set to indicate whether the distances + * need to be rechecked, and it is also ignored for non-leaf entries. + * + * If we are doing an ordered scan, so->distances[] is filled with distance + * data from the distance() functions before returning success. + * + * We must decompress the key in the IndexTuple before passing it to the + * sk_funcs (which actually are the opclass Consistent or Distance methods). + * + * Note that this function is always invoked in a short-lived memory context, + * so we don't need to worry about cleaning up allocated memory, either here + * or in the implementation of any Consistent or Distance methods. + */ +static bool +gistindex_keytest(IndexScanDesc scan, + IndexTuple tuple, + Page page, + OffsetNumber offset, + bool *recheck_p, + bool *recheck_distances_p) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + ScanKey key = scan->keyData; + int keySize = scan->numberOfKeys; + IndexOrderByDistance *distance_p; + Relation r = scan->indexRelation; + + *recheck_p = false; + *recheck_distances_p = false; + + /* + * If it's a leftover invalid tuple from pre-9.1, treat it as a match with + * minimum possible distances. This means we'll always follow it to the + * referenced page. + */ + if (GistTupleIsInvalid(tuple)) + { + int i; + + if (GistPageIsLeaf(page)) /* shouldn't happen */ + elog(ERROR, "invalid GiST tuple found on leaf page"); + for (i = 0; i < scan->numberOfOrderBys; i++) + { + so->distances[i].value = -get_float8_infinity(); + so->distances[i].isnull = false; + } + return true; + } + + /* Check whether it matches according to the Consistent functions */ + while (keySize > 0) + { + Datum datum; + bool isNull; + + datum = index_getattr(tuple, + key->sk_attno, + giststate->leafTupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* + * On non-leaf page we can't conclude that child hasn't NULL + * values because of assumption in GiST: union (VAL, NULL) is VAL. + * But if on non-leaf page key IS NULL, then all children are + * NULL. + */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (GistPageIsLeaf(page) && !isNull) + return false; + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (isNull) + return false; + } + } + else if (isNull) + { + return false; + } + else + { + Datum test; + bool recheck; + GISTENTRY de; + + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + false, isNull); + + /* + * Call the Consistent function to evaluate the test. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the comparison operator's strategy number and subtype + * from pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * We initialize the recheck flag to true (the safest assumption) + * in case the Consistent function forgets to set it. + */ + recheck = true; + + test = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int16GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); + + if (!DatumGetBool(test)) + return false; + *recheck_p |= recheck; + } + + key++; + keySize--; + } + + /* OK, it passes --- now let's compute the distances */ + key = scan->orderByData; + distance_p = so->distances; + keySize = scan->numberOfOrderBys; + while (keySize > 0) + { + Datum datum; + bool isNull; + + datum = index_getattr(tuple, + key->sk_attno, + giststate->leafTupdesc, + &isNull); + + if ((key->sk_flags & SK_ISNULL) || isNull) + { + /* Assume distance computes as null */ + distance_p->value = 0.0; + distance_p->isnull = true; + } + else + { + Datum dist; + bool recheck; + GISTENTRY de; + + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + false, isNull); + + /* + * Call the Distance function to evaluate the distance. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the ordering operator's strategy number and subtype from + * pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * If the function sets the recheck flag, the returned distance is + * a lower bound on the true distance and needs to be rechecked. + * We initialize the flag to 'false'. This flag was added in + * version 9.5; distance functions written before that won't know + * about the flag, but are expected to never be lossy. + */ + recheck = false; + dist = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int16GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); + *recheck_distances_p |= recheck; + distance_p->value = DatumGetFloat8(dist); + distance_p->isnull = false; + } + + key++; + distance_p++; + keySize--; + } + + return true; +} + +/* + * Scan all items on the GiST index page identified by *pageItem, and insert + * them into the queue (or directly to output areas) + * + * scan: index scan we are executing + * pageItem: search queue item identifying an index page to scan + * myDistances: distances array associated with pageItem, or NULL at the root + * tbm: if not NULL, gistgetbitmap's output bitmap + * ntids: if not NULL, gistgetbitmap's output tuple counter + * + * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap + * tuples should be reported directly into the bitmap. If they are NULL, + * we're doing a plain or ordered indexscan. For a plain indexscan, heap + * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, + * heap tuple TIDs are pushed into individual search queue items. In an + * index-only scan, reconstructed index tuples are returned along with the + * TIDs. + * + * If we detect that the index page has split since we saw its downlink + * in the parent, we push its new right sibling onto the queue so the + * sibling will be processed next. + */ +static void +gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, + IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + Relation r = scan->indexRelation; + Buffer buffer; + Page page; + GISTPageOpaque opaque; + OffsetNumber maxoff; + OffsetNumber i; + MemoryContext oldcxt; + + Assert(!GISTSearchItemIsHeap(*pageItem)); + + buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); + LockBuffer(buffer, GIST_SHARE); + PredicateLockPage(r, BufferGetBlockNumber(buffer), scan->xs_snapshot); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + TestForOldSnapshot(scan->xs_snapshot, r, page); + opaque = GistPageGetOpaque(page); + + /* + * Check if we need to follow the rightlink. We need to follow it if the + * page was concurrently split since we visited the parent (in which case + * parentlsn < nsn), or if the system crashed after a page split but + * before the downlink was inserted into the parent. + */ + if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && + (GistFollowRight(page) || + pageItem->data.parentlsn < GistPageGetNSN(page)) && + opaque->rightlink != InvalidBlockNumber /* sanity check */ ) + { + /* There was a page split, follow right link to add pages */ + GISTSearchItem *item; + + /* This can't happen when starting at the root */ + Assert(myDistances != NULL); + + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for the right sibling index page */ + item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); + item->blkno = opaque->rightlink; + item->data.parentlsn = pageItem->data.parentlsn; + + /* Insert it into the queue using same distances as for this page */ + memcpy(item->distances, myDistances, + sizeof(item->distances[0]) * scan->numberOfOrderBys); + + pairingheap_add(so->queue, &item->phNode); + + MemoryContextSwitchTo(oldcxt); + } + + /* + * Check if the page was deleted after we saw the downlink. There's + * nothing of interest on a deleted page. Note that we must do this after + * checking the NSN for concurrent splits! It's possible that the page + * originally contained some tuples that are visible to us, but was split + * so that all the visible tuples were moved to another page, and then + * this page was deleted. + */ + if (GistPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer); + return; + } + + so->nPageData = so->curPageData = 0; + scan->xs_hitup = NULL; /* might point into pageDataCxt */ + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->curPageLSN = BufferGetLSNAtomic(buffer); + + /* + * check all tuples on page + */ + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId iid = PageGetItemId(page, i); + IndexTuple it; + bool match; + bool recheck; + bool recheck_distances; + + /* + * If the scan specifies not to return killed tuples, then we treat a + * killed tuple as not passing the qual. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + continue; + + it = (IndexTuple) PageGetItem(page, iid); + + /* + * Must call gistindex_keytest in tempCxt, and clean up any leftover + * junk afterward. + */ + oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt); + + match = gistindex_keytest(scan, it, page, i, + &recheck, &recheck_distances); + + MemoryContextSwitchTo(oldcxt); + MemoryContextReset(so->giststate->tempCxt); + + /* Ignore tuple if it doesn't match */ + if (!match) + continue; + + if (tbm && GistPageIsLeaf(page)) + { + /* + * getbitmap scan, so just push heap tuple TIDs into the bitmap + * without worrying about ordering + */ + tbm_add_tuples(tbm, &it->t_tid, 1, recheck); + (*ntids)++; + } + else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) + { + /* + * Non-ordered scan, so report tuples in so->pageData[] + */ + so->pageData[so->nPageData].heapPtr = it->t_tid; + so->pageData[so->nPageData].recheck = recheck; + so->pageData[so->nPageData].offnum = i; + + /* + * In an index-only scan, also fetch the data from the tuple. The + * reconstructed tuples are stored in pageDataCxt. + */ + if (scan->xs_want_itup) + { + oldcxt = MemoryContextSwitchTo(so->pageDataCxt); + so->pageData[so->nPageData].recontup = + gistFetchTuple(giststate, r, it); + MemoryContextSwitchTo(oldcxt); + } + so->nPageData++; + } + else + { + /* + * Must push item into search queue. We get here for any lower + * index page, and also for heap tuples if doing an ordered + * search. + */ + GISTSearchItem *item; + int nOrderBys = scan->numberOfOrderBys; + + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for this item */ + item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); + + if (GistPageIsLeaf(page)) + { + /* Creating heap-tuple GISTSearchItem */ + item->blkno = InvalidBlockNumber; + item->data.heap.heapPtr = it->t_tid; + item->data.heap.recheck = recheck; + item->data.heap.recheckDistances = recheck_distances; + + /* + * In an index-only scan, also fetch the data from the tuple. + */ + if (scan->xs_want_itup) + item->data.heap.recontup = gistFetchTuple(giststate, r, it); + } + else + { + /* Creating index-page GISTSearchItem */ + item->blkno = ItemPointerGetBlockNumber(&it->t_tid); + + /* + * LSN of current page is lsn of parent page for child. We + * only have a shared lock, so we need to get the LSN + * atomically. + */ + item->data.parentlsn = BufferGetLSNAtomic(buffer); + } + + /* Insert it into the queue using new distance data */ + memcpy(item->distances, so->distances, + sizeof(item->distances[0]) * nOrderBys); + + pairingheap_add(so->queue, &item->phNode); + + MemoryContextSwitchTo(oldcxt); + } + } + + UnlockReleaseBuffer(buffer); +} + +/* + * Extract next item (in order) from search queue + * + * Returns a GISTSearchItem or NULL. Caller must pfree item when done with it. + */ +static GISTSearchItem * +getNextGISTSearchItem(GISTScanOpaque so) +{ + GISTSearchItem *item; + + if (!pairingheap_is_empty(so->queue)) + { + item = (GISTSearchItem *) pairingheap_remove_first(so->queue); + } + else + { + /* Done when both heaps are empty */ + item = NULL; + } + + /* Return item; caller is responsible to pfree it */ + return item; +} + +/* + * Fetch next heap tuple in an ordered search + */ +static bool +getNextNearest(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool res = false; + + if (scan->xs_hitup) + { + /* free previously returned tuple */ + pfree(scan->xs_hitup); + scan->xs_hitup = NULL; + } + + do + { + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) + break; + + if (GISTSearchItemIsHeap(*item)) + { + /* found a heap item at currently minimal distance */ + scan->xs_heaptid = item->data.heap.heapPtr; + scan->xs_recheck = item->data.heap.recheck; + + index_store_float8_orderby_distances(scan, so->orderByTypes, + item->distances, + item->data.heap.recheckDistances); + + /* in an index-only scan, also return the reconstructed tuple. */ + if (scan->xs_want_itup) + scan->xs_hitup = item->data.heap.recontup; + res = true; + } + else + { + /* visit an index page, extract its items into queue */ + CHECK_FOR_INTERRUPTS(); + + gistScanPage(scan, item, item->distances, NULL, NULL); + } + + pfree(item); + } while (!res); + + return res; +} + +/* + * gistgettuple() -- Get the next tuple in the scan + */ +bool +gistgettuple(IndexScanDesc scan, ScanDirection dir) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + + if (dir != ForwardScanDirection) + elog(ERROR, "GiST only supports forward scan direction"); + + if (!so->qual_ok) + return false; + + if (so->firstCall) + { + /* Begin the scan by processing the root page */ + GISTSearchItem fakeItem; + + pgstat_count_index_scan(scan->indexRelation); + + so->firstCall = false; + so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, NULL, NULL); + } + + if (scan->numberOfOrderBys > 0) + { + /* Must fetch tuples in strict distance order */ + return getNextNearest(scan); + } + else + { + /* Fetch tuples index-page-at-a-time */ + for (;;) + { + if (so->curPageData < so->nPageData) + { + if (scan->kill_prior_tuple && so->curPageData > 0) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } + /* continuing to return tuples from a leaf page */ + scan->xs_heaptid = so->pageData[so->curPageData].heapPtr; + scan->xs_recheck = so->pageData[so->curPageData].recheck; + + /* in an index-only scan, also return the reconstructed tuple */ + if (scan->xs_want_itup) + scan->xs_hitup = so->pageData[so->curPageData].recontup; + + so->curPageData++; + + return true; + } + + /* + * Check the last returned tuple and add it to killedItems if + * necessary + */ + if (scan->kill_prior_tuple + && so->curPageData > 0 + && so->curPageData == so->nPageData) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } + /* find and process the next index page */ + do + { + GISTSearchItem *item; + + if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) + gistkillitems(scan); + + item = getNextGISTSearchItem(so); + + if (!item) + return false; + + CHECK_FOR_INTERRUPTS(); + + /* save current item BlockNumber for next gistkillitems() call */ + so->curBlkno = item->blkno; + + /* + * While scanning a leaf page, ItemPointers of matching heap + * tuples are stored in so->pageData. If there are any on + * this page, we fall out of the inner "do" and loop around to + * return them. + */ + gistScanPage(scan, item, item->distances, NULL, NULL); + + pfree(item); + } while (so->nPageData == 0); + } + } +} + +/* + * gistgetbitmap() -- Get a bitmap of all heap tuple locations + */ +int64 +gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + int64 ntids = 0; + GISTSearchItem fakeItem; + + if (!so->qual_ok) + return 0; + + pgstat_count_index_scan(scan->indexRelation); + + /* Begin the scan by processing the root page */ + so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, tbm, &ntids); + + /* + * While scanning a leaf page, ItemPointers of matching heap tuples will + * be stored directly into tbm, so we don't need to deal with them here. + */ + for (;;) + { + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) + break; + + CHECK_FOR_INTERRUPTS(); + + gistScanPage(scan, item, item->distances, tbm, &ntids); + + pfree(item); + } + + return ntids; +} + +/* + * Can we do index-only scans on the given index column? + * + * Opclasses that implement a fetch function support index-only scans. + * Opclasses without compression functions also support index-only scans. + * Included attributes always can be fetched for index-only scans. + */ +bool +gistcanreturn(Relation index, int attno) +{ + if (attno > IndexRelationGetNumberOfKeyAttributes(index) || + OidIsValid(index_getprocid(index, attno, GIST_FETCH_PROC)) || + !OidIsValid(index_getprocid(index, attno, GIST_COMPRESS_PROC))) + return true; + else + return false; +} |