diff options
Diffstat (limited to 'src/backend/access/gist/gist.c')
-rw-r--r-- | src/backend/access/gist/gist.c | 1734 |
1 files changed, 1734 insertions, 0 deletions
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c new file mode 100644 index 0000000..9b367b5 --- /dev/null +++ b/src/backend/access/gist/gist.c @@ -0,0 +1,1734 @@ +/*------------------------------------------------------------------------- + * + * gist.c + * interface routines for the postgres GiST index access method. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gist.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "access/gistscan.h" +#include "access/xloginsert.h" +#include "catalog/pg_collation.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* non-export function prototypes */ +static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate); +static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum); +static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild, Buffer rightchild, + bool unlockbuf, bool unlockleftchild); +static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo, bool unlockbuf); +static void gistprunepage(Relation rel, Page page, Buffer buffer, + Relation heapRel); + + +#define ROTATEDIST(d) do { \ + SplitedPageLayout *tmp=(SplitedPageLayout*)palloc0(sizeof(SplitedPageLayout)); \ + tmp->block.blkno = InvalidBlockNumber; \ + tmp->buffer = InvalidBuffer; \ + tmp->next = (d); \ + (d)=tmp; \ +} while(0) + + +/* + * GiST handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +gisthandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = GISTNProcs; + amroutine->amoptsprocnum = GIST_OPTIONS_PROC; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = true; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = true; + amroutine->amstorage = true; + amroutine->amclusterable = true; + amroutine->ampredlocks = true; + amroutine->amcanparallel = false; + amroutine->amcaninclude = true; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = gistbuild; + amroutine->ambuildempty = gistbuildempty; + amroutine->aminsert = gistinsert; + amroutine->ambulkdelete = gistbulkdelete; + amroutine->amvacuumcleanup = gistvacuumcleanup; + amroutine->amcanreturn = gistcanreturn; + amroutine->amcostestimate = gistcostestimate; + amroutine->amoptions = gistoptions; + amroutine->amproperty = gistproperty; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = gistvalidate; + amroutine->amadjustmembers = gistadjustmembers; + amroutine->ambeginscan = gistbeginscan; + amroutine->amrescan = gistrescan; + amroutine->amgettuple = gistgettuple; + amroutine->amgetbitmap = gistgetbitmap; + amroutine->amendscan = gistendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * Create and return a temporary memory context for use by GiST. We + * _always_ invoke user-provided methods in a temporary memory + * context, so that memory leaks in those functions cannot cause + * problems. Also, we use some additional temporary contexts in the + * GiST code itself, to avoid the need to do some awkward manual + * memory management. + */ +MemoryContext +createTempGistContext(void) +{ + return AllocSetContextCreate(CurrentMemoryContext, + "GiST temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +/* + * gistbuildempty() -- build an empty gist index in the initialization fork + */ +void +gistbuildempty(Relation index) +{ + Buffer buffer; + + /* Initialize the root page */ + buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog buffer */ + START_CRIT_SECTION(); + GISTInitBuffer(buffer, F_LEAF); + MarkBufferDirty(buffer); + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); + + /* Unlock and release the buffer */ + UnlockReleaseBuffer(buffer); +} + +/* + * gistinsert -- wrapper for GiST tuple insertion. + * + * This is the public interface routine for tuple insertion in GiSTs. + * It doesn't do any work; just locks the relation and passes the buck. + */ +bool +gistinsert(Relation r, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; + IndexTuple itup; + MemoryContext oldCxt; + + /* Initialize GISTSTATE cache if first call in this statement */ + if (giststate == NULL) + { + oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context); + giststate = initGISTstate(r); + giststate->tempCxt = createTempGistContext(); + indexInfo->ii_AmCache = (void *) giststate; + MemoryContextSwitchTo(oldCxt); + } + + oldCxt = MemoryContextSwitchTo(giststate->tempCxt); + + itup = gistFormTuple(giststate, r, + values, isnull, true /* size is currently bogus */ ); + itup->t_tid = *ht_ctid; + + gistdoinsert(r, itup, 0, giststate, heapRel, false); + + /* cleanup */ + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(giststate->tempCxt); + + return false; +} + + +/* + * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple + * at that offset is atomically removed along with inserting the new tuples. + * This is used to replace a tuple with a new one. + * + * If 'leftchildbuf' is valid, we're inserting the downlink for the page + * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. + * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. + * + * If 'markfollowright' is true and the page is split, the left child is + * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered + * index build, however, there is no concurrent access and the page splitting + * is done in a slightly simpler fashion, and false is passed. + * + * If there is not enough room on the page, it is split. All the split + * pages are kept pinned and locked and returned in *splitinfo, the caller + * is responsible for inserting the downlinks for them. However, if + * 'buffer' is the root page and it needs to be split, gistplacetopage() + * performs the split as one atomic operation, and *splitinfo is set to NIL. + * In that case, we continue to hold the root page locked, and the child + * pages are released; note that new tuple(s) are *not* on the root page + * but in one of the new child pages. + * + * If 'newblkno' is not NULL, returns the block number of page the first + * new/updated tuple was inserted to. Usually it's the given page, but could + * be its right sibling if the page was split. + * + * Returns 'true' if the page was split, 'false' otherwise. + */ +bool +gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, + Buffer buffer, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + BlockNumber *newblkno, + Buffer leftchildbuf, + List **splitinfo, + bool markfollowright, + Relation heapRel, + bool is_build) +{ + BlockNumber blkno = BufferGetBlockNumber(buffer); + Page page = BufferGetPage(buffer); + bool is_leaf = (GistPageIsLeaf(page)) ? true : false; + XLogRecPtr recptr; + int i; + bool is_split; + + /* + * Refuse to modify a page that's incompletely split. This should not + * happen because we finish any incomplete splits while we walk down the + * tree. However, it's remotely possible that another concurrent inserter + * splits a parent page, and errors out before completing the split. We + * will just throw an error in that case, and leave any split we had in + * progress unfinished too. The next insert that comes along will clean up + * the mess. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + + /* should never try to insert to a deleted page */ + Assert(!GistPageIsDeleted(page)); + + *splitinfo = NIL; + + /* + * if isupdate, remove old key: This node's key has been modified, either + * because a child split occurred or because we needed to adjust our key + * for an insert in a child node. Therefore, remove the old version of + * this node's key. + * + * for WAL replay, in the non-split case we handle this by setting up a + * one-element todelete array; in the split case, it's handled implicitly + * because the tuple vector passed to gistSplit won't include this tuple. + */ + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + + /* + * If leaf page is full, try at first to delete dead tuples. And then + * check again. + */ + if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) + { + gistprunepage(rel, page, buffer, heapRel); + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + } + + if (is_split) + { + /* no space for insertion */ + IndexTuple *itvec; + int tlen; + SplitedPageLayout *dist = NULL, + *ptr; + BlockNumber oldrlink = InvalidBlockNumber; + GistNSN oldnsn = 0; + SplitedPageLayout rootpg; + bool is_rootsplit; + int npage; + + is_rootsplit = (blkno == GIST_ROOT_BLKNO); + + /* + * Form index tuples vector to split. If we're replacing an old tuple, + * remove the old version from the vector. + */ + itvec = gistextractpage(page, &tlen); + if (OffsetNumberIsValid(oldoffnum)) + { + /* on inner page we should remove old tuple */ + int pos = oldoffnum - FirstOffsetNumber; + + tlen--; + if (pos != tlen) + memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); + } + itvec = gistjoinvector(itvec, &tlen, itup, ntup); + dist = gistSplit(rel, page, itvec, tlen, giststate); + + /* + * Check that split didn't produce too many pages. + */ + npage = 0; + for (ptr = dist; ptr; ptr = ptr->next) + npage++; + /* in a root split, we'll add one more page to the list below */ + if (is_rootsplit) + npage++; + if (npage > GIST_MAX_SPLIT_PAGES) + elog(ERROR, "GiST page split into too many halves (%d, maximum %d)", + npage, GIST_MAX_SPLIT_PAGES); + + /* + * Set up pages to work with. Allocate new buffers for all but the + * leftmost page. The original page becomes the new leftmost page, and + * is just replaced with the new contents. + * + * For a root-split, allocate new buffers for all child pages, the + * original page is overwritten with new root page containing + * downlinks to the new child pages. + */ + ptr = dist; + if (!is_rootsplit) + { + /* save old rightlink and NSN */ + oldrlink = GistPageGetOpaque(page)->rightlink; + oldnsn = GistPageGetNSN(page); + + dist->buffer = buffer; + dist->block.blkno = BufferGetBlockNumber(buffer); + dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); + + /* clean all flags except F_LEAF */ + GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; + + ptr = ptr->next; + } + for (; ptr; ptr = ptr->next) + { + /* Allocate new page */ + ptr->buffer = gistNewBuffer(rel); + GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); + ptr->page = BufferGetPage(ptr->buffer); + ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buffer), + BufferGetBlockNumber(ptr->buffer)); + } + + /* + * Now that we know which blocks the new pages go to, set up downlink + * tuples to point to them. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); + GistTupleSetValid(ptr->itup); + } + + /* + * If this is a root split, we construct the new root page with the + * downlinks here directly, instead of requiring the caller to insert + * them. Add the new root page to the list along with the child pages. + */ + if (is_rootsplit) + { + IndexTuple *downlinks; + int ndownlinks = 0; + int i; + + rootpg.buffer = buffer; + rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); + GistPageGetOpaque(rootpg.page)->flags = 0; + + /* Prepare a vector of all the downlinks */ + for (ptr = dist; ptr; ptr = ptr->next) + ndownlinks++; + downlinks = palloc(sizeof(IndexTuple) * ndownlinks); + for (i = 0, ptr = dist; ptr; ptr = ptr->next) + downlinks[i++] = ptr->itup; + + rootpg.block.blkno = GIST_ROOT_BLKNO; + rootpg.block.num = ndownlinks; + rootpg.list = gistfillitupvec(downlinks, ndownlinks, + &(rootpg.lenlist)); + rootpg.itup = NULL; + + rootpg.next = dist; + dist = &rootpg; + } + else + { + /* Prepare split-info to be returned to caller */ + for (ptr = dist; ptr; ptr = ptr->next) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + + si->buf = ptr->buffer; + si->downlink = ptr->itup; + *splitinfo = lappend(*splitinfo, si); + } + } + + /* + * Fill all pages. All the pages are new, ie. freshly allocated empty + * pages, or a temporary copy of the old page. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + char *data = (char *) (ptr->list); + + for (i = 0; i < ptr->block.num; i++) + { + IndexTuple thistup = (IndexTuple) data; + + if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); + + /* + * If this is the first inserted/updated tuple, let the caller + * know which page it landed on. + */ + if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) + *newblkno = ptr->block.blkno; + + data += IndexTupleSize(thistup); + } + + /* Set up rightlinks */ + if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) + GistPageGetOpaque(ptr->page)->rightlink = + ptr->next->block.blkno; + else + GistPageGetOpaque(ptr->page)->rightlink = oldrlink; + + /* + * Mark the all but the right-most page with the follow-right + * flag. It will be cleared as soon as the downlink is inserted + * into the parent, but this ensures that if we error out before + * that, the index is still consistent. (in buffering build mode, + * any error will abort the index build anyway, so this is not + * needed.) + */ + if (ptr->next && !is_rootsplit && markfollowright) + GistMarkFollowRight(ptr->page); + else + GistClearFollowRight(ptr->page); + + /* + * Copy the NSN of the original page to all pages. The + * F_FOLLOW_RIGHT flags ensure that scans will follow the + * rightlinks until the downlinks are inserted. + */ + GistPageSetNSN(ptr->page, oldnsn); + } + + /* + * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL + * insertion for that. NB: The number of pages and data segments + * specified here must match the calculations in gistXLogSplit()! + */ + if (!is_build && RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(npage, 1 + npage * 2); + + START_CRIT_SECTION(); + + /* + * Must mark buffers dirty before XLogInsert, even though we'll still + * be changing their opaque fields below. + */ + for (ptr = dist; ptr; ptr = ptr->next) + MarkBufferDirty(ptr->buffer); + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + /* + * The first page in the chain was a temporary working copy meant to + * replace the old page. Copy it over the old page. + */ + PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); + dist->page = BufferGetPage(dist->buffer); + + /* + * Write the WAL record. + * + * If we're building a new index, however, we don't WAL-log changes + * yet. The LSN-NSN interlock between parent and child requires that + * LSNs never move backwards, so set the LSNs to a value that's + * smaller than any real or fake unlogged LSN that might be generated + * later. (There can't be any concurrent scans during index build, so + * we don't need to be able to detect concurrent splits yet.) + */ + if (is_build) + recptr = GistBuildLSN; + else + { + if (RelationNeedsWAL(rel)) + recptr = gistXLogSplit(is_leaf, + dist, oldrlink, oldnsn, leftchildbuf, + markfollowright); + else + recptr = gistGetFakeLSN(rel); + } + + for (ptr = dist; ptr; ptr = ptr->next) + PageSetLSN(ptr->page, recptr); + + /* + * Return the new child buffers to the caller. + * + * If this was a root split, we've already inserted the downlink + * pointers, in the form of a new root page. Therefore we can release + * all the new buffers, and keep just the root page locked. + */ + if (is_rootsplit) + { + for (ptr = dist->next; ptr; ptr = ptr->next) + UnlockReleaseBuffer(ptr->buffer); + } + } + else + { + /* + * Enough space. We always get here if ntup==0. + */ + START_CRIT_SECTION(); + + /* + * Delete old tuple if any, then insert new tuple(s) if any. If + * possible, use the fast path of PageIndexTupleOverwrite. + */ + if (OffsetNumberIsValid(oldoffnum)) + { + if (ntup == 1) + { + /* One-for-one replacement, so use PageIndexTupleOverwrite */ + if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup, + IndexTupleSize(*itup))) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(rel)); + } + else + { + /* Delete old, then append new tuple(s) to page */ + PageIndexTupleDelete(page, oldoffnum); + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } + } + else + { + /* Just append new tuples at the end of the page */ + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } + + MarkBufferDirty(buffer); + + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + if (is_build) + recptr = GistBuildLSN; + else + { + if (RelationNeedsWAL(rel)) + { + OffsetNumber ndeloffs = 0, + deloffs[1]; + + if (OffsetNumberIsValid(oldoffnum)) + { + deloffs[0] = oldoffnum; + ndeloffs = 1; + } + + recptr = gistXLogUpdate(buffer, + deloffs, ndeloffs, itup, ntup, + leftchildbuf); + } + else + recptr = gistGetFakeLSN(rel); + } + PageSetLSN(page, recptr); + + if (newblkno) + *newblkno = blkno; + } + + /* + * If we inserted the downlink for a child page, set NSN and clear + * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to + * follow the rightlink if and only if they looked at the parent page + * before we inserted the downlink. + * + * Note that we do this *after* writing the WAL record. That means that + * the possible full page image in the WAL record does not include these + * changes, and they must be replayed even if the page is restored from + * the full page image. There's a chicken-and-egg problem: if we updated + * the child pages first, we wouldn't know the recptr of the WAL record + * we're about to write. + */ + if (BufferIsValid(leftchildbuf)) + { + Page leftpg = BufferGetPage(leftchildbuf); + + GistPageSetNSN(leftpg, recptr); + GistClearFollowRight(leftpg); + + PageSetLSN(leftpg, recptr); + } + + END_CRIT_SECTION(); + + return is_split; +} + +/* + * Workhouse routine for doing insertion into a GiST index. Note that + * this routine assumes it is invoked in a short-lived memory context, + * so it does not bother releasing palloc'd allocations. + */ +void +gistdoinsert(Relation r, IndexTuple itup, Size freespace, + GISTSTATE *giststate, Relation heapRel, bool is_build) +{ + ItemId iid; + IndexTuple idxtuple; + GISTInsertStack firststack; + GISTInsertStack *stack; + GISTInsertState state; + bool xlocked = false; + + memset(&state, 0, sizeof(GISTInsertState)); + state.freespace = freespace; + state.r = r; + state.heapRel = heapRel; + state.is_build = is_build; + + /* Start from the root */ + firststack.blkno = GIST_ROOT_BLKNO; + firststack.lsn = 0; + firststack.retry_from_parent = false; + firststack.parent = NULL; + firststack.downlinkoffnum = InvalidOffsetNumber; + state.stack = stack = &firststack; + + /* + * Walk down along the path of smallest penalty, updating the parent + * pointers with the key we're inserting as we go. If we crash in the + * middle, the tree is consistent, although the possible parent updates + * were a waste. + */ + for (;;) + { + /* + * If we split an internal page while descending the tree, we have to + * retry at the parent. (Normally, the LSN-NSN interlock below would + * also catch this and cause us to retry. But LSNs are not updated + * during index build.) + */ + while (stack->retry_from_parent) + { + if (xlocked) + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + ReleaseBuffer(stack->buffer); + state.stack = stack = stack->parent; + } + + if (XLogRecPtrIsInvalid(stack->lsn)) + stack->buffer = ReadBuffer(state.r, stack->blkno); + + /* + * Be optimistic and grab shared lock first. Swap it for an exclusive + * lock later if we need to update the page. + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_SHARE); + gistcheckpage(state.r, stack->buffer); + } + + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = xlocked ? + PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer); + Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); + + /* + * If this page was split but the downlink was never inserted to the + * parent because the inserting backend crashed before doing that, fix + * that now. + */ + if (GistFollowRight(stack->page)) + { + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + /* someone might've completed the split when we unlocked */ + if (!GistFollowRight(stack->page)) + continue; + } + gistfixsplit(&state, giststate); + + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if ((stack->blkno != GIST_ROOT_BLKNO && + stack->parent->lsn < GistPageGetNSN(stack->page)) || + GistPageIsDeleted(stack->page)) + { + /* + * Concurrent split or page deletion detected. There's no + * guarantee that the downlink for this page is consistent with + * the tuple we're inserting anymore, so go back to parent and + * rechoose the best child. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if (!GistPageIsLeaf(stack->page)) + { + /* + * This is an internal page so continue to walk down the tree. + * Find the child node that has the minimum insertion penalty. + */ + BlockNumber childblkno; + IndexTuple newtup; + GISTInsertStack *item; + OffsetNumber downlinkoffnum; + + downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); + iid = PageGetItemId(stack->page, downlinkoffnum); + idxtuple = (IndexTuple) PageGetItem(stack->page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + /* + * Check that it's not a leftover invalid tuple from pre-9.1 + */ + if (GistTupleIsInvalid(idxtuple)) + ereport(ERROR, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(r)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + + /* + * Check that the key representing the target child node is + * consistent with the key we're inserting. Update it if it's not. + */ + newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); + if (newtup) + { + /* + * Swap shared lock for an exclusive one. Beware, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + + if (PageGetLSN(stack->page) != stack->lsn) + { + /* the page was changed while we unlocked it, retry */ + continue; + } + } + + /* + * Update the tuple. + * + * We still hold the lock after gistinserttuple(), but it + * might have to split the page to make the updated tuple fit. + * In that case the updated tuple might migrate to the other + * half of the split, so we have to go back to the parent and + * descend back to the half that's a better fit for the new + * tuple. + */ + if (gistinserttuple(&state, stack, giststate, newtup, + downlinkoffnum)) + { + /* + * If this was a root split, the root page continues to be + * the parent and the updated tuple went to one of the + * child pages, so we just need to retry from the root + * page. + */ + if (stack->blkno != GIST_ROOT_BLKNO) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + } + continue; + } + } + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + + /* descend to the chosen child */ + item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + item->blkno = childblkno; + item->parent = stack; + item->downlinkoffnum = downlinkoffnum; + state.stack = stack = item; + } + else + { + /* + * Leaf page. Insert the new key. We've already updated all the + * parents on the way down, but we might have to split the page if + * it doesn't fit. gistinserttuple() will take care of that. + */ + + /* + * Swap shared lock for an exclusive one. Be careful, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = PageGetLSN(stack->page); + + if (stack->blkno == GIST_ROOT_BLKNO) + { + /* + * the only page that can become inner instead of leaf is + * the root page, so for root we should recheck it + */ + if (!GistPageIsLeaf(stack->page)) + { + /* + * very rare situation: during unlock/lock index with + * number of pages = 1 was increased + */ + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + continue; + } + + /* + * we don't need to check root split, because checking + * leaf/inner is enough to recognize split for root + */ + } + else if ((GistFollowRight(stack->page) || + stack->parent->lsn < GistPageGetNSN(stack->page)) || + GistPageIsDeleted(stack->page)) + { + /* + * The page was split or deleted while we momentarily + * unlocked the page. Go back to parent. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + } + + /* now state.stack->(page, buffer and blkno) points to leaf page */ + + gistinserttuple(&state, stack, giststate, itup, + InvalidOffsetNumber); + LockBuffer(stack->buffer, GIST_UNLOCK); + + /* Release any pins we might still hold before exiting */ + for (; stack; stack = stack->parent) + ReleaseBuffer(stack->buffer); + break; + } + } +} + +/* + * Traverse the tree to find path from root page to specified "child" block. + * + * returns a new insertion stack, starting from the parent of "child", up + * to the root. *downlinkoffnum is set to the offset of the downlink in the + * direct parent of child. + * + * To prevent deadlocks, this should lock only one page at a time. + */ +static GISTInsertStack * +gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) +{ + Page page; + Buffer buffer; + OffsetNumber i, + maxoff; + ItemId iid; + IndexTuple idxtuple; + List *fifo; + GISTInsertStack *top, + *ptr; + BlockNumber blkno; + + top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + top->blkno = GIST_ROOT_BLKNO; + top->downlinkoffnum = InvalidOffsetNumber; + + fifo = list_make1(top); + while (fifo != NIL) + { + /* Get next page to visit */ + top = linitial(fifo); + fifo = list_delete_first(fifo); + + buffer = ReadBuffer(r, top->blkno); + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(r, buffer); + page = (Page) BufferGetPage(buffer); + + if (GistPageIsLeaf(page)) + { + /* + * Because we scan the index top-down, all the rest of the pages + * in the queue must be leaf pages as well. + */ + UnlockReleaseBuffer(buffer); + break; + } + + /* currently, internal pages are never deleted */ + Assert(!GistPageIsDeleted(page)); + + top->lsn = BufferGetLSNAtomic(buffer); + + /* + * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a + * downlink. This should not normally happen.. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + + if (top->parent && top->parent->lsn < GistPageGetNSN(page) && + GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) + { + /* + * Page was split while we looked elsewhere. We didn't see the + * downlink to the right page when we scanned the parent, so add + * it to the queue now. + * + * Put the right page ahead of the queue, so that we visit it + * next. That's important, because if this is the lowest internal + * level, just above leaves, we might already have queued up some + * leaf pages, and we assume that there can't be any non-leaf + * pages behind leaf pages. + */ + ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr->blkno = GistPageGetOpaque(page)->rightlink; + ptr->downlinkoffnum = InvalidOffsetNumber; + ptr->parent = top->parent; + + fifo = lcons(ptr, fifo); + } + + maxoff = PageGetMaxOffsetNumber(page); + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + if (blkno == child) + { + /* Found it! */ + UnlockReleaseBuffer(buffer); + *downlinkoffnum = i; + return top; + } + else + { + /* Append this child to the list of pages to visit later */ + ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr->blkno = blkno; + ptr->downlinkoffnum = i; + ptr->parent = top; + + fifo = lappend(fifo, ptr); + } + } + + UnlockReleaseBuffer(buffer); + } + + elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", + RelationGetRelationName(r), child); + return NULL; /* keep compiler quiet */ +} + +/* + * Updates the stack so that child->parent is the correct parent of the + * child. child->parent must be exclusively locked on entry, and will + * remain so at exit, but it might not be the same page anymore. + */ +static void +gistFindCorrectParent(Relation r, GISTInsertStack *child, bool is_build) +{ + GISTInsertStack *parent = child->parent; + ItemId iid; + IndexTuple idxtuple; + OffsetNumber maxoff; + GISTInsertStack *ptr; + + gistcheckpage(r, parent->buffer); + parent->page = (Page) BufferGetPage(parent->buffer); + maxoff = PageGetMaxOffsetNumber(parent->page); + + /* Check if the downlink is still where it was before */ + if (child->downlinkoffnum != InvalidOffsetNumber && child->downlinkoffnum <= maxoff) + { + iid = PageGetItemId(parent->page, child->downlinkoffnum); + idxtuple = (IndexTuple) PageGetItem(parent->page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + return; /* still there */ + } + + /* + * The page has changed since we looked. During normal operation, every + * update of a page changes its LSN, so the LSN we memorized should have + * changed too. During index build, however, we don't WAL-log the changes + * until we have built the index, so the LSN doesn't change. There is no + * concurrent activity during index build, but we might have changed the + * parent ourselves. + */ + Assert(parent->lsn != PageGetLSN(parent->page) || is_build); + + /* + * Scan the page to re-find the downlink. If the page was split, it might + * have moved to a different page, so follow the right links until we find + * it. + */ + while (true) + { + OffsetNumber i; + + maxoff = PageGetMaxOffsetNumber(parent->page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(parent->page, i); + idxtuple = (IndexTuple) PageGetItem(parent->page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + { + /* yes!!, found */ + child->downlinkoffnum = i; + return; + } + } + + parent->blkno = GistPageGetOpaque(parent->page)->rightlink; + parent->downlinkoffnum = InvalidOffsetNumber; + UnlockReleaseBuffer(parent->buffer); + if (parent->blkno == InvalidBlockNumber) + { + /* + * End of chain and still didn't find parent. It's a very-very + * rare situation when root splitted. + */ + break; + } + parent->buffer = ReadBuffer(r, parent->blkno); + LockBuffer(parent->buffer, GIST_EXCLUSIVE); + gistcheckpage(r, parent->buffer); + parent->page = (Page) BufferGetPage(parent->buffer); + } + + /* + * awful!!, we need search tree to find parent ... , but before we should + * release all old parent + */ + + ptr = child->parent->parent; /* child->parent already released above */ + while (ptr) + { + ReleaseBuffer(ptr->buffer); + ptr = ptr->parent; + } + + /* ok, find new path */ + ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); + + /* read all buffers as expected by caller */ + /* note we don't lock them or gistcheckpage them here! */ + while (ptr) + { + ptr->buffer = ReadBuffer(r, ptr->blkno); + ptr->page = (Page) BufferGetPage(ptr->buffer); + ptr = ptr->parent; + } + + /* install new chain of parents to stack */ + child->parent = parent; + + /* make recursive call to normal processing */ + LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(r, child, is_build); +} + +/* + * Form a downlink pointer for the page in 'buf'. + */ +static IndexTuple +gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, + GISTInsertStack *stack, bool is_build) +{ + Page page = BufferGetPage(buf); + OffsetNumber maxoff; + OffsetNumber offset; + IndexTuple downlink = NULL; + + maxoff = PageGetMaxOffsetNumber(page); + for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) + { + IndexTuple ituple = (IndexTuple) + PageGetItem(page, PageGetItemId(page, offset)); + + if (downlink == NULL) + downlink = CopyIndexTuple(ituple); + else + { + IndexTuple newdownlink; + + newdownlink = gistgetadjusted(rel, downlink, ituple, + giststate); + if (newdownlink) + downlink = newdownlink; + } + } + + /* + * If the page is completely empty, we can't form a meaningful downlink + * for it. But we have to insert a downlink for the page. Any key will do, + * as long as its consistent with the downlink of parent page, so that we + * can legally insert it to the parent. A minimal one that matches as few + * scans as possible would be best, to keep scans from doing useless work, + * but we don't know how to construct that. So we just use the downlink of + * the original page that was split - that's as far from optimal as it can + * get but will do.. + */ + if (!downlink) + { + ItemId iid; + + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(rel, stack, is_build); + iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum); + downlink = (IndexTuple) PageGetItem(stack->parent->page, iid); + downlink = CopyIndexTuple(downlink); + LockBuffer(stack->parent->buffer, GIST_UNLOCK); + } + + ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf)); + GistTupleSetValid(downlink); + + return downlink; +} + + +/* + * Complete the incomplete split of state->stack->page. + */ +static void +gistfixsplit(GISTInsertState *state, GISTSTATE *giststate) +{ + GISTInsertStack *stack = state->stack; + Buffer buf; + Page page; + List *splitinfo = NIL; + + ereport(LOG, + (errmsg("fixing incomplete split in index \"%s\", block %u", + RelationGetRelationName(state->r), stack->blkno))); + + Assert(GistFollowRight(stack->page)); + Assert(OffsetNumberIsValid(stack->downlinkoffnum)); + + buf = stack->buffer; + + /* + * Read the chain of split pages, following the rightlinks. Construct a + * downlink tuple for each page. + */ + for (;;) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + IndexTuple downlink; + + page = BufferGetPage(buf); + + /* Form the new downlink tuples to insert to parent */ + downlink = gistformdownlink(state->r, buf, giststate, stack, state->is_build); + + si->buf = buf; + si->downlink = downlink; + + splitinfo = lappend(splitinfo, si); + + if (GistFollowRight(page)) + { + /* lock next page */ + buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink); + LockBuffer(buf, GIST_EXCLUSIVE); + } + else + break; + } + + /* Insert the downlinks */ + gistfinishsplit(state, stack, giststate, splitinfo, false); +} + +/* + * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the + * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new. + * 'stack' represents the path from the root to the page being updated. + * + * The caller must hold an exclusive lock on stack->buffer. The lock is still + * held on return, but the page might not contain the inserted tuple if the + * page was split. The function returns true if the page was split, false + * otherwise. + */ +static bool +gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum) +{ + return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum, + InvalidBuffer, InvalidBuffer, false, false); +} + +/* ---------------- + * An extended workhorse version of gistinserttuple(). This version allows + * inserting multiple tuples, or replacing a single tuple with multiple tuples. + * This is used to recursively update the downlinks in the parent when a page + * is split. + * + * If leftchild and rightchild are valid, we're inserting/replacing the + * downlink for rightchild, and leftchild is its left sibling. We clear the + * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the + * insertion of the downlink. + * + * To avoid holding locks for longer than necessary, when recursing up the + * tree to update the parents, the locking is a bit peculiar here. On entry, + * the caller must hold an exclusive lock on stack->buffer, as well as + * leftchild and rightchild if given. On return: + * + * - Lock on stack->buffer is released, if 'unlockbuf' is true. The page is + * always kept pinned, however. + * - Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page + * is kept pinned. + * - Lock and pin on 'rightchild' are always released. + * + * Returns 'true' if the page had to be split. Note that if the page was + * split, the inserted/updated tuples might've been inserted to a right + * sibling of stack->buffer instead of stack->buffer itself. + */ +static bool +gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild, Buffer rightchild, + bool unlockbuf, bool unlockleftchild) +{ + List *splitinfo; + bool is_split; + + /* + * Check for any rw conflicts (in serializable isolation level) just + * before we intend to modify the page + */ + CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer)); + + /* Insert the tuple(s) to the page, splitting the page if necessary */ + is_split = gistplacetopage(state->r, state->freespace, giststate, + stack->buffer, + tuples, ntup, + oldoffnum, NULL, + leftchild, + &splitinfo, + true, + state->heapRel, + state->is_build); + + /* + * Before recursing up in case the page was split, release locks on the + * child pages. We don't need to keep them locked when updating the + * parent. + */ + if (BufferIsValid(rightchild)) + UnlockReleaseBuffer(rightchild); + if (BufferIsValid(leftchild) && unlockleftchild) + LockBuffer(leftchild, GIST_UNLOCK); + + /* + * If we had to split, insert/update the downlinks in the parent. If the + * caller requested us to release the lock on stack->buffer, tell + * gistfinishsplit() to do that as soon as it's safe to do so. If we + * didn't have to split, release it ourselves. + */ + if (splitinfo) + gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf); + else if (unlockbuf) + LockBuffer(stack->buffer, GIST_UNLOCK); + + return is_split; +} + +/* + * Finish an incomplete split by inserting/updating the downlinks in parent + * page. 'splitinfo' contains all the child pages involved in the split, + * from left-to-right. + * + * On entry, the caller must hold a lock on stack->buffer and all the child + * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is + * released on return. The child pages are always unlocked and unpinned. + */ +static void +gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo, bool unlockbuf) +{ + GISTPageSplitInfo *right; + GISTPageSplitInfo *left; + IndexTuple tuples[2]; + + /* A split always contains at least two halves */ + Assert(list_length(splitinfo) >= 2); + + /* + * We need to insert downlinks for each new page, and update the downlink + * for the original (leftmost) page in the split. Begin at the rightmost + * page, inserting one downlink at a time until there's only two pages + * left. Finally insert the downlink for the last new page and update the + * downlink for the original page as one operation. + */ + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + + /* + * Insert downlinks for the siblings from right to left, until there are + * only two siblings left. + */ + for (int pos = list_length(splitinfo) - 1; pos > 1; pos--) + { + right = (GISTPageSplitInfo *) list_nth(splitinfo, pos); + left = (GISTPageSplitInfo *) list_nth(splitinfo, pos - 1); + + gistFindCorrectParent(state->r, stack, state->is_build); + if (gistinserttuples(state, stack->parent, giststate, + &right->downlink, 1, + InvalidOffsetNumber, + left->buf, right->buf, false, false)) + { + /* + * If the parent page was split, the existing downlink might have + * moved. + */ + stack->downlinkoffnum = InvalidOffsetNumber; + } + /* gistinserttuples() released the lock on right->buf. */ + } + + right = (GISTPageSplitInfo *) lsecond(splitinfo); + left = (GISTPageSplitInfo *) linitial(splitinfo); + + /* + * Finally insert downlink for the remaining right page and update the + * downlink for the original page to not contain the tuples that were + * moved to the new pages. + */ + tuples[0] = left->downlink; + tuples[1] = right->downlink; + gistFindCorrectParent(state->r, stack, state->is_build); + (void) gistinserttuples(state, stack->parent, giststate, + tuples, 2, + stack->downlinkoffnum, + left->buf, right->buf, + true, /* Unlock parent */ + unlockbuf /* Unlock stack->buffer if caller + * wants that */ + ); + + /* + * The downlink might have moved when we updated it. Even if the page + * wasn't split, because gistinserttuples() implements updating the old + * tuple by removing and re-inserting it! + */ + stack->downlinkoffnum = InvalidOffsetNumber; + + Assert(left->buf == stack->buffer); + + /* + * If we split the page because we had to adjust the downlink on an + * internal page, while descending the tree for inserting a new tuple, + * then this might no longer be the correct page for the new tuple. The + * downlink to this page might not cover the new tuple anymore, it might + * need to go to the newly-created right sibling instead. Tell the caller + * to walk back up the stack, to re-check at the parent which page to + * insert to. + * + * Normally, the LSN-NSN interlock during the tree descend would also + * detect that a concurrent split happened (by ourselves), and cause us to + * retry at the parent. But that mechanism doesn't work during index + * build, because we don't do WAL-logging, and don't update LSNs, during + * index build. + */ + stack->retry_from_parent = true; +} + +/* + * gistSplit -- split a page in the tree and fill struct + * used for XLOG and real writes buffers. Function is recursive, ie + * it will split page until keys will fit in every page. + */ +SplitedPageLayout * +gistSplit(Relation r, + Page page, + IndexTuple *itup, /* contains compressed entry */ + int len, + GISTSTATE *giststate) +{ + IndexTuple *lvectup, + *rvectup; + GistSplitVector v; + int i; + SplitedPageLayout *res = NULL; + + /* this should never recurse very deeply, but better safe than sorry */ + check_stack_depth(); + + /* there's no point in splitting an empty page */ + Assert(len > 0); + + /* + * If a single tuple doesn't fit on a page, no amount of splitting will + * help. + */ + if (len == 1) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + IndexTupleSize(itup[0]), GiSTPageSize, + RelationGetRelationName(r)))); + + memset(v.spl_lisnull, true, + sizeof(bool) * giststate->nonLeafTupdesc->natts); + memset(v.spl_risnull, true, + sizeof(bool) * giststate->nonLeafTupdesc->natts); + gistSplitByKey(r, page, itup, len, giststate, &v, 0); + + /* form left and right vector */ + lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); + rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); + + for (i = 0; i < v.splitVector.spl_nleft; i++) + lvectup[i] = itup[v.splitVector.spl_left[i] - 1]; + + for (i = 0; i < v.splitVector.spl_nright; i++) + rvectup[i] = itup[v.splitVector.spl_right[i] - 1]; + + /* finalize splitting (may need another split) */ + if (!gistfitpage(rvectup, v.splitVector.spl_nright)) + { + res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate); + } + else + { + ROTATEDIST(res); + res->block.num = v.splitVector.spl_nright; + res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist)); + res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false); + } + + if (!gistfitpage(lvectup, v.splitVector.spl_nleft)) + { + SplitedPageLayout *resptr, + *subres; + + resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate); + + /* install on list's tail */ + while (resptr->next) + resptr = resptr->next; + + resptr->next = res; + res = subres; + } + else + { + ROTATEDIST(res); + res->block.num = v.splitVector.spl_nleft; + res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist)); + res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false); + } + + return res; +} + +/* + * Create a GISTSTATE and fill it with information about the index + */ +GISTSTATE * +initGISTstate(Relation index) +{ + GISTSTATE *giststate; + MemoryContext scanCxt; + MemoryContext oldCxt; + int i; + + /* safety check to protect fixed-size arrays in GISTSTATE */ + if (index->rd_att->natts > INDEX_MAX_KEYS) + elog(ERROR, "numberOfAttributes %d > %d", + index->rd_att->natts, INDEX_MAX_KEYS); + + /* Create the memory context that will hold the GISTSTATE */ + scanCxt = AllocSetContextCreate(CurrentMemoryContext, + "GiST scan context", + ALLOCSET_DEFAULT_SIZES); + oldCxt = MemoryContextSwitchTo(scanCxt); + + /* Create and fill in the GISTSTATE */ + giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); + + giststate->scanCxt = scanCxt; + giststate->tempCxt = scanCxt; /* caller must change this if needed */ + giststate->leafTupdesc = index->rd_att; + + /* + * The truncated tupdesc for non-leaf index tuples, which doesn't contain + * the INCLUDE attributes. + * + * It is used to form tuples during tuple adjustment and page split. + * B-tree creates shortened tuple descriptor for every truncated tuple, + * because it is doing this less often: it does not have to form truncated + * tuples during page split. Also, B-tree is not adjusting tuples on + * internal pages the way GiST does. + */ + giststate->nonLeafTupdesc = CreateTupleDescCopyConstr(index->rd_att); + giststate->nonLeafTupdesc->natts = + IndexRelationGetNumberOfKeyAttributes(index); + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++) + { + fmgr_info_copy(&(giststate->consistentFn[i]), + index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC), + scanCxt); + fmgr_info_copy(&(giststate->unionFn[i]), + index_getprocinfo(index, i + 1, GIST_UNION_PROC), + scanCxt); + + /* opclasses are not required to provide a Compress method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_COMPRESS_PROC))) + fmgr_info_copy(&(giststate->compressFn[i]), + index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC), + scanCxt); + else + giststate->compressFn[i].fn_oid = InvalidOid; + + /* opclasses are not required to provide a Decompress method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DECOMPRESS_PROC))) + fmgr_info_copy(&(giststate->decompressFn[i]), + index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC), + scanCxt); + else + giststate->decompressFn[i].fn_oid = InvalidOid; + + fmgr_info_copy(&(giststate->penaltyFn[i]), + index_getprocinfo(index, i + 1, GIST_PENALTY_PROC), + scanCxt); + fmgr_info_copy(&(giststate->picksplitFn[i]), + index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC), + scanCxt); + fmgr_info_copy(&(giststate->equalFn[i]), + index_getprocinfo(index, i + 1, GIST_EQUAL_PROC), + scanCxt); + + /* opclasses are not required to provide a Distance method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC))) + fmgr_info_copy(&(giststate->distanceFn[i]), + index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC), + scanCxt); + else + giststate->distanceFn[i].fn_oid = InvalidOid; + + /* opclasses are not required to provide a Fetch method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC))) + fmgr_info_copy(&(giststate->fetchFn[i]), + index_getprocinfo(index, i + 1, GIST_FETCH_PROC), + scanCxt); + else + giststate->fetchFn[i].fn_oid = InvalidOid; + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type. If there's no index + * collation then specify default collation in case the support + * functions need collation. This is harmless if the support + * functions don't care about collation, so we just do it + * unconditionally. (We could alternatively call get_typcollation, + * but that seems like expensive overkill --- there aren't going to be + * any cases where a GiST storage type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + giststate->supportCollation[i] = index->rd_indcollation[i]; + else + giststate->supportCollation[i] = DEFAULT_COLLATION_OID; + } + + /* No opclass information for INCLUDE attributes */ + for (; i < index->rd_att->natts; i++) + { + giststate->consistentFn[i].fn_oid = InvalidOid; + giststate->unionFn[i].fn_oid = InvalidOid; + giststate->compressFn[i].fn_oid = InvalidOid; + giststate->decompressFn[i].fn_oid = InvalidOid; + giststate->penaltyFn[i].fn_oid = InvalidOid; + giststate->picksplitFn[i].fn_oid = InvalidOid; + giststate->equalFn[i].fn_oid = InvalidOid; + giststate->distanceFn[i].fn_oid = InvalidOid; + giststate->fetchFn[i].fn_oid = InvalidOid; + giststate->supportCollation[i] = InvalidOid; + } + + MemoryContextSwitchTo(oldCxt); + + return giststate; +} + +void +freeGISTstate(GISTSTATE *giststate) +{ + /* It's sufficient to delete the scanCxt */ + MemoryContextDelete(giststate->scanCxt); +} + +/* + * gistprunepage() -- try to remove LP_DEAD items from the given page. + * Function assumes that buffer is exclusively locked. + */ +static void +gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel) +{ + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable = 0; + OffsetNumber offnum, + maxoff; + + Assert(GistPageIsLeaf(page)); + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + TransactionId latestRemovedXid = InvalidTransactionId; + + if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) + latestRemovedXid = + index_compute_xid_horizon_for_tuples(rel, heapRel, buffer, + deletable, ndeletable); + + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that F_HAS_GARBAGE is only a hint anyway. + */ + GistClearPageHasGarbage(page); + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogDelete(buffer, + deletable, ndeletable, + latestRemovedXid); + + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + } + + /* + * Note: if we didn't find any LP_DEAD items, then the page's + * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a + * separate write to clear it, however. We will clear it when we split + * the page. + */ +} |