diff options
Diffstat (limited to 'src/backend/access/gin/ginentrypage.c')
-rw-r--r-- | src/backend/access/gin/ginentrypage.c | 772 |
1 files changed, 772 insertions, 0 deletions
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c new file mode 100644 index 0000000..29c36bc --- /dev/null +++ b/src/backend/access/gin/ginentrypage.c @@ -0,0 +1,772 @@ +/*------------------------------------------------------------------------- + * + * ginentrypage.c + * routines for handling GIN entry tree pages. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginentrypage.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "utils/rel.h" + +static void entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage); + +/* + * Form a tuple for entry tree. + * + * If the tuple would be too big to be stored, function throws a suitable + * error if errorTooBig is true, or returns NULL if errorTooBig is false. + * + * See src/backend/access/gin/README for a description of the index tuple + * format that is being built here. We build on the assumption that we + * are making a leaf-level key entry containing a posting list of nipd items. + * If the caller is actually trying to make a posting-tree entry, non-leaf + * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite + * the t_tid fields as necessary. In any case, 'data' can be NULL to skip + * filling in the posting list; the caller is responsible for filling it + * afterwards if data = NULL and nipd > 0. + */ +IndexTuple +GinFormTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + Pointer data, Size dataSize, int nipd, + bool errorTooBig) +{ + Datum datums[2]; + bool isnull[2]; + IndexTuple itup; + uint32 newsize; + + /* Build the basic tuple: optional column number, plus key datum */ + if (ginstate->oneCol) + { + datums[0] = key; + isnull[0] = (category != GIN_CAT_NORM_KEY); + } + else + { + datums[0] = UInt16GetDatum(attnum); + isnull[0] = false; + datums[1] = key; + isnull[1] = (category != GIN_CAT_NORM_KEY); + } + + itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); + + /* + * Determine and store offset to the posting list, making sure there is + * room for the category byte if needed. + * + * Note: because index_form_tuple MAXALIGNs the tuple size, there may well + * be some wasted pad space. Is it worth recomputing the data length to + * prevent that? That would also allow us to Assert that the real data + * doesn't overlap the GinNullCategory byte, which this code currently + * takes on faith. + */ + newsize = IndexTupleSize(itup); + + if (IndexTupleHasNulls(itup)) + { + uint32 minsize; + + Assert(category != GIN_CAT_NORM_KEY); + minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory); + newsize = Max(newsize, minsize); + } + + newsize = SHORTALIGN(newsize); + + GinSetPostingOffset(itup, newsize); + GinSetNPosting(itup, nipd); + + /* + * Add space needed for posting list, if any. Then check that the tuple + * won't be too big to store. + */ + newsize += dataSize; + + newsize = MAXALIGN(newsize); + + if (newsize > GinMaxItemSize) + { + if (errorTooBig) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + (Size) newsize, (Size) GinMaxItemSize, + RelationGetRelationName(ginstate->index)))); + pfree(itup); + return NULL; + } + + /* + * Resize tuple if needed + */ + if (newsize != IndexTupleSize(itup)) + { + itup = repalloc(itup, newsize); + + /* + * PostgreSQL 9.3 and earlier did not clear this new space, so we + * might find uninitialized padding when reading tuples from disk. + */ + memset((char *) itup + IndexTupleSize(itup), + 0, newsize - IndexTupleSize(itup)); + /* set new size in tuple header */ + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + } + + /* + * Copy in the posting list, if provided + */ + if (data) + { + char *ptr = GinGetPosting(itup); + + memcpy(ptr, data, dataSize); + } + + /* + * Insert category byte, if needed + */ + if (category != GIN_CAT_NORM_KEY) + { + Assert(IndexTupleHasNulls(itup)); + GinSetNullCategory(itup, ginstate, category); + } + return itup; +} + +/* + * Read item pointers from leaf entry tuple. + * + * Returns a palloc'd array of ItemPointers. The number of items is returned + * in *nitems. + */ +ItemPointer +ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, + int *nitems) +{ + Pointer ptr = GinGetPosting(itup); + int nipd = GinGetNPosting(itup); + ItemPointer ipd; + int ndecoded; + + if (GinItupIsCompressed(itup)) + { + if (nipd > 0) + { + ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + if (nipd != ndecoded) + elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", + nipd, ndecoded); + } + else + { + ipd = palloc(0); + } + } + else + { + ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); + } + *nitems = nipd; + return ipd; +} + +/* + * Form a non-leaf entry tuple by copying the key data from the given tuple, + * which can be either a leaf or non-leaf entry tuple. + * + * Any posting list in the source tuple is not copied. The specified child + * block number is inserted into t_tid. + */ +static IndexTuple +GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk) +{ + IndexTuple nitup; + + if (GinPageIsLeaf(page) && !GinIsPostingTree(itup)) + { + /* Tuple contains a posting list, just copy stuff before that */ + uint32 origsize = GinGetPostingOffset(itup); + + origsize = MAXALIGN(origsize); + nitup = (IndexTuple) palloc(origsize); + memcpy(nitup, itup, origsize); + /* ... be sure to fix the size header field ... */ + nitup->t_info &= ~INDEX_SIZE_MASK; + nitup->t_info |= origsize; + } + else + { + /* Copy the tuple as-is */ + nitup = (IndexTuple) palloc(IndexTupleSize(itup)); + memcpy(nitup, itup, IndexTupleSize(itup)); + } + + /* Now insert the correct downlink */ + GinSetDownlink(nitup, childblk); + + return nitup; +} + +/* + * Entry tree is a "static", ie tuple never deletes from it, + * so we don't use right bound, we use rightmost key instead. + */ +static IndexTuple +getRightMostTuple(Page page) +{ + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + + return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff)); +} + +static bool +entryIsMoveRight(GinBtree btree, Page page) +{ + IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; + + if (GinPageRightMost(page)) + return false; + + itup = getRightMostTuple(page); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + + if (ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, btree->entryKey, btree->entryCategory, + attnum, key, category) > 0) + return true; + + return false; +} + +/* + * Find correct tuple in non-leaf page. It supposed that + * page correctly chosen and searching value SHOULD be on page + */ +static BlockNumber +entryLocateEntry(GinBtree btree, GinBtreeStack *stack) +{ + OffsetNumber low, + high, + maxoff; + IndexTuple itup = NULL; + int result; + Page page = BufferGetPage(stack->buffer); + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + stack->predictNumber *= PageGetMaxOffsetNumber(page); + return btree->getLeftMostChild(btree, page); + } + + low = FirstOffsetNumber; + maxoff = high = PageGetMaxOffsetNumber(page); + Assert(high >= low); + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + if (mid == maxoff && GinPageRightMost(page)) + { + /* Right infinity */ + result = -1; + } + else + { + OffsetNumber attnum; + Datum key; + GinNullCategory category; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); + } + + if (result == 0) + { + stack->off = mid; + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + Assert(high >= FirstOffsetNumber && high <= maxoff); + + stack->off = high; + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high)); + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); +} + +/* + * Searches correct position for value on leaf page. + * Page should be correctly chosen. + * Returns true if value found on page. + */ +static bool +entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack) +{ + Page page = BufferGetPage(stack->buffer); + OffsetNumber low, + high; + + Assert(GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + return true; + } + + low = FirstOffsetNumber; + high = PageGetMaxOffsetNumber(page); + + if (high < low) + { + stack->off = FirstOffsetNumber; + return false; + } + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; + int result; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); + if (result == 0) + { + stack->off = mid; + return true; + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + stack->off = high; + return false; +} + +static OffsetNumber +entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) +{ + OffsetNumber i, + maxoff = PageGetMaxOffsetNumber(page); + IndexTuple itup; + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + /* if page isn't changed, we returns storedOff */ + if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff)); + if (GinGetDownlink(itup) == blkno) + return storedOff; + + /* + * we hope, that needed pointer goes to right. It's true if there + * wasn't a deletion + */ + for (i = storedOff + 1; i <= maxoff; i++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + if (GinGetDownlink(itup) == blkno) + return i; + } + maxoff = storedOff - 1; + } + + /* last chance */ + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + if (GinGetDownlink(itup) == blkno) + return i; + } + + return InvalidOffsetNumber; +} + +static BlockNumber +entryGetLeftMostPage(GinBtree btree, Page page) +{ + IndexTuple itup; + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); + return GinGetDownlink(itup); +} + +static bool +entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off, + GinBtreeEntryInsertData *insertData) +{ + Size releasedsz = 0; + Size addedsz; + Page page = BufferGetPage(buf); + + Assert(insertData->entry); + Assert(!GinPageIsData(page)); + + if (insertData->isDelete) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + } + + addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData); + + if (PageGetFreeSpace(page) + releasedsz >= addedsz) + return true; + + return false; +} + +/* + * Delete tuple on leaf page if tuples existed and we + * should update it, update old child blkno to new right page + * if child split occurred + */ +static void +entryPreparePage(GinBtree btree, Page page, OffsetNumber off, + GinBtreeEntryInsertData *insertData, BlockNumber updateblkno) +{ + Assert(insertData->entry); + Assert(!GinPageIsData(page)); + + if (insertData->isDelete) + { + Assert(GinPageIsLeaf(page)); + PageIndexTupleDelete(page, off); + } + + if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + GinSetDownlink(itup, updateblkno); + } +} + +/* + * Prepare to insert data on an entry page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + */ +static GinPlaceToPageRC +entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + OffsetNumber off = stack->off; + + /* If it doesn't fit, deal with split case */ + if (!entryIsEnoughSpace(btree, buf, off, insertData)) + { + entrySplitPage(btree, buf, stack, insertData, updateblkno, + newlpage, newrpage); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void *ptp_workspace) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + OffsetNumber placed; + + entryPreparePage(btree, page, off, insertData, updateblkno); + + placed = PageAddItem(page, + (Item) insertData->entry, + IndexTupleSize(insertData->entry), + off, false, false); + if (placed != off) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(btree->index)); + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertEntry data; + + data.isDelete = insertData->isDelete; + data.offset = off; + + XLogRegisterBufData(0, (char *) &data, + offsetof(ginxlogInsertEntry, tuple)); + XLogRegisterBufData(0, (char *) insertData->entry, + IndexTupleSize(insertData->entry)); + } +} + +/* + * Split entry page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. + */ +static void +entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage) +{ + OffsetNumber off = stack->off; + OffsetNumber i, + maxoff, + separator = InvalidOffsetNumber; + Size totalsize = 0; + Size lsize = 0, + size; + char *ptr; + IndexTuple itup; + Page page; + Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Size pageSize = PageGetPageSize(lpage); + PGAlignedBlock tupstore[2]; /* could need 2 pages' worth of tuples */ + + entryPreparePage(btree, lpage, off, insertData, updateblkno); + + /* + * First, append all the existing tuples and the new tuple we're inserting + * one after another in a temporary workspace. + */ + maxoff = PageGetMaxOffsetNumber(lpage); + ptr = tupstore[0].data; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + if (i == off) + { + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i)); + size = MAXALIGN(IndexTupleSize(itup)); + memcpy(ptr, itup, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + if (off == maxoff + 1) + { + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + /* + * Initialize the left and right pages, and copy all the tuples back to + * them. + */ + GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); + GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize); + + ptr = tupstore[0].data; + maxoff++; + lsize = 0; + + page = lpage; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + itup = (IndexTuple) ptr; + + /* + * Decide where to split. We try to equalize the pages' total data + * size, not number of tuples. + */ + if (lsize > totalsize / 2) + { + if (separator == InvalidOffsetNumber) + separator = i - 1; + page = rpage; + } + else + { + lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + } + + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(btree->index)); + ptr += MAXALIGN(IndexTupleSize(itup)); + } + + /* return temp pages to caller */ + *newlpage = lpage; + *newrpage = rpage; +} + +/* + * Construct insertion payload for inserting the downlink for given buffer. + */ +static void * +entryPrepareDownlink(GinBtree btree, Buffer lbuf) +{ + GinBtreeEntryInsertData *insertData; + Page lpage = BufferGetPage(lbuf); + BlockNumber lblkno = BufferGetBlockNumber(lbuf); + IndexTuple itup; + + itup = getRightMostTuple(lpage); + + insertData = palloc(sizeof(GinBtreeEntryInsertData)); + insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno); + insertData->isDelete = false; + + return insertData; +} + +/* + * Fills new root by rightest values from child. + * Also called from ginxlog, should not use btree + */ +void +ginEntryFillRoot(GinBtree btree, Page root, + BlockNumber lblkno, Page lpage, + BlockNumber rblkno, Page rpage) +{ + IndexTuple itup; + + itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index root page"); + pfree(itup); + + itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index root page"); + pfree(itup); +} + +/* + * Set up GinBtree for entry page access + * + * Note: during WAL recovery, there may be no valid data in ginstate + * other than a faked-up Relation pointer; the key datum is bogus too. + */ +void +ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, + Datum key, GinNullCategory category, + GinState *ginstate) +{ + memset(btree, 0, sizeof(GinBtreeData)); + + btree->index = ginstate->index; + btree->rootBlkno = GIN_ROOT_BLKNO; + btree->ginstate = ginstate; + + btree->findChildPage = entryLocateEntry; + btree->getLeftMostChild = entryGetLeftMostPage; + btree->isMoveRight = entryIsMoveRight; + btree->findItem = entryLocateLeafEntry; + btree->findChildPtr = entryFindChildPtr; + btree->beginPlaceToPage = entryBeginPlaceToPage; + btree->execPlaceToPage = entryExecPlaceToPage; + btree->fillRoot = ginEntryFillRoot; + btree->prepareDownlink = entryPrepareDownlink; + + btree->isData = false; + btree->fullScan = false; + btree->isBuild = false; + + btree->entryAttnum = attnum; + btree->entryKey = key; + btree->entryCategory = category; +} |