/*------------------------------------------------------------------------- * * gindatapage.c * routines for handling GIN posting tree pages. * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/access/gin/gindatapage.c *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/gin_private.h" #include "access/ginxlog.h" #include "access/xloginsert.h" #include "lib/ilist.h" #include "miscadmin.h" #include "storage/predicate.h" #include "utils/rel.h" /* * Min, Max and Target size of posting lists stored on leaf pages, in bytes. * * The code can deal with any size, but random access is more efficient when * a number of smaller lists are stored, rather than one big list. If a * posting list would become larger than Max size as a result of insertions, * it is split into two. If a posting list would be smaller than minimum * size, it is merged with the next posting list. */ #define GinPostingListSegmentMaxSize 384 #define GinPostingListSegmentTargetSize 256 #define GinPostingListSegmentMinSize 128 /* * At least this many items fit in a GinPostingListSegmentMaxSize-bytes * long segment. This is used when estimating how much space is required * for N items, at minimum. */ #define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6) /* * A working struct for manipulating a posting tree leaf page. */ typedef struct { dlist_head segments; /* a list of leafSegmentInfos */ /* * The following fields represent how the segments are split across pages, * if a page split is required. Filled in by leafRepackItems. */ dlist_node *lastleft; /* last segment on left page */ int lsize; /* total size on left page */ int rsize; /* total size on right page */ bool oldformat; /* page is in pre-9.4 format on disk */ /* * If we need WAL data representing the reconstructed leaf page, it's * stored here by computeLeafRecompressWALData. */ char *walinfo; /* buffer start */ int walinfolen; /* and length */ } disassembledLeaf; typedef struct { dlist_node node; /* linked list pointers */ /*------------- * 'action' indicates the status of this in-memory segment, compared to * what's on disk. It is one of the GIN_SEGMENT_* action codes: * * UNMODIFIED no changes * DELETE the segment is to be removed. 'seg' and 'items' are * ignored * INSERT this is a completely new segment * REPLACE this replaces an existing segment with new content * ADDITEMS like REPLACE, but no items have been removed, and we track * in detail what items have been added to this segment, in * 'modifieditems' *------------- */ char action; ItemPointerData *modifieditems; uint16 nmodifieditems; /* * The following fields represent the items in this segment. If 'items' is * not NULL, it contains a palloc'd array of the items in this segment. If * 'seg' is not NULL, it contains the items in an already-compressed * format. It can point to an on-disk page (!modified), or a palloc'd * segment in memory. If both are set, they must represent the same items. */ GinPostingList *seg; ItemPointer items; int nitems; /* # of items in 'items', if items != NULL */ } leafSegmentInfo; static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, Page *newlpage, Page *newrpage); static disassembledLeaf *disassembleLeaf(Page page); static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems); static void computeLeafRecompressWALData(disassembledLeaf *leaf); static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, Page lpage, Page rpage); /* * Read TIDs from leaf data page to single uncompressed array. The TIDs are * returned in ascending order. * * advancePast is a hint, indicating that the caller is only interested in * TIDs > advancePast. To return all items, use ItemPointerSetMin. * * Note: This function can still return items smaller than advancePast that * are in the same posting list as the items of interest, so the caller must * still check all the returned items. But passing it allows this function to * skip whole posting lists. */ ItemPointer GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast) { ItemPointer result; if (GinPageIsCompressed(page)) { GinPostingList *seg = GinDataLeafPageGetPostingList(page); Size len = GinDataLeafPageGetPostingListSize(page); Pointer endptr = ((Pointer) seg) + len; GinPostingList *next; /* Skip to the segment containing advancePast+1 */ if (ItemPointerIsValid(&advancePast)) { next = GinNextPostingListSegment(seg); while ((Pointer) next < endptr && ginCompareItemPointers(&next->first, &advancePast) <= 0) { seg = next; next = GinNextPostingListSegment(seg); } len = endptr - (Pointer) seg; } if (len > 0) result = ginPostingListDecodeAllSegments(seg, len, nitems); else { result = NULL; *nitems = 0; } } else { ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems); result = palloc((*nitems) * sizeof(ItemPointerData)); memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData)); } return result; } /* * Places all TIDs from leaf data page to bitmap. */ int GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm) { ItemPointer uncompressed; int nitems; if (GinPageIsCompressed(page)) { GinPostingList *segment = GinDataLeafPageGetPostingList(page); Size len = GinDataLeafPageGetPostingListSize(page); nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm); } else { uncompressed = dataLeafPageGetUncompressed(page, &nitems); if (nitems > 0) tbm_add_tuples(tbm, uncompressed, nitems, false); } return nitems; } /* * Get pointer to the uncompressed array of items on a pre-9.4 format * uncompressed leaf page. The number of items in the array is returned in * *nitems. */ static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems) { ItemPointer items; Assert(!GinPageIsCompressed(page)); /* * In the old pre-9.4 page format, the whole page content is used for * uncompressed items, and the number of items is stored in 'maxoff' */ items = (ItemPointer) GinDataPageGetData(page); *nitems = GinPageGetOpaque(page)->maxoff; return items; } /* * Check if we should follow the right link to find the item we're searching * for. * * Compares inserting item pointer with the right bound of the current page. */ static bool dataIsMoveRight(GinBtree btree, Page page) { ItemPointer iptr = GinDataPageGetRightBound(page); if (GinPageRightMost(page)) return false; if (GinPageIsDeleted(page)) return true; return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? true : false; } /* * Find correct PostingItem in non-leaf page. It is assumed that this is * the correct page, and the searched value SHOULD be on the page. */ static BlockNumber dataLocateItem(GinBtree btree, GinBtreeStack *stack) { OffsetNumber low, high, maxoff; PostingItem *pitem = NULL; int result; Page page = BufferGetPage(stack->buffer); Assert(!GinPageIsLeaf(page)); Assert(GinPageIsData(page)); if (btree->fullScan) { stack->off = FirstOffsetNumber; stack->predictNumber *= GinPageGetOpaque(page)->maxoff; return btree->getLeftMostChild(btree, page); } low = FirstOffsetNumber; maxoff = high = GinPageGetOpaque(page)->maxoff; Assert(high >= low); high++; while (high > low) { OffsetNumber mid = low + ((high - low) / 2); pitem = GinDataPageGetPostingItem(page, mid); if (mid == maxoff) { /* * Right infinity, page already correctly chosen with a help of * dataIsMoveRight */ result = -1; } else { pitem = GinDataPageGetPostingItem(page, mid); result = ginCompareItemPointers(&btree->itemptr, &(pitem->key)); } if (result == 0) { stack->off = mid; return PostingItemGetBlockNumber(pitem); } else if (result > 0) low = mid + 1; else high = mid; } Assert(high >= FirstOffsetNumber && high <= maxoff); stack->off = high; pitem = GinDataPageGetPostingItem(page, high); return PostingItemGetBlockNumber(pitem); } /* * Find link to blkno on non-leaf page, returns offset of PostingItem */ static OffsetNumber dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) { OffsetNumber i, maxoff = GinPageGetOpaque(page)->maxoff; PostingItem *pitem; Assert(!GinPageIsLeaf(page)); Assert(GinPageIsData(page)); /* if page isn't changed, we return storedOff */ if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) { pitem = GinDataPageGetPostingItem(page, storedOff); if (PostingItemGetBlockNumber(pitem) == blkno) return storedOff; /* * we hope, that needed pointer goes to right. It's true if there * wasn't a deletion */ for (i = storedOff + 1; i <= maxoff; i++) { pitem = GinDataPageGetPostingItem(page, i); if (PostingItemGetBlockNumber(pitem) == blkno) return i; } maxoff = storedOff - 1; } /* last chance */ for (i = FirstOffsetNumber; i <= maxoff; i++) { pitem = GinDataPageGetPostingItem(page, i); if (PostingItemGetBlockNumber(pitem) == blkno) return i; } return InvalidOffsetNumber; } /* * Return blkno of leftmost child */ static BlockNumber dataGetLeftMostPage(GinBtree btree, Page page) { PostingItem *pitem; Assert(!GinPageIsLeaf(page)); Assert(GinPageIsData(page)); Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber); pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber); return PostingItemGetBlockNumber(pitem); } /* * Add PostingItem to a non-leaf page. */ void GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset) { OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; char *ptr; Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber); Assert(!GinPageIsLeaf(page)); if (offset == InvalidOffsetNumber) { ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1); } else { ptr = (char *) GinDataPageGetPostingItem(page, offset); if (offset != maxoff + 1) memmove(ptr + sizeof(PostingItem), ptr, (maxoff - offset + 1) * sizeof(PostingItem)); } memcpy(ptr, data, sizeof(PostingItem)); maxoff++; GinPageGetOpaque(page)->maxoff = maxoff; /* * Also set pd_lower to the end of the posting items, to follow the * "standard" page layout, so that we can squeeze out the unused space * from full-page images. */ GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); } /* * Delete posting item from non-leaf page */ void GinPageDeletePostingItem(Page page, OffsetNumber offset) { OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; Assert(!GinPageIsLeaf(page)); Assert(offset >= FirstOffsetNumber && offset <= maxoff); if (offset != maxoff) memmove(GinDataPageGetPostingItem(page, offset), GinDataPageGetPostingItem(page, offset + 1), sizeof(PostingItem) * (maxoff - offset)); maxoff--; GinPageGetOpaque(page)->maxoff = maxoff; GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); } /* * Prepare to insert data on a leaf data page. * * If it will fit, return GPTP_INSERT after doing whatever setup is needed * before we enter the insertion critical section. *ptp_workspace can be * set to pass information along to the execPlaceToPage function. * * If it won't fit, perform a page split and return two temporary page * images into *newlpage and *newrpage, with result GPTP_SPLIT. * * In neither case should the given page buffer be modified here. */ static GinPlaceToPageRC dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, void **ptp_workspace, Page *newlpage, Page *newrpage) { GinBtreeDataLeafInsertData *items = insertdata; ItemPointer newItems = &items->items[items->curitem]; int maxitems = items->nitem - items->curitem; Page page = BufferGetPage(buf); int i; ItemPointerData rbound; ItemPointerData lbound; bool needsplit; bool append; int segsize; Size freespace; disassembledLeaf *leaf; leafSegmentInfo *lastleftinfo; ItemPointerData maxOldItem; ItemPointerData remaining; rbound = *GinDataPageGetRightBound(page); /* * Count how many of the new items belong to this page. */ if (!GinPageRightMost(page)) { for (i = 0; i < maxitems; i++) { if (ginCompareItemPointers(&newItems[i], &rbound) > 0) { /* * This needs to go to some other location in the tree. (The * caller should've chosen the insert location so that at * least the first item goes here.) */ Assert(i > 0); break; } } maxitems = i; } /* Disassemble the data on the page */ leaf = disassembleLeaf(page); /* * Are we appending to the end of the page? IOW, are all the new items * larger than any of the existing items. */ if (!dlist_is_empty(&leaf->segments)) { lastleftinfo = dlist_container(leafSegmentInfo, node, dlist_tail_node(&leaf->segments)); if (!lastleftinfo->items) lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, &lastleftinfo->nitems); maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1]; if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0) append = true; else append = false; } else { ItemPointerSetMin(&maxOldItem); append = true; } /* * If we're appending to the end of the page, we will append as many items * as we can fit (after splitting), and stop when the pages becomes full. * Otherwise we have to limit the number of new items to insert, because * once we start packing we can't just stop when we run out of space, * because we must make sure that all the old items still fit. */ if (GinPageIsCompressed(page)) freespace = GinDataLeafPageGetFreeSpace(page); else freespace = 0; if (append) { /* * Even when appending, trying to append more items than will fit is * not completely free, because we will merge the new items and old * items into an array below. In the best case, every new item fits in * a single byte, and we can use all the free space on the old page as * well as the new page. For simplicity, ignore segment overhead etc. */ maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize); } else { /* * Calculate a conservative estimate of how many new items we can fit * on the two pages after splitting. * * We can use any remaining free space on the old page to store full * segments, as well as the new page. Each full-sized segment can hold * at least MinTuplesPerSegment items */ int nnewsegments; nnewsegments = freespace / GinPostingListSegmentMaxSize; nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize; maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); } /* Add the new items to the segment list */ if (!addItemsToLeaf(leaf, newItems, maxitems)) { /* all items were duplicates, we have nothing to do */ items->curitem += maxitems; return GPTP_NO_WORK; } /* * Pack the items back to compressed segments, ready for writing to disk. */ needsplit = leafRepackItems(leaf, &remaining); /* * Did all the new items fit? * * If we're appending, it's OK if they didn't. But as a sanity check, * verify that all the old items fit. */ if (ItemPointerIsValid(&remaining)) { if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0) elog(ERROR, "could not split GIN page; all old items didn't fit"); /* Count how many of the new items did fit. */ for (i = 0; i < maxitems; i++) { if (ginCompareItemPointers(&newItems[i], &remaining) >= 0) break; } if (i == 0) elog(ERROR, "could not split GIN page; no new items fit"); maxitems = i; } if (!needsplit) { /* * Great, all the items fit on a single page. If needed, prepare data * for a WAL record describing the changes we'll make. */ if (RelationNeedsWAL(btree->index) && !btree->isBuild) computeLeafRecompressWALData(leaf); /* * We're ready to enter the critical section, but * dataExecPlaceToPageLeaf will need access to the "leaf" data. */ *ptp_workspace = leaf; if (append) elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, items->nitem - items->curitem - maxitems); else elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)", maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, items->nitem - items->curitem - maxitems); } else { /* * Have to split. * * leafRepackItems already divided the segments between the left and * the right page. It filled the left page as full as possible, and * put the rest to the right page. When building a new index, that's * good, because the table is scanned from beginning to end and there * won't be any more insertions to the left page during the build. * This packs the index as tight as possible. But otherwise, split * 50/50, by moving segments from the left page to the right page * until they're balanced. * * As a further heuristic, when appending items to the end of the * page, try to make the left page 75% full, on the assumption that * subsequent insertions will probably also go to the end. This packs * the index somewhat tighter when appending to a table, which is very * common. */ if (!btree->isBuild) { while (dlist_has_prev(&leaf->segments, leaf->lastleft)) { lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); /* ignore deleted segments */ if (lastleftinfo->action != GIN_SEGMENT_DELETE) { segsize = SizeOfGinPostingList(lastleftinfo->seg); /* * Note that we check that the right page doesn't become * more full than the left page even when appending. It's * possible that we added enough items to make both pages * more than 75% full. */ if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0) break; if (append) { if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4) break; } leaf->lsize -= segsize; leaf->rsize += segsize; } leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft); } } Assert(leaf->lsize <= GinDataPageMaxDataSize); Assert(leaf->rsize <= GinDataPageMaxDataSize); /* * Fetch the max item in the left page's last segment; it becomes the * right bound of the page. */ lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); if (!lastleftinfo->items) lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, &lastleftinfo->nitems); lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; /* * Now allocate a couple of temporary page images, and fill them. */ *newlpage = palloc(BLCKSZ); *newrpage = palloc(BLCKSZ); dataPlaceToPageLeafSplit(leaf, lbound, rbound, *newlpage, *newrpage); Assert(GinPageRightMost(page) || ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), GinDataPageGetRightBound(*newrpage)) < 0); if (append) elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)", maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, items->nitem - items->curitem - maxitems); else elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)", maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, items->nitem - items->curitem - maxitems); } items->curitem += maxitems; return needsplit ? GPTP_SPLIT : GPTP_INSERT; } /* * Perform data insertion after beginPlaceToPage has decided it will fit. * * This is invoked within a critical section, and XLOG record creation (if * needed) is already started. The target buffer is registered in slot 0. */ static void dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, void *ptp_workspace) { disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace; /* Apply changes to page */ dataPlaceToPageLeafRecompress(buf, leaf); /* If needed, register WAL data built by computeLeafRecompressWALData */ if (RelationNeedsWAL(btree->index) && !btree->isBuild) { XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); } } /* * Vacuum a posting tree leaf page. */ void ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) { Page page = BufferGetPage(buffer); disassembledLeaf *leaf; bool removedsomething = false; dlist_iter iter; leaf = disassembleLeaf(page); /* Vacuum each segment. */ dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); int oldsegsize; ItemPointer cleaned; int ncleaned; if (!seginfo->items) seginfo->items = ginPostingListDecode(seginfo->seg, &seginfo->nitems); if (seginfo->seg) oldsegsize = SizeOfGinPostingList(seginfo->seg); else oldsegsize = GinDataPageMaxDataSize; cleaned = ginVacuumItemPointers(gvs, seginfo->items, seginfo->nitems, &ncleaned); pfree(seginfo->items); seginfo->items = NULL; seginfo->nitems = 0; if (cleaned) { if (ncleaned > 0) { int npacked; seginfo->seg = ginCompressPostingList(cleaned, ncleaned, oldsegsize, &npacked); /* Removing an item never increases the size of the segment */ if (npacked != ncleaned) elog(ERROR, "could not fit vacuumed posting list"); seginfo->action = GIN_SEGMENT_REPLACE; } else { seginfo->seg = NULL; seginfo->items = NULL; seginfo->action = GIN_SEGMENT_DELETE; } seginfo->nitems = ncleaned; removedsomething = true; } } /* * If we removed any items, reconstruct the page from the pieces. * * We don't try to re-encode the segments here, even though some of them * might be really small now that we've removed some items from them. It * seems like a waste of effort, as there isn't really any benefit from * larger segments per se; larger segments only help to pack more items in * the same space. We might as well delay doing that until the next * insertion, which will need to re-encode at least part of the page * anyway. * * Also note if the page was in uncompressed, pre-9.4 format before, it is * now represented as one huge segment that contains all the items. It * might make sense to split that, to speed up random access, but we don't * bother. You'll have to REINDEX anyway if you want the full gain of the * new tighter index format. */ if (removedsomething) { bool modified; /* * Make sure we have a palloc'd copy of all segments, after the first * segment that is modified. (dataPlaceToPageLeafRecompress requires * this). */ modified = false; dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); if (seginfo->action != GIN_SEGMENT_UNMODIFIED) modified = true; if (modified && seginfo->action != GIN_SEGMENT_DELETE) { int segsize = SizeOfGinPostingList(seginfo->seg); GinPostingList *tmp = (GinPostingList *) palloc(segsize); memcpy(tmp, seginfo->seg, segsize); seginfo->seg = tmp; } } if (RelationNeedsWAL(indexrel)) computeLeafRecompressWALData(leaf); /* Apply changes to page */ START_CRIT_SECTION(); dataPlaceToPageLeafRecompress(buffer, leaf); MarkBufferDirty(buffer); if (RelationNeedsWAL(indexrel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } } /* * Construct a ginxlogRecompressDataLeaf record representing the changes * in *leaf. (Because this requires a palloc, we have to do it before * we enter the critical section that actually updates the page.) */ static void computeLeafRecompressWALData(disassembledLeaf *leaf) { int nmodified = 0; char *walbufbegin; char *walbufend; dlist_iter iter; int segno; ginxlogRecompressDataLeaf *recompress_xlog; /* Count the modified segments */ dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); if (seginfo->action != GIN_SEGMENT_UNMODIFIED) nmodified++; } walbufbegin = palloc(sizeof(ginxlogRecompressDataLeaf) + BLCKSZ + /* max size needed to hold the segment data */ nmodified * 2 /* (segno + action) per action */ ); walbufend = walbufbegin; recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; walbufend += sizeof(ginxlogRecompressDataLeaf); recompress_xlog->nactions = nmodified; segno = 0; dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); int segsize = 0; int datalen; uint8 action = seginfo->action; if (action == GIN_SEGMENT_UNMODIFIED) { segno++; continue; } if (action != GIN_SEGMENT_DELETE) segsize = SizeOfGinPostingList(seginfo->seg); /* * If storing the uncompressed list of added item pointers would take * more space than storing the compressed segment as is, do that * instead. */ if (action == GIN_SEGMENT_ADDITEMS && seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize) { action = GIN_SEGMENT_REPLACE; } *((uint8 *) (walbufend++)) = segno; *(walbufend++) = action; switch (action) { case GIN_SEGMENT_DELETE: datalen = 0; break; case GIN_SEGMENT_ADDITEMS: datalen = seginfo->nmodifieditems * sizeof(ItemPointerData); memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16)); memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen); datalen += sizeof(uint16); break; case GIN_SEGMENT_INSERT: case GIN_SEGMENT_REPLACE: datalen = SHORTALIGN(segsize); memcpy(walbufend, seginfo->seg, segsize); break; default: elog(ERROR, "unexpected GIN leaf action %d", action); } walbufend += datalen; if (action != GIN_SEGMENT_INSERT) segno++; } /* Pass back the constructed info via *leaf */ leaf->walinfo = walbufbegin; leaf->walinfolen = walbufend - walbufbegin; } /* * Assemble a disassembled posting tree leaf page back to a buffer. * * This just updates the target buffer; WAL stuff is caller's responsibility. * * NOTE: The segment pointers must not point directly to the same buffer, * except for segments that have not been modified and whose preceding * segments have not been modified either. */ static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) { Page page = BufferGetPage(buf); char *ptr; int newsize; bool modified = false; dlist_iter iter; int segsize; /* * If the page was in pre-9.4 format before, convert the header, and force * all segments to be copied to the page whether they were modified or * not. */ if (!GinPageIsCompressed(page)) { Assert(leaf->oldformat); GinPageSetCompressed(page); GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; modified = true; } ptr = (char *) GinDataLeafPageGetPostingList(page); newsize = 0; dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); if (seginfo->action != GIN_SEGMENT_UNMODIFIED) modified = true; if (seginfo->action != GIN_SEGMENT_DELETE) { segsize = SizeOfGinPostingList(seginfo->seg); if (modified) memcpy(ptr, seginfo->seg, segsize); ptr += segsize; newsize += segsize; } } Assert(newsize <= GinDataPageMaxDataSize); GinDataPageSetDataSize(page, newsize); } /* * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf * segments to two pages instead of one. * * This is different from the non-split cases in that this does not modify * the original page directly, but writes to temporary in-memory copies of * the new left and right pages. */ static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, Page lpage, Page rpage) { char *ptr; int segsize; int lsize; int rsize; dlist_node *node; dlist_node *firstright; leafSegmentInfo *seginfo; /* Initialize temporary pages to hold the new left and right pages */ GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); /* * Copy the segments that go to the left page. * * XXX: We should skip copying the unmodified part of the left page, like * we do when recompressing. */ lsize = 0; ptr = (char *) GinDataLeafPageGetPostingList(lpage); firstright = dlist_next_node(&leaf->segments, leaf->lastleft); for (node = dlist_head_node(&leaf->segments); node != firstright; node = dlist_next_node(&leaf->segments, node)) { seginfo = dlist_container(leafSegmentInfo, node, node); if (seginfo->action != GIN_SEGMENT_DELETE) { segsize = SizeOfGinPostingList(seginfo->seg); memcpy(ptr, seginfo->seg, segsize); ptr += segsize; lsize += segsize; } } Assert(lsize == leaf->lsize); GinDataPageSetDataSize(lpage, lsize); *GinDataPageGetRightBound(lpage) = lbound; /* Copy the segments that go to the right page */ ptr = (char *) GinDataLeafPageGetPostingList(rpage); rsize = 0; for (node = firstright; ; node = dlist_next_node(&leaf->segments, node)) { seginfo = dlist_container(leafSegmentInfo, node, node); if (seginfo->action != GIN_SEGMENT_DELETE) { segsize = SizeOfGinPostingList(seginfo->seg); memcpy(ptr, seginfo->seg, segsize); ptr += segsize; rsize += segsize; } if (!dlist_has_next(&leaf->segments, node)) break; } Assert(rsize == leaf->rsize); GinDataPageSetDataSize(rpage, rsize); *GinDataPageGetRightBound(rpage) = rbound; } /* * Prepare to insert data on an internal data page. * * If it will fit, return GPTP_INSERT after doing whatever setup is needed * before we enter the insertion critical section. *ptp_workspace can be * set to pass information along to the execPlaceToPage function. * * If it won't fit, perform a page split and return two temporary page * images into *newlpage and *newrpage, with result GPTP_SPLIT. * * In neither case should the given page buffer be modified here. * * Note: on insertion to an internal node, in addition to inserting the given * item, the downlink of the existing item at stack->off will be updated to * point to updateblkno. */ static GinPlaceToPageRC dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, void **ptp_workspace, Page *newlpage, Page *newrpage) { Page page = BufferGetPage(buf); /* If it doesn't fit, deal with split case */ if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) { dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, newlpage, newrpage); return GPTP_SPLIT; } /* Else, we're ready to proceed with insertion */ return GPTP_INSERT; } /* * Perform data insertion after beginPlaceToPage has decided it will fit. * * This is invoked within a critical section, and XLOG record creation (if * needed) is already started. The target buffer is registered in slot 0. */ static void dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, void *ptp_workspace) { Page page = BufferGetPage(buf); OffsetNumber off = stack->off; PostingItem *pitem; /* Update existing downlink to point to next page (on internal page) */ pitem = GinDataPageGetPostingItem(page, off); PostingItemSetBlockNumber(pitem, updateblkno); /* Add new item */ pitem = (PostingItem *) insertdata; GinDataPageAddPostingItem(page, pitem, off); if (RelationNeedsWAL(btree->index) && !btree->isBuild) { /* * This must be static, because it has to survive until XLogInsert, * and we can't palloc here. Ugly, but the XLogInsert infrastructure * isn't reentrant anyway. */ static ginxlogInsertDataInternal data; data.offset = off; data.newitem = *pitem; XLogRegisterBufData(0, (char *) &data, sizeof(ginxlogInsertDataInternal)); } } /* * Prepare to insert data on a posting-tree data page. * * If it will fit, return GPTP_INSERT after doing whatever setup is needed * before we enter the insertion critical section. *ptp_workspace can be * set to pass information along to the execPlaceToPage function. * * If it won't fit, perform a page split and return two temporary page * images into *newlpage and *newrpage, with result GPTP_SPLIT. * * In neither case should the given page buffer be modified here. * * Note: on insertion to an internal node, in addition to inserting the given * item, the downlink of the existing item at stack->off will be updated to * point to updateblkno. * * Calls relevant function for internal or leaf page because they are handled * very differently. */ static GinPlaceToPageRC dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, void **ptp_workspace, Page *newlpage, Page *newrpage) { Page page = BufferGetPage(buf); Assert(GinPageIsData(page)); if (GinPageIsLeaf(page)) return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata, ptp_workspace, newlpage, newrpage); else return dataBeginPlaceToPageInternal(btree, buf, stack, insertdata, updateblkno, ptp_workspace, newlpage, newrpage); } /* * Perform data insertion after beginPlaceToPage has decided it will fit. * * This is invoked within a critical section, and XLOG record creation (if * needed) is already started. The target buffer is registered in slot 0. * * Calls relevant function for internal or leaf page because they are handled * very differently. */ static void dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, void *ptp_workspace) { Page page = BufferGetPage(buf); if (GinPageIsLeaf(page)) dataExecPlaceToPageLeaf(btree, buf, stack, insertdata, ptp_workspace); else dataExecPlaceToPageInternal(btree, buf, stack, insertdata, updateblkno, ptp_workspace); } /* * Split internal page and insert new data. * * Returns new temp pages to *newlpage and *newrpage. * The original buffer is left untouched. */ static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, Page *newlpage, Page *newrpage) { Page oldpage = BufferGetPage(origbuf); OffsetNumber off = stack->off; int nitems = GinPageGetOpaque(oldpage)->maxoff; int nleftitems; int nrightitems; Size pageSize = PageGetPageSize(oldpage); ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage); ItemPointer bound; Page lpage; Page rpage; OffsetNumber separator; PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; lpage = PageGetTempPage(oldpage); rpage = PageGetTempPage(oldpage); GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); /* * First construct a new list of PostingItems, which includes all the old * items, and the new item. */ memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber), (off - 1) * sizeof(PostingItem)); allitems[off - 1] = *((PostingItem *) insertdata); memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off), (nitems - (off - 1)) * sizeof(PostingItem)); nitems++; /* Update existing downlink to point to next page */ PostingItemSetBlockNumber(&allitems[off], updateblkno); /* * When creating a new index, fit as many tuples as possible on the left * page, on the assumption that the table is scanned from beginning to * end. This packs the index as tight as possible. */ if (btree->isBuild && GinPageRightMost(oldpage)) separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem); else separator = nitems / 2; nleftitems = separator; nrightitems = nitems - separator; memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber), allitems, nleftitems * sizeof(PostingItem)); GinPageGetOpaque(lpage)->maxoff = nleftitems; memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber), &allitems[separator], nrightitems * sizeof(PostingItem)); GinPageGetOpaque(rpage)->maxoff = nrightitems; /* * Also set pd_lower for both pages, like GinDataPageAddPostingItem does. */ GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem)); GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem)); /* set up right bound for left page */ bound = GinDataPageGetRightBound(lpage); *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key; /* set up right bound for right page */ *GinDataPageGetRightBound(rpage) = oldbound; /* return temp pages to caller */ *newlpage = lpage; *newrpage = rpage; } /* * Construct insertion payload for inserting the downlink for given buffer. */ static void * dataPrepareDownlink(GinBtree btree, Buffer lbuf) { PostingItem *pitem = palloc(sizeof(PostingItem)); Page lpage = BufferGetPage(lbuf); PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf)); pitem->key = *GinDataPageGetRightBound(lpage); return pitem; } /* * Fills new root by right bound values from child. * Also called from ginxlog, should not use btree */ void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage) { PostingItem li, ri; li.key = *GinDataPageGetRightBound(lpage); PostingItemSetBlockNumber(&li, lblkno); GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber); ri.key = *GinDataPageGetRightBound(rpage); PostingItemSetBlockNumber(&ri, rblkno); GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber); } /*** Functions to work with disassembled leaf pages ***/ /* * Disassemble page into a disassembledLeaf struct. */ static disassembledLeaf * disassembleLeaf(Page page) { disassembledLeaf *leaf; GinPostingList *seg; Pointer segbegin; Pointer segend; leaf = palloc0(sizeof(disassembledLeaf)); dlist_init(&leaf->segments); if (GinPageIsCompressed(page)) { /* * Create a leafSegmentInfo entry for each segment. */ seg = GinDataLeafPageGetPostingList(page); segbegin = (Pointer) seg; segend = segbegin + GinDataLeafPageGetPostingListSize(page); while ((Pointer) seg < segend) { leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); seginfo->action = GIN_SEGMENT_UNMODIFIED; seginfo->seg = seg; seginfo->items = NULL; seginfo->nitems = 0; dlist_push_tail(&leaf->segments, &seginfo->node); seg = GinNextPostingListSegment(seg); } leaf->oldformat = false; } else { /* * A pre-9.4 format uncompressed page is represented by a single * segment, with an array of items. The corner case is uncompressed * page containing no items, which is represented as no segments. */ ItemPointer uncompressed; int nuncompressed; leafSegmentInfo *seginfo; uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed); if (nuncompressed > 0) { seginfo = palloc(sizeof(leafSegmentInfo)); seginfo->action = GIN_SEGMENT_REPLACE; seginfo->seg = NULL; seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData)); memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData)); seginfo->nitems = nuncompressed; dlist_push_tail(&leaf->segments, &seginfo->node); } leaf->oldformat = true; } return leaf; } /* * Distribute newItems to the segments. * * Any segments that acquire new items are decoded, and the new items are * merged with the old items. * * Returns true if any new items were added. False means they were all * duplicates of existing items on the page. */ static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) { dlist_iter iter; ItemPointer nextnew = newItems; int newleft = nNewItems; bool modified = false; leafSegmentInfo *newseg; /* * If the page is completely empty, just construct one new segment to hold * all the new items. */ if (dlist_is_empty(&leaf->segments)) { newseg = palloc(sizeof(leafSegmentInfo)); newseg->seg = NULL; newseg->items = newItems; newseg->nitems = nNewItems; newseg->action = GIN_SEGMENT_INSERT; dlist_push_tail(&leaf->segments, &newseg->node); return true; } dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur); int nthis; ItemPointer tmpitems; int ntmpitems; /* * How many of the new items fall into this segment? */ if (!dlist_has_next(&leaf->segments, iter.cur)) nthis = newleft; else { leafSegmentInfo *next; ItemPointerData next_first; next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, dlist_next_node(&leaf->segments, iter.cur)); if (next->items) next_first = next->items[0]; else { Assert(next->seg != NULL); next_first = next->seg->first; } nthis = 0; while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0) nthis++; } if (nthis == 0) continue; /* Merge the new items with the existing items. */ if (!cur->items) cur->items = ginPostingListDecode(cur->seg, &cur->nitems); /* * Fast path for the important special case that we're appending to * the end of the page: don't let the last segment on the page grow * larger than the target, create a new segment before that happens. */ if (!dlist_has_next(&leaf->segments, iter.cur) && ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 && cur->seg != NULL && SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize) { newseg = palloc(sizeof(leafSegmentInfo)); newseg->seg = NULL; newseg->items = nextnew; newseg->nitems = nthis; newseg->action = GIN_SEGMENT_INSERT; dlist_push_tail(&leaf->segments, &newseg->node); modified = true; break; } tmpitems = ginMergeItemPointers(cur->items, cur->nitems, nextnew, nthis, &ntmpitems); if (ntmpitems != cur->nitems) { /* * If there are no duplicates, track the added items so that we * can emit a compact ADDITEMS WAL record later on. (it doesn't * seem worth re-checking which items were duplicates, if there * were any) */ if (ntmpitems == nthis + cur->nitems && cur->action == GIN_SEGMENT_UNMODIFIED) { cur->action = GIN_SEGMENT_ADDITEMS; cur->modifieditems = nextnew; cur->nmodifieditems = nthis; } else cur->action = GIN_SEGMENT_REPLACE; cur->items = tmpitems; cur->nitems = ntmpitems; cur->seg = NULL; modified = true; } nextnew += nthis; newleft -= nthis; if (newleft == 0) break; } return modified; } /* * Recompresses all segments that have been modified. * * If not all the items fit on two pages (ie. after split), we store as * many items as fit, and set *remaining to the first item that didn't fit. * If all items fit, *remaining is set to invalid. * * Returns true if the page has to be split. */ static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) { int pgused = 0; bool needsplit = false; dlist_iter iter; int segsize; leafSegmentInfo *nextseg; int npacked; bool modified; dlist_node *cur_node; dlist_node *next_node; ItemPointerSetInvalid(remaining); /* * cannot use dlist_foreach_modify here because we insert adjacent items * while iterating. */ for (cur_node = dlist_head_node(&leaf->segments); cur_node != NULL; cur_node = next_node) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, cur_node); if (dlist_has_next(&leaf->segments, cur_node)) next_node = dlist_next_node(&leaf->segments, cur_node); else next_node = NULL; /* Compress the posting list, if necessary */ if (seginfo->action != GIN_SEGMENT_DELETE) { if (seginfo->seg == NULL) { if (seginfo->nitems > GinPostingListSegmentMaxSize) npacked = 0; /* no chance that it would fit. */ else { seginfo->seg = ginCompressPostingList(seginfo->items, seginfo->nitems, GinPostingListSegmentMaxSize, &npacked); } if (npacked != seginfo->nitems) { /* * Too large. Compress again to the target size, and * create a new segment to represent the remaining items. * The new segment is inserted after this one, so it will * be processed in the next iteration of this loop. */ if (seginfo->seg) pfree(seginfo->seg); seginfo->seg = ginCompressPostingList(seginfo->items, seginfo->nitems, GinPostingListSegmentTargetSize, &npacked); if (seginfo->action != GIN_SEGMENT_INSERT) seginfo->action = GIN_SEGMENT_REPLACE; nextseg = palloc(sizeof(leafSegmentInfo)); nextseg->action = GIN_SEGMENT_INSERT; nextseg->seg = NULL; nextseg->items = &seginfo->items[npacked]; nextseg->nitems = seginfo->nitems - npacked; next_node = &nextseg->node; dlist_insert_after(cur_node, next_node); } } /* * If the segment is very small, merge it with the next segment. */ if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node) { int nmerged; nextseg = dlist_container(leafSegmentInfo, node, next_node); if (seginfo->items == NULL) seginfo->items = ginPostingListDecode(seginfo->seg, &seginfo->nitems); if (nextseg->items == NULL) nextseg->items = ginPostingListDecode(nextseg->seg, &nextseg->nitems); nextseg->items = ginMergeItemPointers(seginfo->items, seginfo->nitems, nextseg->items, nextseg->nitems, &nmerged); Assert(nmerged == seginfo->nitems + nextseg->nitems); nextseg->nitems = nmerged; nextseg->seg = NULL; nextseg->action = GIN_SEGMENT_REPLACE; nextseg->modifieditems = NULL; nextseg->nmodifieditems = 0; if (seginfo->action == GIN_SEGMENT_INSERT) { dlist_delete(cur_node); continue; } else { seginfo->action = GIN_SEGMENT_DELETE; seginfo->seg = NULL; } } seginfo->items = NULL; seginfo->nitems = 0; } if (seginfo->action == GIN_SEGMENT_DELETE) continue; /* * OK, we now have a compressed version of this segment ready for * copying to the page. Did we exceed the size that fits on one page? */ segsize = SizeOfGinPostingList(seginfo->seg); if (pgused + segsize > GinDataPageMaxDataSize) { if (!needsplit) { /* switch to right page */ Assert(pgused > 0); leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node); needsplit = true; leaf->lsize = pgused; pgused = 0; } else { /* * Filled both pages. The last segment we constructed did not * fit. */ *remaining = seginfo->seg->first; /* * remove all segments that did not fit from the list. */ while (dlist_has_next(&leaf->segments, cur_node)) dlist_delete(dlist_next_node(&leaf->segments, cur_node)); dlist_delete(cur_node); break; } } pgused += segsize; } if (!needsplit) { leaf->lsize = pgused; leaf->rsize = 0; } else leaf->rsize = pgused; Assert(leaf->lsize <= GinDataPageMaxDataSize); Assert(leaf->rsize <= GinDataPageMaxDataSize); /* * Make a palloc'd copy of every segment after the first modified one, * because as we start copying items to the original page, we might * overwrite an existing segment. */ modified = false; dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED) { modified = true; } else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED) { GinPostingList *tmp; segsize = SizeOfGinPostingList(seginfo->seg); tmp = palloc(segsize); memcpy(tmp, seginfo->seg, segsize); seginfo->seg = tmp; } } return needsplit; } /*** Functions that are exported to the rest of the GIN code ***/ /* * Creates new posting tree containing the given TIDs. Returns the page * number of the root of the new posting tree. * * items[] must be in sorted order with no duplicates. */ BlockNumber createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, GinStatsData *buildStats, Buffer entrybuffer) { BlockNumber blkno; Buffer buffer; Page tmppage; Page page; Pointer ptr; int nrootitems; int rootsize; bool is_build = (buildStats != NULL); /* Construct the new root page in memory first. */ tmppage = (Page) palloc(BLCKSZ); GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber; /* * Write as many of the items to the root page as fit. In segments of max * GinPostingListSegmentMaxSize bytes each. */ nrootitems = 0; rootsize = 0; ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage); while (nrootitems < nitems) { GinPostingList *segment; int npacked; int segsize; segment = ginCompressPostingList(&items[nrootitems], nitems - nrootitems, GinPostingListSegmentMaxSize, &npacked); segsize = SizeOfGinPostingList(segment); if (rootsize + segsize > GinDataPageMaxDataSize) break; memcpy(ptr, segment, segsize); ptr += segsize; rootsize += segsize; nrootitems += npacked; pfree(segment); } GinDataPageSetDataSize(tmppage, rootsize); /* * All set. Get a new physical page, and copy the in-memory page to it. */ buffer = GinNewBuffer(index); page = BufferGetPage(buffer); blkno = BufferGetBlockNumber(buffer); /* * Copy any predicate locks from the entry tree leaf (containing posting * list) to the posting tree. */ PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno); START_CRIT_SECTION(); PageRestoreTempPage(tmppage, page); MarkBufferDirty(buffer); if (RelationNeedsWAL(index) && !is_build) { XLogRecPtr recptr; ginxlogCreatePostingTree data; data.size = rootsize; XLogBeginInsert(); XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree)); XLogRegisterData((char *) GinDataLeafPageGetPostingList(page), rootsize); XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE); PageSetLSN(page, recptr); } UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* During index build, count the newly-added data page */ if (buildStats) buildStats->nDataPages++; elog(DEBUG2, "created GIN posting tree with %d items", nrootitems); /* * Add any remaining TIDs to the newly-created posting tree. */ if (nitems > nrootitems) { ginInsertItemPointers(index, blkno, items + nrootitems, nitems - nrootitems, buildStats); } return blkno; } static void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) { memset(btree, 0, sizeof(GinBtreeData)); btree->index = index; btree->rootBlkno = rootBlkno; btree->findChildPage = dataLocateItem; btree->getLeftMostChild = dataGetLeftMostPage; btree->isMoveRight = dataIsMoveRight; btree->findItem = NULL; btree->findChildPtr = dataFindChildPtr; btree->beginPlaceToPage = dataBeginPlaceToPage; btree->execPlaceToPage = dataExecPlaceToPage; btree->fillRoot = ginDataFillRoot; btree->prepareDownlink = dataPrepareDownlink; btree->isData = true; btree->fullScan = false; btree->isBuild = false; } /* * Inserts array of item pointers, may execute several tree scan (very rare) */ void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats) { GinBtreeData btree; GinBtreeDataLeafInsertData insertdata; GinBtreeStack *stack; ginPrepareDataScan(&btree, index, rootBlkno); btree.isBuild = (buildStats != NULL); insertdata.items = items; insertdata.nitem = nitem; insertdata.curitem = 0; while (insertdata.curitem < insertdata.nitem) { /* search for the leaf page where the first item should go to */ btree.itemptr = insertdata.items[insertdata.curitem]; stack = ginFindLeafPage(&btree, false, true, NULL); ginInsertValue(&btree, stack, &insertdata, buildStats); } } /* * Starts a new scan on a posting tree. */ GinBtreeStack * ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot) { GinBtreeStack *stack; ginPrepareDataScan(btree, index, rootBlkno); btree->fullScan = true; stack = ginFindLeafPage(btree, true, false, snapshot); return stack; }