diff options
Diffstat (limited to '')
-rw-r--r-- | src/backend/access/nbtree/nbtdedup.c | 1098 |
1 files changed, 1098 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c new file mode 100644 index 0000000..1cd1b59 --- /dev/null +++ b/src/backend/access/nbtree/nbtdedup.c @@ -0,0 +1,1098 @@ +/*------------------------------------------------------------------------- + * + * nbtdedup.c + * Deduplicate or bottom-up delete items in Postgres btrees. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtdedup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + +static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, + TM_IndexDeleteOp *delstate); +static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem); +static void _bt_singleval_fillfactor(Page page, BTDedupState state, + Size newitemsz); +#ifdef USE_ASSERT_CHECKING +static bool _bt_posting_valid(IndexTuple posting); +#endif + +/* + * Perform a deduplication pass. + * + * The general approach taken here is to perform as much deduplication as + * possible to free as much space as possible. Note, however, that "single + * value" strategy is used for !bottomupdedup callers when the page is full of + * tuples of a single value. Deduplication passes that apply the strategy + * will leave behind a few untouched tuples at the end of the page, preparing + * the page for an anticipated page split that uses nbtsplitloc.c's own single + * value strategy. Our high level goal is to delay merging the untouched + * tuples until after the page splits. + * + * When a call to _bt_bottomupdel_pass() just took place (and failed), our + * high level goal is to prevent a page split entirely by buying more time. + * We still hope that a page split can be avoided altogether. That's why + * single value strategy is not even considered for bottomupdedup callers. + * + * The page will have to be split if we cannot successfully free at least + * newitemsz (we also need space for newitem's line pointer, which isn't + * included in caller's newitemsz). + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +void +_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, + Size newitemsz, bool bottomupdedup) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Page newpage; + BTDedupState state; + Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0; + bool singlevalstrat = false; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* + * Initialize deduplication state. + * + * It would be possible for maxpostingsize (limit on posting list tuple + * size) to be set to one third of the page. However, it seems like a + * good idea to limit the size of posting lists to one sixth of a page. + * That ought to leave us with a good split point when pages full of + * duplicates can be split several times. + */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); + /* Metadata about base tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + /* Size of all physical tuples to be replaced by pending posting list */ + state->phystupsize = 0; + /* nintervals should be initialized to zero */ + state->nintervals = 0; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Consider applying "single value" strategy, though only if the page + * seems likely to be split in the near future + */ + if (!bottomupdedup) + singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); + + /* + * Deduplicate items from page, and write them to newpage. + * + * Copy the original page's LSN into newpage copy. This will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + newpage = PageGetTempPageCopySpecial(page); + PageSetLSN(newpage, PageGetLSN(page)); + + /* Copy high key, if any */ + if (!P_RIGHTMOST(opaque)) + { + ItemId hitemid = PageGetItemId(page, P_HIKEY); + Size hitemsz = ItemIdGetLength(hitemid); + IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); + + if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (state->deduplicate && + _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID(s) for itup have been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list for some other reason (e.g., adding more + * TIDs would have caused posting list to exceed current + * maxpostingsize). + * + * If state contains pending posting list with more than one item, + * form new posting tuple, and actually update the page. Else + * reset the state and move on without modifying the page. + */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + if (singlevalstrat) + { + /* + * Single value strategy's extra steps. + * + * Lower maxpostingsize for sixth and final large posting list + * tuple at the point where 5 maxpostingsize-capped tuples + * have either been formed or observed. + * + * When a sixth maxpostingsize-capped item is formed/observed, + * stop merging together tuples altogether. The few tuples + * that remain at the end of the page won't be merged together + * at all (at least not until after a future page split takes + * place). + */ + if (state->nmaxitems == 5) + _bt_singleval_fillfactor(page, state, newitemsz); + else if (state->nmaxitems == 6) + { + state->deduplicate = false; + singlevalstrat = false; /* won't be back here */ + } + } + + /* itup starts new pending posting list */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + + /* Handle the last item */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + /* + * If no items suitable for deduplication were found, newpage must be + * exactly the same as the original page, so just return from function. + * + * We could determine whether or not to proceed on the basis the space + * savings being sufficient to avoid an immediate page split instead. We + * don't do that because there is some small value in nbtsplitloc.c always + * operating against a page that is fully deduplicated (apart from + * newitem). Besides, most of the cost has already been paid. + */ + if (state->nintervals == 0) + { + /* cannot leak memory here */ + pfree(newpage); + pfree(state->htids); + pfree(state); + return; + } + + /* + * By here, it's clear that deduplication will definitely go ahead. + * + * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace + * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway. + * But keep things tidy. + */ + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + START_CRIT_SECTION(); + + PageRestoreTempPage(newpage, page); + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.nintervals = state->nintervals; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + /* + * The intervals array is not in the buffer, but pretend that it is. + * When XLogInsert stores the whole buffer, the array need not be + * stored too. + */ + XLogRegisterBufData(0, (char *) state->intervals, + state->nintervals * sizeof(BTDedupInterval)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* Local space accounting should agree with page accounting */ + Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); + + /* cannot leak memory here */ + pfree(state->htids); + pfree(state); +} + +/* + * Perform bottom-up index deletion pass. + * + * See if duplicate index tuples (plus certain nearby tuples) are eligible to + * be deleted via bottom-up index deletion. The high level goal here is to + * entirely prevent "unnecessary" page splits caused by MVCC version churn + * from UPDATEs (when the UPDATEs don't logically modify any of the columns + * covered by the 'rel' index). This is qualitative, not quantitative -- we + * do not particularly care about once-off opportunities to delete many index + * tuples together. + * + * See nbtree/README for details on the design of nbtree bottom-up deletion. + * See access/tableam.h for a description of how we're expected to cooperate + * with the tableam. + * + * Returns true on success, in which case caller can assume page split will be + * avoided for a reasonable amount of time. Returns false when caller should + * deduplicate the page (if possible at all). + * + * Note: Occasionally we return true despite failing to delete enough items to + * avoid a split. This makes caller skip deduplication and go split the page + * right away. Our return value is always just advisory information. + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +bool +_bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, + Size newitemsz) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + BTDedupState state; + TM_IndexDeleteOp delstate; + bool neverdedup; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* Initialize deduplication state */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->nintervals = 0; + + /* + * Initialize tableam state that describes bottom-up index deletion + * operation. + * + * We'll go on to ask the tableam to search for TIDs whose index tuples we + * can safely delete. The tableam will search until our leaf page space + * target is satisfied, or until the cost of continuing with the tableam + * operation seems too high. It focuses its efforts on TIDs associated + * with duplicate index tuples that we mark "promising". + * + * This space target is a little arbitrary. The tableam must be able to + * keep the costs and benefits in balance. We provide the tableam with + * exhaustive information about what might work, without directly + * concerning ourselves with avoiding work during the tableam call. Our + * role in costing the bottom-up deletion process is strictly advisory. + */ + delstate.bottomup = true; + delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz); + delstate.ndeltids = 0; + delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); + delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* itup starts first pending interval */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + /* Tuple is equal; just added its TIDs to pending interval */ + } + else + { + /* Finalize interval -- move its TIDs to delete state */ + _bt_bottomupdel_finish_pending(page, state, &delstate); + + /* itup starts new pending interval */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + /* Finalize final interval -- move its TIDs to delete state */ + _bt_bottomupdel_finish_pending(page, state, &delstate); + + /* + * We don't give up now in the event of having few (or even zero) + * promising tuples for the tableam because it's not up to us as the index + * AM to manage costs (note that the tableam might have heuristics of its + * own that work out what to do). We should at least avoid having our + * caller do a useless deduplication pass after we return in the event of + * zero promising tuples, though. + */ + neverdedup = false; + if (state->nintervals == 0) + neverdedup = true; + + pfree(state->htids); + pfree(state); + + /* Ask tableam which TIDs are deletable, then physically delete them */ + _bt_delitems_delete_check(rel, buf, heapRel, &delstate); + + pfree(delstate.deltids); + pfree(delstate.status); + + /* Report "success" to caller unconditionally to avoid deduplication */ + if (neverdedup) + return true; + + /* Don't dedup when we won't end up back here any time soon anyway */ + return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz); +} + +/* + * Create a new pending posting list tuple based on caller's base tuple. + * + * Every tuple processed by deduplication either becomes the base tuple for a + * posting list, or gets its heap TID(s) accepted into a pending posting list. + * A tuple that starts out as the base tuple for a posting list will only + * actually be rewritten within _bt_dedup_finish_pending() when it turns out + * that there are duplicates that can be merged into the base tuple. + */ +void +_bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff) +{ + Assert(state->nhtids == 0); + Assert(state->nitems == 0); + Assert(!BTreeTupleIsPivot(base)); + + /* + * Copy heap TID(s) from new base tuple for new candidate posting list + * into working state's array + */ + if (!BTreeTupleIsPosting(base)) + { + memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData)); + state->nhtids = 1; + state->basetupsize = IndexTupleSize(base); + } + else + { + int nposting; + + nposting = BTreeTupleGetNPosting(base); + memcpy(state->htids, BTreeTupleGetPosting(base), + sizeof(ItemPointerData) * nposting); + state->nhtids = nposting; + /* basetupsize should not include existing posting list */ + state->basetupsize = BTreeTupleGetPostingOffset(base); + } + + /* + * Save new base tuple itself -- it'll be needed if we actually create a + * new posting list from new pending posting list. + * + * Must maintain physical size of all existing tuples (including line + * pointer overhead) so that we can calculate space savings on page. + */ + state->nitems = 1; + state->base = base; + state->baseoff = baseoff; + state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); + /* Also save baseoff in pending state for interval */ + state->intervals[state->nintervals].baseoff = state->baseoff; +} + +/* + * Save itup heap TID(s) into pending posting list where possible. + * + * Returns bool indicating if the pending posting list managed by state now + * includes itup's heap TID(s). + */ +bool +_bt_dedup_save_htid(BTDedupState state, IndexTuple itup) +{ + int nhtids; + ItemPointer htids; + Size mergedtupsz; + + Assert(!BTreeTupleIsPivot(itup)); + + if (!BTreeTupleIsPosting(itup)) + { + nhtids = 1; + htids = &itup->t_tid; + } + else + { + nhtids = BTreeTupleGetNPosting(itup); + htids = BTreeTupleGetPosting(itup); + } + + /* + * Don't append (have caller finish pending posting list as-is) if + * appending heap TID(s) from itup would put us over maxpostingsize limit. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. + */ + mergedtupsz = MAXALIGN(state->basetupsize + + (state->nhtids + nhtids) * sizeof(ItemPointerData)); + + if (mergedtupsz > state->maxpostingsize) + { + /* + * Count this as an oversized item for single value strategy, though + * only when there are 50 TIDs in the final posting list tuple. This + * limit (which is fairly arbitrary) avoids confusion about how many + * 1/6 of a page tuples have been encountered/created by the current + * deduplication pass. + * + * Note: We deliberately don't consider which deduplication pass + * merged together tuples to create this item (could be a previous + * deduplication pass, or current pass). See _bt_do_singleval() + * comments. + */ + if (state->nhtids > 50) + state->nmaxitems++; + + return false; + } + + /* + * Save heap TIDs to pending posting list tuple -- itup can be merged into + * pending posting list + */ + state->nitems++; + memcpy(state->htids + state->nhtids, htids, + sizeof(ItemPointerData) * nhtids); + state->nhtids += nhtids; + state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + + return true; +} + +/* + * Finalize pending posting list tuple, and add it to the page. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * Returns space saving from deduplicating to make a new posting list tuple. + * Note that this includes line pointer overhead. This is zero in the case + * where no deduplication was possible. + */ +Size +_bt_dedup_finish_pending(Page newpage, BTDedupState state) +{ + OffsetNumber tupoff; + Size tuplesz; + Size spacesaving; + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage)); + if (state->nitems == 1) + { + /* Use original, unchanged base tuple */ + tuplesz = IndexTupleSize(state->base); + if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + spacesaving = 0; + } + else + { + IndexTuple final; + + /* Form a tuple with a posting list */ + final = _bt_form_posting(state->base, state->htids, state->nhtids); + tuplesz = IndexTupleSize(final); + Assert(tuplesz <= state->maxpostingsize); + + /* Save final number of items for posting list */ + state->intervals[state->nintervals].nitems = state->nitems; + + Assert(tuplesz == MAXALIGN(IndexTupleSize(final))); + if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + pfree(final); + spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData)); + /* Increment nintervals, since we wrote a new posting list tuple */ + state->nintervals++; + Assert(spacesaving > 0 && spacesaving < BLCKSZ); + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + + return spacesaving; +} + +/* + * Finalize interval during bottom-up index deletion. + * + * During a bottom-up pass we expect that TIDs will be recorded in dedup state + * first, and then get moved over to delstate (in variable-sized batches) by + * calling here. Call here happens when the number of TIDs in a dedup + * interval is known, and interval gets finalized (i.e. when caller sees next + * tuple on the page is not a duplicate, or when caller runs out of tuples to + * process from leaf page). + * + * This is where bottom-up deletion determines and remembers which entries are + * duplicates. This will be important information to the tableam delete + * infrastructure later on. Plain index tuple duplicates are marked + * "promising" here, per tableam contract. + * + * Our approach to marking entries whose TIDs come from posting lists is more + * complicated. Posting lists can only be formed by a deduplication pass (or + * during an index build), so recent version churn affecting the pointed-to + * logical rows is not particularly likely. We may still give a weak signal + * about posting list tuples' entries (by marking just one of its TIDs/entries + * promising), though this is only a possibility in the event of further + * duplicate index tuples in final interval that covers posting list tuple (as + * in the plain tuple case). A weak signal/hint will be useful to the tableam + * when it has no stronger signal to go with for the deletion operation as a + * whole. + * + * The heuristics we use work well in practice because we only need to give + * the tableam the right _general_ idea about where to look. Garbage tends to + * naturally get concentrated in relatively few table blocks with workloads + * that bottom-up deletion targets. The tableam cannot possibly rank all + * available table blocks sensibly based on the hints we provide, but that's + * okay -- only the extremes matter. The tableam just needs to be able to + * predict which few table blocks will have the most tuples that are safe to + * delete for each deletion operation, with low variance across related + * deletion operations. + */ +static void +_bt_bottomupdel_finish_pending(Page page, BTDedupState state, + TM_IndexDeleteOp *delstate) +{ + bool dupinterval = (state->nitems > 1); + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + for (int i = 0; i < state->nitems; i++) + { + OffsetNumber offnum = state->baseoff + i; + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids]; + TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids]; + + if (!BTreeTupleIsPosting(itup)) + { + /* Simple case: A plain non-pivot tuple */ + ideltid->tid = itup->t_tid; + ideltid->id = delstate->ndeltids; + istatus->idxoffnum = offnum; + istatus->knowndeletable = false; /* for now */ + istatus->promising = dupinterval; /* simple rule */ + istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData); + + delstate->ndeltids++; + } + else + { + /* + * Complicated case: A posting list tuple. + * + * We make the conservative assumption that there can only be at + * most one affected logical row per posting list tuple. There + * will be at most one promising entry in deltids to represent + * this presumed lone logical row. Note that this isn't even + * considered unless the posting list tuple is also in an interval + * of duplicates -- this complicated rule is just a variant of the + * simple rule used to decide if plain index tuples are promising. + */ + int nitem = BTreeTupleGetNPosting(itup); + bool firstpromising = false; + bool lastpromising = false; + + Assert(_bt_posting_valid(itup)); + + if (dupinterval) + { + /* + * Complicated rule: either the first or last TID in the + * posting list gets marked promising (if any at all) + */ + BlockNumber minblocklist, + midblocklist, + maxblocklist; + ItemPointer mintid, + midtid, + maxtid; + + mintid = BTreeTupleGetHeapTID(itup); + midtid = BTreeTupleGetPostingN(itup, nitem / 2); + maxtid = BTreeTupleGetMaxHeapTID(itup); + minblocklist = ItemPointerGetBlockNumber(mintid); + midblocklist = ItemPointerGetBlockNumber(midtid); + maxblocklist = ItemPointerGetBlockNumber(maxtid); + + /* Only entry with predominant table block can be promising */ + firstpromising = (minblocklist == midblocklist); + lastpromising = (!firstpromising && + midblocklist == maxblocklist); + } + + for (int p = 0; p < nitem; p++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, p); + + ideltid->tid = *htid; + ideltid->id = delstate->ndeltids; + istatus->idxoffnum = offnum; + istatus->knowndeletable = false; /* for now */ + istatus->promising = false; + if ((firstpromising && p == 0) || + (lastpromising && p == nitem - 1)) + istatus->promising = true; + istatus->freespace = sizeof(ItemPointerData); /* at worst */ + + ideltid++; + istatus++; + delstate->ndeltids++; + } + } + } + + if (dupinterval) + { + state->intervals[state->nintervals].nitems = state->nitems; + state->nintervals++; + } + + /* Reset state for next interval */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; +} + +/* + * Determine if page non-pivot tuples (data items) are all duplicates of the + * same value -- if they are, deduplication's "single value" strategy should + * be applied. The general goal of this strategy is to ensure that + * nbtsplitloc.c (which uses its own single value strategy) will find a useful + * split point as further duplicates are inserted, and successive rightmost + * page splits occur among pages that store the same duplicate value. When + * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full, + * just like it would if deduplication were disabled. + * + * We expect that affected workloads will require _several_ single value + * strategy deduplication passes (over a page that only stores duplicates) + * before the page is finally split. The first deduplication pass should only + * find regular non-pivot tuples. Later deduplication passes will find + * existing maxpostingsize-capped posting list tuples, which must be skipped + * over. The penultimate pass is generally the first pass that actually + * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a + * few untouched non-pivot tuples. The final deduplication pass won't free + * any space -- it will skip over everything without merging anything (it + * retraces the steps of the penultimate pass). + * + * Fortunately, having several passes isn't too expensive. Each pass (after + * the first pass) won't spend many cycles on the large posting list tuples + * left by previous passes. Each pass will find a large contiguous group of + * smaller duplicate tuples to merge together at the end of the page. + */ +static bool +_bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, minoff); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + { + itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + return true; + } + + return false; +} + +/* + * Lower maxpostingsize when using "single value" strategy, to avoid a sixth + * and final maxpostingsize-capped tuple. The sixth and final posting list + * tuple will end up somewhat smaller than the first five. (Note: The first + * five tuples could actually just be very large duplicate tuples that + * couldn't be merged together at all. Deduplication will simply not modify + * the page when that happens.) + * + * When there are six posting lists on the page (after current deduplication + * pass goes on to create/observe a sixth very large tuple), caller should end + * its deduplication pass. It isn't useful to try to deduplicate items that + * are supposed to end up on the new right sibling page following the + * anticipated page split. A future deduplication pass of future right + * sibling page might take care of it. (This is why the first single value + * strategy deduplication pass for a given leaf page will generally find only + * plain non-pivot tuples -- see _bt_do_singleval() comments.) + */ +static void +_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz) +{ + Size leftfree; + int reduction; + + /* This calculation needs to match nbtsplitloc.c */ + leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + /* Subtract size of new high key (includes pivot heap TID space) */ + leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); + + /* + * Reduce maxpostingsize by an amount equal to target free space on left + * half of page + */ + reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0); + if (state->maxpostingsize > reduction) + state->maxpostingsize -= reduction; + else + state->maxpostingsize = 0; +} + +/* + * Build a posting list tuple based on caller's "base" index tuple and list of + * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a + * posting list. (Posting list tuples can never have a single heap TID, partly + * because that ensures that deduplication always reduces final MAXALIGN()'d + * size of entire tuple.) + * + * Convention is that posting list starts at a MAXALIGN()'d offset (rather + * than a SHORTALIGN()'d offset), in line with the approach taken when + * appending a heap TID to new pivot tuple/high key during suffix truncation. + * This sometimes wastes a little space that was only needed as alignment + * padding in the original tuple. Following this convention simplifies the + * space accounting used when deduplicating a page (the same convention + * simplifies the accounting for choosing a point to split a page at). + * + * Note: Caller's "htids" array must be unique and already in ascending TID + * order. Any existing heap TIDs from "base" won't automatically appear in + * returned posting list tuple (they must be included in htids array.) + */ +IndexTuple +_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids) +{ + uint32 keysize, + newsize; + IndexTuple itup; + + if (BTreeTupleIsPosting(base)) + keysize = BTreeTupleGetPostingOffset(base); + else + keysize = IndexTupleSize(base); + + Assert(!BTreeTupleIsPivot(base)); + Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX); + Assert(keysize == MAXALIGN(keysize)); + + /* Determine final size of new tuple */ + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, base, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + memcpy(BTreeTupleGetPosting(itup), htids, + sizeof(ItemPointerData) * nhtids); + Assert(_bt_posting_valid(itup)); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(htids, &itup->t_tid); + Assert(ItemPointerIsValid(&itup->t_tid)); + } + + return itup; +} + +/* + * Generate a replacement tuple by "updating" a posting list tuple so that it + * no longer has TIDs that need to be deleted. + * + * Used by both VACUUM and index deletion. Caller's vacposting argument + * points to the existing posting list tuple to be updated. + * + * On return, caller's vacposting argument will point to final "updated" + * tuple, which will be palloc()'d in caller's memory context. + */ +void +_bt_update_posting(BTVacuumPosting vacposting) +{ + IndexTuple origtuple = vacposting->itup; + uint32 keysize, + newsize; + IndexTuple itup; + int nhtids; + int ui, + d; + ItemPointer htids; + + nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids; + + Assert(_bt_posting_valid(origtuple)); + Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple)); + + /* + * Determine final size of new tuple. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. We avoid calling _bt_form_posting() here + * to save ourselves a second memory allocation for a htids workspace. + */ + keysize = BTreeTupleGetPostingOffset(origtuple); + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, origtuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + htids = BTreeTupleGetPosting(itup); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + htids = &itup->t_tid; + } + + ui = 0; + d = 0; + for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++) + { + if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i) + { + d++; + continue; + } + htids[ui++] = *BTreeTupleGetPostingN(origtuple, i); + } + Assert(ui == nhtids); + Assert(d == vacposting->ndeletedtids); + Assert(nhtids == 1 || _bt_posting_valid(itup)); + Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid)); + + /* vacposting arg's itup will now point to updated version */ + vacposting->itup = itup; +} + +/* + * Prepare for a posting list split by swapping heap TID in newitem with heap + * TID from original posting list (the 'oposting' heap TID located at offset + * 'postingoff'). Modifies newitem, so caller should pass their own private + * copy that can safely be modified. + * + * Returns new posting list tuple, which is palloc()'d in caller's context. + * This is guaranteed to be the same size as 'oposting'. Modified newitem is + * what caller actually inserts. (This happens inside the same critical + * section that performs an in-place update of old posting list using new + * posting list returned here.) + * + * While the keys from newitem and oposting must be opclass equal, and must + * generate identical output when run through the underlying type's output + * function, it doesn't follow that their representations match exactly. + * Caller must avoid assuming that there can't be representational differences + * that make datums from oposting bigger or smaller than the corresponding + * datums from newitem. For example, differences in TOAST input state might + * break a faulty assumption about tuple size (the executor is entitled to + * apply TOAST compression based on its own criteria). It also seems possible + * that further representational variation will be introduced in the future, + * in order to support nbtree features like page-level prefix compression. + * + * See nbtree/README for details on the design of posting list splits. + */ +IndexTuple +_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) +{ + int nhtids; + char *replacepos; + char *replaceposright; + Size nmovebytes; + IndexTuple nposting; + + nhtids = BTreeTupleGetNPosting(oposting); + Assert(_bt_posting_valid(oposting)); + + /* + * The postingoff argument originated as a _bt_binsrch_posting() return + * value. It will be 0 in the event of corruption that makes a leaf page + * contain a non-pivot tuple that's somehow identical to newitem (no two + * non-pivot tuples should ever have the same TID). This has been known + * to happen in the field from time to time. + * + * Perform a basic sanity check to catch this case now. + */ + if (!(postingoff > 0 && postingoff < nhtids)) + elog(ERROR, "posting list tuple with %d items cannot be split at offset %d", + nhtids, postingoff); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID. We shift TIDs one place to the right, losing original + * rightmost TID. (nmovebytes must not include TIDs to the left of + * postingoff, nor the existing rightmost/max TID that gets overwritten.) + */ + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); + replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1); + nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); + memmove(replaceposright, replacepos, nmovebytes); + + /* Fill the gap at postingoff with TID of new item (original new TID) */ + Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem)); + ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); + + /* Now copy oposting's rightmost/max TID into new item (final new TID) */ + ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid); + + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), + BTreeTupleGetHeapTID(newitem)) < 0); + Assert(_bt_posting_valid(nposting)); + + return nposting; +} + +/* + * Verify posting list invariants for "posting", which must be a posting list + * tuple. Used within assertions. + */ +#ifdef USE_ASSERT_CHECKING +static bool +_bt_posting_valid(IndexTuple posting) +{ + ItemPointerData last; + ItemPointer htid; + + if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2) + return false; + + /* Remember first heap TID for loop */ + ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last); + if (!ItemPointerIsValid(&last)) + return false; + + /* Iterate, starting from second TID */ + for (int i = 1; i < BTreeTupleGetNPosting(posting); i++) + { + htid = BTreeTupleGetPostingN(posting, i); + + if (!ItemPointerIsValid(htid)) + return false; + if (ItemPointerCompare(htid, &last) <= 0) + return false; + ItemPointerCopy(htid, &last); + } + + return true; +} +#endif |