summaryrefslogtreecommitdiffstats
path: root/src/backend/access/nbtree/nbtdedup.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtdedup.c')
-rw-r--r--src/backend/access/nbtree/nbtdedup.c1105
1 files changed, 1105 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
new file mode 100644
index 0000000..d4db0b2
--- /dev/null
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -0,0 +1,1105 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtdedup.c
+ * Deduplicate or bottom-up delete items in Postgres btrees.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtdedup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
+ TM_IndexDeleteOp *delstate);
+static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem);
+static void _bt_singleval_fillfactor(Page page, BTDedupState state,
+ Size newitemsz);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_posting_valid(IndexTuple posting);
+#endif
+
+/*
+ * Perform a deduplication pass.
+ *
+ * The general approach taken here is to perform as much deduplication as
+ * possible to free as much space as possible. Note, however, that "single
+ * value" strategy is used for !bottomupdedup callers when the page is full of
+ * tuples of a single value. Deduplication passes that apply the strategy
+ * will leave behind a few untouched tuples at the end of the page, preparing
+ * the page for an anticipated page split that uses nbtsplitloc.c's own single
+ * value strategy. Our high level goal is to delay merging the untouched
+ * tuples until after the page splits.
+ *
+ * When a call to _bt_bottomupdel_pass() just took place (and failed), our
+ * high level goal is to prevent a page split entirely by buying more time.
+ * We still hope that a page split can be avoided altogether. That's why
+ * single value strategy is not even considered for bottomupdedup callers.
+ *
+ * The page will have to be split if we cannot successfully free at least
+ * newitemsz (we also need space for newitem's line pointer, which isn't
+ * included in caller's newitemsz).
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ */
+void
+_bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
+ bool bottomupdedup)
+{
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque = BTPageGetOpaque(page);
+ Page newpage;
+ BTDedupState state;
+ Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
+ bool singlevalstrat = false;
+ int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+ newitemsz += sizeof(ItemIdData);
+
+ /*
+ * Initialize deduplication state.
+ *
+ * It would be possible for maxpostingsize (limit on posting list tuple
+ * size) to be set to one third of the page. However, it seems like a
+ * good idea to limit the size of posting lists to one sixth of a page.
+ * That ought to leave us with a good split point when pages full of
+ * duplicates can be split several times.
+ */
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true;
+ state->nmaxitems = 0;
+ state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
+ /* Metadata about base tuple of current pending posting list */
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ /* Metadata about current pending posting list TIDs */
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ /* Size of all physical tuples to be replaced by pending posting list */
+ state->phystupsize = 0;
+ /* nintervals should be initialized to zero */
+ state->nintervals = 0;
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Consider applying "single value" strategy, though only if the page
+ * seems likely to be split in the near future
+ */
+ if (!bottomupdedup)
+ singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
+
+ /*
+ * Deduplicate items from page, and write them to newpage.
+ *
+ * Copy the original page's LSN into newpage copy. This will become the
+ * updated version of the page. We need this because XLogInsert will
+ * examine the LSN and possibly dump it in a page image.
+ */
+ newpage = PageGetTempPageCopySpecial(page);
+ PageSetLSN(newpage, PageGetLSN(page));
+
+ /* Copy high key, if any */
+ if (!P_RIGHTMOST(opaque))
+ {
+ ItemId hitemid = PageGetItemId(page, P_HIKEY);
+ Size hitemsz = ItemIdGetLength(hitemid);
+ IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
+
+ if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add highkey");
+ }
+
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(!ItemIdIsDead(itemid));
+
+ if (offnum == minoff)
+ {
+ /*
+ * No previous/base tuple for the data item -- use the data item
+ * as base tuple of pending posting list
+ */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ else if (state->deduplicate &&
+ _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+ _bt_dedup_save_htid(state, itup))
+ {
+ /*
+ * Tuple is equal to base tuple of pending posting list. Heap
+ * TID(s) for itup have been saved in state.
+ */
+ }
+ else
+ {
+ /*
+ * Tuple is not equal to pending posting list tuple, or
+ * _bt_dedup_save_htid() opted to not merge current item into
+ * pending posting list for some other reason (e.g., adding more
+ * TIDs would have caused posting list to exceed current
+ * maxpostingsize).
+ *
+ * If state contains pending posting list with more than one item,
+ * form new posting tuple and add it to our temp page (newpage).
+ * Else add pending interval's base tuple to the temp page as-is.
+ */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+
+ if (singlevalstrat)
+ {
+ /*
+ * Single value strategy's extra steps.
+ *
+ * Lower maxpostingsize for sixth and final large posting list
+ * tuple at the point where 5 maxpostingsize-capped tuples
+ * have either been formed or observed.
+ *
+ * When a sixth maxpostingsize-capped item is formed/observed,
+ * stop merging together tuples altogether. The few tuples
+ * that remain at the end of the page won't be merged together
+ * at all (at least not until after a future page split takes
+ * place, when this page's newly allocated right sibling page
+ * gets its first deduplication pass).
+ */
+ if (state->nmaxitems == 5)
+ _bt_singleval_fillfactor(page, state, newitemsz);
+ else if (state->nmaxitems == 6)
+ {
+ state->deduplicate = false;
+ singlevalstrat = false; /* won't be back here */
+ }
+ }
+
+ /* itup starts new pending posting list */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+
+ /* Handle the last item */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+
+ /*
+ * If no items suitable for deduplication were found, newpage must be
+ * exactly the same as the original page, so just return from function.
+ *
+ * We could determine whether or not to proceed on the basis the space
+ * savings being sufficient to avoid an immediate page split instead. We
+ * don't do that because there is some small value in nbtsplitloc.c always
+ * operating against a page that is fully deduplicated (apart from
+ * newitem). Besides, most of the cost has already been paid.
+ */
+ if (state->nintervals == 0)
+ {
+ /* cannot leak memory here */
+ pfree(newpage);
+ pfree(state->htids);
+ pfree(state);
+ return;
+ }
+
+ /*
+ * By here, it's clear that deduplication will definitely go ahead.
+ *
+ * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
+ * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
+ * But keep things tidy.
+ */
+ if (P_HAS_GARBAGE(opaque))
+ {
+ BTPageOpaque nopaque = BTPageGetOpaque(newpage);
+
+ nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+ }
+
+ START_CRIT_SECTION();
+
+ PageRestoreTempPage(newpage, page);
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+ xl_btree_dedup xlrec_dedup;
+
+ xlrec_dedup.nintervals = state->nintervals;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+ /*
+ * The intervals array is not in the buffer, but pretend that it is.
+ * When XLogInsert stores the whole buffer, the array need not be
+ * stored too.
+ */
+ XLogRegisterBufData(0, (char *) state->intervals,
+ state->nintervals * sizeof(BTDedupInterval));
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Local space accounting should agree with page accounting */
+ Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
+
+ /* cannot leak memory here */
+ pfree(state->htids);
+ pfree(state);
+}
+
+/*
+ * Perform bottom-up index deletion pass.
+ *
+ * See if duplicate index tuples (plus certain nearby tuples) are eligible to
+ * be deleted via bottom-up index deletion. The high level goal here is to
+ * entirely prevent "unnecessary" page splits caused by MVCC version churn
+ * from UPDATEs (when the UPDATEs don't logically modify any of the columns
+ * covered by the 'rel' index). This is qualitative, not quantitative -- we
+ * do not particularly care about once-off opportunities to delete many index
+ * tuples together.
+ *
+ * See nbtree/README for details on the design of nbtree bottom-up deletion.
+ * See access/tableam.h for a description of how we're expected to cooperate
+ * with the tableam.
+ *
+ * Returns true on success, in which case caller can assume page split will be
+ * avoided for a reasonable amount of time. Returns false when caller should
+ * deduplicate the page (if possible at all).
+ *
+ * Note: Occasionally we return true despite failing to delete enough items to
+ * avoid a split. This makes caller skip deduplication and go split the page
+ * right away. Our return value is always just advisory information.
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ */
+bool
+_bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
+ Size newitemsz)
+{
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque = BTPageGetOpaque(page);
+ BTDedupState state;
+ TM_IndexDeleteOp delstate;
+ bool neverdedup;
+ int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+ newitemsz += sizeof(ItemIdData);
+
+ /* Initialize deduplication state */
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true;
+ state->nmaxitems = 0;
+ state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+ state->nintervals = 0;
+
+ /*
+ * Initialize tableam state that describes bottom-up index deletion
+ * operation.
+ *
+ * We'll go on to ask the tableam to search for TIDs whose index tuples we
+ * can safely delete. The tableam will search until our leaf page space
+ * target is satisfied, or until the cost of continuing with the tableam
+ * operation seems too high. It focuses its efforts on TIDs associated
+ * with duplicate index tuples that we mark "promising".
+ *
+ * This space target is a little arbitrary. The tableam must be able to
+ * keep the costs and benefits in balance. We provide the tableam with
+ * exhaustive information about what might work, without directly
+ * concerning ourselves with avoiding work during the tableam call. Our
+ * role in costing the bottom-up deletion process is strictly advisory.
+ */
+ delstate.irel = rel;
+ delstate.iblknum = BufferGetBlockNumber(buf);
+ delstate.bottomup = true;
+ delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
+ delstate.ndeltids = 0;
+ delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+ delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(!ItemIdIsDead(itemid));
+
+ if (offnum == minoff)
+ {
+ /* itup starts first pending interval */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+ _bt_dedup_save_htid(state, itup))
+ {
+ /* Tuple is equal; just added its TIDs to pending interval */
+ }
+ else
+ {
+ /* Finalize interval -- move its TIDs to delete state */
+ _bt_bottomupdel_finish_pending(page, state, &delstate);
+
+ /* itup starts new pending interval */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+ /* Finalize final interval -- move its TIDs to delete state */
+ _bt_bottomupdel_finish_pending(page, state, &delstate);
+
+ /*
+ * We don't give up now in the event of having few (or even zero)
+ * promising tuples for the tableam because it's not up to us as the index
+ * AM to manage costs (note that the tableam might have heuristics of its
+ * own that work out what to do). We should at least avoid having our
+ * caller do a useless deduplication pass after we return in the event of
+ * zero promising tuples, though.
+ */
+ neverdedup = false;
+ if (state->nintervals == 0)
+ neverdedup = true;
+
+ pfree(state->htids);
+ pfree(state);
+
+ /* Ask tableam which TIDs are deletable, then physically delete them */
+ _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
+
+ pfree(delstate.deltids);
+ pfree(delstate.status);
+
+ /* Report "success" to caller unconditionally to avoid deduplication */
+ if (neverdedup)
+ return true;
+
+ /* Don't dedup when we won't end up back here any time soon anyway */
+ return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's base tuple.
+ *
+ * Every tuple processed by deduplication either becomes the base tuple for a
+ * posting list, or gets its heap TID(s) accepted into a pending posting list.
+ * A tuple that starts out as the base tuple for a posting list will only
+ * actually be rewritten within _bt_dedup_finish_pending() when it turns out
+ * that there are duplicates that can be merged into the base tuple.
+ */
+void
+_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+ OffsetNumber baseoff)
+{
+ Assert(state->nhtids == 0);
+ Assert(state->nitems == 0);
+ Assert(!BTreeTupleIsPivot(base));
+
+ /*
+ * Copy heap TID(s) from new base tuple for new candidate posting list
+ * into working state's array
+ */
+ if (!BTreeTupleIsPosting(base))
+ {
+ memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
+ state->nhtids = 1;
+ state->basetupsize = IndexTupleSize(base);
+ }
+ else
+ {
+ int nposting;
+
+ nposting = BTreeTupleGetNPosting(base);
+ memcpy(state->htids, BTreeTupleGetPosting(base),
+ sizeof(ItemPointerData) * nposting);
+ state->nhtids = nposting;
+ /* basetupsize should not include existing posting list */
+ state->basetupsize = BTreeTupleGetPostingOffset(base);
+ }
+
+ /*
+ * Save new base tuple itself -- it'll be needed if we actually create a
+ * new posting list from new pending posting list.
+ *
+ * Must maintain physical size of all existing tuples (including line
+ * pointer overhead) so that we can calculate space savings on page.
+ */
+ state->nitems = 1;
+ state->base = base;
+ state->baseoff = baseoff;
+ state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+ /* Also save baseoff in pending state for interval */
+ state->intervals[state->nintervals].baseoff = state->baseoff;
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state now
+ * includes itup's heap TID(s).
+ */
+bool
+_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
+{
+ int nhtids;
+ ItemPointer htids;
+ Size mergedtupsz;
+
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ nhtids = 1;
+ htids = &itup->t_tid;
+ }
+ else
+ {
+ nhtids = BTreeTupleGetNPosting(itup);
+ htids = BTreeTupleGetPosting(itup);
+ }
+
+ /*
+ * Don't append (have caller finish pending posting list as-is) if
+ * appending heap TID(s) from itup would put us over maxpostingsize limit.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples.
+ */
+ mergedtupsz = MAXALIGN(state->basetupsize +
+ (state->nhtids + nhtids) * sizeof(ItemPointerData));
+
+ if (mergedtupsz > state->maxpostingsize)
+ {
+ /*
+ * Count this as an oversized item for single value strategy, though
+ * only when there are 50 TIDs in the final posting list tuple. This
+ * limit (which is fairly arbitrary) avoids confusion about how many
+ * 1/6 of a page tuples have been encountered/created by the current
+ * deduplication pass.
+ *
+ * Note: We deliberately don't consider which deduplication pass
+ * merged together tuples to create this item (could be a previous
+ * deduplication pass, or current pass). See _bt_do_singleval()
+ * comments.
+ */
+ if (state->nhtids > 50)
+ state->nmaxitems++;
+
+ return false;
+ }
+
+ /*
+ * Save heap TIDs to pending posting list tuple -- itup can be merged into
+ * pending posting list
+ */
+ state->nitems++;
+ memcpy(state->htids + state->nhtids, htids,
+ sizeof(ItemPointerData) * nhtids);
+ state->nhtids += nhtids;
+ state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+ return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page. Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead. This is zero in the case
+ * where no deduplication was possible.
+ */
+Size
+_bt_dedup_finish_pending(Page newpage, BTDedupState state)
+{
+ OffsetNumber tupoff;
+ Size tuplesz;
+ Size spacesaving;
+
+ Assert(state->nitems > 0);
+ Assert(state->nitems <= state->nhtids);
+ Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+ tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
+ if (state->nitems == 1)
+ {
+ /* Use original, unchanged base tuple */
+ tuplesz = IndexTupleSize(state->base);
+ Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
+ Assert(tuplesz <= BTMaxItemSize(newpage));
+ if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ spacesaving = 0;
+ }
+ else
+ {
+ IndexTuple final;
+
+ /* Form a tuple with a posting list */
+ final = _bt_form_posting(state->base, state->htids, state->nhtids);
+ tuplesz = IndexTupleSize(final);
+ Assert(tuplesz <= state->maxpostingsize);
+
+ /* Save final number of items for posting list */
+ state->intervals[state->nintervals].nitems = state->nitems;
+
+ Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
+ Assert(tuplesz <= BTMaxItemSize(newpage));
+ if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
+ false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ pfree(final);
+ spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
+ /* Increment nintervals, since we wrote a new posting list tuple */
+ state->nintervals++;
+ Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+ }
+
+ /* Reset state for next pending posting list */
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+
+ return spacesaving;
+}
+
+/*
+ * Finalize interval during bottom-up index deletion.
+ *
+ * During a bottom-up pass we expect that TIDs will be recorded in dedup state
+ * first, and then get moved over to delstate (in variable-sized batches) by
+ * calling here. Call here happens when the number of TIDs in a dedup
+ * interval is known, and interval gets finalized (i.e. when caller sees next
+ * tuple on the page is not a duplicate, or when caller runs out of tuples to
+ * process from leaf page).
+ *
+ * This is where bottom-up deletion determines and remembers which entries are
+ * duplicates. This will be important information to the tableam delete
+ * infrastructure later on. Plain index tuple duplicates are marked
+ * "promising" here, per tableam contract.
+ *
+ * Our approach to marking entries whose TIDs come from posting lists is more
+ * complicated. Posting lists can only be formed by a deduplication pass (or
+ * during an index build), so recent version churn affecting the pointed-to
+ * logical rows is not particularly likely. We may still give a weak signal
+ * about posting list tuples' entries (by marking just one of its TIDs/entries
+ * promising), though this is only a possibility in the event of further
+ * duplicate index tuples in final interval that covers posting list tuple (as
+ * in the plain tuple case). A weak signal/hint will be useful to the tableam
+ * when it has no stronger signal to go with for the deletion operation as a
+ * whole.
+ *
+ * The heuristics we use work well in practice because we only need to give
+ * the tableam the right _general_ idea about where to look. Garbage tends to
+ * naturally get concentrated in relatively few table blocks with workloads
+ * that bottom-up deletion targets. The tableam cannot possibly rank all
+ * available table blocks sensibly based on the hints we provide, but that's
+ * okay -- only the extremes matter. The tableam just needs to be able to
+ * predict which few table blocks will have the most tuples that are safe to
+ * delete for each deletion operation, with low variance across related
+ * deletion operations.
+ */
+static void
+_bt_bottomupdel_finish_pending(Page page, BTDedupState state,
+ TM_IndexDeleteOp *delstate)
+{
+ bool dupinterval = (state->nitems > 1);
+
+ Assert(state->nitems > 0);
+ Assert(state->nitems <= state->nhtids);
+ Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+ for (int i = 0; i < state->nitems; i++)
+ {
+ OffsetNumber offnum = state->baseoff + i;
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+ TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
+ TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Simple case: A plain non-pivot tuple */
+ ideltid->tid = itup->t_tid;
+ ideltid->id = delstate->ndeltids;
+ istatus->idxoffnum = offnum;
+ istatus->knowndeletable = false; /* for now */
+ istatus->promising = dupinterval; /* simple rule */
+ istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
+
+ delstate->ndeltids++;
+ }
+ else
+ {
+ /*
+ * Complicated case: A posting list tuple.
+ *
+ * We make the conservative assumption that there can only be at
+ * most one affected logical row per posting list tuple. There
+ * will be at most one promising entry in deltids to represent
+ * this presumed lone logical row. Note that this isn't even
+ * considered unless the posting list tuple is also in an interval
+ * of duplicates -- this complicated rule is just a variant of the
+ * simple rule used to decide if plain index tuples are promising.
+ */
+ int nitem = BTreeTupleGetNPosting(itup);
+ bool firstpromising = false;
+ bool lastpromising = false;
+
+ Assert(_bt_posting_valid(itup));
+
+ if (dupinterval)
+ {
+ /*
+ * Complicated rule: either the first or last TID in the
+ * posting list gets marked promising (if any at all)
+ */
+ BlockNumber minblocklist,
+ midblocklist,
+ maxblocklist;
+ ItemPointer mintid,
+ midtid,
+ maxtid;
+
+ mintid = BTreeTupleGetHeapTID(itup);
+ midtid = BTreeTupleGetPostingN(itup, nitem / 2);
+ maxtid = BTreeTupleGetMaxHeapTID(itup);
+ minblocklist = ItemPointerGetBlockNumber(mintid);
+ midblocklist = ItemPointerGetBlockNumber(midtid);
+ maxblocklist = ItemPointerGetBlockNumber(maxtid);
+
+ /* Only entry with predominant table block can be promising */
+ firstpromising = (minblocklist == midblocklist);
+ lastpromising = (!firstpromising &&
+ midblocklist == maxblocklist);
+ }
+
+ for (int p = 0; p < nitem; p++)
+ {
+ ItemPointer htid = BTreeTupleGetPostingN(itup, p);
+
+ ideltid->tid = *htid;
+ ideltid->id = delstate->ndeltids;
+ istatus->idxoffnum = offnum;
+ istatus->knowndeletable = false; /* for now */
+ istatus->promising = false;
+ if ((firstpromising && p == 0) ||
+ (lastpromising && p == nitem - 1))
+ istatus->promising = true;
+ istatus->freespace = sizeof(ItemPointerData); /* at worst */
+
+ ideltid++;
+ istatus++;
+ delstate->ndeltids++;
+ }
+ }
+ }
+
+ if (dupinterval)
+ {
+ state->intervals[state->nintervals].nitems = state->nitems;
+ state->nintervals++;
+ }
+
+ /* Reset state for next interval */
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+}
+
+/*
+ * Determine if page non-pivot tuples (data items) are all duplicates of the
+ * same value -- if they are, deduplication's "single value" strategy should
+ * be applied. The general goal of this strategy is to ensure that
+ * nbtsplitloc.c (which uses its own single value strategy) will find a useful
+ * split point as further duplicates are inserted, and successive rightmost
+ * page splits occur among pages that store the same duplicate value. When
+ * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
+ * just like it would if deduplication were disabled.
+ *
+ * We expect that affected workloads will require _several_ single value
+ * strategy deduplication passes (over a page that only stores duplicates)
+ * before the page is finally split. The first deduplication pass should only
+ * find regular non-pivot tuples. Later deduplication passes will find
+ * existing maxpostingsize-capped posting list tuples, which must be skipped
+ * over. The penultimate pass is generally the first pass that actually
+ * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
+ * few untouched non-pivot tuples. The final deduplication pass won't free
+ * any space -- it will skip over everything without merging anything (it
+ * retraces the steps of the penultimate pass).
+ *
+ * Fortunately, having several passes isn't too expensive. Each pass (after
+ * the first pass) won't spend many cycles on the large posting list tuples
+ * left by previous passes. Each pass will find a large contiguous group of
+ * smaller duplicate tuples to merge together at the end of the page.
+ */
+static bool
+_bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem)
+{
+ int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ ItemId itemid;
+ IndexTuple itup;
+
+ itemid = PageGetItemId(page, minoff);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
+ {
+ itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
+ * and final maxpostingsize-capped tuple. The sixth and final posting list
+ * tuple will end up somewhat smaller than the first five. (Note: The first
+ * five tuples could actually just be very large duplicate tuples that
+ * couldn't be merged together at all. Deduplication will simply not modify
+ * the page when that happens.)
+ *
+ * When there are six posting lists on the page (after current deduplication
+ * pass goes on to create/observe a sixth very large tuple), caller should end
+ * its deduplication pass. It isn't useful to try to deduplicate items that
+ * are supposed to end up on the new right sibling page following the
+ * anticipated page split. A future deduplication pass of future right
+ * sibling page might take care of it. (This is why the first single value
+ * strategy deduplication pass for a given leaf page will generally find only
+ * plain non-pivot tuples -- see _bt_do_singleval() comments.)
+ */
+static void
+_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
+{
+ Size leftfree;
+ int reduction;
+
+ /* This calculation needs to match nbtsplitloc.c */
+ leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
+ MAXALIGN(sizeof(BTPageOpaqueData));
+ /* Subtract size of new high key (includes pivot heap TID space) */
+ leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
+
+ /*
+ * Reduce maxpostingsize by an amount equal to target free space on left
+ * half of page
+ */
+ reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
+ if (state->maxpostingsize > reduction)
+ state->maxpostingsize -= reduction;
+ else
+ state->maxpostingsize = 0;
+}
+
+/*
+ * Build a posting list tuple based on caller's "base" index tuple and list of
+ * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
+ * posting list. (Posting list tuples can never have a single heap TID, partly
+ * because that ensures that deduplication always reduces final MAXALIGN()'d
+ * size of entire tuple.)
+ *
+ * Convention is that posting list starts at a MAXALIGN()'d offset (rather
+ * than a SHORTALIGN()'d offset), in line with the approach taken when
+ * appending a heap TID to new pivot tuple/high key during suffix truncation.
+ * This sometimes wastes a little space that was only needed as alignment
+ * padding in the original tuple. Following this convention simplifies the
+ * space accounting used when deduplicating a page (the same convention
+ * simplifies the accounting for choosing a point to split a page at).
+ *
+ * Note: Caller's "htids" array must be unique and already in ascending TID
+ * order. Any existing heap TIDs from "base" won't automatically appear in
+ * returned posting list tuple (they must be included in htids array.)
+ */
+IndexTuple
+_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
+{
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+
+ if (BTreeTupleIsPosting(base))
+ keysize = BTreeTupleGetPostingOffset(base);
+ else
+ keysize = IndexTupleSize(base);
+
+ Assert(!BTreeTupleIsPivot(base));
+ Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
+ Assert(keysize == MAXALIGN(keysize));
+
+ /* Determine final size of new tuple */
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ Assert(newsize <= INDEX_SIZE_MASK);
+ Assert(newsize == MAXALIGN(newsize));
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, base, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ memcpy(BTreeTupleGetPosting(itup), htids,
+ sizeof(ItemPointerData) * nhtids);
+ Assert(_bt_posting_valid(itup));
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ ItemPointerCopy(htids, &itup->t_tid);
+ Assert(ItemPointerIsValid(&itup->t_tid));
+ }
+
+ return itup;
+}
+
+/*
+ * Generate a replacement tuple by "updating" a posting list tuple so that it
+ * no longer has TIDs that need to be deleted.
+ *
+ * Used by both VACUUM and index deletion. Caller's vacposting argument
+ * points to the existing posting list tuple to be updated.
+ *
+ * On return, caller's vacposting argument will point to final "updated"
+ * tuple, which will be palloc()'d in caller's memory context.
+ */
+void
+_bt_update_posting(BTVacuumPosting vacposting)
+{
+ IndexTuple origtuple = vacposting->itup;
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+ int nhtids;
+ int ui,
+ d;
+ ItemPointer htids;
+
+ nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
+
+ Assert(_bt_posting_valid(origtuple));
+ Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
+
+ /*
+ * Determine final size of new tuple.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples. We avoid calling _bt_form_posting() here
+ * to save ourselves a second memory allocation for a htids workspace.
+ */
+ keysize = BTreeTupleGetPostingOffset(origtuple);
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ Assert(newsize <= INDEX_SIZE_MASK);
+ Assert(newsize == MAXALIGN(newsize));
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, origtuple, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ htids = BTreeTupleGetPosting(itup);
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ htids = &itup->t_tid;
+ }
+
+ ui = 0;
+ d = 0;
+ for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
+ {
+ if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
+ {
+ d++;
+ continue;
+ }
+ htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
+ }
+ Assert(ui == nhtids);
+ Assert(d == vacposting->ndeletedtids);
+ Assert(nhtids == 1 || _bt_posting_valid(itup));
+ Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
+
+ /* vacposting arg's itup will now point to updated version */
+ vacposting->itup = itup;
+}
+
+/*
+ * Prepare for a posting list split by swapping heap TID in newitem with heap
+ * TID from original posting list (the 'oposting' heap TID located at offset
+ * 'postingoff'). Modifies newitem, so caller should pass their own private
+ * copy that can safely be modified.
+ *
+ * Returns new posting list tuple, which is palloc()'d in caller's context.
+ * This is guaranteed to be the same size as 'oposting'. Modified newitem is
+ * what caller actually inserts. (This happens inside the same critical
+ * section that performs an in-place update of old posting list using new
+ * posting list returned here.)
+ *
+ * While the keys from newitem and oposting must be opclass equal, and must
+ * generate identical output when run through the underlying type's output
+ * function, it doesn't follow that their representations match exactly.
+ * Caller must avoid assuming that there can't be representational differences
+ * that make datums from oposting bigger or smaller than the corresponding
+ * datums from newitem. For example, differences in TOAST input state might
+ * break a faulty assumption about tuple size (the executor is entitled to
+ * apply TOAST compression based on its own criteria). It also seems possible
+ * that further representational variation will be introduced in the future,
+ * in order to support nbtree features like page-level prefix compression.
+ *
+ * See nbtree/README for details on the design of posting list splits.
+ */
+IndexTuple
+_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
+{
+ int nhtids;
+ char *replacepos;
+ char *replaceposright;
+ Size nmovebytes;
+ IndexTuple nposting;
+
+ nhtids = BTreeTupleGetNPosting(oposting);
+ Assert(_bt_posting_valid(oposting));
+
+ /*
+ * The postingoff argument originated as a _bt_binsrch_posting() return
+ * value. It will be 0 in the event of corruption that makes a leaf page
+ * contain a non-pivot tuple that's somehow identical to newitem (no two
+ * non-pivot tuples should ever have the same TID). This has been known
+ * to happen in the field from time to time.
+ *
+ * Perform a basic sanity check to catch this case now.
+ */
+ if (!(postingoff > 0 && postingoff < nhtids))
+ elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
+ nhtids, postingoff);
+
+ /*
+ * Move item pointers in posting list to make a gap for the new item's
+ * heap TID. We shift TIDs one place to the right, losing original
+ * rightmost TID. (nmovebytes must not include TIDs to the left of
+ * postingoff, nor the existing rightmost/max TID that gets overwritten.)
+ */
+ nposting = CopyIndexTuple(oposting);
+ replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+ replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
+ nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+ memmove(replaceposright, replacepos, nmovebytes);
+
+ /* Fill the gap at postingoff with TID of new item (original new TID) */
+ Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
+ ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+ /* Now copy oposting's rightmost/max TID into new item (final new TID) */
+ ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
+
+ Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+ BTreeTupleGetHeapTID(newitem)) < 0);
+ Assert(_bt_posting_valid(nposting));
+
+ return nposting;
+}
+
+/*
+ * Verify posting list invariants for "posting", which must be a posting list
+ * tuple. Used within assertions.
+ */
+#ifdef USE_ASSERT_CHECKING
+static bool
+_bt_posting_valid(IndexTuple posting)
+{
+ ItemPointerData last;
+ ItemPointer htid;
+
+ if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
+ return false;
+
+ /* Remember first heap TID for loop */
+ ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
+ if (!ItemPointerIsValid(&last))
+ return false;
+
+ /* Iterate, starting from second TID */
+ for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
+ {
+ htid = BTreeTupleGetPostingN(posting, i);
+
+ if (!ItemPointerIsValid(htid))
+ return false;
+ if (ItemPointerCompare(htid, &last) <= 0)
+ return false;
+ ItemPointerCopy(htid, &last);
+ }
+
+ return true;
+}
+#endif