/*------------------------------------------------------------------------- * * nbtsplitloc.c * Choose split point code for Postgres btree implementation. * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/access/nbtree/nbtsplitloc.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/nbtree.h" #include "storage/lmgr.h" typedef enum { /* strategy for searching through materialized list of split points */ SPLIT_DEFAULT, /* give some weight to truncation */ SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */ SPLIT_SINGLE_VALUE /* leave left page almost full */ } FindSplitStrat; typedef struct { /* details of free space left by split */ int16 curdelta; /* current leftfree/rightfree delta */ int16 leftfree; /* space left on left page post-split */ int16 rightfree; /* space left on right page post-split */ /* split point identifying fields (returned by _bt_findsplitloc) */ OffsetNumber firstrightoff; /* first origpage item on rightpage */ bool newitemonleft; /* new item goes on left, or right? */ } SplitPoint; typedef struct { /* context data for _bt_recsplitloc */ Relation rel; /* index relation */ Page origpage; /* page undergoing split */ IndexTuple newitem; /* new item (cause of page split) */ Size newitemsz; /* size of newitem (includes line pointer) */ bool is_leaf; /* T if splitting a leaf page */ bool is_rightmost; /* T if splitting rightmost page on level */ OffsetNumber newitemoff; /* where the new item is to be inserted */ int leftspace; /* space available for items on left page */ int rightspace; /* space available for items on right page */ int olddataitemstotal; /* space taken by old items */ Size minfirstrightsz; /* smallest firstright size */ /* candidate split point data */ int maxsplits; /* maximum number of splits */ int nsplits; /* current number of splits */ SplitPoint *splits; /* all candidate split points for page */ int interval; /* current range of acceptable split points */ } FindSplitData; static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz); static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult); static int _bt_splitcmp(const void *arg1, const void *arg2); static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult); static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy); static int _bt_defaultinterval(FindSplitData *state); static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy); static void _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, SplitPoint **rightinterval); static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split); static inline IndexTuple _bt_split_lastleft(FindSplitData *state, SplitPoint *split); static inline IndexTuple _bt_split_firstright(FindSplitData *state, SplitPoint *split); /* * _bt_findsplitloc() -- find an appropriate place to split a page. * * The main goal here is to equalize the free space that will be on each * split page, *after accounting for the inserted tuple*. (If we fail to * account for it, we might find ourselves with too little room on the page * that it needs to go into!) * * If the page is the rightmost page on its level, we instead try to arrange * to leave the left split page fillfactor% full. In this way, when we are * inserting successively increasing keys (consider sequences, timestamps, * etc) we will end up with a tree whose pages are about fillfactor% full, * instead of the 50% full result that we'd get without this special case. * This is the same as nbtsort.c produces for a newly-created tree. Note * that leaf and nonleaf pages use different fillfactors. Note also that * there are a number of further special cases where fillfactor is not * applied in the standard way. * * We are passed the intended insert position of the new tuple, expressed as * the offsetnumber of the tuple it must go in front of (this could be * maxoff+1 if the tuple is to go at the end). The new tuple itself is also * passed, since it's needed to give some weight to how effective suffix * truncation will be. The implementation picks the split point that * maximizes the effectiveness of suffix truncation from a small list of * alternative candidate split points that leave each side of the split with * about the same share of free space. Suffix truncation is secondary to * equalizing free space, except in cases with large numbers of duplicates. * Note that it is always assumed that caller goes on to perform truncation, * even with pg_upgrade'd indexes where that isn't actually the case * (!heapkeyspace indexes). See nbtree/README for more information about * suffix truncation. * * We return the index of the first existing tuple that should go on the * righthand page (which is called firstrightoff), plus a boolean * indicating whether the new tuple goes on the left or right page. You * can think of the returned state as a point _between_ two adjacent data * items (laftleft and firstright data items) on an imaginary version of * origpage that already includes newitem. The bool is necessary to * disambiguate the case where firstrightoff == newitemoff (i.e. it is * sometimes needed to determine if the firstright tuple for the split is * newitem rather than the tuple from origpage at offset firstrightoff). */ OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft) { BTPageOpaque opaque; int leftspace, rightspace, olddataitemstotal, olddataitemstoleft, perfectpenalty, leaffillfactor; FindSplitData state; FindSplitStrat strategy; ItemId itemid; OffsetNumber offnum, maxoff, firstrightoff; double fillfactormult; bool usemult; SplitPoint leftpage, rightpage; opaque = BTPageGetOpaque(origpage); maxoff = PageGetMaxOffsetNumber(origpage); /* Total free space available on a btree page, after fixed overhead */ leftspace = rightspace = PageGetPageSize(origpage) - SizeOfPageHeaderData - MAXALIGN(sizeof(BTPageOpaqueData)); /* The right page will have the same high key as the old page */ if (!P_RIGHTMOST(opaque)) { itemid = PageGetItemId(origpage, P_HIKEY); rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData)); } /* Count up total space in data items before actually scanning 'em */ olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage); leaffillfactor = BTGetFillFactor(rel); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); state.rel = rel; state.origpage = origpage; state.newitem = newitem; state.newitemsz = newitemsz; state.is_leaf = P_ISLEAF(opaque); state.is_rightmost = P_RIGHTMOST(opaque); state.leftspace = leftspace; state.rightspace = rightspace; state.olddataitemstotal = olddataitemstotal; state.minfirstrightsz = SIZE_MAX; state.newitemoff = newitemoff; /* newitem cannot be a posting list item */ Assert(!BTreeTupleIsPosting(newitem)); /* * nsplits should never exceed maxoff because there will be at most as * many candidate split points as there are points _between_ tuples, once * you imagine that the new item is already on the original page (the * final number of splits may be slightly lower because not all points * between tuples will be legal). */ state.maxsplits = maxoff; state.splits = palloc(sizeof(SplitPoint) * state.maxsplits); state.nsplits = 0; /* * Scan through the data items and calculate space usage for a split at * each possible position */ olddataitemstoleft = 0; for (offnum = P_FIRSTDATAKEY(opaque); offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { Size itemsz; itemid = PageGetItemId(origpage, offnum); itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); /* * When item offset number is not newitemoff, neither side of the * split can be newitem. Record a split after the previous data item * from original page, but before the current data item from original * page. (_bt_recsplitloc() will reject the split when there are no * previous items, which we rely on.) */ if (offnum < newitemoff) _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); else if (offnum > newitemoff) _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); else { /* * Record a split after all "offnum < newitemoff" original page * data items, but before newitem */ _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); /* * Record a split after newitem, but before data item from * original page at offset newitemoff/current offset */ _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); } olddataitemstoleft += itemsz; } /* * Record a split after all original page data items, but before newitem. * (Though only when it's possible that newitem will end up alone on new * right page.) */ Assert(olddataitemstoleft == olddataitemstotal); if (newitemoff > maxoff) _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0); /* * I believe it is not possible to fail to find a feasible split, but just * in case ... */ if (state.nsplits == 0) elog(ERROR, "could not find a feasible split point for index \"%s\"", RelationGetRelationName(rel)); /* * Start search for a split point among list of legal split points. Give * primary consideration to equalizing available free space in each half * of the split initially (start with default strategy), while applying * rightmost and split-after-new-item optimizations where appropriate. * Either of the two other fallback strategies may be required for cases * with a large number of duplicates around the original/space-optimal * split point. * * Default strategy gives some weight to suffix truncation in deciding a * split point on leaf pages. It attempts to select a split point where a * distinguishing attribute appears earlier in the new high key for the * left side of the split, in order to maximize the number of trailing * attributes that can be truncated away. Only candidate split points * that imply an acceptable balance of free space on each side are * considered. See _bt_defaultinterval(). */ if (!state.is_leaf) { /* fillfactormult only used on rightmost page */ usemult = state.is_rightmost; fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0; } else if (state.is_rightmost) { /* Rightmost leaf page -- fillfactormult always used */ usemult = true; fillfactormult = leaffillfactor / 100.0; } else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult)) { /* * New item inserted at rightmost point among a localized grouping on * a leaf page -- apply "split after new item" optimization, either by * applying leaf fillfactor multiplier, or by choosing the exact split * point that leaves newitem as lastleft. (usemult is set for us.) */ if (usemult) { /* fillfactormult should be set based on leaf fillfactor */ fillfactormult = leaffillfactor / 100.0; } else { /* find precise split point after newitemoff */ for (int i = 0; i < state.nsplits; i++) { SplitPoint *split = state.splits + i; if (split->newitemonleft && newitemoff == split->firstrightoff) { pfree(state.splits); *newitemonleft = true; return newitemoff; } } /* * Cannot legally split after newitemoff; proceed with split * without using fillfactor multiplier. This is defensive, and * should never be needed in practice. */ fillfactormult = 0.50; } } else { /* Other leaf page. 50:50 page split. */ usemult = false; /* fillfactormult not used, but be tidy */ fillfactormult = 0.50; } /* * Save leftmost and rightmost splits for page before original ordinal * sort order is lost by delta/fillfactormult sort */ leftpage = state.splits[0]; rightpage = state.splits[state.nsplits - 1]; /* Give split points a fillfactormult-wise delta, and sort on deltas */ _bt_deltasortsplits(&state, fillfactormult, usemult); /* Determine split interval for default strategy */ state.interval = _bt_defaultinterval(&state); /* * Determine if default strategy/split interval will produce a * sufficiently distinguishing split, or if we should change strategies. * Alternative strategies change the range of split points that are * considered acceptable (split interval), and possibly change * fillfactormult, in order to deal with pages with a large number of * duplicates gracefully. * * Pass low and high splits for the entire page (actually, they're for an * imaginary version of the page that includes newitem). These are used * when the initial split interval encloses split points that are full of * duplicates, and we need to consider if it's even possible to avoid * appending a heap TID. */ perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy); if (strategy == SPLIT_DEFAULT) { /* * Default strategy worked out (always works out with internal page). * Original split interval still stands. */ } /* * Many duplicates strategy is used when a heap TID would otherwise be * appended, but the page isn't completely full of logical duplicates. * * The split interval is widened to include all legal candidate split * points. There might be a few as two distinct values in the whole-page * split interval, though it's also possible that most of the values on * the page are unique. The final split point will either be to the * immediate left or to the immediate right of the group of duplicate * tuples that enclose the first/delta-optimal split point (perfect * penalty was set so that the lowest delta split point that avoids * appending a heap TID will be chosen). Maximizing the number of * attributes that can be truncated away is not a goal of the many * duplicates strategy. * * Single value strategy is used when it is impossible to avoid appending * a heap TID. It arranges to leave the left page very full. This * maximizes space utilization in cases where tuples with the same * attribute values span many pages. Newly inserted duplicates will tend * to have higher heap TID values, so we'll end up splitting to the right * consistently. (Single value strategy is harmless though not * particularly useful with !heapkeyspace indexes.) */ else if (strategy == SPLIT_MANY_DUPLICATES) { Assert(state.is_leaf); /* Shouldn't try to truncate away extra user attributes */ Assert(perfectpenalty == IndexRelationGetNumberOfKeyAttributes(state.rel)); /* No need to resort splits -- no change in fillfactormult/deltas */ state.interval = state.nsplits; } else if (strategy == SPLIT_SINGLE_VALUE) { Assert(state.is_leaf); /* Split near the end of the page */ usemult = true; fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0; /* Resort split points with new delta */ _bt_deltasortsplits(&state, fillfactormult, usemult); /* Appending a heap TID is unavoidable, so interval of 1 is fine */ state.interval = 1; } /* * Search among acceptable split points (using final split interval) for * the entry that has the lowest penalty, and is therefore expected to * maximize fan-out. Sets *newitemonleft for us. */ firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, strategy); pfree(state.splits); return firstrightoff; } /* * Subroutine to record a particular point between two tuples (possibly the * new item) on page (ie, combination of firstrightoff and newitemonleft * settings) in *state for later analysis. This is also a convenient point to * check if the split is legal (if it isn't, it won't be recorded). * * firstrightoff is the offset of the first item on the original page that * goes to the right page, and firstrightofforigpagetuplesz is the size of * that tuple. firstrightoff can be > max offset, which means that all the * old items go to the left page and only the new item goes to the right page. * We don't actually use firstrightofforigpagetuplesz in that case (actually, * we don't use it for _any_ split where the firstright tuple happens to be * newitem). * * olddataitemstoleft is the total size of all old items to the left of the * split point that is recorded here when legal. Should not include * newitemsz, since that is handled here. */ static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz) { int16 leftfree, rightfree; Size firstrightsz; Size postingsz = 0; bool newitemisfirstright; /* Is the new item going to be split point's firstright tuple? */ newitemisfirstright = (firstrightoff == state->newitemoff && !newitemonleft); if (newitemisfirstright) firstrightsz = state->newitemsz; else { firstrightsz = firstrightofforigpagetuplesz; /* * Calculate suffix truncation space saving when firstright tuple is a * posting list tuple, though only when the tuple is over 64 bytes * including line pointer overhead (arbitrary). This avoids accessing * the tuple in cases where its posting list must be very small (if * tuple has one at all). * * Note: We don't do this in the case where firstright tuple is * newitem, since newitem cannot have a posting list. */ if (state->is_leaf && firstrightsz > 64) { ItemId itemid; IndexTuple newhighkey; itemid = PageGetItemId(state->origpage, firstrightoff); newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid); if (BTreeTupleIsPosting(newhighkey)) postingsz = IndexTupleSize(newhighkey) - BTreeTupleGetPostingOffset(newhighkey); } } /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - (state->olddataitemstotal - olddataitemstoleft); /* * The first item on the right page becomes the high key of the left page; * therefore it counts against left space as well as right space (we * cannot assume that suffix truncation will make it any smaller). When * index has included attributes, then those attributes of left page high * key will be truncated leaving that page with slightly more free space. * However, that shouldn't affect our ability to find valid split * location, since we err in the direction of being pessimistic about free * space on the left half. Besides, even when suffix truncation of * non-TID attributes occurs, the new high key often won't even be a * single MAXALIGN() quantum smaller than the firstright tuple it's based * on. * * If we are on the leaf level, assume that suffix truncation cannot avoid * adding a heap TID to the left half's new high key when splitting at the * leaf level. In practice the new high key will often be smaller and * will rarely be larger, but conservatively assume the worst case. We do * go to the trouble of subtracting away posting list overhead, though * only when it looks like it will make an appreciable difference. * (Posting lists are the only case where truncation will typically make * the final high key far smaller than firstright, so being a bit more * precise there noticeably improves the balance of free space.) */ if (state->is_leaf) leftfree -= (int16) (firstrightsz + MAXALIGN(sizeof(ItemPointerData)) - postingsz); else leftfree -= (int16) firstrightsz; /* account for the new item */ if (newitemonleft) leftfree -= (int16) state->newitemsz; else rightfree -= (int16) state->newitemsz; /* * If we are not on the leaf level, we will be able to discard the key * data from the first item that winds up on the right page. */ if (!state->is_leaf) rightfree += (int16) firstrightsz - (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); /* Record split if legal */ if (leftfree >= 0 && rightfree >= 0) { Assert(state->nsplits < state->maxsplits); /* Determine smallest firstright tuple size among legal splits */ state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz); state->splits[state->nsplits].curdelta = 0; state->splits[state->nsplits].leftfree = leftfree; state->splits[state->nsplits].rightfree = rightfree; state->splits[state->nsplits].firstrightoff = firstrightoff; state->splits[state->nsplits].newitemonleft = newitemonleft; state->nsplits++; } } /* * Subroutine to assign space deltas to materialized array of candidate split * points based on current fillfactor, and to sort array using that fillfactor */ static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult) { for (int i = 0; i < state->nsplits; i++) { SplitPoint *split = state->splits + i; int16 delta; if (usemult) delta = fillfactormult * split->leftfree - (1.0 - fillfactormult) * split->rightfree; else delta = split->leftfree - split->rightfree; if (delta < 0) delta = -delta; /* Save delta */ split->curdelta = delta; } qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp); } /* * qsort-style comparator used by _bt_deltasortsplits() */ static int _bt_splitcmp(const void *arg1, const void *arg2) { SplitPoint *split1 = (SplitPoint *) arg1; SplitPoint *split2 = (SplitPoint *) arg2; if (split1->curdelta > split2->curdelta) return 1; if (split1->curdelta < split2->curdelta) return -1; return 0; } /* * Subroutine to determine whether or not a non-rightmost leaf page should be * split immediately after the would-be original page offset for the * new/incoming tuple (or should have leaf fillfactor applied when new item is * to the right on original page). This is appropriate when there is a * pattern of localized monotonically increasing insertions into a composite * index, where leading attribute values form local groupings, and we * anticipate further insertions of the same/current grouping (new item's * grouping) in the near future. This can be thought of as a variation on * applying leaf fillfactor during rightmost leaf page splits, since cases * that benefit will converge on packing leaf pages leaffillfactor% full over * time. * * We may leave extra free space remaining on the rightmost page of a "most * significant column" grouping of tuples if that grouping never ends up * having future insertions that use the free space. That effect is * self-limiting; a future grouping that becomes the "nearest on the right" * grouping of the affected grouping usually puts the extra free space to good * use. * * Caller uses optimization when routine returns true, though the exact action * taken by caller varies. Caller uses original leaf page fillfactor in * standard way rather than using the new item offset directly when *usemult * was also set to true here. Otherwise, caller applies optimization by * locating the legal split point that makes the new tuple the lastleft tuple * for the split. */ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult) { int16 nkeyatts; ItemId itemid; IndexTuple tup; int keepnatts; Assert(state->is_leaf && !state->is_rightmost); nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); /* Single key indexes not considered here */ if (nkeyatts == 1) return false; /* Ascending insertion pattern never inferred when new item is first */ if (state->newitemoff == P_FIRSTKEY) return false; /* * Only apply optimization on pages with equisized tuples, since ordinal * keys are likely to be fixed-width. Testing if the new tuple is * variable width directly might also work, but that fails to apply the * optimization to indexes with a numeric_ops attribute. * * Conclude that page has equisized tuples when the new item is the same * width as the smallest item observed during pass over page, and other * non-pivot tuples must be the same width as well. (Note that the * possibly-truncated existing high key isn't counted in * olddataitemstotal, and must be subtracted from maxoff.) */ if (state->newitemsz != state->minfirstrightsz) return false; if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal) return false; /* * Avoid applying optimization when tuples are wider than a tuple * consisting of two non-NULL int8/int64 attributes (or four non-NULL * int4/int32 attributes) */ if (state->newitemsz > MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) + sizeof(ItemIdData)) return false; /* * At least the first attribute's value must be equal to the corresponding * value in previous tuple to apply optimization. New item cannot be a * duplicate, either. * * Handle case where new item is to the right of all items on the existing * page. This is suggestive of monotonically increasing insertions in * itself, so the "heap TID adjacency" test is not applied here. */ if (state->newitemoff > maxoff) { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); if (keepnatts > 1 && keepnatts <= nkeyatts) { *usemult = true; return true; } return false; } /* * "Low cardinality leading column, high cardinality suffix column" * indexes with a random insertion pattern (e.g., an index with a boolean * column, such as an index on '(book_is_in_print, book_isbn)') present us * with a risk of consistently misapplying the optimization. We're * willing to accept very occasional misapplication of the optimization, * provided the cases where we get it wrong are rare and self-limiting. * * Heap TID adjacency strongly suggests that the item just to the left was * inserted very recently, which limits overapplication of the * optimization. Besides, all inappropriate cases triggered here will * still split in the middle of the page on average. */ itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff)); tup = (IndexTuple) PageGetItem(state->origpage, itemid); /* Do cheaper test first */ if (BTreeTupleIsPosting(tup) || !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); if (keepnatts > 1 && keepnatts <= nkeyatts) { double interp = (double) state->newitemoff / ((double) maxoff + 1); double leaffillfactormult = (double) leaffillfactor / 100.0; /* * Don't allow caller to split after a new item when it will result in * a split point to the right of the point that a leaf fillfactor * split would use -- have caller apply leaf fillfactor instead */ *usemult = interp > leaffillfactormult; return true; } return false; } /* * Subroutine for determining if two heap TIDS are "adjacent". * * Adjacent means that the high TID is very likely to have been inserted into * heap relation immediately after the low TID, probably during the current * transaction. */ static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) { BlockNumber lowblk, highblk; lowblk = ItemPointerGetBlockNumber(lowhtid); highblk = ItemPointerGetBlockNumber(highhtid); /* Make optimistic assumption of adjacency when heap blocks match */ if (lowblk == highblk) return true; /* When heap block one up, second offset should be FirstOffsetNumber */ if (lowblk + 1 == highblk && ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber) return true; return false; } /* * Subroutine to find the "best" split point among candidate split points. * The best split point is the split point with the lowest penalty among split * points that fall within current/final split interval. Penalty is an * abstract score, with a definition that varies depending on whether we're * splitting a leaf page or an internal page. See _bt_split_penalty() for * details. * * "perfectpenalty" is assumed to be the lowest possible penalty among * candidate split points. This allows us to return early without wasting * cycles on calculating the first differing attribute for all candidate * splits when that clearly cannot improve our choice (or when we only want a * minimally distinguishing split point, and don't want to make the split any * more unbalanced than is necessary). * * We return the index of the first existing tuple that should go on the right * page, plus a boolean indicating if new item is on left of split point. */ static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy) { int bestpenalty, lowsplit; int highsplit = Min(state->interval, state->nsplits); SplitPoint *final; bestpenalty = INT_MAX; lowsplit = 0; for (int i = lowsplit; i < highsplit; i++) { int penalty; penalty = _bt_split_penalty(state, state->splits + i); if (penalty < bestpenalty) { bestpenalty = penalty; lowsplit = i; } if (penalty <= perfectpenalty) break; } final = &state->splits[lowsplit]; /* * There is a risk that the "many duplicates" strategy will repeatedly do * the wrong thing when there are monotonically decreasing insertions to * the right of a large group of duplicates. Repeated splits could leave * a succession of right half pages with free space that can never be * used. This must be avoided. * * Consider the example of the leftmost page in a single integer attribute * NULLS FIRST index which is almost filled with NULLs. Monotonically * decreasing integer insertions might cause the same leftmost page to * split repeatedly at the same point. Each split derives its new high * key from the lowest current value to the immediate right of the large * group of NULLs, which will always be higher than all future integer * insertions, directing all future integer insertions to the same * leftmost page. */ if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && !final->newitemonleft && final->firstrightoff >= state->newitemoff && final->firstrightoff < state->newitemoff + 9) { /* * Avoid the problem by performing a 50:50 split when the new item is * just to the right of the would-be "many duplicates" split point. * (Note that the test used for an insert that is "just to the right" * of the split point is conservative.) */ final = &state->splits[0]; } *newitemonleft = final->newitemonleft; return final->firstrightoff; } #define LEAF_SPLIT_DISTANCE 0.050 #define INTERNAL_SPLIT_DISTANCE 0.075 /* * Return a split interval to use for the default strategy. This is a limit * on the number of candidate split points to give further consideration to. * Only a fraction of all candidate splits points (those located at the start * of the now-sorted splits array) fall within the split interval. Split * interval is applied within _bt_bestsplitloc(). * * Split interval represents an acceptable range of split points -- those that * have leftfree and rightfree values that are acceptably balanced. The final * split point chosen is the split point with the lowest "penalty" among split * points in this split interval (unless we change our entire strategy, in * which case the interval also changes -- see _bt_strategy()). * * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits, * and sigma b for internal ("branch") splits. It's hard to provide a * theoretical justification for the size of the split interval, though it's * clear that a small split interval can make tuples on level L+1 much smaller * on average, without noticeably affecting space utilization on level L. * (Note that the way that we calculate split interval might need to change if * suffix truncation is taught to truncate tuples "within" the last * attribute/datum for data types like text, which is more or less how it is * assumed to work in the paper.) */ static int _bt_defaultinterval(FindSplitData *state) { SplitPoint *spaceoptimal; int16 tolerance, lowleftfree, lowrightfree, highleftfree, highrightfree; /* * Determine leftfree and rightfree values that are higher and lower than * we're willing to tolerate. Note that the final split interval will be * about 10% of nsplits in the common case where all non-pivot tuples * (data items) from a leaf page are uniformly sized. We're a bit more * aggressive when splitting internal pages. */ if (state->is_leaf) tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE; else tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE; /* First candidate split point is the most evenly balanced */ spaceoptimal = state->splits; lowleftfree = spaceoptimal->leftfree - tolerance; lowrightfree = spaceoptimal->rightfree - tolerance; highleftfree = spaceoptimal->leftfree + tolerance; highrightfree = spaceoptimal->rightfree + tolerance; /* * Iterate through split points, starting from the split immediately after * 'spaceoptimal'. Find the first split point that divides free space so * unevenly that including it in the split interval would be unacceptable. */ for (int i = 1; i < state->nsplits; i++) { SplitPoint *split = state->splits + i; /* Cannot use curdelta here, since its value is often weighted */ if (split->leftfree < lowleftfree || split->rightfree < lowrightfree || split->leftfree > highleftfree || split->rightfree > highrightfree) return i; } return state->nsplits; } /* * Subroutine to decide whether split should use default strategy/initial * split interval, or whether it should finish splitting the page using * alternative strategies (this is only possible with leaf pages). * * Caller uses alternative strategy (or sticks with default strategy) based * on how *strategy is set here. Return value is "perfect penalty", which is * passed to _bt_bestsplitloc() as a final constraint on how far caller is * willing to go to avoid appending a heap TID when using the many duplicates * strategy (it also saves _bt_bestsplitloc() useless cycles). */ static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy) { IndexTuple leftmost, rightmost; SplitPoint *leftinterval, *rightinterval; int perfectpenalty; int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); /* Assume that alternative strategy won't be used for now */ *strategy = SPLIT_DEFAULT; /* * Use smallest observed firstright item size for entire page (actually, * entire imaginary version of page that includes newitem) as perfect * penalty on internal pages. This can save cycles in the common case * where most or all splits (not just splits within interval) have * firstright tuples that are the same size. */ if (!state->is_leaf) return state->minfirstrightsz; /* * Use leftmost and rightmost tuples from leftmost and rightmost splits in * current split interval */ _bt_interval_edges(state, &leftinterval, &rightinterval); leftmost = _bt_split_lastleft(state, leftinterval); rightmost = _bt_split_firstright(state, rightinterval); /* * If initial split interval can produce a split point that will at least * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); if (perfectpenalty <= indnkeyatts) return perfectpenalty; /* * Work out how caller should finish split when even their "perfect" * penalty for initial/default split interval indicates that the interval * does not contain even a single split that avoids appending a heap TID. * * Use the leftmost split's lastleft tuple and the rightmost split's * firstright tuple to assess every possible split. */ leftmost = _bt_split_lastleft(state, leftpage); rightmost = _bt_split_firstright(state, rightpage); /* * If page (including new item) has many duplicates but is not entirely * full of duplicates, a many duplicates strategy split will be performed. * If page is entirely full of duplicates, a single value strategy split * will be performed. */ perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; /* * Many duplicates strategy should split at either side the group of * duplicates that enclose the delta-optimal split point. Return * indnkeyatts rather than the true perfect penalty to make that * happen. (If perfectpenalty was returned here then low cardinality * composite indexes could have continual unbalanced splits.) * * Note that caller won't go through with a many duplicates split in * rare cases where it looks like there are ever-decreasing insertions * to the immediate right of the split point. This must happen just * before a final decision is made, within _bt_bestsplitloc(). */ return indnkeyatts; } /* * Single value strategy is only appropriate with ever-increasing heap * TIDs; otherwise, original default strategy split should proceed to * avoid pathological performance. Use page high key to infer if this is * the rightmost page among pages that store the same duplicate value. * This should not prevent insertions of heap TIDs that are slightly out * of order from using single value strategy, since that's expected with * concurrent inserters of the same duplicate value. */ else if (state->is_rightmost) *strategy = SPLIT_SINGLE_VALUE; else { ItemId itemid; IndexTuple hikey; itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, state->newitem); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else { /* * Have caller finish split using default strategy, since page * does not appear to be the rightmost page for duplicates of the * value the page is filled with */ } } return perfectpenalty; } /* * Subroutine to locate leftmost and rightmost splits for current/default * split interval. Note that it will be the same split iff there is only one * split in interval. */ static void _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, SplitPoint **rightinterval) { int highsplit = Min(state->interval, state->nsplits); SplitPoint *deltaoptimal; deltaoptimal = state->splits; *leftinterval = NULL; *rightinterval = NULL; /* * Delta is an absolute distance to optimal split point, so both the * leftmost and rightmost split point will usually be at the end of the * array */ for (int i = highsplit - 1; i >= 0; i--) { SplitPoint *distant = state->splits + i; if (distant->firstrightoff < deltaoptimal->firstrightoff) { if (*leftinterval == NULL) *leftinterval = distant; } else if (distant->firstrightoff > deltaoptimal->firstrightoff) { if (*rightinterval == NULL) *rightinterval = distant; } else if (!distant->newitemonleft && deltaoptimal->newitemonleft) { /* * "incoming tuple will become firstright" (distant) is to the * left of "incoming tuple will become lastleft" (delta-optimal) */ Assert(distant->firstrightoff == state->newitemoff); if (*leftinterval == NULL) *leftinterval = distant; } else if (distant->newitemonleft && !deltaoptimal->newitemonleft) { /* * "incoming tuple will become lastleft" (distant) is to the right * of "incoming tuple will become firstright" (delta-optimal) */ Assert(distant->firstrightoff == state->newitemoff); if (*rightinterval == NULL) *rightinterval = distant; } else { /* There was only one or two splits in initial split interval */ Assert(distant == deltaoptimal); if (*leftinterval == NULL) *leftinterval = distant; if (*rightinterval == NULL) *rightinterval = distant; } if (*leftinterval && *rightinterval) return; } Assert(false); } /* * Subroutine to find penalty for caller's candidate split point. * * On leaf pages, penalty is the attribute number that distinguishes each side * of a split. It's the last attribute that needs to be included in new high * key for left page. It can be greater than the number of key attributes in * cases where a heap TID will need to be appended during truncation. * * On internal pages, penalty is simply the size of the firstright tuple for * the split (including line pointer overhead). This tuple will become the * new high key for the left page. */ static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split) { IndexTuple lastleft; IndexTuple firstright; if (!state->is_leaf) { ItemId itemid; if (!split->newitemonleft && split->firstrightoff == state->newitemoff) return state->newitemsz; itemid = PageGetItemId(state->origpage, split->firstrightoff); return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); } lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); return _bt_keep_natts_fast(state->rel, lastleft, firstright); } /* * Subroutine to get a lastleft IndexTuple for a split point */ static inline IndexTuple _bt_split_lastleft(FindSplitData *state, SplitPoint *split) { ItemId itemid; if (split->newitemonleft && split->firstrightoff == state->newitemoff) return state->newitem; itemid = PageGetItemId(state->origpage, OffsetNumberPrev(split->firstrightoff)); return (IndexTuple) PageGetItem(state->origpage, itemid); } /* * Subroutine to get a firstright IndexTuple for a split point */ static inline IndexTuple _bt_split_firstright(FindSplitData *state, SplitPoint *split) { ItemId itemid; if (!split->newitemonleft && split->firstrightoff == state->newitemoff) return state->newitem; itemid = PageGetItemId(state->origpage, split->firstrightoff); return (IndexTuple) PageGetItem(state->origpage, itemid); }