diff options
Diffstat (limited to '')
-rw-r--r-- | src/backend/access/nbtree/Makefile | 28 | ||||
-rw-r--r-- | src/backend/access/nbtree/README | 1056 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtcompare.c | 335 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtdedup.c | 1098 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 3009 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 3073 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 1446 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 2501 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 2016 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsplitloc.c | 1190 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 2751 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtvalidate.c | 380 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtxlog.c | 1126 |
13 files changed, 20009 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile new file mode 100644 index 0000000..d69808e --- /dev/null +++ b/src/backend/access/nbtree/Makefile @@ -0,0 +1,28 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/nbtree +# +# IDENTIFICATION +# src/backend/access/nbtree/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/nbtree +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + nbtcompare.o \ + nbtdedup.o \ + nbtinsert.o \ + nbtpage.o \ + nbtree.o \ + nbtsearch.o \ + nbtsort.o \ + nbtsplitloc.o \ + nbtutils.o \ + nbtvalidate.o \ + nbtxlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README new file mode 100644 index 0000000..bfe33b6 --- /dev/null +++ b/src/backend/access/nbtree/README @@ -0,0 +1,1056 @@ +src/backend/access/nbtree/README + +Btree Indexing +============== + +This directory contains a correct implementation of Lehman and Yao's +high-concurrency B-tree management algorithm (P. Lehman and S. Yao, +Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions +on Database Systems, Vol 6, No. 4, December 1981, pp 650-670). We also +use a simplified version of the deletion logic described in Lanin and +Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm, +Proceedings of 1986 Fall Joint Computer Conference, pp 380-389). + +The basic Lehman & Yao Algorithm +-------------------------------- + +Compared to a classic B-tree, L&Y adds a right-link pointer to each page, +to the page's right sibling. It also adds a "high key" to each page, which +is an upper bound on the keys that are allowed on that page. These two +additions make it possible detect a concurrent page split, which allows the +tree to be searched without holding any read locks (except to keep a single +page from being modified while reading it). + +When a search follows a downlink to a child page, it compares the page's +high key with the search key. If the search key is greater than the high +key, the page must've been split concurrently, and you must follow the +right-link to find the new page containing the key range you're looking +for. This might need to be repeated, if the page has been split more than +once. + +Lehman and Yao talk about alternating "separator" keys and downlinks in +internal pages rather than tuples or records. We use the term "pivot" +tuple to refer to tuples which don't point to heap tuples, that are used +only for tree navigation. All tuples on non-leaf pages and high keys on +leaf pages are pivot tuples. Since pivot tuples are only used to represent +which part of the key space belongs on each page, they can have attribute +values copied from non-pivot tuples that were deleted and killed by VACUUM +some time ago. A pivot tuple may contain a "separator" key and downlink, +just a separator key (i.e. the downlink value is implicitly undefined), or +just a downlink (i.e. all attributes are truncated away). + +The requirement that all btree keys be unique is satisfied by treating heap +TID as a tiebreaker attribute. Logical duplicates are sorted in heap TID +order. This is necessary because Lehman and Yao also require that the key +range for a subtree S is described by Ki < v <= Ki+1 where Ki and Ki+1 are +the adjacent keys in the parent page (Ki must be _strictly_ less than v, +which is assured by having reliably unique keys). Keys are always unique +on their level, with the exception of a leaf page's high key, which can be +fully equal to the last item on the page. + +The Postgres implementation of suffix truncation must make sure that the +Lehman and Yao invariants hold, and represents that absent/truncated +attributes in pivot tuples have the sentinel value "minus infinity". The +later section on suffix truncation will be helpful if it's unclear how the +Lehman & Yao invariants work with a real world example. + +Differences to the Lehman & Yao algorithm +----------------------------------------- + +We have made the following changes in order to incorporate the L&Y algorithm +into Postgres: + +Lehman and Yao don't require read locks, but assume that in-memory +copies of tree pages are unshared. Postgres shares in-memory buffers +among backends. As a result, we do page-level read locking on btree +pages in order to guarantee that no record is modified while we are +examining it. This reduces concurrency but guarantees correct +behavior. + +We support the notion of an ordered "scan" of an index as well as +insertions, deletions, and simple lookups. A scan in the forward +direction is no problem, we just use the right-sibling pointers that +L&Y require anyway. (Thus, once we have descended the tree to the +correct start point for the scan, the scan looks only at leaf pages +and never at higher tree levels.) To support scans in the backward +direction, we also store a "left sibling" link much like the "right +sibling". (This adds an extra step to the L&Y split algorithm: while +holding the write lock on the page being split, we also lock its former +right sibling to update that page's left-link. This is safe since no +writer of that page can be interested in acquiring a write lock on our +page.) A backwards scan has one additional bit of complexity: after +following the left-link we must account for the possibility that the +left sibling page got split before we could read it. So, we have to +move right until we find a page whose right-link matches the page we +came from. (Actually, it's even harder than that; see page deletion +discussion below.) + +Page read locks are held only for as long as a scan is examining a page. +To minimize lock/unlock traffic, an index scan always searches a leaf page +to identify all the matching items at once, copying their heap tuple IDs +into backend-local storage. The heap tuple IDs are then processed while +not holding any page lock within the index. We do continue to hold a pin +on the leaf page in some circumstances, to protect against concurrent +deletions (see below). In this state the scan is effectively stopped +"between" pages, either before or after the page it has pinned. This is +safe in the presence of concurrent insertions and even page splits, because +items are never moved across pre-existing page boundaries --- so the scan +cannot miss any items it should have seen, nor accidentally return the same +item twice. The scan must remember the page's right-link at the time it +was scanned, since that is the page to move right to; if we move right to +the current right-link then we'd re-scan any items moved by a page split. +We don't similarly remember the left-link, since it's best to use the most +up-to-date left-link when trying to move left (see detailed move-left +algorithm below). + +In most cases we release our lock and pin on a page before attempting +to acquire pin and lock on the page we are moving to. In a few places +it is necessary to lock the next page before releasing the current one. +This is safe when moving right or up, but not when moving left or down +(else we'd create the possibility of deadlocks). + +Lehman and Yao fail to discuss what must happen when the root page +becomes full and must be split. Our implementation is to split the +root in the same way that any other page would be split, then construct +a new root page holding pointers to both of the resulting pages (which +now become siblings on the next level of the tree). The new root page +is then installed by altering the root pointer in the meta-data page (see +below). This works because the root is not treated specially in any +other way --- in particular, searches will move right using its link +pointer if the link is set. Therefore, searches will find the data +that's been moved into the right sibling even if they read the meta-data +page before it got updated. This is the same reasoning that makes a +split of a non-root page safe. The locking considerations are similar too. + +When an inserter recurses up the tree, splitting internal pages to insert +links to pages inserted on the level below, it is possible that it will +need to access a page above the level that was the root when it began its +descent (or more accurately, the level that was the root when it read the +meta-data page). In this case the stack it made while descending does not +help for finding the correct page. When this happens, we find the correct +place by re-descending the tree until we reach the level one above the +level we need to insert a link to, and then moving right as necessary. +(Typically this will take only two fetches, the meta-data page and the new +root, but in principle there could have been more than one root split +since we saw the root. We can identify the correct tree level by means of +the level numbers stored in each page. The situation is rare enough that +we do not need a more efficient solution.) + +Lehman and Yao must couple/chain locks as part of moving right when +relocating a child page's downlink during an ascent of the tree. This is +the only point where Lehman and Yao have to simultaneously hold three +locks (a lock on the child, the original parent, and the original parent's +right sibling). We don't need to couple internal page locks for pages on +the same level, though. We match a child's block number to a downlink +from a pivot tuple one level up, whereas Lehman and Yao match on the +separator key associated with the downlink that was followed during the +initial descent. We can release the lock on the original parent page +before acquiring a lock on its right sibling, since there is never any +need to deal with the case where the separator key that we must relocate +becomes the original parent's high key. Lanin and Shasha don't couple +locks here either, though they also don't couple locks between levels +during ascents. They are willing to "wait and try again" to avoid races. +Their algorithm is optimistic, which means that "an insertion holds no +more than one write lock at a time during its ascent". We more or less +stick with Lehman and Yao's approach of conservatively coupling parent and +child locks when ascending the tree, since it's far simpler. + +Lehman and Yao assume fixed-size keys, but we must deal with +variable-size keys. Therefore there is not a fixed maximum number of +keys per page; we just stuff in as many as will fit. When we split a +page, we try to equalize the number of bytes, not items, assigned to +pages (though suffix truncation is also considered). Note we must include +the incoming item in this calculation, otherwise it is possible to find +that the incoming item doesn't fit on the split page where it needs to go! + +Deleting index tuples during VACUUM +----------------------------------- + +Before deleting a leaf item, we get a super-exclusive lock on the target +page, so that no other backend has a pin on the page when the deletion +starts. This is not necessary for correctness in terms of the btree index +operations themselves; as explained above, index scans logically stop +"between" pages and so can't lose their place. The reason we do it is to +provide an interlock between VACUUM and indexscans. Since VACUUM deletes +index entries before reclaiming heap tuple line pointers, the +super-exclusive lock guarantees that VACUUM can't reclaim for re-use a +line pointer that an indexscanning process might be about to visit. This +guarantee works only for simple indexscans that visit the heap in sync +with the index scan, not for bitmap scans. We only need the guarantee +when using non-MVCC snapshot rules; when using an MVCC snapshot, it +doesn't matter if the heap tuple is replaced with an unrelated tuple at +the same TID, because the new tuple won't be visible to our scan anyway. +Therefore, a scan using an MVCC snapshot which has no other confounding +factors will not hold the pin after the page contents are read. The +current reasons for exceptions, where a pin is still needed, are if the +index is not WAL-logged or if the scan is an index-only scan. If later +work allows the pin to be dropped for all cases we will be able to +simplify the vacuum code, since the concept of a super-exclusive lock +for btree indexes will no longer be needed. + +Because a pin is not always held, and a page can be split even while +someone does hold a pin on it, it is possible that an indexscan will +return items that are no longer stored on the page it has a pin on, but +rather somewhere to the right of that page. To ensure that VACUUM can't +prematurely remove such heap tuples, we require btbulkdelete to obtain a +super-exclusive lock on every leaf page in the index, even pages that +don't contain any deletable tuples. Any scan which could yield incorrect +results if the tuple at a TID matching the scan's range and filter +conditions were replaced by a different tuple while the scan is in +progress must hold the pin on each index page until all index entries read +from the page have been processed. This guarantees that the btbulkdelete +call cannot return while any indexscan is still holding a copy of a +deleted index tuple if the scan could be confused by that. Note that this +requirement does not say that btbulkdelete must visit the pages in any +particular order. (See also simple deletion and bottom-up deletion, +below.) + +There is no such interlocking for deletion of items in internal pages, +since backends keep no lock nor pin on a page they have descended past. +Hence, when a backend is ascending the tree using its stack, it must +be prepared for the possibility that the item it wants is to the left of +the recorded position (but it can't have moved left out of the recorded +page). Since we hold a lock on the lower page (per L&Y) until we have +re-found the parent item that links to it, we can be assured that the +parent item does still exist and can't have been deleted. + +VACUUM's linear scan, concurrent page splits +-------------------------------------------- + +VACUUM accesses the index by doing a linear scan to search for deletable +TIDs, while considering the possibility of deleting empty pages in +passing. This is in physical/block order, not logical/keyspace order. +The tricky part of this is avoiding missing any deletable tuples in the +presence of concurrent page splits: a page split could easily move some +tuples from a page not yet passed over by the sequential scan to a +lower-numbered page already passed over. + +To implement this, we provide a "vacuum cycle ID" mechanism that makes it +possible to determine whether a page has been split since the current +btbulkdelete cycle started. If btbulkdelete finds a page that has been +split since it started, and has a right-link pointing to a lower page +number, then it temporarily suspends its sequential scan and visits that +page instead. It must continue to follow right-links and vacuum dead +tuples until reaching a page that either hasn't been split since +btbulkdelete started, or is above the location of the outer sequential +scan. Then it can resume the sequential scan. This ensures that all +tuples are visited. It may be that some tuples are visited twice, but +that has no worse effect than an inaccurate index tuple count (and we +can't guarantee an accurate count anyway in the face of concurrent +activity). Note that this still works if the has-been-recently-split test +has a small probability of false positives, so long as it never gives a +false negative. This makes it possible to implement the test with a small +counter value stored on each index page. + +Deleting entire pages during VACUUM +----------------------------------- + +We consider deleting an entire page from the btree only when it's become +completely empty of items. (Merging partly-full pages would allow better +space reuse, but it seems impractical to move existing data items left or +right to make this happen --- a scan moving in the opposite direction +might miss the items if so.) Also, we *never* delete the rightmost page +on a tree level (this restriction simplifies the traversal algorithms, as +explained below). Page deletion always begins from an empty leaf page. An +internal page can only be deleted as part of deleting an entire subtree. +This is always a "skinny" subtree consisting of a "chain" of internal pages +plus a single leaf page. There is one page on each level of the subtree, +and each level/page covers the same key space. + +Deleting a leaf page is a two-stage process. In the first stage, the page +is unlinked from its parent, and marked as half-dead. The parent page must +be found using the same type of search as used to find the parent during an +insertion split. We lock the target and the parent pages, change the +target's downlink to point to the right sibling, and remove its old +downlink. This causes the target page's key space to effectively belong to +its right sibling. (Neither the left nor right sibling pages need to +change their "high key" if any; so there is no problem with possibly not +having enough space to replace a high key.) At the same time, we mark the +target page as half-dead, which causes any subsequent searches to ignore it +and move right (or left, in a backwards scan). This leaves the tree in a +similar state as during a page split: the page has no downlink pointing to +it, but it's still linked to its siblings. + +(Note: Lanin and Shasha prefer to make the key space move left, but their +argument for doing so hinges on not having left-links, which we have +anyway. So we simplify the algorithm by moving the key space right. This +is only possible because we don't match on a separator key when ascending +the tree during a page split, unlike Lehman and Yao/Lanin and Shasha -- it +doesn't matter if the downlink is re-found in a pivot tuple whose separator +key does not match the one encountered when inserter initially descended +the tree.) + +To preserve consistency on the parent level, we cannot merge the key space +of a page into its right sibling unless the right sibling is a child of +the same parent --- otherwise, the parent's key space assignment changes +too, meaning we'd have to make bounding-key updates in its parent, and +perhaps all the way up the tree. Since we can't possibly do that +atomically, we forbid this case. That means that the rightmost child of a +parent node can't be deleted unless it's the only remaining child, in which +case we will delete the parent too (see below). + +In the second-stage, the half-dead leaf page is unlinked from its siblings. +We first lock the left sibling (if any) of the target, the target page +itself, and its right sibling (there must be one) in that order. Then we +update the side-links in the siblings, and mark the target page deleted. + +When we're about to delete the last remaining child of a parent page, things +are slightly more complicated. In the first stage, we leave the immediate +parent of the leaf page alone, and remove the downlink to the parent page +instead, from the grandparent. If it's the last child of the grandparent +too, we recurse up until we find a parent with more than one child, and +remove the downlink of that page. The leaf page is marked as half-dead, and +the block number of the page whose downlink was removed is stashed in the +half-dead leaf page. This leaves us with a chain of internal pages, with +one downlink each, leading to the half-dead leaf page, and no downlink +pointing to the topmost page in the chain. + +While we recurse up to find the topmost parent in the chain, we keep the +leaf page locked, but don't need to hold locks on the intermediate pages +between the leaf and the topmost parent -- insertions into upper tree levels +happen only as a result of splits of child pages, and that can't happen as +long as we're keeping the leaf locked. The internal pages in the chain +cannot acquire new children afterwards either, because the leaf page is +marked as half-dead and won't be split. + +Removing the downlink to the top of the to-be-deleted subtree/chain +effectively transfers the key space to the right sibling for all the +intermediate levels too, in one atomic operation. A concurrent search might +still visit the intermediate pages, but it will move right when it reaches +the half-dead page at the leaf level. In particular, the search will move to +the subtree to the right of the half-dead leaf page/to-be-deleted subtree, +since the half-dead leaf page's right sibling must be a "cousin" page, not a +"true" sibling page (or a second cousin page when the to-be-deleted chain +starts at leaf page's grandparent page, and so on). + +In the second stage, the topmost page in the chain is unlinked from its +siblings, and the half-dead leaf page is updated to point to the next page +down in the chain. This is repeated until there are no internal pages left +in the chain. Finally, the half-dead leaf page itself is unlinked from its +siblings. + +A deleted page cannot be recycled immediately, since there may be other +processes waiting to reference it (ie, search processes that just left the +parent, or scans moving right or left from one of the siblings). These +processes must be able to observe a deleted page for some time after the +deletion operation, in order to be able to at least recover from it (they +recover by moving right, as with concurrent page splits). Searchers never +have to worry about concurrent page recycling. + +See "Placing deleted pages in the FSM" section below for a description of +when and how deleted pages become safe for VACUUM to make recyclable. + +Page deletion and backwards scans +--------------------------------- + +Moving left in a backward scan is complicated because we must consider +the possibility that the left sibling was just split (meaning we must find +the rightmost page derived from the left sibling), plus the possibility +that the page we were just on has now been deleted and hence isn't in the +sibling chain at all anymore. So the move-left algorithm becomes: + +0. Remember the page we are on as the "original page". +1. Follow the original page's left-link (we're done if this is zero). +2. If the current page is live and its right-link matches the "original + page", we are done. +3. Otherwise, move right one or more times looking for a live page whose + right-link matches the "original page". If found, we are done. (In + principle we could scan all the way to the right end of the index, but + in practice it seems better to give up after a small number of tries. + It's unlikely the original page's sibling split more than a few times + while we were in flight to it; if we do not find a matching link in a + few tries, then most likely the original page is deleted.) +4. Return to the "original page". If it is still live, return to step 1 + (we guessed wrong about it being deleted, and should restart with its + current left-link). If it is dead, move right until a non-dead page + is found (there must be one, since rightmost pages are never deleted), + mark that as the new "original page", and return to step 1. + +This algorithm is correct because the live page found by step 4 will have +the same left keyspace boundary as the page we started from. Therefore, +when we ultimately exit, it must be on a page whose right keyspace +boundary matches the left boundary of where we started --- which is what +we need to be sure we don't miss or re-scan any items. + +Page deletion and tree height +----------------------------- + +Because we never delete the rightmost page of any level (and in particular +never delete the root), it's impossible for the height of the tree to +decrease. After massive deletions we might have a scenario in which the +tree is "skinny", with several single-page levels below the root. +Operations will still be correct in this case, but we'd waste cycles +descending through the single-page levels. To handle this we use an idea +from Lanin and Shasha: we keep track of the "fast root" level, which is +the lowest single-page level. The meta-data page keeps a pointer to this +level as well as the true root. All ordinary operations initiate their +searches at the fast root not the true root. When we split a page that is +alone on its level or delete the next-to-last page on a level (both cases +are easily detected), we have to make sure that the fast root pointer is +adjusted appropriately. In the split case, we do this work as part of the +atomic update for the insertion into the parent level; in the delete case +as part of the atomic update for the delete (either way, the metapage has +to be the last page locked in the update to avoid deadlock risks). This +avoids race conditions if two such operations are executing concurrently. + +Placing deleted pages in the FSM +-------------------------------- + +Recycling a page is decoupled from page deletion. A deleted page can only +be put in the FSM to be recycled once there is no possible scan or search +that has a reference to it; until then, it must stay in place with its +sibling links undisturbed, as a tombstone that allows concurrent searches +to detect and then recover from concurrent deletions (which are rather +like concurrent page splits to searchers). This design is an +implementation of what Lanin and Shasha call "the drain technique". + +We implement the technique by waiting until all active snapshots and +registered snapshots as of the page deletion are gone; which is overly +strong, but is simple to implement within Postgres. When marked fully +dead, a deleted page is labeled with the next-transaction counter value. +VACUUM can reclaim the page for re-use when the stored XID is guaranteed +to be "visible to everyone". As collateral damage, we wait for snapshots +taken until the next transaction to allocate an XID commits. We also wait +for running XIDs with no snapshots. + +Prior to PostgreSQL 14, VACUUM would only place _old_ deleted pages that +it encounters during its linear scan (pages deleted by a previous VACUUM +operation) in the FSM. Newly deleted pages were never placed in the FSM, +because that was assumed to _always_ be unsafe. That assumption was +unnecessarily pessimistic in practice, though -- it often doesn't take +very long for newly deleted pages to become safe to place in the FSM. +There is no truly principled way to predict when deleted pages will become +safe to place in the FSM for recycling -- it might become safe almost +immediately (long before the current VACUUM completes), or it might not +even be safe by the time the next VACUUM takes place. Recycle safety is +purely a question of maintaining the consistency (or at least the apparent +consistency) of a physical data structure. The state within the backend +running VACUUM is simply not relevant. + +PostgreSQL 14 added the ability for VACUUM to consider if it's possible to +recycle newly deleted pages at the end of the full index scan where the +page deletion took place. It is convenient to check if it's safe at that +point. This does require that VACUUM keep around a little bookkeeping +information about newly deleted pages, but that's very cheap. Using +in-memory state for this avoids the need to revisit newly deleted pages a +second time later on -- we can just use safexid values from the local +bookkeeping state to determine recycle safety in a deferred fashion. + +The need for additional FSM indirection after a page deletion operation +takes place is a natural consequence of the highly permissive rules for +index scans with Lehman and Yao's design. In general an index scan +doesn't have to hold a lock or even a pin on any page when it descends the +tree (nothing that you'd usually think of as an interlock is held "between +levels"). At the same time, index scans cannot be allowed to land on a +truly unrelated page due to concurrent recycling (not to be confused with +concurrent deletion), because that results in wrong answers to queries. +Simpler approaches to page deletion that don't need to defer recycling are +possible, but none seem compatible with Lehman and Yao's design. + +Placing an already-deleted page in the FSM to be recycled when needed +doesn't actually change the state of the page. The page will be changed +whenever it is subsequently taken from the FSM for reuse. The deleted +page's contents will be overwritten by the split operation (it will become +the new right sibling page). + +Fastpath For Index Insertion +---------------------------- + +We optimize for a common case of insertion of increasing index key +values by caching the last page to which this backend inserted the last +value, if this page was the rightmost leaf page. For the next insert, we +can then quickly check if the cached page is still the rightmost leaf +page and also the correct place to hold the current value. We can avoid +the cost of walking down the tree in such common cases. + +The optimization works on the assumption that there can only be one +non-ignorable leaf rightmost page, and so not even a visible-to-everyone +style interlock is required. We cannot fail to detect that our hint was +invalidated, because there can only be one such page in the B-Tree at +any time. It's possible that the page will be deleted and recycled +without a backend's cached page also being detected as invalidated, but +only when we happen to recycle a block that once again gets recycled as the +rightmost leaf page. + +Simple deletion +--------------- + +If a process visits a heap tuple and finds that it's dead and removable +(ie, dead to all open transactions, not only that process), then we can +return to the index and mark the corresponding index entry "known dead", +allowing subsequent index scans to skip visiting the heap tuple. The +"known dead" marking works by setting the index item's lp_flags state +to LP_DEAD. This is currently only done in plain indexscans, not bitmap +scans, because only plain scans visit the heap and index "in sync" and so +there's not a convenient way to do it for bitmap scans. Note also that +LP_DEAD bits are often set when checking a unique index for conflicts on +insert (this is simpler because it takes place when we hold an exclusive +lock on the leaf page). + +Once an index tuple has been marked LP_DEAD it can actually be deleted +from the index immediately; since index scans only stop "between" pages, +no scan can lose its place from such a deletion. We separate the steps +because we allow LP_DEAD to be set with only a share lock (it's exactly +like a hint bit for a heap tuple), but physically removing tuples requires +exclusive lock. Also, delaying the deletion often allows us to pick up +extra index tuples that weren't initially safe for index scans to mark +LP_DEAD. We do this with index tuples whose TIDs point to the same table +blocks as an LP_DEAD-marked tuple. They're practically free to check in +passing, and have a pretty good chance of being safe to delete due to +various locality effects. + +We only try to delete LP_DEAD tuples (and nearby tuples) when we are +otherwise faced with having to split a page to do an insertion (and hence +have exclusive lock on it already). Deduplication and bottom-up index +deletion can also prevent a page split, but simple deletion is always our +preferred approach. (Note that posting list tuples can only have their +LP_DEAD bit set when every table TID within the posting list is known +dead. This isn't much of a problem in practice because LP_DEAD bits are +just a starting point for simple deletion -- we still manage to perform +granular deletes of posting list TIDs quite often.) + +It's sufficient to have an exclusive lock on the index page, not a +super-exclusive lock, to do deletion of LP_DEAD items. It might seem +that this breaks the interlock between VACUUM and indexscans, but that is +not so: as long as an indexscanning process has a pin on the page where +the index item used to be, VACUUM cannot complete its btbulkdelete scan +and so cannot remove the heap tuple. This is another reason why +btbulkdelete has to get a super-exclusive lock on every leaf page, not only +the ones where it actually sees items to delete. + +LP_DEAD setting by index scans cannot be sure that a TID whose index tuple +it had planned on LP_DEAD-setting has not been recycled by VACUUM if it +drops its pin in the meantime. It must conservatively also remember the +LSN of the page, and only act to set LP_DEAD bits when the LSN has not +changed at all. (Avoiding dropping the pin entirely also makes it safe, of +course.) + +Bottom-Up deletion +------------------ + +We attempt to delete whatever duplicates happen to be present on the page +when the duplicates are suspected to be caused by version churn from +successive UPDATEs. This only happens when we receive an executor hint +indicating that optimizations like heapam's HOT have not worked out for +the index -- the incoming tuple must be a logically unchanged duplicate +which is needed for MVCC purposes, suggesting that that might well be the +dominant source of new index tuples on the leaf page in question. (Also, +bottom-up deletion is triggered within unique indexes in cases with +continual INSERT and DELETE related churn, since that is easy to detect +without any external hint.) + +Simple deletion will already have failed to prevent a page split when a +bottom-up deletion pass takes place (often because no LP_DEAD bits were +ever set on the page). The two mechanisms have closely related +implementations. The same WAL records are used for each operation, and +the same tableam infrastructure is used to determine what TIDs/tuples are +actually safe to delete. The implementations only differ in how they pick +TIDs to consider for deletion, and whether or not the tableam will give up +before accessing all table blocks (bottom-up deletion lives with the +uncertainty of its success by keeping the cost of failure low). Even +still, the two mechanisms are clearly distinct at the conceptual level. + +Bottom-up index deletion is driven entirely by heuristics (whereas simple +deletion is guaranteed to delete at least those index tuples that are +already LP_DEAD marked -- there must be at least one). We have no +certainty that we'll find even one index tuple to delete. That's why we +closely cooperate with the tableam to keep the costs it pays in balance +with the benefits we receive. The interface that we use for this is +described in detail in access/tableam.h. + +Bottom-up index deletion can be thought of as a backstop mechanism against +unnecessary version-driven page splits. It is based in part on an idea +from generational garbage collection: the "generational hypothesis". This +is the empirical observation that "most objects die young". Within +nbtree, new index tuples often quickly appear in the same place, and then +quickly become garbage. There can be intense concentrations of garbage in +relatively few leaf pages with certain workloads (or there could be in +earlier versions of PostgreSQL without bottom-up index deletion, at +least). See doc/src/sgml/btree.sgml for a high-level description of the +design principles behind bottom-up index deletion in nbtree, including +details of how it complements VACUUM. + +We expect to find a reasonably large number of tuples that are safe to +delete within each bottom-up pass. If we don't then we won't need to +consider the question of bottom-up deletion for the same leaf page for +quite a while (usually because the page splits, which resolves the +situation for the time being). We expect to perform regular bottom-up +deletion operations against pages that are at constant risk of unnecessary +page splits caused only by version churn. When the mechanism works well +we'll constantly be "on the verge" of having version-churn-driven page +splits, but never actually have even one. + +Our duplicate heuristics work well despite being fairly simple. +Unnecessary page splits only occur when there are truly pathological +levels of version churn (in theory a small amount of version churn could +make a page split occur earlier than strictly necessary, but that's pretty +harmless). We don't have to understand the underlying workload; we only +have to understand the general nature of the pathology that we target. +Version churn is easy to spot when it is truly pathological. Affected +leaf pages are fairly homogeneous. + +WAL Considerations +------------------ + +The insertion and deletion algorithms in themselves don't guarantee btree +consistency after a crash. To provide robustness, we depend on WAL +replay. A single WAL entry is effectively an atomic action --- we can +redo it from the log if it fails to complete. + +Ordinary item insertions (that don't force a page split) are of course +single WAL entries, since they only affect one page. The same for +leaf-item deletions (if the deletion brings the leaf page to zero items, +it is now a candidate to be deleted, but that is a separate action). + +An insertion that causes a page split is logged as a single WAL entry for +the changes occurring on the insertion's level --- including update of the +right sibling's left-link --- followed by a second WAL entry for the +insertion on the parent level (which might itself be a page split, requiring +an additional insertion above that, etc). + +For a root split, the follow-on WAL entry is a "new root" entry rather than +an "insertion" entry, but details are otherwise much the same. + +Because splitting involves multiple atomic actions, it's possible that the +system crashes between splitting a page and inserting the downlink for the +new half to the parent. After recovery, the downlink for the new page will +be missing. The search algorithm works correctly, as the page will be found +by following the right-link from its left sibling, although if a lot of +downlinks in the tree are missing, performance will suffer. A more serious +consequence is that if the page without a downlink gets split again, the +insertion algorithm will fail to find the location in the parent level to +insert the downlink. + +Our approach is to create any missing downlinks on-the-fly, when searching +the tree for a new insertion. It could be done during searches, too, but +it seems best not to put any extra updates in what would otherwise be a +read-only operation (updating is not possible in hot standby mode anyway). +It would seem natural to add the missing downlinks in VACUUM, but since +inserting a downlink might require splitting a page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. There is one +minor exception: VACUUM finishes interrupted splits of internal pages when +deleting their children. This allows the code for re-finding parent items +to be used by both page splits and page deletion. + +To identify missing downlinks, when a page is split, the left page is +flagged to indicate that the split is not yet complete (INCOMPLETE_SPLIT). +When the downlink is inserted to the parent, the flag is cleared atomically +with the insertion. The child page is kept locked until the insertion in +the parent is finished and the flag in the child cleared, but can be +released immediately after that, before recursing up the tree if the parent +also needs to be split. This ensures that incompletely split pages should +not be seen under normal circumstances; only if insertion to the parent +has failed for some reason. (It's also possible for a reader to observe +a page with the incomplete split flag set during recovery; see later +section on "Scans during Recovery" for details.) + +We flag the left page, even though it's the right page that's missing the +downlink, because it's more convenient to know already when following the +right-link from the left page to the right page that it will need to have +its downlink inserted to the parent. + +When splitting a non-root page that is alone on its level, the required +metapage update (of the "fast root" link) is performed and logged as part +of the insertion into the parent level. When splitting the root page, the +metapage update is handled as part of the "new root" action. + +Each step in page deletion is logged as a separate WAL entry: marking the +leaf as half-dead and removing the downlink is one record, and unlinking a +page is a second record. If vacuum is interrupted for some reason, or the +system crashes, the tree is consistent for searches and insertions. The +next VACUUM will find the half-dead leaf page and continue the deletion. + +Before 9.4, we used to keep track of incomplete splits and page deletions +during recovery and finish them immediately at end of recovery, instead of +doing it lazily at the next insertion or vacuum. However, that made the +recovery much more complicated, and only fixed the problem when crash +recovery was performed. An incomplete split can also occur if an otherwise +recoverable error, like out-of-memory or out-of-disk-space, happens while +inserting the downlink to the parent. + +Scans during Recovery +--------------------- + +nbtree indexes support read queries in Hot Standby mode. Every atomic +action/WAL record makes isolated changes that leave the tree in a +consistent state for readers. Readers lock pages according to the same +rules that readers follow on the primary. (Readers may have to move +right to recover from a "concurrent" page split or page deletion, just +like on the primary.) + +However, there are a couple of differences in how pages are locked by +replay/the startup process as compared to the original write operation +on the primary. The exceptions involve page splits and page deletions. +The first phase and second phase of a page split are processed +independently during replay, since they are independent atomic actions. +We do not attempt to recreate the coupling of parent and child page +write locks that took place on the primary. This is safe because readers +never care about the incomplete split flag anyway. Holding on to an +extra write lock on the primary is only necessary so that a second +writer cannot observe the incomplete split flag before the first writer +finishes the split. If we let concurrent writers on the primary observe +an incomplete split flag on the same page, each writer would attempt to +complete the unfinished split, corrupting the parent page. (Similarly, +replay of page deletion records does not hold a write lock on the target +leaf page throughout; only the primary needs to block out concurrent +writers that insert on to the page being deleted.) + +WAL replay holds same-level locks in a way that matches the approach +taken during original execution, though. This prevent readers from +observing same-level inconsistencies. It's probably possible to be more +lax about how same-level locks are acquired during recovery (most kinds +of readers could still move right to recover if we didn't couple +same-level locks), but we prefer to be conservative here. + +During recovery all index scans start with ignore_killed_tuples = false +and we never set kill_prior_tuple. We do this because the oldest xmin +on the standby server can be older than the oldest xmin on the primary +server, which means tuples can be marked LP_DEAD even when they are +still visible on the standby. We don't WAL log tuple LP_DEAD bits, but +they can still appear in the standby because of full page writes. So +we must always ignore them in standby, and that means it's not worth +setting them either. (When LP_DEAD-marked tuples are eventually deleted +on the primary, the deletion is WAL-logged. Queries that run on a +standby therefore get much of the benefit of any LP_DEAD setting that +takes place on the primary.) + +Note that we talk about scans that are started during recovery. We go to +a little trouble to allow a scan to start during recovery and end during +normal running after recovery has completed. This is a key capability +because it allows running applications to continue while the standby +changes state into a normally running server. + +The interlocking required to avoid returning incorrect results from +non-MVCC scans is not required on standby nodes. We still get a +super-exclusive lock ("cleanup lock") when replaying VACUUM records +during recovery, but recovery does not need to lock every leaf page +(only those leaf pages that have items to delete). That is safe because +HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(), +HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only ever +used during write transactions, which cannot exist on the standby. MVCC +scans are already protected by definition, so HeapTupleSatisfiesMVCC() +is not a problem. The optimizer looks at the boundaries of value ranges +using HeapTupleSatisfiesNonVacuumable() with an index-only scan, which +is also safe. That leaves concern only for HeapTupleSatisfiesToast(). + +HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's +because it doesn't need to - if the main heap row is visible then the +toast rows will also be visible. So as long as we follow a toast +pointer from a visible (live) tuple the corresponding toast rows +will also be visible, so we do not need to recheck MVCC on them. + +Other Things That Are Handy to Know +----------------------------------- + +Page zero of every btree is a meta-data page. This page stores the +location of the root page --- both the true root and the current effective +root ("fast" root). To avoid fetching the metapage for every single index +search, we cache a copy of the meta-data information in the index's +relcache entry (rd_amcache). This is a bit ticklish since using the cache +implies following a root page pointer that could be stale. However, a +backend following a cached pointer can sufficiently verify whether it +reached the intended page; either by checking the is-root flag when it +is going to the true root, or by checking that the page has no siblings +when going to the fast root. At worst, this could result in descending +some extra tree levels if we have a cached pointer to a fast root that is +now above the real fast root. Such cases shouldn't arise often enough to +be worth optimizing; and in any case we can expect a relcache flush will +discard the cached metapage before long, since a VACUUM that's moved the +fast root pointer can be expected to issue a statistics update for the +index. + +The algorithm assumes we can fit at least three items per page +(a "high key" and two real data items). Therefore it's unsafe +to accept items larger than 1/3rd page size. Larger items would +work sometimes, but could cause failures later on depending on +what else gets put on their page. + +"ScanKey" data structures are used in two fundamentally different ways +in this code, which we describe as "search" scankeys and "insertion" +scankeys. A search scankey is the kind passed to btbeginscan() or +btrescan() from outside the btree code. The sk_func pointers in a search +scankey point to comparison functions that return boolean, such as int4lt. +There might be more than one scankey entry for a given index column, or +none at all. (We require the keys to appear in index column order, but +the order of multiple keys for a given column is unspecified.) An +insertion scankey ("BTScanInsert" data structure) uses a similar +array-of-ScanKey data structure, but the sk_func pointers point to btree +comparison support functions (ie, 3-way comparators that return int4 values +interpreted as <0, =0, >0). In an insertion scankey there is at most one +entry per index column. There is also other data about the rules used to +locate where to begin the scan, such as whether or not the scan is a +"nextkey" scan. Insertion scankeys are built within the btree code (eg, by +_bt_mkscankey()) and are used to locate the starting point of a scan, as +well as for locating the place to insert a new index tuple. (Note: in the +case of an insertion scankey built from a search scankey or built from a +truncated pivot tuple, there might be fewer keys than index columns, +indicating that we have no constraints for the remaining index columns.) +After we have located the starting point of a scan, the original search +scankey is consulted as each index entry is sequentially scanned to decide +whether to return the entry and whether the scan can stop (see +_bt_checkkeys()). + +Notes about suffix truncation +----------------------------- + +We truncate away suffix key attributes that are not needed for a page high +key during a leaf page split. The remaining attributes must distinguish +the last index tuple on the post-split left page as belonging on the left +page, and the first index tuple on the post-split right page as belonging +on the right page. Tuples logically retain truncated key attributes, +though they implicitly have "negative infinity" as their value, and have no +storage overhead. Since the high key is subsequently reused as the +downlink in the parent page for the new right page, suffix truncation makes +pivot tuples short. INCLUDE indexes are guaranteed to have non-key +attributes truncated at the time of a leaf page split, but may also have +some key attributes truncated away, based on the usual criteria for key +attributes. They are not a special case, since non-key attributes are +merely payload to B-Tree searches. + +The goal of suffix truncation of key attributes is to improve index +fan-out. The technique was first described by Bayer and Unterauer (R.Bayer +and K.Unterauer, Prefix B-Trees, ACM Transactions on Database Systems, Vol +2, No. 1, March 1977, pp 11-26). The Postgres implementation is loosely +based on their paper. Note that Postgres only implements what the paper +refers to as simple prefix B-Trees. Note also that the paper assumes that +the tree has keys that consist of single strings that maintain the "prefix +property", much like strings that are stored in a suffix tree (comparisons +of earlier bytes must always be more significant than comparisons of later +bytes, and, in general, the strings must compare in a way that doesn't +break transitive consistency as they're split into pieces). Suffix +truncation in Postgres currently only works at the whole-attribute +granularity, but it would be straightforward to invent opclass +infrastructure that manufactures a smaller attribute value in the case of +variable-length types, such as text. An opclass support function could +manufacture the shortest possible key value that still correctly separates +each half of a leaf page split. + +There is sophisticated criteria for choosing a leaf page split point. The +general idea is to make suffix truncation effective without unduly +influencing the balance of space for each half of the page split. The +choice of leaf split point can be thought of as a choice among points +*between* items on the page to be split, at least if you pretend that the +incoming tuple was placed on the page already (you have to pretend because +there won't actually be enough space for it on the page). Choosing the +split point between two index tuples where the first non-equal attribute +appears as early as possible results in truncating away as many suffix +attributes as possible. Evenly balancing space among each half of the +split is usually the first concern, but even small adjustments in the +precise split point can allow truncation to be far more effective. + +Suffix truncation is primarily valuable because it makes pivot tuples +smaller, which delays splits of internal pages, but that isn't the only +reason why it's effective. Even truncation that doesn't make pivot tuples +smaller due to alignment still prevents pivot tuples from being more +restrictive than truly necessary in how they describe which values belong +on which pages. + +While it's not possible to correctly perform suffix truncation during +internal page splits, it's still useful to be discriminating when splitting +an internal page. The split point that implies a downlink be inserted in +the parent that's the smallest one available within an acceptable range of +the fillfactor-wise optimal split point is chosen. This idea also comes +from the Prefix B-Tree paper. This process has much in common with what +happens at the leaf level to make suffix truncation effective. The overall +effect is that suffix truncation tends to produce smaller, more +discriminating pivot tuples, especially early in the lifetime of the index, +while biasing internal page splits makes the earlier, smaller pivot tuples +end up in the root page, delaying root page splits. + +Logical duplicates are given special consideration. The logic for +selecting a split point goes to great lengths to avoid having duplicates +span more than one page, and almost always manages to pick a split point +between two user-key-distinct tuples, accepting a completely lopsided split +if it must. When a page that's already full of duplicates must be split, +the fallback strategy assumes that duplicates are mostly inserted in +ascending heap TID order. The page is split in a way that leaves the left +half of the page mostly full, and the right half of the page mostly empty. +The overall effect is that leaf page splits gracefully adapt to inserts of +large groups of duplicates, maximizing space utilization. Note also that +"trapping" large groups of duplicates on the same leaf page like this makes +deduplication more efficient. Deduplication can be performed infrequently, +without merging together existing posting list tuples too often. + +Notes about deduplication +------------------------- + +We deduplicate non-pivot tuples in non-unique indexes to reduce storage +overhead, and to avoid (or at least delay) page splits. Note that the +goals for deduplication in unique indexes are rather different; see later +section for details. Deduplication alters the physical representation of +tuples without changing the logical contents of the index, and without +adding overhead to read queries. Non-pivot tuples are merged together +into a single physical tuple with a posting list (a simple array of heap +TIDs with the standard item pointer format). Deduplication is always +applied lazily, at the point where it would otherwise be necessary to +perform a page split. It occurs only when LP_DEAD items have been +removed, as our last line of defense against splitting a leaf page +(bottom-up index deletion may be attempted first, as our second last line +of defense). We can set the LP_DEAD bit with posting list tuples, though +only when all TIDs are known dead. + +Our lazy approach to deduplication allows the page space accounting used +during page splits to have absolutely minimal special case logic for +posting lists. Posting lists can be thought of as extra payload that +suffix truncation will reliably truncate away as needed during page +splits, just like non-key columns from an INCLUDE index tuple. +Incoming/new tuples can generally be treated as non-overlapping plain +items (though see section on posting list splits for information about how +overlapping new/incoming items are really handled). + +The representation of posting lists is almost identical to the posting +lists used by GIN, so it would be straightforward to apply GIN's varbyte +encoding compression scheme to individual posting lists. Posting list +compression would break the assumptions made by posting list splits about +page space accounting (see later section), so it's not clear how +compression could be integrated with nbtree. Besides, posting list +compression does not offer a compelling trade-off for nbtree, since in +general nbtree is optimized for consistent performance with many +concurrent readers and writers. Compression would also make the deletion +of a subset of TIDs from a posting list slow and complicated, which would +be a big problem for workloads that depend heavily on bottom-up index +deletion. + +A major goal of our lazy approach to deduplication is to limit the +performance impact of deduplication with random updates. Even concurrent +append-only inserts of the same key value will tend to have inserts of +individual index tuples in an order that doesn't quite match heap TID +order. Delaying deduplication minimizes page level fragmentation. + +Deduplication in unique indexes +------------------------------- + +Very often, the number of distinct values that can ever be placed on +almost any given leaf page in a unique index is fixed and permanent. For +example, a primary key on an identity column will usually only have leaf +page splits caused by the insertion of new logical rows within the +rightmost leaf page. If there is a split of a non-rightmost leaf page, +then the split must have been triggered by inserts associated with UPDATEs +of existing logical rows. Splitting a leaf page purely to store multiple +versions is a false economy. In effect, we're permanently degrading the +index structure just to absorb a temporary burst of duplicates. + +Deduplication in unique indexes helps to prevent these pathological page +splits. Storing duplicates in a space efficient manner is not the goal, +since in the long run there won't be any duplicates anyway. Rather, we're +buying time for standard garbage collection mechanisms to run before a +page split is needed. + +Unique index leaf pages only get a deduplication pass when an insertion +(that might have to split the page) observed an existing duplicate on the +page in passing. This is based on the assumption that deduplication will +only work out when _all_ new insertions are duplicates from UPDATEs. This +may mean that we miss an opportunity to delay a page split, but that's +okay because our ultimate goal is to delay leaf page splits _indefinitely_ +(i.e. to prevent them altogether). There is little point in trying to +delay a split that is probably inevitable anyway. This allows us to avoid +the overhead of attempting to deduplicate with unique indexes that always +have few or no duplicates. + +Note: Avoiding "unnecessary" page splits driven by version churn is also +the goal of bottom-up index deletion, which was added to PostgreSQL 14. +Bottom-up index deletion is now the preferred way to deal with this +problem (with all kinds of indexes, though especially with unique +indexes). Still, deduplication can sometimes augment bottom-up index +deletion. When deletion cannot free tuples (due to an old snapshot +holding up cleanup), falling back on deduplication provides additional +capacity. Delaying the page split by deduplicating can allow a future +bottom-up deletion pass of the same page to succeed. + +Posting list splits +------------------- + +When the incoming tuple happens to overlap with an existing posting list, +a posting list split is performed. Like a page split, a posting list +split resolves a situation where a new/incoming item "won't fit", while +inserting the incoming item in passing (i.e. as part of the same atomic +action). It's possible (though not particularly likely) that an insert of +a new item on to an almost-full page will overlap with a posting list, +resulting in both a posting list split and a page split. Even then, the +atomic action that splits the posting list also inserts the new item +(since page splits always insert the new item in passing). Including the +posting list split in the same atomic action as the insert avoids problems +caused by concurrent inserts into the same posting list -- the exact +details of how we change the posting list depend upon the new item, and +vice-versa. A single atomic action also minimizes the volume of extra +WAL required for a posting list split, since we don't have to explicitly +WAL-log the original posting list tuple. + +Despite piggy-backing on the same atomic action that inserts a new tuple, +posting list splits can be thought of as a separate, extra action to the +insert itself (or to the page split itself). Posting list splits +conceptually "rewrite" an insert that overlaps with an existing posting +list into an insert that adds its final new item just to the right of the +posting list instead. The size of the posting list won't change, and so +page space accounting code does not need to care about posting list splits +at all. This is an important upside of our design; the page split point +choice logic is very subtle even without it needing to deal with posting +list splits. + +Only a few isolated extra steps are required to preserve the illusion that +the new item never overlapped with an existing posting list in the first +place: the heap TID of the incoming tuple has its TID replaced with the +rightmost/max heap TID from the existing/originally overlapping posting +list. Similarly, the original incoming item's TID is relocated to the +appropriate offset in the posting list (we usually shift TIDs out of the +way to make a hole for it). Finally, the posting-split-with-page-split +case must generate a new high key based on an imaginary version of the +original page that has both the final new item and the after-list-split +posting tuple (page splits usually just operate against an imaginary +version that contains the new item/item that won't fit). + +This approach avoids inventing an "eager" atomic posting split operation +that splits the posting list without simultaneously finishing the insert +of the incoming item. This alternative design might seem cleaner, but it +creates subtle problems for page space accounting. In general, there +might not be enough free space on the page to split a posting list such +that the incoming/new item no longer overlaps with either posting list +half --- the operation could fail before the actual retail insert of the +new item even begins. We'd end up having to handle posting list splits +that need a page split anyway. Besides, supporting variable "split points" +while splitting posting lists won't actually improve overall space +utilization. + +Notes About Data Representation +------------------------------- + +The right-sibling link required by L&Y is kept in the page "opaque +data" area, as is the left-sibling link, the page level, and some flags. +The page level counts upwards from zero at the leaf level, to the tree +depth minus 1 at the root. (Counting up from the leaves ensures that we +don't need to renumber any existing pages when splitting the root.) + +The Postgres disk block data format (an array of items) doesn't fit +Lehman and Yao's alternating-keys-and-pointers notion of a disk page, +so we have to play some games. (The alternating-keys-and-pointers +notion is important for internal page splits, which conceptually split +at the middle of an existing pivot tuple -- the tuple's "separator" key +goes on the left side of the split as the left side's new high key, +while the tuple's pointer/downlink goes on the right side as the +first/minus infinity downlink.) + +On a page that is not rightmost in its tree level, the "high key" is +kept in the page's first item, and real data items start at item 2. +The link portion of the "high key" item goes unused. A page that is +rightmost has no "high key" (it's implicitly positive infinity), so +data items start with the first item. Putting the high key at the +left, rather than the right, may seem odd, but it avoids moving the +high key as we add data items. + +On a leaf page, the data items are simply links to (TIDs of) tuples +in the relation being indexed, with the associated key values. + +On a non-leaf page, the data items are down-links to child pages with +bounding keys. The key in each data item is a strict lower bound for +keys on that child page, so logically the key is to the left of that +downlink. The high key (if present) is the upper bound for the last +downlink. The first data item on each such page has no lower bound +--- or lower bound of minus infinity, if you prefer. The comparison +routines must treat it accordingly. The actual key stored in the +item is irrelevant, and need not be stored at all. This arrangement +corresponds to the fact that an L&Y non-leaf page has one more pointer +than key. Suffix truncation's negative infinity attributes behave in +the same way. diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c new file mode 100644 index 0000000..7ac73cb --- /dev/null +++ b/src/backend/access/nbtree/nbtcompare.c @@ -0,0 +1,335 @@ +/*------------------------------------------------------------------------- + * + * nbtcompare.c + * Comparison functions for btree access method. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtcompare.c + * + * NOTES + * + * These functions are stored in pg_amproc. For each operator class + * defined on btrees, they compute + * + * compare(a, b): + * < 0 if a < b, + * = 0 if a == b, + * > 0 if a > b. + * + * The result is always an int32 regardless of the input datatype. + * + * Although any negative int32 is acceptable for reporting "<", + * and any positive int32 is acceptable for reporting ">", routines + * that work on 32-bit or wider datatypes can't just return "a - b". + * That could overflow and give the wrong answer. + * + * NOTE: it is critical that the comparison function impose a total order + * on all non-NULL values of the data type, and that the datatype's + * boolean comparison operators (= < >= etc) yield results consistent + * with the comparison routine. Otherwise bad behavior may ensue. + * (For example, the comparison operators must NOT punt when faced with + * NAN or other funny values; you must devise some collation sequence for + * all such values.) If the datatype is not trivial, this is most + * reliably done by having the boolean operators invoke the same + * three-way comparison code that the btree function does. Therefore, + * this file contains only btree support for "trivial" datatypes --- + * all others are in the /utils/adt/ files that implement their datatypes. + * + * NOTE: these routines must not leak memory, since memory allocated + * during an index access won't be recovered till end of query. This + * primarily affects comparison routines for toastable datatypes; + * they have to be careful to free any detoasted copy of an input datum. + * + * NOTE: we used to forbid comparison functions from returning INT_MIN, + * but that proves to be too error-prone because some platforms' versions + * of memcmp() etc can return INT_MIN. As a means of stress-testing + * callers, this file can be compiled with STRESS_SORT_INT_MIN defined + * to cause many of these functions to return INT_MIN or INT_MAX instead of + * their customary -1/+1. For production, though, that's not a good idea + * since users or third-party code might expect the traditional results. + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <limits.h> + +#include "utils/builtins.h" +#include "utils/sortsupport.h" + +#ifdef STRESS_SORT_INT_MIN +#define A_LESS_THAN_B INT_MIN +#define A_GREATER_THAN_B INT_MAX +#else +#define A_LESS_THAN_B (-1) +#define A_GREATER_THAN_B 1 +#endif + + +Datum +btboolcmp(PG_FUNCTION_ARGS) +{ + bool a = PG_GETARG_BOOL(0); + bool b = PG_GETARG_BOOL(1); + + PG_RETURN_INT32((int32) a - (int32) b); +} + +Datum +btint2cmp(PG_FUNCTION_ARGS) +{ + int16 a = PG_GETARG_INT16(0); + int16 b = PG_GETARG_INT16(1); + + PG_RETURN_INT32((int32) a - (int32) b); +} + +static int +btint2fastcmp(Datum x, Datum y, SortSupport ssup) +{ + int16 a = DatumGetInt16(x); + int16 b = DatumGetInt16(y); + + return (int) a - (int) b; +} + +Datum +btint2sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btint2fastcmp; + PG_RETURN_VOID(); +} + +Datum +btint4cmp(PG_FUNCTION_ARGS) +{ + int32 a = PG_GETARG_INT32(0); + int32 b = PG_GETARG_INT32(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +static int +btint4fastcmp(Datum x, Datum y, SortSupport ssup) +{ + int32 a = DatumGetInt32(x); + int32 b = DatumGetInt32(y); + + if (a > b) + return A_GREATER_THAN_B; + else if (a == b) + return 0; + else + return A_LESS_THAN_B; +} + +Datum +btint4sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btint4fastcmp; + PG_RETURN_VOID(); +} + +Datum +btint8cmp(PG_FUNCTION_ARGS) +{ + int64 a = PG_GETARG_INT64(0); + int64 b = PG_GETARG_INT64(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +static int +btint8fastcmp(Datum x, Datum y, SortSupport ssup) +{ + int64 a = DatumGetInt64(x); + int64 b = DatumGetInt64(y); + + if (a > b) + return A_GREATER_THAN_B; + else if (a == b) + return 0; + else + return A_LESS_THAN_B; +} + +Datum +btint8sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btint8fastcmp; + PG_RETURN_VOID(); +} + +Datum +btint48cmp(PG_FUNCTION_ARGS) +{ + int32 a = PG_GETARG_INT32(0); + int64 b = PG_GETARG_INT64(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint84cmp(PG_FUNCTION_ARGS) +{ + int64 a = PG_GETARG_INT64(0); + int32 b = PG_GETARG_INT32(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint24cmp(PG_FUNCTION_ARGS) +{ + int16 a = PG_GETARG_INT16(0); + int32 b = PG_GETARG_INT32(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint42cmp(PG_FUNCTION_ARGS) +{ + int32 a = PG_GETARG_INT32(0); + int16 b = PG_GETARG_INT16(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint28cmp(PG_FUNCTION_ARGS) +{ + int16 a = PG_GETARG_INT16(0); + int64 b = PG_GETARG_INT64(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint82cmp(PG_FUNCTION_ARGS) +{ + int64 a = PG_GETARG_INT64(0); + int16 b = PG_GETARG_INT16(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btoidcmp(PG_FUNCTION_ARGS) +{ + Oid a = PG_GETARG_OID(0); + Oid b = PG_GETARG_OID(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +static int +btoidfastcmp(Datum x, Datum y, SortSupport ssup) +{ + Oid a = DatumGetObjectId(x); + Oid b = DatumGetObjectId(y); + + if (a > b) + return A_GREATER_THAN_B; + else if (a == b) + return 0; + else + return A_LESS_THAN_B; +} + +Datum +btoidsortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btoidfastcmp; + PG_RETURN_VOID(); +} + +Datum +btoidvectorcmp(PG_FUNCTION_ARGS) +{ + oidvector *a = (oidvector *) PG_GETARG_POINTER(0); + oidvector *b = (oidvector *) PG_GETARG_POINTER(1); + int i; + + /* We arbitrarily choose to sort first by vector length */ + if (a->dim1 != b->dim1) + PG_RETURN_INT32(a->dim1 - b->dim1); + + for (i = 0; i < a->dim1; i++) + { + if (a->values[i] != b->values[i]) + { + if (a->values[i] > b->values[i]) + PG_RETURN_INT32(A_GREATER_THAN_B); + else + PG_RETURN_INT32(A_LESS_THAN_B); + } + } + PG_RETURN_INT32(0); +} + +Datum +btcharcmp(PG_FUNCTION_ARGS) +{ + char a = PG_GETARG_CHAR(0); + char b = PG_GETARG_CHAR(1); + + /* Be careful to compare chars as unsigned */ + PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b)); +} diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c new file mode 100644 index 0000000..1cd1b59 --- /dev/null +++ b/src/backend/access/nbtree/nbtdedup.c @@ -0,0 +1,1098 @@ +/*------------------------------------------------------------------------- + * + * nbtdedup.c + * Deduplicate or bottom-up delete items in Postgres btrees. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtdedup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + +static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, + TM_IndexDeleteOp *delstate); +static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem); +static void _bt_singleval_fillfactor(Page page, BTDedupState state, + Size newitemsz); +#ifdef USE_ASSERT_CHECKING +static bool _bt_posting_valid(IndexTuple posting); +#endif + +/* + * Perform a deduplication pass. + * + * The general approach taken here is to perform as much deduplication as + * possible to free as much space as possible. Note, however, that "single + * value" strategy is used for !bottomupdedup callers when the page is full of + * tuples of a single value. Deduplication passes that apply the strategy + * will leave behind a few untouched tuples at the end of the page, preparing + * the page for an anticipated page split that uses nbtsplitloc.c's own single + * value strategy. Our high level goal is to delay merging the untouched + * tuples until after the page splits. + * + * When a call to _bt_bottomupdel_pass() just took place (and failed), our + * high level goal is to prevent a page split entirely by buying more time. + * We still hope that a page split can be avoided altogether. That's why + * single value strategy is not even considered for bottomupdedup callers. + * + * The page will have to be split if we cannot successfully free at least + * newitemsz (we also need space for newitem's line pointer, which isn't + * included in caller's newitemsz). + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +void +_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, + Size newitemsz, bool bottomupdedup) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Page newpage; + BTDedupState state; + Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0; + bool singlevalstrat = false; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* + * Initialize deduplication state. + * + * It would be possible for maxpostingsize (limit on posting list tuple + * size) to be set to one third of the page. However, it seems like a + * good idea to limit the size of posting lists to one sixth of a page. + * That ought to leave us with a good split point when pages full of + * duplicates can be split several times. + */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); + /* Metadata about base tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + /* Size of all physical tuples to be replaced by pending posting list */ + state->phystupsize = 0; + /* nintervals should be initialized to zero */ + state->nintervals = 0; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Consider applying "single value" strategy, though only if the page + * seems likely to be split in the near future + */ + if (!bottomupdedup) + singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); + + /* + * Deduplicate items from page, and write them to newpage. + * + * Copy the original page's LSN into newpage copy. This will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + newpage = PageGetTempPageCopySpecial(page); + PageSetLSN(newpage, PageGetLSN(page)); + + /* Copy high key, if any */ + if (!P_RIGHTMOST(opaque)) + { + ItemId hitemid = PageGetItemId(page, P_HIKEY); + Size hitemsz = ItemIdGetLength(hitemid); + IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); + + if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (state->deduplicate && + _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID(s) for itup have been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list for some other reason (e.g., adding more + * TIDs would have caused posting list to exceed current + * maxpostingsize). + * + * If state contains pending posting list with more than one item, + * form new posting tuple, and actually update the page. Else + * reset the state and move on without modifying the page. + */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + if (singlevalstrat) + { + /* + * Single value strategy's extra steps. + * + * Lower maxpostingsize for sixth and final large posting list + * tuple at the point where 5 maxpostingsize-capped tuples + * have either been formed or observed. + * + * When a sixth maxpostingsize-capped item is formed/observed, + * stop merging together tuples altogether. The few tuples + * that remain at the end of the page won't be merged together + * at all (at least not until after a future page split takes + * place). + */ + if (state->nmaxitems == 5) + _bt_singleval_fillfactor(page, state, newitemsz); + else if (state->nmaxitems == 6) + { + state->deduplicate = false; + singlevalstrat = false; /* won't be back here */ + } + } + + /* itup starts new pending posting list */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + + /* Handle the last item */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + /* + * If no items suitable for deduplication were found, newpage must be + * exactly the same as the original page, so just return from function. + * + * We could determine whether or not to proceed on the basis the space + * savings being sufficient to avoid an immediate page split instead. We + * don't do that because there is some small value in nbtsplitloc.c always + * operating against a page that is fully deduplicated (apart from + * newitem). Besides, most of the cost has already been paid. + */ + if (state->nintervals == 0) + { + /* cannot leak memory here */ + pfree(newpage); + pfree(state->htids); + pfree(state); + return; + } + + /* + * By here, it's clear that deduplication will definitely go ahead. + * + * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace + * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway. + * But keep things tidy. + */ + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + START_CRIT_SECTION(); + + PageRestoreTempPage(newpage, page); + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.nintervals = state->nintervals; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + /* + * The intervals array is not in the buffer, but pretend that it is. + * When XLogInsert stores the whole buffer, the array need not be + * stored too. + */ + XLogRegisterBufData(0, (char *) state->intervals, + state->nintervals * sizeof(BTDedupInterval)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* Local space accounting should agree with page accounting */ + Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); + + /* cannot leak memory here */ + pfree(state->htids); + pfree(state); +} + +/* + * Perform bottom-up index deletion pass. + * + * See if duplicate index tuples (plus certain nearby tuples) are eligible to + * be deleted via bottom-up index deletion. The high level goal here is to + * entirely prevent "unnecessary" page splits caused by MVCC version churn + * from UPDATEs (when the UPDATEs don't logically modify any of the columns + * covered by the 'rel' index). This is qualitative, not quantitative -- we + * do not particularly care about once-off opportunities to delete many index + * tuples together. + * + * See nbtree/README for details on the design of nbtree bottom-up deletion. + * See access/tableam.h for a description of how we're expected to cooperate + * with the tableam. + * + * Returns true on success, in which case caller can assume page split will be + * avoided for a reasonable amount of time. Returns false when caller should + * deduplicate the page (if possible at all). + * + * Note: Occasionally we return true despite failing to delete enough items to + * avoid a split. This makes caller skip deduplication and go split the page + * right away. Our return value is always just advisory information. + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +bool +_bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, + Size newitemsz) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + BTDedupState state; + TM_IndexDeleteOp delstate; + bool neverdedup; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* Initialize deduplication state */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->nintervals = 0; + + /* + * Initialize tableam state that describes bottom-up index deletion + * operation. + * + * We'll go on to ask the tableam to search for TIDs whose index tuples we + * can safely delete. The tableam will search until our leaf page space + * target is satisfied, or until the cost of continuing with the tableam + * operation seems too high. It focuses its efforts on TIDs associated + * with duplicate index tuples that we mark "promising". + * + * This space target is a little arbitrary. The tableam must be able to + * keep the costs and benefits in balance. We provide the tableam with + * exhaustive information about what might work, without directly + * concerning ourselves with avoiding work during the tableam call. Our + * role in costing the bottom-up deletion process is strictly advisory. + */ + delstate.bottomup = true; + delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz); + delstate.ndeltids = 0; + delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); + delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* itup starts first pending interval */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + /* Tuple is equal; just added its TIDs to pending interval */ + } + else + { + /* Finalize interval -- move its TIDs to delete state */ + _bt_bottomupdel_finish_pending(page, state, &delstate); + + /* itup starts new pending interval */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + /* Finalize final interval -- move its TIDs to delete state */ + _bt_bottomupdel_finish_pending(page, state, &delstate); + + /* + * We don't give up now in the event of having few (or even zero) + * promising tuples for the tableam because it's not up to us as the index + * AM to manage costs (note that the tableam might have heuristics of its + * own that work out what to do). We should at least avoid having our + * caller do a useless deduplication pass after we return in the event of + * zero promising tuples, though. + */ + neverdedup = false; + if (state->nintervals == 0) + neverdedup = true; + + pfree(state->htids); + pfree(state); + + /* Ask tableam which TIDs are deletable, then physically delete them */ + _bt_delitems_delete_check(rel, buf, heapRel, &delstate); + + pfree(delstate.deltids); + pfree(delstate.status); + + /* Report "success" to caller unconditionally to avoid deduplication */ + if (neverdedup) + return true; + + /* Don't dedup when we won't end up back here any time soon anyway */ + return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz); +} + +/* + * Create a new pending posting list tuple based on caller's base tuple. + * + * Every tuple processed by deduplication either becomes the base tuple for a + * posting list, or gets its heap TID(s) accepted into a pending posting list. + * A tuple that starts out as the base tuple for a posting list will only + * actually be rewritten within _bt_dedup_finish_pending() when it turns out + * that there are duplicates that can be merged into the base tuple. + */ +void +_bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff) +{ + Assert(state->nhtids == 0); + Assert(state->nitems == 0); + Assert(!BTreeTupleIsPivot(base)); + + /* + * Copy heap TID(s) from new base tuple for new candidate posting list + * into working state's array + */ + if (!BTreeTupleIsPosting(base)) + { + memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData)); + state->nhtids = 1; + state->basetupsize = IndexTupleSize(base); + } + else + { + int nposting; + + nposting = BTreeTupleGetNPosting(base); + memcpy(state->htids, BTreeTupleGetPosting(base), + sizeof(ItemPointerData) * nposting); + state->nhtids = nposting; + /* basetupsize should not include existing posting list */ + state->basetupsize = BTreeTupleGetPostingOffset(base); + } + + /* + * Save new base tuple itself -- it'll be needed if we actually create a + * new posting list from new pending posting list. + * + * Must maintain physical size of all existing tuples (including line + * pointer overhead) so that we can calculate space savings on page. + */ + state->nitems = 1; + state->base = base; + state->baseoff = baseoff; + state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); + /* Also save baseoff in pending state for interval */ + state->intervals[state->nintervals].baseoff = state->baseoff; +} + +/* + * Save itup heap TID(s) into pending posting list where possible. + * + * Returns bool indicating if the pending posting list managed by state now + * includes itup's heap TID(s). + */ +bool +_bt_dedup_save_htid(BTDedupState state, IndexTuple itup) +{ + int nhtids; + ItemPointer htids; + Size mergedtupsz; + + Assert(!BTreeTupleIsPivot(itup)); + + if (!BTreeTupleIsPosting(itup)) + { + nhtids = 1; + htids = &itup->t_tid; + } + else + { + nhtids = BTreeTupleGetNPosting(itup); + htids = BTreeTupleGetPosting(itup); + } + + /* + * Don't append (have caller finish pending posting list as-is) if + * appending heap TID(s) from itup would put us over maxpostingsize limit. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. + */ + mergedtupsz = MAXALIGN(state->basetupsize + + (state->nhtids + nhtids) * sizeof(ItemPointerData)); + + if (mergedtupsz > state->maxpostingsize) + { + /* + * Count this as an oversized item for single value strategy, though + * only when there are 50 TIDs in the final posting list tuple. This + * limit (which is fairly arbitrary) avoids confusion about how many + * 1/6 of a page tuples have been encountered/created by the current + * deduplication pass. + * + * Note: We deliberately don't consider which deduplication pass + * merged together tuples to create this item (could be a previous + * deduplication pass, or current pass). See _bt_do_singleval() + * comments. + */ + if (state->nhtids > 50) + state->nmaxitems++; + + return false; + } + + /* + * Save heap TIDs to pending posting list tuple -- itup can be merged into + * pending posting list + */ + state->nitems++; + memcpy(state->htids + state->nhtids, htids, + sizeof(ItemPointerData) * nhtids); + state->nhtids += nhtids; + state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + + return true; +} + +/* + * Finalize pending posting list tuple, and add it to the page. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * Returns space saving from deduplicating to make a new posting list tuple. + * Note that this includes line pointer overhead. This is zero in the case + * where no deduplication was possible. + */ +Size +_bt_dedup_finish_pending(Page newpage, BTDedupState state) +{ + OffsetNumber tupoff; + Size tuplesz; + Size spacesaving; + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage)); + if (state->nitems == 1) + { + /* Use original, unchanged base tuple */ + tuplesz = IndexTupleSize(state->base); + if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + spacesaving = 0; + } + else + { + IndexTuple final; + + /* Form a tuple with a posting list */ + final = _bt_form_posting(state->base, state->htids, state->nhtids); + tuplesz = IndexTupleSize(final); + Assert(tuplesz <= state->maxpostingsize); + + /* Save final number of items for posting list */ + state->intervals[state->nintervals].nitems = state->nitems; + + Assert(tuplesz == MAXALIGN(IndexTupleSize(final))); + if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + pfree(final); + spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData)); + /* Increment nintervals, since we wrote a new posting list tuple */ + state->nintervals++; + Assert(spacesaving > 0 && spacesaving < BLCKSZ); + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + + return spacesaving; +} + +/* + * Finalize interval during bottom-up index deletion. + * + * During a bottom-up pass we expect that TIDs will be recorded in dedup state + * first, and then get moved over to delstate (in variable-sized batches) by + * calling here. Call here happens when the number of TIDs in a dedup + * interval is known, and interval gets finalized (i.e. when caller sees next + * tuple on the page is not a duplicate, or when caller runs out of tuples to + * process from leaf page). + * + * This is where bottom-up deletion determines and remembers which entries are + * duplicates. This will be important information to the tableam delete + * infrastructure later on. Plain index tuple duplicates are marked + * "promising" here, per tableam contract. + * + * Our approach to marking entries whose TIDs come from posting lists is more + * complicated. Posting lists can only be formed by a deduplication pass (or + * during an index build), so recent version churn affecting the pointed-to + * logical rows is not particularly likely. We may still give a weak signal + * about posting list tuples' entries (by marking just one of its TIDs/entries + * promising), though this is only a possibility in the event of further + * duplicate index tuples in final interval that covers posting list tuple (as + * in the plain tuple case). A weak signal/hint will be useful to the tableam + * when it has no stronger signal to go with for the deletion operation as a + * whole. + * + * The heuristics we use work well in practice because we only need to give + * the tableam the right _general_ idea about where to look. Garbage tends to + * naturally get concentrated in relatively few table blocks with workloads + * that bottom-up deletion targets. The tableam cannot possibly rank all + * available table blocks sensibly based on the hints we provide, but that's + * okay -- only the extremes matter. The tableam just needs to be able to + * predict which few table blocks will have the most tuples that are safe to + * delete for each deletion operation, with low variance across related + * deletion operations. + */ +static void +_bt_bottomupdel_finish_pending(Page page, BTDedupState state, + TM_IndexDeleteOp *delstate) +{ + bool dupinterval = (state->nitems > 1); + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + for (int i = 0; i < state->nitems; i++) + { + OffsetNumber offnum = state->baseoff + i; + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids]; + TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids]; + + if (!BTreeTupleIsPosting(itup)) + { + /* Simple case: A plain non-pivot tuple */ + ideltid->tid = itup->t_tid; + ideltid->id = delstate->ndeltids; + istatus->idxoffnum = offnum; + istatus->knowndeletable = false; /* for now */ + istatus->promising = dupinterval; /* simple rule */ + istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData); + + delstate->ndeltids++; + } + else + { + /* + * Complicated case: A posting list tuple. + * + * We make the conservative assumption that there can only be at + * most one affected logical row per posting list tuple. There + * will be at most one promising entry in deltids to represent + * this presumed lone logical row. Note that this isn't even + * considered unless the posting list tuple is also in an interval + * of duplicates -- this complicated rule is just a variant of the + * simple rule used to decide if plain index tuples are promising. + */ + int nitem = BTreeTupleGetNPosting(itup); + bool firstpromising = false; + bool lastpromising = false; + + Assert(_bt_posting_valid(itup)); + + if (dupinterval) + { + /* + * Complicated rule: either the first or last TID in the + * posting list gets marked promising (if any at all) + */ + BlockNumber minblocklist, + midblocklist, + maxblocklist; + ItemPointer mintid, + midtid, + maxtid; + + mintid = BTreeTupleGetHeapTID(itup); + midtid = BTreeTupleGetPostingN(itup, nitem / 2); + maxtid = BTreeTupleGetMaxHeapTID(itup); + minblocklist = ItemPointerGetBlockNumber(mintid); + midblocklist = ItemPointerGetBlockNumber(midtid); + maxblocklist = ItemPointerGetBlockNumber(maxtid); + + /* Only entry with predominant table block can be promising */ + firstpromising = (minblocklist == midblocklist); + lastpromising = (!firstpromising && + midblocklist == maxblocklist); + } + + for (int p = 0; p < nitem; p++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, p); + + ideltid->tid = *htid; + ideltid->id = delstate->ndeltids; + istatus->idxoffnum = offnum; + istatus->knowndeletable = false; /* for now */ + istatus->promising = false; + if ((firstpromising && p == 0) || + (lastpromising && p == nitem - 1)) + istatus->promising = true; + istatus->freespace = sizeof(ItemPointerData); /* at worst */ + + ideltid++; + istatus++; + delstate->ndeltids++; + } + } + } + + if (dupinterval) + { + state->intervals[state->nintervals].nitems = state->nitems; + state->nintervals++; + } + + /* Reset state for next interval */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; +} + +/* + * Determine if page non-pivot tuples (data items) are all duplicates of the + * same value -- if they are, deduplication's "single value" strategy should + * be applied. The general goal of this strategy is to ensure that + * nbtsplitloc.c (which uses its own single value strategy) will find a useful + * split point as further duplicates are inserted, and successive rightmost + * page splits occur among pages that store the same duplicate value. When + * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full, + * just like it would if deduplication were disabled. + * + * We expect that affected workloads will require _several_ single value + * strategy deduplication passes (over a page that only stores duplicates) + * before the page is finally split. The first deduplication pass should only + * find regular non-pivot tuples. Later deduplication passes will find + * existing maxpostingsize-capped posting list tuples, which must be skipped + * over. The penultimate pass is generally the first pass that actually + * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a + * few untouched non-pivot tuples. The final deduplication pass won't free + * any space -- it will skip over everything without merging anything (it + * retraces the steps of the penultimate pass). + * + * Fortunately, having several passes isn't too expensive. Each pass (after + * the first pass) won't spend many cycles on the large posting list tuples + * left by previous passes. Each pass will find a large contiguous group of + * smaller duplicate tuples to merge together at the end of the page. + */ +static bool +_bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, minoff); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + { + itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + return true; + } + + return false; +} + +/* + * Lower maxpostingsize when using "single value" strategy, to avoid a sixth + * and final maxpostingsize-capped tuple. The sixth and final posting list + * tuple will end up somewhat smaller than the first five. (Note: The first + * five tuples could actually just be very large duplicate tuples that + * couldn't be merged together at all. Deduplication will simply not modify + * the page when that happens.) + * + * When there are six posting lists on the page (after current deduplication + * pass goes on to create/observe a sixth very large tuple), caller should end + * its deduplication pass. It isn't useful to try to deduplicate items that + * are supposed to end up on the new right sibling page following the + * anticipated page split. A future deduplication pass of future right + * sibling page might take care of it. (This is why the first single value + * strategy deduplication pass for a given leaf page will generally find only + * plain non-pivot tuples -- see _bt_do_singleval() comments.) + */ +static void +_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz) +{ + Size leftfree; + int reduction; + + /* This calculation needs to match nbtsplitloc.c */ + leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + /* Subtract size of new high key (includes pivot heap TID space) */ + leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); + + /* + * Reduce maxpostingsize by an amount equal to target free space on left + * half of page + */ + reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0); + if (state->maxpostingsize > reduction) + state->maxpostingsize -= reduction; + else + state->maxpostingsize = 0; +} + +/* + * Build a posting list tuple based on caller's "base" index tuple and list of + * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a + * posting list. (Posting list tuples can never have a single heap TID, partly + * because that ensures that deduplication always reduces final MAXALIGN()'d + * size of entire tuple.) + * + * Convention is that posting list starts at a MAXALIGN()'d offset (rather + * than a SHORTALIGN()'d offset), in line with the approach taken when + * appending a heap TID to new pivot tuple/high key during suffix truncation. + * This sometimes wastes a little space that was only needed as alignment + * padding in the original tuple. Following this convention simplifies the + * space accounting used when deduplicating a page (the same convention + * simplifies the accounting for choosing a point to split a page at). + * + * Note: Caller's "htids" array must be unique and already in ascending TID + * order. Any existing heap TIDs from "base" won't automatically appear in + * returned posting list tuple (they must be included in htids array.) + */ +IndexTuple +_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids) +{ + uint32 keysize, + newsize; + IndexTuple itup; + + if (BTreeTupleIsPosting(base)) + keysize = BTreeTupleGetPostingOffset(base); + else + keysize = IndexTupleSize(base); + + Assert(!BTreeTupleIsPivot(base)); + Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX); + Assert(keysize == MAXALIGN(keysize)); + + /* Determine final size of new tuple */ + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, base, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + memcpy(BTreeTupleGetPosting(itup), htids, + sizeof(ItemPointerData) * nhtids); + Assert(_bt_posting_valid(itup)); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(htids, &itup->t_tid); + Assert(ItemPointerIsValid(&itup->t_tid)); + } + + return itup; +} + +/* + * Generate a replacement tuple by "updating" a posting list tuple so that it + * no longer has TIDs that need to be deleted. + * + * Used by both VACUUM and index deletion. Caller's vacposting argument + * points to the existing posting list tuple to be updated. + * + * On return, caller's vacposting argument will point to final "updated" + * tuple, which will be palloc()'d in caller's memory context. + */ +void +_bt_update_posting(BTVacuumPosting vacposting) +{ + IndexTuple origtuple = vacposting->itup; + uint32 keysize, + newsize; + IndexTuple itup; + int nhtids; + int ui, + d; + ItemPointer htids; + + nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids; + + Assert(_bt_posting_valid(origtuple)); + Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple)); + + /* + * Determine final size of new tuple. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. We avoid calling _bt_form_posting() here + * to save ourselves a second memory allocation for a htids workspace. + */ + keysize = BTreeTupleGetPostingOffset(origtuple); + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, origtuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + htids = BTreeTupleGetPosting(itup); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + htids = &itup->t_tid; + } + + ui = 0; + d = 0; + for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++) + { + if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i) + { + d++; + continue; + } + htids[ui++] = *BTreeTupleGetPostingN(origtuple, i); + } + Assert(ui == nhtids); + Assert(d == vacposting->ndeletedtids); + Assert(nhtids == 1 || _bt_posting_valid(itup)); + Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid)); + + /* vacposting arg's itup will now point to updated version */ + vacposting->itup = itup; +} + +/* + * Prepare for a posting list split by swapping heap TID in newitem with heap + * TID from original posting list (the 'oposting' heap TID located at offset + * 'postingoff'). Modifies newitem, so caller should pass their own private + * copy that can safely be modified. + * + * Returns new posting list tuple, which is palloc()'d in caller's context. + * This is guaranteed to be the same size as 'oposting'. Modified newitem is + * what caller actually inserts. (This happens inside the same critical + * section that performs an in-place update of old posting list using new + * posting list returned here.) + * + * While the keys from newitem and oposting must be opclass equal, and must + * generate identical output when run through the underlying type's output + * function, it doesn't follow that their representations match exactly. + * Caller must avoid assuming that there can't be representational differences + * that make datums from oposting bigger or smaller than the corresponding + * datums from newitem. For example, differences in TOAST input state might + * break a faulty assumption about tuple size (the executor is entitled to + * apply TOAST compression based on its own criteria). It also seems possible + * that further representational variation will be introduced in the future, + * in order to support nbtree features like page-level prefix compression. + * + * See nbtree/README for details on the design of posting list splits. + */ +IndexTuple +_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) +{ + int nhtids; + char *replacepos; + char *replaceposright; + Size nmovebytes; + IndexTuple nposting; + + nhtids = BTreeTupleGetNPosting(oposting); + Assert(_bt_posting_valid(oposting)); + + /* + * The postingoff argument originated as a _bt_binsrch_posting() return + * value. It will be 0 in the event of corruption that makes a leaf page + * contain a non-pivot tuple that's somehow identical to newitem (no two + * non-pivot tuples should ever have the same TID). This has been known + * to happen in the field from time to time. + * + * Perform a basic sanity check to catch this case now. + */ + if (!(postingoff > 0 && postingoff < nhtids)) + elog(ERROR, "posting list tuple with %d items cannot be split at offset %d", + nhtids, postingoff); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID. We shift TIDs one place to the right, losing original + * rightmost TID. (nmovebytes must not include TIDs to the left of + * postingoff, nor the existing rightmost/max TID that gets overwritten.) + */ + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); + replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1); + nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); + memmove(replaceposright, replacepos, nmovebytes); + + /* Fill the gap at postingoff with TID of new item (original new TID) */ + Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem)); + ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); + + /* Now copy oposting's rightmost/max TID into new item (final new TID) */ + ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid); + + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), + BTreeTupleGetHeapTID(newitem)) < 0); + Assert(_bt_posting_valid(nposting)); + + return nposting; +} + +/* + * Verify posting list invariants for "posting", which must be a posting list + * tuple. Used within assertions. + */ +#ifdef USE_ASSERT_CHECKING +static bool +_bt_posting_valid(IndexTuple posting) +{ + ItemPointerData last; + ItemPointer htid; + + if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2) + return false; + + /* Remember first heap TID for loop */ + ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last); + if (!ItemPointerIsValid(&last)) + return false; + + /* Iterate, starting from second TID */ + for (int i = 1; i < BTreeTupleGetNPosting(posting); i++) + { + htid = BTreeTupleGetPostingN(posting, i); + + if (!ItemPointerIsValid(htid)) + return false; + if (ItemPointerCompare(htid, &last) <= 0) + return false; + ItemPointerCopy(htid, &last); + } + + return true; +} +#endif diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c new file mode 100644 index 0000000..1241c56 --- /dev/null +++ b/src/backend/access/nbtree/nbtinsert.c @@ -0,0 +1,3009 @@ +/*------------------------------------------------------------------------- + * + * nbtinsert.c + * Item insertion in Lehman and Yao btrees for Postgres. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtinsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/transam.h" +#include "access/xloginsert.h" +#include "lib/qunique.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/smgr.h" + +/* Minimum tree height for application of fastpath optimization */ +#define BTREE_FASTPATH_MIN_LEVEL 2 + + +static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate); +static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, + Relation heapRel, + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken); +static OffsetNumber _bt_findinsertloc(Relation rel, + BTInsertState insertstate, + bool checkingunique, + bool indexUnchanged, + BTStack stack, + Relation heapRel); +static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack); +static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, + Buffer buf, + Buffer cbuf, + BTStack stack, + IndexTuple itup, + Size itemsz, + OffsetNumber newitemoff, + int postingoff, + bool split_only_page); +static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, + Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, + IndexTuple newitem, IndexTuple orignewitem, + IndexTuple nposting, uint16 postingoff); +static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, + BTStack stack, bool isroot, bool isonly); +static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); +static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, + OffsetNumber itup_off, bool newfirstdataitem); +static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, + BTInsertState insertstate, + bool simpleonly, bool checkingunique, + bool uniquedup, bool indexUnchanged); +static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, + OffsetNumber *deletable, int ndeletable, + IndexTuple newitem, OffsetNumber minoff, + OffsetNumber maxoff); +static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable, + int ndeletable, IndexTuple newitem, + int *nblocks); +static inline int _bt_blk_cmp(const void *arg1, const void *arg2); + +/* + * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. + * + * This routine is called by the public interface routine, btinsert. + * By here, itup is filled in, including the TID. + * + * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this + * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or + * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. + * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and + * don't actually insert. + * + * indexUnchanged executor hint indicates if itup is from an + * UPDATE that didn't logically change the indexed value, but + * must nevertheless have a new entry to point to a successor + * version. + * + * The result value is only significant for UNIQUE_CHECK_PARTIAL: + * it must be true if the entry is known unique, else false. + * (In the current implementation we'll also return true after a + * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but + * that's just a coding artifact.) + */ +bool +_bt_doinsert(Relation rel, IndexTuple itup, + IndexUniqueCheck checkUnique, bool indexUnchanged, + Relation heapRel) +{ + bool is_unique = false; + BTInsertStateData insertstate; + BTScanInsert itup_key; + BTStack stack; + bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); + + /* we need an insertion scan key to do our search, so build one */ + itup_key = _bt_mkscankey(rel, itup); + + if (checkingunique) + { + if (!itup_key->anynullkeys) + { + /* No (heapkeyspace) scantid until uniqueness established */ + itup_key->scantid = NULL; + } + else + { + /* + * Scan key for new tuple contains NULL key values. Bypass + * checkingunique steps. They are unnecessary because core code + * considers NULL unequal to every value, including NULL. + * + * This optimization avoids O(N^2) behavior within the + * _bt_findinsertloc() heapkeyspace path when a unique index has a + * large number of "duplicates" with NULL key values. + */ + checkingunique = false; + /* Tuple is unique in the sense that core code cares about */ + Assert(checkUnique != UNIQUE_CHECK_EXISTING); + is_unique = true; + } + } + + /* + * Fill in the BTInsertState working area, to track the current page and + * position within the page to insert on. + * + * Note that itemsz is passed down to lower level code that deals with + * inserting the item. It must be MAXALIGN()'d. This ensures that space + * accounting code consistently considers the alignment overhead that we + * expect PageAddItem() will add later. (Actually, index_form_tuple() is + * already conservative about alignment, but we don't rely on that from + * this distance. Besides, preserving the "true" tuple size in index + * tuple headers for the benefit of nbtsplitloc.c might happen someday. + * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.) + */ + insertstate.itup = itup; + insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); + insertstate.itup_key = itup_key; + insertstate.bounds_valid = false; + insertstate.buf = InvalidBuffer; + insertstate.postingoff = 0; + +search: + + /* + * Find and lock the leaf page that the tuple should be added to by + * searching from the root page. insertstate.buf will hold a buffer that + * is locked in exclusive mode afterwards. + */ + stack = _bt_search_insert(rel, &insertstate); + + /* + * checkingunique inserts are not allowed to go ahead when two tuples with + * equal key attribute values would be visible to new MVCC snapshots once + * the xact commits. Check for conflicts in the locked page/buffer (if + * needed) here. + * + * It might be necessary to check a page to the right in _bt_check_unique, + * though that should be very rare. In practice the first page the value + * could be on (with scantid omitted) is almost always also the only page + * that a matching tuple might be found on. This is due to the behavior + * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can + * only be allowed to cross a page boundary when there is no candidate + * leaf page split point that avoids it. Also, _bt_check_unique can use + * the leaf page high key to determine that there will be no duplicates on + * the right sibling without actually visiting it (it uses the high key in + * cases where the new item happens to belong at the far right of the leaf + * page). + * + * NOTE: obviously, _bt_check_unique can only detect keys that are already + * in the index; so it cannot defend against concurrent insertions of the + * same key. We protect against that by means of holding a write lock on + * the first page the value could be on, with omitted/-inf value for the + * implicit heap TID tiebreaker attribute. Any other would-be inserter of + * the same key must acquire a write lock on the same page, so only one + * would-be inserter can be making the check at one time. Furthermore, + * once we are past the check we hold write locks continuously until we + * have performed our insertion, so no later inserter can fail to see our + * insertion. (This requires some care in _bt_findinsertloc.) + * + * If we must wait for another xact, we release the lock while waiting, + * and then must perform a new search. + * + * For a partial uniqueness check, we don't wait for the other xact. Just + * let the tuple in and return false for possibly non-unique, or true for + * definitely unique. + */ + if (checkingunique) + { + TransactionId xwait; + uint32 speculativeToken; + + xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique, + &is_unique, &speculativeToken); + + if (unlikely(TransactionIdIsValid(xwait))) + { + /* Have to wait for the other guy ... */ + _bt_relbuf(rel, insertstate.buf); + insertstate.buf = InvalidBuffer; + + /* + * If it's a speculative insertion, wait for it to finish (ie. to + * go ahead with the insertion, or kill the tuple). Otherwise + * wait for the transaction to finish as usual. + */ + if (speculativeToken) + SpeculativeInsertionWait(xwait, speculativeToken); + else + XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); + + /* start over... */ + if (stack) + _bt_freestack(stack); + goto search; + } + + /* Uniqueness is established -- restore heap tid as scantid */ + if (itup_key->heapkeyspace) + itup_key->scantid = &itup->t_tid; + } + + if (checkUnique != UNIQUE_CHECK_EXISTING) + { + OffsetNumber newitemoff; + + /* + * The only conflict predicate locking cares about for indexes is when + * an index tuple insert conflicts with an existing lock. We don't + * know the actual page we're going to insert on for sure just yet in + * checkingunique and !heapkeyspace cases, but it's okay to use the + * first page the value could be on (with scantid omitted) instead. + */ + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf)); + + /* + * Do the insertion. Note that insertstate contains cached binary + * search bounds established within _bt_check_unique when insertion is + * checkingunique. + */ + newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, + indexUnchanged, stack, heapRel); + _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, + itup, insertstate.itemsz, newitemoff, + insertstate.postingoff, false); + } + else + { + /* just release the buffer */ + _bt_relbuf(rel, insertstate.buf); + } + + /* be tidy */ + if (stack) + _bt_freestack(stack); + pfree(itup_key); + + return is_unique; +} + +/* + * _bt_search_insert() -- _bt_search() wrapper for inserts + * + * Search the tree for a particular scankey, or more precisely for the first + * leaf page it could be on. Try to make use of the fastpath optimization's + * rightmost leaf page cache before actually searching the tree from the root + * page, though. + * + * Return value is a stack of parent-page pointers (though see notes about + * fastpath optimization and page splits below). insertstate->buf is set to + * the address of the leaf-page buffer, which is write-locked and pinned in + * all cases (if necessary by creating a new empty root page for caller). + * + * The fastpath optimization avoids most of the work of searching the tree + * repeatedly when a single backend inserts successive new tuples on the + * rightmost leaf page of an index. A backend cache of the rightmost leaf + * page is maintained within _bt_insertonpg(), and used here. The cache is + * invalidated here when an insert of a non-pivot tuple must take place on a + * non-rightmost leaf page. + * + * The optimization helps with indexes on an auto-incremented field. It also + * helps with indexes on datetime columns, as well as indexes with lots of + * NULL values. (NULLs usually get inserted in the rightmost page for single + * column indexes, since they usually get treated as coming after everything + * else in the key space. Individual NULL tuples will generally be placed on + * the rightmost leaf page due to the influence of the heap TID column.) + * + * Note that we avoid applying the optimization when there is insufficient + * space on the rightmost page to fit caller's new item. This is necessary + * because we'll need to return a real descent stack when a page split is + * expected (actually, caller can cope with a leaf page split that uses a NULL + * stack, but that's very slow and so must be avoided). Note also that the + * fastpath optimization acquires the lock on the page conditionally as a way + * of reducing extra contention when there are concurrent insertions into the + * rightmost page (we give up if we'd have to wait for the lock). We assume + * that it isn't useful to apply the optimization when there is contention, + * since each per-backend cache won't stay valid for long. + */ +static BTStack +_bt_search_insert(Relation rel, BTInsertState insertstate) +{ + Assert(insertstate->buf == InvalidBuffer); + Assert(!insertstate->bounds_valid); + Assert(insertstate->postingoff == 0); + + if (RelationGetTargetBlock(rel) != InvalidBlockNumber) + { + /* Simulate a _bt_getbuf() call with conditional locking */ + insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); + if (_bt_conditionallockbuf(rel, insertstate->buf)) + { + Page page; + BTPageOpaque opaque; + + _bt_checkpage(rel, insertstate->buf); + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Check if the page is still the rightmost leaf page and has + * enough free space to accommodate the new tuple. Also check + * that the insertion scan key is strictly greater than the first + * non-pivot tuple on the page. (Note that we expect itup_key's + * scantid to be unset when our caller is a checkingunique + * inserter.) + */ + if (P_RIGHTMOST(opaque) && + P_ISLEAF(opaque) && + !P_IGNORE(opaque) && + PageGetFreeSpace(page) > insertstate->itemsz && + PageGetMaxOffsetNumber(page) >= P_HIKEY && + _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0) + { + /* + * Caller can use the fastpath optimization because cached + * block is still rightmost leaf page, which can fit caller's + * new tuple without splitting. Keep block in local cache for + * next insert, and have caller use NULL stack. + * + * Note that _bt_insert_parent() has an assertion that catches + * leaf page splits that somehow follow from a fastpath insert + * (it should only be passed a NULL stack when it must deal + * with a concurrent root page split, and never because a NULL + * stack was returned here). + */ + return NULL; + } + + /* Page unsuitable for caller, drop lock and pin */ + _bt_relbuf(rel, insertstate->buf); + } + else + { + /* Lock unavailable, drop pin */ + ReleaseBuffer(insertstate->buf); + } + + /* Forget block, since cache doesn't appear to be useful */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + + /* Cannot use optimization -- descend tree, return proper descent stack */ + return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE, + NULL); +} + +/* + * _bt_check_unique() -- Check for violation of unique index constraint + * + * Returns InvalidTransactionId if there is no conflict, else an xact ID + * we must wait for to see if it commits a conflicting tuple. If an actual + * conflict is detected, no return --- just ereport(). If an xact ID is + * returned, and the conflicting tuple still has a speculative insertion in + * progress, *speculativeToken is set to non-zero, and the caller can wait for + * the verdict on the insertion using SpeculativeInsertionWait(). + * + * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return + * InvalidTransactionId because we don't want to wait. In this case we + * set *is_unique to false if there is a potential conflict, and the + * core code must redo the uniqueness check later. + * + * As a side-effect, sets state in insertstate that can later be used by + * _bt_findinsertloc() to reuse most of the binary search work we do + * here. + * + * Do not call here when there are NULL values in scan key. NULL should be + * considered unequal to NULL when checking for duplicates, but we are not + * prepared to handle that correctly. + */ +static TransactionId +_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken) +{ + IndexTuple itup = insertstate->itup; + IndexTuple curitup = NULL; + ItemId curitemid = NULL; + BTScanInsert itup_key = insertstate->itup_key; + SnapshotData SnapshotDirty; + OffsetNumber offset; + OffsetNumber maxoff; + Page page; + BTPageOpaque opaque; + Buffer nbuf = InvalidBuffer; + bool found = false; + bool inposting = false; + bool prevalldead = true; + int curposti = 0; + + /* Assume unique until we find a duplicate */ + *is_unique = true; + + InitDirtySnapshot(SnapshotDirty); + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Find the first tuple with the same key. + * + * This also saves the binary search bounds in insertstate. We use them + * in the fastpath below, but also in the _bt_findinsertloc() call later. + */ + Assert(!insertstate->bounds_valid); + offset = _bt_binsrch_insert(rel, insertstate); + + /* + * Scan over all equal tuples, looking for live conflicts. + */ + Assert(!insertstate->bounds_valid || insertstate->low == offset); + Assert(!itup_key->anynullkeys); + Assert(itup_key->scantid == NULL); + for (;;) + { + /* + * Each iteration of the loop processes one heap TID, not one index + * tuple. Current offset number for page isn't usually advanced on + * iterations that process heap TIDs from posting list tuples. + * + * "inposting" state is set when _inside_ a posting list --- not when + * we're at the start (or end) of a posting list. We advance curposti + * at the end of the iteration when inside a posting list tuple. In + * general, every loop iteration either advances the page offset or + * advances curposti --- an iteration that handles the rightmost/max + * heap TID in a posting list finally advances the page offset (and + * unsets "inposting"). + * + * Make sure the offset points to an actual index tuple before trying + * to examine it... + */ + if (offset <= maxoff) + { + /* + * Fastpath: In most cases, we can use cached search bounds to + * limit our consideration to items that are definitely + * duplicates. This fastpath doesn't apply when the original page + * is empty, or when initial offset is past the end of the + * original page, which may indicate that we need to examine a + * second or subsequent page. + * + * Note that this optimization allows us to avoid calling + * _bt_compare() directly when there are no duplicates, as long as + * the offset where the key will go is not at the end of the page. + */ + if (nbuf == InvalidBuffer && offset == insertstate->stricthigh) + { + Assert(insertstate->bounds_valid); + Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); + Assert(insertstate->low <= insertstate->stricthigh); + Assert(_bt_compare(rel, itup_key, page, offset) < 0); + break; + } + + /* + * We can skip items that are already marked killed. + * + * In the presence of heavy update activity an index may contain + * many killed items with the same key; running _bt_compare() on + * each killed item gets expensive. Just advance over killed + * items as quickly as we can. We only apply _bt_compare() when + * we get to a non-killed item. We could reuse the bounds to + * avoid _bt_compare() calls for known equal tuples, but it + * doesn't seem worth it. + */ + if (!inposting) + curitemid = PageGetItemId(page, offset); + if (inposting || !ItemIdIsDead(curitemid)) + { + ItemPointerData htid; + bool all_dead = false; + + if (!inposting) + { + /* Plain tuple, or first TID in posting list tuple */ + if (_bt_compare(rel, itup_key, page, offset) != 0) + break; /* we're past all the equal tuples */ + + /* Advanced curitup */ + curitup = (IndexTuple) PageGetItem(page, curitemid); + Assert(!BTreeTupleIsPivot(curitup)); + } + + /* okay, we gotta fetch the heap tuple using htid ... */ + if (!BTreeTupleIsPosting(curitup)) + { + /* ... htid is from simple non-pivot tuple */ + Assert(!inposting); + htid = curitup->t_tid; + } + else if (!inposting) + { + /* ... htid is first TID in new posting list */ + inposting = true; + prevalldead = true; + curposti = 0; + htid = *BTreeTupleGetPostingN(curitup, 0); + } + else + { + /* ... htid is second or subsequent TID in posting list */ + Assert(curposti > 0); + htid = *BTreeTupleGetPostingN(curitup, curposti); + } + + /* + * If we are doing a recheck, we expect to find the tuple we + * are rechecking. It's not a duplicate, but we have to keep + * scanning. + */ + if (checkUnique == UNIQUE_CHECK_EXISTING && + ItemPointerCompare(&htid, &itup->t_tid) == 0) + { + found = true; + } + + /* + * Check if there's any table tuples for this index entry + * satisfying SnapshotDirty. This is necessary because for AMs + * with optimizations like heap's HOT, we have just a single + * index entry for the entire chain. + */ + else if (table_index_fetch_tuple_check(heapRel, &htid, + &SnapshotDirty, + &all_dead)) + { + TransactionId xwait; + + /* + * It is a duplicate. If we are only doing a partial + * check, then don't bother checking if the tuple is being + * updated in another transaction. Just return the fact + * that it is a potential conflict and leave the full + * check till later. Don't invalidate binary search + * bounds. + */ + if (checkUnique == UNIQUE_CHECK_PARTIAL) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + *is_unique = false; + return InvalidTransactionId; + } + + /* + * If this tuple is being updated by other transaction + * then we have to wait for its commit/abort. + */ + xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? + SnapshotDirty.xmin : SnapshotDirty.xmax; + + if (TransactionIdIsValid(xwait)) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; + /* Caller releases lock on buf immediately */ + insertstate->bounds_valid = false; + return xwait; + } + + /* + * Otherwise we have a definite conflict. But before + * complaining, look to see if the tuple we want to insert + * is itself now committed dead --- if so, don't complain. + * This is a waste of time in normal scenarios but we must + * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the + * exact tuple which triggered the insert, but that's OK + * because if we find a live tuple anywhere in this chain, + * we have a unique key conflict. The other live tuple is + * not part of this chain because it had a different index + * entry. + */ + htid = itup->t_tid; + if (table_index_fetch_tuple_check(heapRel, &htid, + SnapshotSelf, NULL)) + { + /* Normal case --- it's still live */ + } + else + { + /* + * It's been deleted, so no error, and no need to + * continue searching + */ + break; + } + + /* + * Check for a conflict-in as we would if we were going to + * write to this page. We aren't actually going to write, + * but we want a chance to report SSI conflicts that would + * otherwise be masked by this unique constraint + * violation. + */ + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf)); + + /* + * This is a definite conflict. Break the tuple down into + * datums and report the error. But first, make sure we + * release the buffer locks we're holding --- + * BuildIndexValueDescription could make catalog accesses, + * which in the worst case might touch this same index and + * cause deadlocks. + */ + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + _bt_relbuf(rel, insertstate->buf); + insertstate->buf = InvalidBuffer; + insertstate->bounds_valid = false; + + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(rel), + values, isnull); + + key_desc = BuildIndexValueDescription(rel, values, + isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("duplicate key value violates unique constraint \"%s\"", + RelationGetRelationName(rel)), + key_desc ? errdetail("Key %s already exists.", + key_desc) : 0, + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + } + } + else if (all_dead && (!inposting || + (prevalldead && + curposti == BTreeTupleGetNPosting(curitup) - 1))) + { + /* + * The conflicting tuple (or all HOT chains pointed to by + * all posting list TIDs) is dead to everyone, so mark the + * index entry killed. + */ + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + + /* + * Mark buffer with a dirty hint, since state is not + * crucial. Be sure to mark the proper buffer dirty. + */ + if (nbuf != InvalidBuffer) + MarkBufferDirtyHint(nbuf, true); + else + MarkBufferDirtyHint(insertstate->buf, true); + } + + /* + * Remember if posting list tuple has even a single HOT chain + * whose members are not all dead + */ + if (!all_dead && inposting) + prevalldead = false; + } + } + + if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1) + { + /* Advance to next TID in same posting list */ + curposti++; + continue; + } + else if (offset < maxoff) + { + /* Advance to next tuple */ + curposti = 0; + inposting = false; + offset = OffsetNumberNext(offset); + } + else + { + int highkeycmp; + + /* If scankey == hikey we gotta check the next page too */ + if (P_RIGHTMOST(opaque)) + break; + highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); + Assert(highkeycmp <= 0); + if (highkeycmp != 0) + break; + /* Advance to next non-dead page --- there must be one */ + for (;;) + { + BlockNumber nblkno = opaque->btpo_next; + + nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ); + page = BufferGetPage(nbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + break; + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + } + /* Will also advance to next tuple */ + curposti = 0; + inposting = false; + maxoff = PageGetMaxOffsetNumber(page); + offset = P_FIRSTDATAKEY(opaque); + /* Don't invalidate binary search bounds */ + } + } + + /* + * If we are doing a recheck then we should have found the tuple we are + * checking. Otherwise there's something very wrong --- probably, the + * index is on a non-immutable expression. + */ + if (checkUnique == UNIQUE_CHECK_EXISTING && !found) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to re-find tuple within index \"%s\"", + RelationGetRelationName(rel)), + errhint("This may be because of a non-immutable index expression."), + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + + return InvalidTransactionId; +} + + +/* + * _bt_findinsertloc() -- Finds an insert location for a tuple + * + * On entry, insertstate buffer contains the page the new tuple belongs + * on. It is exclusive-locked and pinned by the caller. + * + * If 'checkingunique' is true, the buffer on entry is the first page + * that contains duplicates of the new key. If there are duplicates on + * multiple pages, the correct insertion position might be some page to + * the right, rather than the first page. In that case, this function + * moves right to the correct target page. + * + * (In a !heapkeyspace index, there can be multiple pages with the same + * high key, where the new tuple could legitimately be placed on. In + * that case, the caller passes the first page containing duplicates, + * just like when checkingunique=true. If that page doesn't have enough + * room for the new tuple, this function moves right, trying to find a + * legal page that does.) + * + * If 'indexUnchanged' is true, this is for an UPDATE that didn't + * logically change the indexed value, but must nevertheless have a new + * entry to point to a successor version. This hint from the executor + * will influence our behavior when the page might have to be split and + * we must consider our options. Bottom-up index deletion can avoid + * pathological version-driven page splits, but we only want to go to the + * trouble of trying it when we already have moderate confidence that + * it's appropriate. The hint should not significantly affect our + * behavior over time unless practically all inserts on to the leaf page + * get the hint. + * + * On exit, insertstate buffer contains the chosen insertion page, and + * the offset within that page is returned. If _bt_findinsertloc needed + * to move right, the lock and pin on the original page are released, and + * the new buffer is exclusively locked and pinned instead. + * + * If insertstate contains cached binary search bounds, we will take + * advantage of them. This avoids repeating comparisons that we made in + * _bt_check_unique() already. + * + * If there is not enough room on the page for the new tuple, we try to + * make room by removing any LP_DEAD tuples. + */ +static OffsetNumber +_bt_findinsertloc(Relation rel, + BTInsertState insertstate, + bool checkingunique, + bool indexUnchanged, + BTStack stack, + Relation heapRel) +{ + BTScanInsert itup_key = insertstate->itup_key; + Page page = BufferGetPage(insertstate->buf); + BTPageOpaque opaque; + OffsetNumber newitemoff; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Check 1/3 of a page restriction */ + if (unlikely(insertstate->itemsz > BTMaxItemSize(page))) + _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page, + insertstate->itup); + + Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque)); + Assert(!insertstate->bounds_valid || checkingunique); + Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL); + Assert(itup_key->heapkeyspace || itup_key->scantid == NULL); + Assert(!itup_key->allequalimage || itup_key->heapkeyspace); + + if (itup_key->heapkeyspace) + { + /* Keep track of whether checkingunique duplicate seen */ + bool uniquedup = indexUnchanged; + + /* + * If we're inserting into a unique index, we may have to walk right + * through leaf pages to find the one leaf page that we must insert on + * to. + * + * This is needed for checkingunique callers because a scantid was not + * used when we called _bt_search(). scantid can only be set after + * _bt_check_unique() has checked for duplicates. The buffer + * initially stored in insertstate->buf has the page where the first + * duplicate key might be found, which isn't always the page that new + * tuple belongs on. The heap TID attribute for new tuple (scantid) + * could force us to insert on a sibling page, though that should be + * very rare in practice. + */ + if (checkingunique) + { + if (insertstate->low < insertstate->stricthigh) + { + /* Encountered a duplicate in _bt_check_unique() */ + Assert(insertstate->bounds_valid); + uniquedup = true; + } + + for (;;) + { + /* + * Does the new tuple belong on this page? + * + * The earlier _bt_check_unique() call may well have + * established a strict upper bound on the offset for the new + * item. If it's not the last item of the page (i.e. if there + * is at least one tuple on the page that goes after the tuple + * we're inserting) then we know that the tuple belongs on + * this page. We can skip the high key check. + */ + if (insertstate->bounds_valid && + insertstate->low <= insertstate->stricthigh && + insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) + break; + + /* Test '<=', not '!=', since scantid is set now */ + if (P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY) <= 0) + break; + + _bt_stepright(rel, insertstate, stack); + /* Update local state after stepping right */ + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* Assume duplicates (if checkingunique) */ + uniquedup = true; + } + } + + /* + * If the target page cannot fit newitem, try to avoid splitting the + * page on insert by performing deletion or deduplication now + */ + if (PageGetFreeSpace(page) < insertstate->itemsz) + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, + checkingunique, uniquedup, + indexUnchanged); + } + else + { + /*---------- + * This is a !heapkeyspace (version 2 or 3) index. The current page + * is the first page that we could insert the new tuple to, but there + * may be other pages to the right that we could opt to use instead. + * + * If the new key is equal to one or more existing keys, we can + * legitimately place it anywhere in the series of equal keys. In + * fact, if the new key is equal to the page's "high key" we can place + * it on the next page. If it is equal to the high key, and there's + * not room to insert the new tuple on the current page without + * splitting, then we move right hoping to find more free space and + * avoid a split. + * + * Keep scanning right until we + * (a) find a page with enough free space, + * (b) reach the last page where the tuple can legally go, or + * (c) get tired of searching. + * (c) is not flippant; it is important because if there are many + * pages' worth of equal keys, it's better to split one of the early + * pages than to scan all the way to the end of the run of equal keys + * on every insert. We implement "get tired" as a random choice, + * since stopping after scanning a fixed number of pages wouldn't work + * well (we'd never reach the right-hand side of previously split + * pages). The probability of moving right is set at 0.99, which may + * seem too high to change the behavior much, but it does an excellent + * job of preventing O(N^2) behavior with many equal keys. + *---------- + */ + while (PageGetFreeSpace(page) < insertstate->itemsz) + { + /* + * Before considering moving right, see if we can obtain enough + * space by erasing LP_DEAD items + */ + if (P_HAS_GARBAGE(opaque)) + { + /* Perform simple deletion */ + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + false, false, false); + + if (PageGetFreeSpace(page) >= insertstate->itemsz) + break; /* OK, now we have enough space */ + } + + /* + * Nope, so check conditions (b) and (c) enumerated above + * + * The earlier _bt_check_unique() call may well have established a + * strict upper bound on the offset for the new item. If it's not + * the last item of the page (i.e. if there is at least one tuple + * on the page that's greater than the tuple we're inserting to) + * then we know that the tuple belongs on this page. We can skip + * the high key check. + */ + if (insertstate->bounds_valid && + insertstate->low <= insertstate->stricthigh && + insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) + break; + + if (P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY) != 0 || + random() <= (MAX_RANDOM_VALUE / 100)) + break; + + _bt_stepright(rel, insertstate, stack); + /* Update local state after stepping right */ + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + } + + /* + * We should now be on the correct page. Find the offset within the page + * for the new tuple. (Possibly reusing earlier search bounds.) + */ + Assert(P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); + + newitemoff = _bt_binsrch_insert(rel, insertstate); + + if (insertstate->postingoff == -1) + { + /* + * There is an overlapping posting list tuple with its LP_DEAD bit + * set. We don't want to unnecessarily unset its LP_DEAD bit while + * performing a posting list split, so perform simple index tuple + * deletion early. + */ + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + false, false, false); + + /* + * Do new binary search. New insert location cannot overlap with any + * posting list now. + */ + Assert(!insertstate->bounds_valid); + insertstate->postingoff = 0; + newitemoff = _bt_binsrch_insert(rel, insertstate); + Assert(insertstate->postingoff == 0); + } + + return newitemoff; +} + +/* + * Step right to next non-dead page, during insertion. + * + * This is a bit more complicated than moving right in a search. We must + * write-lock the target page before releasing write lock on current page; + * else someone else's _bt_check_unique scan could fail to see our insertion. + * Write locks on intermediate dead pages won't do because we don't know when + * they will get de-linked from the tree. + * + * This is more aggressive than it needs to be for non-unique !heapkeyspace + * indexes. + */ +static void +_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) +{ + Page page; + BTPageOpaque opaque; + Buffer rbuf; + BlockNumber rblkno; + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + rbuf = InvalidBuffer; + rblkno = opaque->btpo_next; + for (;;) + { + rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this page was incompletely split, finish the split now. We do + * this while holding a lock on the left sibling, which is not good + * because finishing the split could be a fairly lengthy operation. + * But this should happen very seldom. + */ + if (P_INCOMPLETE_SPLIT(opaque)) + { + _bt_finish_split(rel, rbuf, stack); + rbuf = InvalidBuffer; + continue; + } + + if (!P_IGNORE(opaque)) + break; + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + + rblkno = opaque->btpo_next; + } + /* rbuf locked; unlock buf, update state for caller */ + _bt_relbuf(rel, insertstate->buf); + insertstate->buf = rbuf; + insertstate->bounds_valid = false; +} + +/*---------- + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * + * This recursive procedure does the following things: + * + * + if postingoff != 0, splits existing posting list tuple + * (since it overlaps with new 'itup' tuple). + * + if necessary, splits the target page, using 'itup_key' for + * suffix truncation on leaf pages (caller passes NULL for + * non-leaf pages). + * + inserts the new tuple (might be split from posting list). + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invokes itself with the appropriate tuple for the right + * child page on the parent. + * + updates the metapage if a true root or fast root is split. + * + * On entry, we must have the correct buffer in which to do the + * insertion, and the buffer must be pinned and write-locked. On return, + * we will have dropped both the pin and the lock on the buffer. + * + * This routine only performs retail tuple insertions. 'itup' should + * always be either a non-highkey leaf item, or a downlink (new high + * key items are created indirectly, when a page is split). When + * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page + * we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + *---------- + */ +static void +_bt_insertonpg(Relation rel, + BTScanInsert itup_key, + Buffer buf, + Buffer cbuf, + BTStack stack, + IndexTuple itup, + Size itemsz, + OffsetNumber newitemoff, + int postingoff, + bool split_only_page) +{ + Page page; + BTPageOpaque opaque; + bool isleaf, + isroot, + isrightmost, + isonly; + IndexTuple oposting = NULL; + IndexTuple origitup = NULL; + IndexTuple nposting = NULL; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + isleaf = P_ISLEAF(opaque); + isroot = P_ISROOT(opaque); + isrightmost = P_RIGHTMOST(opaque); + isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque); + + /* child buffer must be given iff inserting on an internal page */ + Assert(isleaf == !BufferIsValid(cbuf)); + /* tuple must have appropriate number of attributes */ + Assert(!isleaf || + BTreeTupleGetNAtts(itup, rel) == + IndexRelationGetNumberOfAttributes(rel)); + Assert(isleaf || + BTreeTupleGetNAtts(itup, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(!BTreeTupleIsPosting(itup)); + Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz); + /* Caller must always finish incomplete split for us */ + Assert(!P_INCOMPLETE_SPLIT(opaque)); + + /* + * Every internal page should have exactly one negative infinity item at + * all times. Only _bt_split() and _bt_newroot() should add items that + * become negative infinity items through truncation, since they're the + * only routines that allocate new internal pages. + */ + Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque)); + + /* + * Do we need to split an existing posting list item? + */ + if (postingoff != 0) + { + ItemId itemid = PageGetItemId(page, newitemoff); + + /* + * The new tuple is a duplicate with a heap TID that falls inside the + * range of an existing posting list tuple on a leaf page. Prepare to + * split an existing posting list. Overwriting the posting list with + * its post-split version is treated as an extra step in either the + * insert or page split critical section. + */ + Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* + * postingoff value comes from earlier call to _bt_binsrch_posting(). + * Its binary search might think that a plain tuple must be a posting + * list tuple that needs to be split. This can happen with corruption + * involving an existing plain tuple that is a duplicate of the new + * item, up to and including its table TID. Check for that here in + * passing. + * + * Also verify that our caller has made sure that the existing posting + * list tuple does not have its LP_DEAD bit set. + */ + if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(&itup->t_tid), + ItemPointerGetOffsetNumber(&itup->t_tid), + newitemoff, BufferGetBlockNumber(buf), + RelationGetRelationName(rel)))); + + /* use a mutable copy of itup as our itup from here on */ + origitup = itup; + itup = CopyIndexTuple(origitup); + nposting = _bt_swap_posting(itup, oposting, postingoff); + /* itup now contains rightmost/max TID from oposting */ + + /* Alter offset so that newitem goes after posting list */ + newitemoff = OffsetNumberNext(newitemoff); + } + + /* + * Do we need to split the page to fit the item on it? + * + * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, + * so this comparison is correct even though we appear to be accounting + * only for the item and not for its line pointer. + */ + if (PageGetFreeSpace(page) < itemsz) + { + Buffer rbuf; + + Assert(!split_only_page); + + /* split the buffer into left and right halves */ + rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup, + origitup, nposting, postingoff); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buf), + BufferGetBlockNumber(rbuf)); + + /*---------- + * By here, + * + * + our target page has been split; + * + the original tuple has been inserted; + * + we have write locks on both the old (left half) + * and new (right half) buffers, after the split; and + * + we know the key we want to insert into the parent + * (it's the "high key" on the left child page). + * + * We're ready to do the parent insertion. We need to hold onto the + * locks for the child pages until we locate the parent, but we can + * at least release the lock on the right child before doing the + * actual insertion. The lock on the left child will be released + * last of all by parent insertion, where it is the 'cbuf' of parent + * page. + *---------- + */ + _bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly); + } + else + { + Buffer metabuf = InvalidBuffer; + Page metapg = NULL; + BTMetaPageData *metad = NULL; + BlockNumber blockcache; + + /* + * If we are doing this insert because we split a page that was the + * only one on its tree level, but was not the root, it may have been + * the "fast root". We need to ensure that the fast root link points + * at or above the current page. We can safely acquire a lock on the + * metapage here --- see comments for _bt_newroot(). + */ + if (unlikely(split_only_page)) + { + Assert(!isleaf); + Assert(BufferIsValid(cbuf)); + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + if (metad->btm_fastlevel >= opaque->btpo_level) + { + /* no update wanted */ + _bt_relbuf(rel, metabuf); + metabuf = InvalidBuffer; + } + } + + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + if (postingoff != 0) + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false, + false) == InvalidOffsetNumber) + elog(PANIC, "failed to add new item to block %u in index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + + MarkBufferDirty(buf); + + if (BufferIsValid(metabuf)) + { + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + metad->btm_fastroot = BufferGetBlockNumber(buf); + metad->btm_fastlevel = opaque->btpo_level; + MarkBufferDirty(metabuf); + } + + /* + * Clear INCOMPLETE_SPLIT flag on child if inserting the new item + * finishes a split + */ + if (!isleaf) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + Assert(P_INCOMPLETE_SPLIT(cpageop)); + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_insert xlrec; + xl_btree_metadata xlmeta; + uint8 xlinfo; + XLogRecPtr recptr; + uint16 upostingoff; + + xlrec.offnum = newitemoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); + + if (isleaf && postingoff == 0) + { + /* Simple leaf insert */ + xlinfo = XLOG_BTREE_INSERT_LEAF; + } + else if (postingoff != 0) + { + /* + * Leaf insert with posting list split. Must include + * postingoff field before newitem/orignewitem. + */ + Assert(isleaf); + xlinfo = XLOG_BTREE_INSERT_POST; + } + else + { + /* Internal page insert, which finishes a split on cbuf */ + xlinfo = XLOG_BTREE_INSERT_UPPER; + XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD); + + if (BufferIsValid(metabuf)) + { + /* Actually, it's an internal page insert + meta update */ + xlinfo = XLOG_BTREE_INSERT_META; + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + xlmeta.version = metad->btm_version; + xlmeta.root = metad->btm_root; + xlmeta.level = metad->btm_level; + xlmeta.fastroot = metad->btm_fastroot; + xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; + xlmeta.allequalimage = metad->btm_allequalimage; + + XLogRegisterBuffer(2, metabuf, + REGBUF_WILL_INIT | REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &xlmeta, + sizeof(xl_btree_metadata)); + } + } + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + if (postingoff == 0) + { + /* Just log itup from caller */ + XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + } + else + { + /* + * Insert with posting list split (XLOG_BTREE_INSERT_POST + * record) case. + * + * Log postingoff. Also log origitup, not itup. REDO routine + * must reconstruct final itup (as well as nposting) using + * _bt_swap_posting(). + */ + upostingoff = postingoff; + + XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16)); + XLogRegisterBufData(0, (char *) origitup, + IndexTupleSize(origitup)); + } + + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + if (BufferIsValid(metabuf)) + PageSetLSN(metapg, recptr); + if (!isleaf) + PageSetLSN(BufferGetPage(cbuf), recptr); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* Release subsidiary buffers */ + if (BufferIsValid(metabuf)) + _bt_relbuf(rel, metabuf); + if (!isleaf) + _bt_relbuf(rel, cbuf); + + /* + * Cache the block number if this is the rightmost leaf page. Cache + * may be used by a future inserter within _bt_search_insert(). + */ + blockcache = InvalidBlockNumber; + if (isrightmost && isleaf && !isroot) + blockcache = BufferGetBlockNumber(buf); + + /* Release buffer for insertion target block */ + _bt_relbuf(rel, buf); + + /* + * If we decided to cache the insertion target block before releasing + * its buffer lock, then cache it now. Check the height of the tree + * first, though. We don't go for the optimization with small + * indexes. Defer final check to this point to ensure that we don't + * call _bt_getrootheight while holding a buffer lock. + */ + if (BlockNumberIsValid(blockcache) && + _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) + RelationSetTargetBlock(rel, blockcache); + } + + /* be tidy */ + if (postingoff != 0) + { + /* itup is actually a modified copy of caller's original */ + pfree(nposting); + pfree(itup); + } +} + +/* + * _bt_split() -- split a page in the btree. + * + * On entry, buf is the page to split, and is pinned and write-locked. + * newitemoff etc. tell us about the new item that must be inserted + * along with the data from the original page. + * + * itup_key is used for suffix truncation on leaf pages (internal + * page callers pass NULL). When splitting a non-leaf page, 'cbuf' + * is the left-sibling of the page we're inserting the downlink for. + * This function will clear the INCOMPLETE_SPLIT flag on it, and + * release the buffer. + * + * orignewitem, nposting, and postingoff are needed when an insert of + * orignewitem results in both a posting list split and a page split. + * These extra posting list split details are used here in the same + * way as they are used in the more common case where a posting list + * split does not coincide with a page split. We need to deal with + * posting list splits directly in order to ensure that everything + * that follows from the insert of orignewitem is handled as a single + * atomic operation (though caller's insert of a new pivot/downlink + * into parent page will still be a separate operation). See + * nbtree/README for details on the design of posting list splits. + * + * Returns the new right sibling of buf, pinned and write-locked. + * The pin and lock on buf are maintained. + */ +static Buffer +_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff) +{ + Buffer rbuf; + Page origpage; + Page leftpage, + rightpage; + BlockNumber origpagenumber, + rightpagenumber; + BTPageOpaque ropaque, + lopaque, + oopaque; + Buffer sbuf = InvalidBuffer; + Page spage = NULL; + BTPageOpaque sopaque = NULL; + Size itemsz; + ItemId itemid; + IndexTuple firstright, + lefthighkey; + OffsetNumber firstrightoff; + OffsetNumber afterleftoff, + afterrightoff, + minusinfoff; + OffsetNumber origpagepostingoff; + OffsetNumber maxoff; + OffsetNumber i; + bool newitemonleft, + isleaf, + isrightmost; + + /* + * origpage is the original page to be split. leftpage is a temporary + * buffer that receives the left-sibling data, which will be copied back + * into origpage on success. rightpage is the new page that will receive + * the right-sibling data. + * + * leftpage is allocated after choosing a split point. rightpage's new + * buffer isn't acquired until after leftpage is initialized and has new + * high key, the last point where splitting the page may fail (barring + * corruption). Failing before acquiring new buffer won't have lasting + * consequences, since origpage won't have been modified and leftpage is + * only workspace. + */ + origpage = BufferGetPage(buf); + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + isleaf = P_ISLEAF(oopaque); + isrightmost = P_RIGHTMOST(oopaque); + maxoff = PageGetMaxOffsetNumber(origpage); + origpagenumber = BufferGetBlockNumber(buf); + + /* + * Choose a point to split origpage at. + * + * A split point can be thought of as a point _between_ two existing data + * items on origpage (the lastleft and firstright tuples), provided you + * pretend that the new item that didn't fit is already on origpage. + * + * Since origpage does not actually contain newitem, the representation of + * split points needs to work with two boundary cases: splits where + * newitem is lastleft, and splits where newitem is firstright. + * newitemonleft resolves the ambiguity that would otherwise exist when + * newitemoff == firstrightoff. In all other cases it's clear which side + * of the split every tuple goes on from context. newitemonleft is + * usually (but not always) redundant information. + * + * firstrightoff is supposed to be an origpage offset number, but it's + * possible that its value will be maxoff+1, which is "past the end" of + * origpage. This happens in the rare case where newitem goes after all + * existing items (i.e. newitemoff is maxoff+1) and we end up splitting + * origpage at the point that leaves newitem alone on new right page. Any + * "!newitemonleft && newitemoff == firstrightoff" split point makes + * newitem the firstright tuple, though, so this case isn't a special + * case. + */ + firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, + newitem, &newitemonleft); + + /* Allocate temp buffer for leftpage */ + leftpage = PageGetTempPage(origpage); + _bt_pageinit(leftpage, BufferGetPageSize(buf)); + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + + /* + * leftpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + lopaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + /* set flag in leftpage indicating that rightpage has no downlink yet */ + lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; + lopaque->btpo_prev = oopaque->btpo_prev; + /* handle btpo_next after rightpage buffer acquired */ + lopaque->btpo_level = oopaque->btpo_level; + /* handle btpo_cycleid after rightpage buffer acquired */ + + /* + * Copy the original page's LSN into leftpage, which will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + PageSetLSN(leftpage, PageGetLSN(origpage)); + + /* + * Determine page offset number of existing overlapped-with-orignewitem + * posting list when it is necessary to perform a posting list split in + * passing. Note that newitem was already changed by caller (newitem no + * longer has the orignewitem TID). + * + * This page offset number (origpagepostingoff) will be used to pretend + * that the posting split has already taken place, even though the + * required modifications to origpage won't occur until we reach the + * critical section. The lastleft and firstright tuples of our page split + * point should, in effect, come from an imaginary version of origpage + * that has the nposting tuple instead of the original posting list tuple. + * + * Note: _bt_findsplitloc() should have compensated for coinciding posting + * list splits in just the same way, at least in theory. It doesn't + * bother with that, though. In practice it won't affect its choice of + * split point. + */ + origpagepostingoff = InvalidOffsetNumber; + if (postingoff != 0) + { + Assert(isleaf); + Assert(ItemPointerCompare(&orignewitem->t_tid, + &newitem->t_tid) < 0); + Assert(BTreeTupleIsPosting(nposting)); + origpagepostingoff = OffsetNumberPrev(newitemoff); + } + + /* + * The high key for the new left page is a possibly-truncated copy of + * firstright on the leaf level (it's "firstright itself" on internal + * pages; see !isleaf comments below). This may seem to be contrary to + * Lehman & Yao's approach of using a copy of lastleft as the new high key + * when splitting on the leaf level. It isn't, though. + * + * Suffix truncation will leave the left page's high key fully equal to + * lastleft when lastleft and firstright are equal prior to heap TID (that + * is, the tiebreaker TID value comes from lastleft). It isn't actually + * necessary for a new leaf high key to be a copy of lastleft for the L&Y + * "subtree" invariant to hold. It's sufficient to make sure that the new + * leaf high key is strictly less than firstright, and greater than or + * equal to (not necessarily equal to) lastleft. In other words, when + * suffix truncation isn't possible during a leaf page split, we take + * L&Y's exact approach to generating a new high key for the left page. + * (Actually, that is slightly inaccurate. We don't just use a copy of + * lastleft. A tuple with all the keys from firstright but the max heap + * TID from lastleft is used, to avoid introducing a special case.) + */ + if (!newitemonleft && newitemoff == firstrightoff) + { + /* incoming tuple becomes firstright */ + itemsz = newitemsz; + firstright = newitem; + } + else + { + /* existing item at firstrightoff becomes firstright */ + itemid = PageGetItemId(origpage, firstrightoff); + itemsz = ItemIdGetLength(itemid); + firstright = (IndexTuple) PageGetItem(origpage, itemid); + if (firstrightoff == origpagepostingoff) + firstright = nposting; + } + + if (isleaf) + { + IndexTuple lastleft; + + /* Attempt suffix truncation for leaf page splits */ + if (newitemonleft && newitemoff == firstrightoff) + { + /* incoming tuple becomes lastleft */ + lastleft = newitem; + } + else + { + OffsetNumber lastleftoff; + + /* existing item before firstrightoff becomes lastleft */ + lastleftoff = OffsetNumberPrev(firstrightoff); + Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); + itemid = PageGetItemId(origpage, lastleftoff); + lastleft = (IndexTuple) PageGetItem(origpage, itemid); + if (lastleftoff == origpagepostingoff) + lastleft = nposting; + } + + lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key); + itemsz = IndexTupleSize(lefthighkey); + } + else + { + /* + * Don't perform suffix truncation on a copy of firstright to make + * left page high key for internal page splits. Must use firstright + * as new high key directly. + * + * Each distinct separator key value originates as a leaf level high + * key; all other separator keys/pivot tuples are copied from one + * level down. A separator key in a grandparent page must be + * identical to high key in rightmost parent page of the subtree to + * its left, which must itself be identical to high key in rightmost + * child page of that same subtree (this even applies to separator + * from grandparent's high key). There must always be an unbroken + * "seam" of identical separator keys that guide index scans at every + * level, starting from the grandparent. That's why suffix truncation + * is unsafe here. + * + * Internal page splits will truncate firstright into a "negative + * infinity" data item when it gets inserted on the new right page + * below, though. This happens during the call to _bt_pgaddtup() for + * the new first data item for right page. Do not confuse this + * mechanism with suffix truncation. It is just a convenient way of + * implementing page splits that split the internal page "inside" + * firstright. The lefthighkey separator key cannot appear a second + * time in the right page (only firstright's downlink goes in right + * page). + */ + lefthighkey = firstright; + } + + /* + * Add new high key to leftpage + */ + afterleftoff = P_HIKEY; + + Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0); + Assert(BTreeTupleGetNAtts(lefthighkey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey))); + if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "failed to add high key to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + afterleftoff = OffsetNumberNext(afterleftoff); + + /* + * Acquire a new right page to split into, now that left page has a new + * high key. From here on, it's not okay to throw an error without + * zeroing rightpage first. This coding rule ensures that we won't + * confuse future VACUUM operations, which might otherwise try to re-find + * a downlink to a leftover junk page as the page undergoes deletion. + * + * It would be reasonable to start the critical section just after the new + * rightpage buffer is acquired instead; that would allow us to avoid + * leftover junk pages without bothering to zero rightpage. We do it this + * way because it avoids an unnecessary PANIC when either origpage or its + * existing sibling page are corrupt. + */ + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rightpage = BufferGetPage(rbuf); + rightpagenumber = BufferGetBlockNumber(rbuf); + /* rightpage was initialized by _bt_getbuf */ + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + /* + * Finish off remaining leftpage special area fields. They cannot be set + * before both origpage (leftpage) and rightpage buffers are acquired and + * locked. + * + * btpo_cycleid is only used with leaf pages, though we set it here in all + * cases just to be consistent. + */ + lopaque->btpo_next = rightpagenumber; + lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); + + /* + * rightpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + ropaque->btpo_flags = oopaque->btpo_flags; + ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = oopaque->btpo_next; + ropaque->btpo_level = oopaque->btpo_level; + ropaque->btpo_cycleid = lopaque->btpo_cycleid; + + /* + * Add new high key to rightpage where necessary. + * + * If the page we're splitting is not the rightmost page at its level in + * the tree, then the first entry on the page is the high key from + * origpage. + */ + afterrightoff = P_HIKEY; + + if (!isrightmost) + { + IndexTuple righthighkey; + + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + righthighkey = (IndexTuple) PageGetItem(origpage, itemid); + Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0); + Assert(BTreeTupleGetNAtts(righthighkey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff, + false, false) == InvalidOffsetNumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add high key to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + + /* + * Internal page splits truncate first data item on right page -- it + * becomes "minus infinity" item for the page. Set this up here. + */ + minusinfoff = InvalidOffsetNumber; + if (!isleaf) + minusinfoff = afterrightoff; + + /* + * Now transfer all the data items (non-pivot tuples in isleaf case, or + * additional pivot tuples in !isleaf case) to the appropriate page. + * + * Note: we *must* insert at least the right page's items in item-number + * order, for the benefit of _bt_restore_page(). + */ + for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple dataitem; + + itemid = PageGetItemId(origpage, i); + itemsz = ItemIdGetLength(itemid); + dataitem = (IndexTuple) PageGetItem(origpage, itemid); + + /* replace original item with nposting due to posting split? */ + if (i == origpagepostingoff) + { + Assert(BTreeTupleIsPosting(dataitem)); + Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); + dataitem = nposting; + } + + /* does new item belong before this one? */ + else if (i == newitemoff) + { + if (newitemonleft) + { + Assert(newitemoff <= firstrightoff); + if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff, + false)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterleftoff = OffsetNumberNext(afterleftoff); + } + else + { + Assert(newitemoff >= firstrightoff); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, + afterrightoff == minusinfoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + } + + /* decide which page to put it on */ + if (i < firstrightoff) + { + if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterleftoff = OffsetNumberNext(afterleftoff); + } + else + { + if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff, + afterrightoff == minusinfoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + } + + /* Handle case where newitem goes at the end of rightpage */ + if (i <= newitemoff) + { + /* + * Can't have newitemonleft here; that would imply we were told to put + * *everything* on the left page, which cannot fit (if it could, we'd + * not be splitting the page). + */ + Assert(!newitemonleft && newitemoff == maxoff + 1); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, + afterrightoff == minusinfoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + + /* + * We have to grab the original right sibling (if any) and update its prev + * link. We are guaranteed that this is deadlock-free, since we couple + * the locks in the standard order: left to right. + */ + if (!isrightmost) + { + sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); + spage = BufferGetPage(sbuf); + sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); + if (sopaque->btpo_prev != origpagenumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling's left-link doesn't match: " + "block %u links to %u instead of expected %u in index \"%s\"", + oopaque->btpo_next, sopaque->btpo_prev, origpagenumber, + RelationGetRelationName(rel)))); + } + + /* + * Check to see if we can set the SPLIT_END flag in the right-hand + * split page; this can save some I/O for vacuum since it need not + * proceed to the right sibling. We can set the flag if the right + * sibling has a different cycleid: that means it could not be part of + * a group of pages that were all split off from the same ancestor + * page. If you're confused, imagine that page A splits to A B and + * then again, yielding A C B, while vacuum is in progress. Tuples + * originally in A could now be in either B or C, hence vacuum must + * examine both pages. But if D, our right sibling, has a different + * cycleid then it could not contain any tuples that were in A when + * the vacuum started. + */ + if (sopaque->btpo_cycleid != ropaque->btpo_cycleid) + ropaque->btpo_flags |= BTP_SPLIT_END; + } + + /* + * Right sibling is locked, new siblings are prepared, but original page + * is not updated yet. + * + * NO EREPORT(ERROR) till right sibling is updated. We can get away with + * not starting the critical section till here because we haven't been + * scribbling on the original page yet; see comments above. + */ + START_CRIT_SECTION(); + + /* + * By here, the original data page has been split into two new halves, and + * these are correct. The algorithm requires that the left page never + * move during a split, so we copy the new left page back on top of the + * original. We need to do this before writing the WAL record, so that + * XLogInsert can WAL log an image of the page if necessary. + */ + PageRestoreTempPage(leftpage, origpage); + /* leftpage, lopaque must not be used below here */ + + MarkBufferDirty(buf); + MarkBufferDirty(rbuf); + + if (!isrightmost) + { + sopaque->btpo_prev = rightpagenumber; + MarkBufferDirty(sbuf); + } + + /* + * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes + * a split + */ + if (!isleaf) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_split xlrec; + uint8 xlinfo; + XLogRecPtr recptr; + + xlrec.level = ropaque->btpo_level; + /* See comments below on newitem, orignewitem, and posting lists */ + xlrec.firstrightoff = firstrightoff; + xlrec.newitemoff = newitemoff; + xlrec.postingoff = 0; + if (postingoff != 0 && origpagepostingoff < firstrightoff) + xlrec.postingoff = postingoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); + /* Log original right sibling, since we've changed its prev-pointer */ + if (!isrightmost) + XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); + if (!isleaf) + XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); + + /* + * Log the new item, if it was inserted on the left page. (If it was + * put on the right page, we don't need to explicitly WAL log it + * because it's included with all the other items on the right page.) + * Show the new item as belonging to the left page buffer, so that it + * is not stored if XLogInsert decides it needs a full-page image of + * the left page. We always store newitemoff in the record, though. + * + * The details are sometimes slightly different for page splits that + * coincide with a posting list split. If both the replacement + * posting list and newitem go on the right page, then we don't need + * to log anything extra, just like the simple !newitemonleft + * no-posting-split case (postingoff is set to zero in the WAL record, + * so recovery doesn't need to process a posting list split at all). + * Otherwise, we set postingoff and log orignewitem instead of + * newitem, despite having actually inserted newitem. REDO routine + * must reconstruct nposting and newitem using _bt_swap_posting(). + * + * Note: It's possible that our page split point is the point that + * makes the posting list lastleft and newitem firstright. This is + * the only case where we log orignewitem/newitem despite newitem + * going on the right page. If XLogInsert decides that it can omit + * orignewitem due to logging a full-page image of the left page, + * everything still works out, since recovery only needs to log + * orignewitem for items on the left page (just like the regular + * newitem-logged case). + */ + if (newitemonleft && xlrec.postingoff == 0) + XLogRegisterBufData(0, (char *) newitem, newitemsz); + else if (xlrec.postingoff != 0) + { + Assert(isleaf); + Assert(newitemonleft || firstrightoff == newitemoff); + Assert(newitemsz == IndexTupleSize(orignewitem)); + XLogRegisterBufData(0, (char *) orignewitem, newitemsz); + } + + /* Log the left page's new high key */ + if (!isleaf) + { + /* lefthighkey isn't local copy, get current pointer */ + itemid = PageGetItemId(origpage, P_HIKEY); + lefthighkey = (IndexTuple) PageGetItem(origpage, itemid); + } + XLogRegisterBufData(0, (char *) lefthighkey, + MAXALIGN(IndexTupleSize(lefthighkey))); + + /* + * Log the contents of the right page in the format understood by + * _bt_restore_page(). The whole right page will be recreated. + * + * Direct access to page is not good but faster - we should implement + * some new func in page API. Note we only store the tuples + * themselves, knowing that they were inserted in item-number order + * and so the line pointers can be reconstructed. See comments for + * _bt_restore_page(). + */ + XLogRegisterBufData(1, + (char *) rightpage + ((PageHeader) rightpage)->pd_upper, + ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); + + xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + PageSetLSN(origpage, recptr); + PageSetLSN(rightpage, recptr); + if (!isrightmost) + PageSetLSN(spage, recptr); + if (!isleaf) + PageSetLSN(BufferGetPage(cbuf), recptr); + } + + END_CRIT_SECTION(); + + /* release the old right sibling */ + if (!isrightmost) + _bt_relbuf(rel, sbuf); + + /* release the child */ + if (!isleaf) + _bt_relbuf(rel, cbuf); + + /* be tidy */ + if (isleaf) + pfree(lefthighkey); + + /* split's done */ + return rbuf; +} + +/* + * _bt_insert_parent() -- Insert downlink into parent, completing split. + * + * On entry, buf and rbuf are the left and right split pages, which we + * still hold write locks on. Both locks will be released here. We + * release the rbuf lock once we have a write lock on the page that we + * intend to insert a downlink to rbuf on (i.e. buf's current parent page). + * The lock on buf is released at the same point as the lock on the parent + * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same + * atomic operation that completes the split by inserting a new downlink. + * + * stack - stack showing how we got here. Will be NULL when splitting true + * root, or during concurrent root split, where we can be inefficient + * isroot - we split the true root + * isonly - we split a page alone on its level (might have been fast root) + */ +static void +_bt_insert_parent(Relation rel, + Buffer buf, + Buffer rbuf, + BTStack stack, + bool isroot, + bool isonly) +{ + /* + * Here we have to do something Lehman and Yao don't talk about: deal with + * a root split and construction of a new root. If our stack is empty + * then we have just split a node on what had been the root level when we + * descended the tree. If it was still the root then we perform a + * new-root construction. If it *wasn't* the root anymore, search to find + * the next higher level that someone constructed meanwhile, and find the + * right place to insert as for the normal case. + * + * If we have to search for the parent level, we do so by re-descending + * from the root. This is not super-efficient, but it's rare enough not + * to matter. + */ + if (isroot) + { + Buffer rootbuf; + + Assert(stack == NULL); + Assert(isonly); + /* create a new root node and update the metapage */ + rootbuf = _bt_newroot(rel, buf, rbuf); + /* release the split buffers */ + _bt_relbuf(rel, rootbuf); + _bt_relbuf(rel, rbuf); + _bt_relbuf(rel, buf); + } + else + { + BlockNumber bknum = BufferGetBlockNumber(buf); + BlockNumber rbknum = BufferGetBlockNumber(rbuf); + Page page = BufferGetPage(buf); + IndexTuple new_item; + BTStackData fakestack; + IndexTuple ritem; + Buffer pbuf; + + if (stack == NULL) + { + BTPageOpaque opaque; + + elog(DEBUG2, "concurrent ROOT page split"); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * We should never reach here when a leaf page split takes place + * despite the insert of newitem being able to apply the fastpath + * optimization. Make sure of that with an assertion. + * + * This is more of a performance issue than a correctness issue. + * The fastpath won't have a descent stack. Using a phony stack + * here works, but never rely on that. The fastpath should be + * rejected within _bt_search_insert() when the rightmost leaf + * page will split, since it's faster to go through _bt_search() + * and get a stack in the usual way. + */ + Assert(!(P_ISLEAF(opaque) && + BlockNumberIsValid(RelationGetTargetBlock(rel)))); + + /* Find the leftmost page at the next level up */ + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); + /* Set up a phony stack entry pointing there */ + stack = &fakestack; + stack->bts_blkno = BufferGetBlockNumber(pbuf); + stack->bts_offset = InvalidOffsetNumber; + stack->bts_parent = NULL; + _bt_relbuf(rel, pbuf); + } + + /* get high key from left, a strict lower bound for new right page */ + ritem = (IndexTuple) PageGetItem(page, + PageGetItemId(page, P_HIKEY)); + + /* form an index tuple that points at the new right page */ + new_item = CopyIndexTuple(ritem); + BTreeTupleSetDownLink(new_item, rbknum); + + /* + * Re-find and write lock the parent of buf. + * + * It's possible that the location of buf's downlink has changed since + * our initial _bt_search() descent. _bt_getstackbuf() will detect + * and recover from this, updating the stack, which ensures that the + * new downlink will be inserted at the correct offset. Even buf's + * parent may have changed. + */ + pbuf = _bt_getstackbuf(rel, stack, bknum); + + /* + * Unlock the right child. The left child will be unlocked in + * _bt_insertonpg(). + * + * Unlocking the right child must be delayed until here to ensure that + * no concurrent VACUUM operation can become confused. Page deletion + * cannot be allowed to fail to re-find a downlink for the rbuf page. + * (Actually, this is just a vestige of how things used to work. The + * page deletion code is expected to check for the INCOMPLETE_SPLIT + * flag on the left child. It won't attempt deletion of the right + * child until the split is complete. Despite all this, we opt to + * conservatively delay unlocking the right child until here.) + */ + _bt_relbuf(rel, rbuf); + + if (pbuf == InvalidBuffer) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u", + RelationGetRelationName(rel), bknum, rbknum))); + + /* Recursively insert into the parent */ + _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, + new_item, MAXALIGN(IndexTupleSize(new_item)), + stack->bts_offset + 1, 0, isonly); + + /* be tidy */ + pfree(new_item); + } +} + +/* + * _bt_finish_split() -- Finish an incomplete split + * + * A crash or other failure can leave a split incomplete. The insertion + * routines won't allow to insert on a page that is incompletely split. + * Before inserting on such a page, call _bt_finish_split(). + * + * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked + * and unpinned. + */ +void +_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) +{ + Page lpage = BufferGetPage(lbuf); + BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); + Buffer rbuf; + Page rpage; + BTPageOpaque rpageop; + bool wasroot; + bool wasonly; + + Assert(P_INCOMPLETE_SPLIT(lpageop)); + + /* Lock right sibling, the one missing the downlink */ + rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* Could this be a root split? */ + if (!stack) + { + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf)); + + _bt_relbuf(rel, metabuf); + } + else + wasroot = false; + + /* Was this the only page on the level before split? */ + wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); + + elog(DEBUG1, "finishing incomplete split of %u/%u", + BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); + + _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly); +} + +/* + * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot + * tuple whose downlink points to child page. + * + * Caller passes child's block number, which is used to identify + * associated pivot tuple in parent page using a linear search that + * matches on pivot's downlink/block number. The expected location of + * the pivot tuple is taken from the stack one level above the child + * page. This is used as a starting point. Insertions into the + * parent level could cause the pivot tuple to move right; deletions + * could cause it to move left, but not left of the page we previously + * found it on. + * + * Caller can use its stack to relocate the pivot tuple/downlink for + * any same-level page to the right of the page found by its initial + * descent. This is necessary because of the possibility that caller + * moved right to recover from a concurrent page split. It's also + * convenient for certain callers to be able to step right when there + * wasn't a concurrent page split, while still using their original + * stack. For example, the checkingunique _bt_doinsert() case may + * have to step right when there are many physical duplicates, and its + * scantid forces an insertion to the right of the "first page the + * value could be on". (This is also relied on by all of our callers + * when dealing with !heapkeyspace indexes.) + * + * Returns write-locked parent page buffer, or InvalidBuffer if pivot + * tuple not found (should not happen). Adjusts bts_blkno & + * bts_offset if changed. Page split caller should insert its new + * pivot tuple for its new right sibling page on parent page, at the + * offset number bts_offset + 1. + */ +Buffer +_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child) +{ + BlockNumber blkno; + OffsetNumber start; + + blkno = stack->bts_blkno; + start = stack->bts_offset; + + for (;;) + { + Buffer buf; + Page page; + BTPageOpaque opaque; + + buf = _bt_getbuf(rel, blkno, BT_WRITE); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_INCOMPLETE_SPLIT(opaque)) + { + _bt_finish_split(rel, buf, stack->bts_parent); + continue; + } + + if (!P_IGNORE(opaque)) + { + OffsetNumber offnum, + minoff, + maxoff; + ItemId itemid; + IndexTuple item; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * start = InvalidOffsetNumber means "search the whole page". We + * need this test anyway due to possibility that page has a high + * key now when it didn't before. + */ + if (start < minoff) + start = minoff; + + /* + * Need this check too, to guard against possibility that page + * split since we visited it originally. + */ + if (start > maxoff) + start = OffsetNumberNext(maxoff); + + /* + * These loops will check every item on the page --- but in an + * order that's attuned to the probability of where it actually + * is. Scan to the right first, then to the left. + */ + for (offnum = start; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (IndexTuple) PageGetItem(page, itemid); + + if (BTreeTupleGetDownLink(item) == child) + { + /* Return accurate pointer to where link is now */ + stack->bts_blkno = blkno; + stack->bts_offset = offnum; + return buf; + } + } + + for (offnum = OffsetNumberPrev(start); + offnum >= minoff; + offnum = OffsetNumberPrev(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (IndexTuple) PageGetItem(page, itemid); + + if (BTreeTupleGetDownLink(item) == child) + { + /* Return accurate pointer to where link is now */ + stack->bts_blkno = blkno; + stack->bts_offset = offnum; + return buf; + } + } + } + + /* + * The item we're looking for moved right at least one page. + * + * Lehman and Yao couple/chain locks when moving right here, which we + * can avoid. See nbtree/README. + */ + if (P_RIGHTMOST(opaque)) + { + _bt_relbuf(rel, buf); + return InvalidBuffer; + } + blkno = opaque->btpo_next; + start = InvalidOffsetNumber; + _bt_relbuf(rel, buf); + } +} + +/* + * _bt_newroot() -- Create a new root page for the index. + * + * We've just split the old root page and need to create a new one. + * In order to do this, we add a new root page to the file, then lock + * the metadata page and update it. This is guaranteed to be deadlock- + * free, because all readers release their locks on the metadata page + * before trying to lock the root, and all writers lock the root before + * trying to lock the metadata page. We have a write lock on the old + * root page, so we have not introduced any cycles into the waits-for + * graph. + * + * On entry, lbuf (the old root) and rbuf (its new peer) are write- + * locked. On exit, a new root page exists with entries for the + * two new children, metapage is updated and unlocked/unpinned. + * The new root buffer is returned to caller which has to unlock/unpin + * lbuf, rbuf & rootbuf. + */ +static Buffer +_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) +{ + Buffer rootbuf; + Page lpage, + rootpage; + BlockNumber lbkno, + rbkno; + BlockNumber rootblknum; + BTPageOpaque rootopaque; + BTPageOpaque lopaque; + ItemId itemid; + IndexTuple item; + IndexTuple left_item; + Size left_item_sz; + IndexTuple right_item; + Size right_item_sz; + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + lbkno = BufferGetBlockNumber(lbuf); + rbkno = BufferGetBlockNumber(rbuf); + lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + + /* get a new root page */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootpage = BufferGetPage(rootbuf); + rootblknum = BufferGetBlockNumber(rootbuf); + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + /* + * Create downlink item for left page (old root). The key value used is + * "minus infinity", a sentinel value that's reliably less than any real + * key value that could appear in the left page. + */ + left_item_sz = sizeof(IndexTupleData); + left_item = (IndexTuple) palloc(left_item_sz); + left_item->t_info = left_item_sz; + BTreeTupleSetDownLink(left_item, lbkno); + BTreeTupleSetNAtts(left_item, 0, false); + + /* + * Create downlink item for right page. The key for it is obtained from + * the "high key" position in the left page. + */ + itemid = PageGetItemId(lpage, P_HIKEY); + right_item_sz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(lpage, itemid); + right_item = CopyIndexTuple(item); + BTreeTupleSetDownLink(right_item, rbkno); + + /* NO EREPORT(ERROR) from here till newroot op is logged */ + START_CRIT_SECTION(); + + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + + /* set btree special data */ + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags = BTP_ROOT; + rootopaque->btpo_level = + ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1; + rootopaque->btpo_cycleid = 0; + + /* update metapage data */ + metad->btm_root = rootblknum; + metad->btm_level = rootopaque->btpo_level; + metad->btm_fastroot = rootblknum; + metad->btm_fastlevel = rootopaque->btpo_level; + + /* + * Insert the left page pointer into the new root page. The root page is + * the rightmost page on its level so there is no "high key" in it; the + * two items will go into positions P_HIKEY and P_FIRSTKEY. + * + * Note: we *must* insert the two items in item-number order, for the + * benefit of _bt_restore_page(). + */ + Assert(BTreeTupleGetNAtts(left_item, rel) == 0); + if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add leftkey to new root page" + " while splitting block %u of index \"%s\"", + BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); + + /* + * insert the right page pointer into the new root page. + */ + Assert(BTreeTupleGetNAtts(right_item, rel) > 0); + Assert(BTreeTupleGetNAtts(right_item, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add rightkey to new root page" + " while splitting block %u of index \"%s\"", + BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); + + /* Clear the incomplete-split flag in the left child */ + Assert(P_INCOMPLETE_SPLIT(lopaque)); + lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(lbuf); + + MarkBufferDirty(rootbuf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_newroot xlrec; + XLogRecPtr recptr; + xl_btree_metadata md; + + xlrec.rootblk = rootblknum; + xlrec.level = metad->btm_level; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); + + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + md.version = metad->btm_version; + md.root = rootblknum; + md.level = metad->btm_level; + md.fastroot = rootblknum; + md.fastlevel = metad->btm_level; + md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; + md.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); + + /* + * Direct access to page is not good but faster - we should implement + * some new func in page API. + */ + XLogRegisterBufData(0, + (char *) rootpage + ((PageHeader) rootpage)->pd_upper, + ((PageHeader) rootpage)->pd_special - + ((PageHeader) rootpage)->pd_upper); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); + + PageSetLSN(lpage, recptr); + PageSetLSN(rootpage, recptr); + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + + /* done with metapage */ + _bt_relbuf(rel, metabuf); + + pfree(left_item); + pfree(right_item); + + return rootbuf; +} + +/* + * _bt_pgaddtup() -- add a data item to a particular page during split. + * + * The difference between this routine and a bare PageAddItem call is + * that this code can deal with the first data item on an internal btree + * page in passing. This data item (which is called "firstright" within + * _bt_split()) has a key that must be treated as minus infinity after + * the split. Therefore, we truncate away all attributes when caller + * specifies it's the first data item on page (downlink is not changed, + * though). This extra step is only needed for the right page of an + * internal page split. There is no need to do this for the first data + * item on the existing/left page, since that will already have been + * truncated during an earlier page split. + * + * See _bt_split() for a high level explanation of why we truncate here. + * Note that this routine has nothing to do with suffix truncation, + * despite using some of the same infrastructure. + */ +static inline bool +_bt_pgaddtup(Page page, + Size itemsize, + IndexTuple itup, + OffsetNumber itup_off, + bool newfirstdataitem) +{ + IndexTupleData trunctuple; + + if (newfirstdataitem) + { + trunctuple = *itup; + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetNAtts(&trunctuple, 0, false); + itup = &trunctuple; + itemsize = sizeof(IndexTupleData); + } + + if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false, + false) == InvalidOffsetNumber)) + return false; + + return true; +} + +/* + * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split. + * + * There are three operations performed here: simple index deletion, bottom-up + * index deletion, and deduplication. If all three operations fail to free + * enough space for the incoming item then caller will go on to split the + * page. We always consider simple deletion first. If that doesn't work out + * we consider alternatives. Callers that only want us to consider simple + * deletion (without any fallback) ask for that using the 'simpleonly' + * argument. + * + * We usually pick only one alternative "complex" operation when simple + * deletion alone won't prevent a page split. The 'checkingunique', + * 'uniquedup', and 'indexUnchanged' arguments are used for that. + * + * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page + * level flag was found set. The flag was useful back when there wasn't + * necessarily one single page for a duplicate tuple to go on (before heap TID + * became a part of the key space in version 4 indexes). But we don't + * actually look at the flag anymore (it's not a gating condition for our + * caller). That would cause us to miss tuples that are safe to delete, + * without getting any benefit in return. We know that the alternative is to + * split the page; scanning the line pointer array in passing won't have + * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite + * all this because !heapkeyspace indexes must still do a "getting tired" + * linear search, and so are likely to get some benefit from using it as a + * gating condition.) + */ +static void +_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, + BTInsertState insertstate, + bool simpleonly, bool checkingunique, + bool uniquedup, bool indexUnchanged) +{ + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable = 0; + OffsetNumber offnum, + minoff, + maxoff; + Buffer buffer = insertstate->buf; + BTScanInsert itup_key = insertstate->itup_key; + Page page = BufferGetPage(buffer); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque)); + Assert(simpleonly || itup_key->heapkeyspace); + Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged)); + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. We'll usually manage to delete a few extra items that + * are not marked LP_DEAD in passing. Often the extra items that actually + * end up getting deleted are items that would have had their LP_DEAD bit + * set before long anyway (if we opted not to include them as extras). + */ + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable, + insertstate->itup, minoff, maxoff); + insertstate->bounds_valid = false; + + /* Return when a page split has already been avoided */ + if (PageGetFreeSpace(page) >= insertstate->itemsz) + return; + + /* Might as well assume duplicates (if checkingunique) */ + uniquedup = true; + } + + /* + * We're done with simple deletion. Return early with callers that only + * call here so that simple deletion can be considered. This includes + * callers that explicitly ask for this and checkingunique callers that + * probably don't have any version churn duplicates on the page. + * + * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we + * return at this point (or when we go on the try either or both of our + * other strategies and they also fail). We do not bother expending a + * separate write to clear it, however. Caller will definitely clear it + * when it goes on to split the page (note also that the deduplication + * process will clear the flag in passing, just to keep things tidy). + */ + if (simpleonly || (checkingunique && !uniquedup)) + { + Assert(!indexUnchanged); + return; + } + + /* Assume bounds about to be invalidated (this is almost certain now) */ + insertstate->bounds_valid = false; + + /* + * Perform bottom-up index deletion pass when executor hint indicated that + * incoming item is logically unchanged, or for a unique index that is + * known to have physical duplicates for some other reason. (There is a + * large overlap between these two cases for a unique index. It's worth + * having both triggering conditions in order to apply the optimization in + * the event of successive related INSERT and DELETE statements.) + * + * We'll go on to do a deduplication pass when a bottom-up pass fails to + * delete an acceptable amount of free space (a significant fraction of + * the page, or space for the new item, whichever is greater). + * + * Note: Bottom-up index deletion uses the same equality/equivalence + * routines as deduplication internally. However, it does not merge + * together index tuples, so the same correctness considerations do not + * apply. We deliberately omit an index-is-allequalimage test here. + */ + if ((indexUnchanged || uniquedup) && + _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz)) + return; + + /* Perform deduplication pass (when enabled and index-is-allequalimage) */ + if (BTGetDeduplicateItems(rel) && itup_key->allequalimage) + _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup, + insertstate->itemsz, (indexUnchanged || uniquedup)); +} + +/* + * _bt_simpledel_pass - Simple index tuple deletion pass. + * + * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers + * of all such tuples are determined by caller (caller passes these to us as + * its 'deletable' argument). + * + * We might also delete extra index tuples that turn out to be safe to delete + * in passing (though they must be cheap to check in passing to begin with). + * There is no certainty that any extra tuples will be deleted, though. The + * high level goal of the approach we take is to get the most out of each call + * here (without noticeably increasing the per-call overhead compared to what + * we need to do just to be able to delete the page's LP_DEAD-marked index + * tuples). + * + * The number of extra index tuples that turn out to be deletable might + * greatly exceed the number of LP_DEAD-marked index tuples due to various + * locality related effects. For example, it's possible that the total number + * of table blocks (pointed to by all TIDs on the leaf page) is naturally + * quite low, in which case we might end up checking if it's possible to + * delete _most_ index tuples on the page (without the tableam needing to + * access additional table blocks). The tableam will sometimes stumble upon + * _many_ extra deletable index tuples in indexes where this pattern is + * common. + * + * See nbtree/README for further details on simple index tuple deletion. + */ +static void +_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, + OffsetNumber *deletable, int ndeletable, IndexTuple newitem, + OffsetNumber minoff, OffsetNumber maxoff) +{ + Page page = BufferGetPage(buffer); + BlockNumber *deadblocks; + int ndeadblocks; + TM_IndexDeleteOp delstate; + OffsetNumber offnum; + + /* Get array of table blocks pointed to by LP_DEAD-set tuples */ + deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem, + &ndeadblocks); + + /* Initialize tableam state that describes index deletion operation */ + delstate.bottomup = false; + delstate.bottomupfreespace = 0; + delstate.ndeltids = 0; + delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); + delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids]; + TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids]; + BlockNumber tidblock; + void *match; + + if (!BTreeTupleIsPosting(itup)) + { + tidblock = ItemPointerGetBlockNumber(&itup->t_tid); + match = bsearch(&tidblock, deadblocks, ndeadblocks, + sizeof(BlockNumber), _bt_blk_cmp); + + if (!match) + { + Assert(!ItemIdIsDead(itemid)); + continue; + } + + /* + * TID's table block is among those pointed to by the TIDs from + * LP_DEAD-bit set tuples on page -- add TID to deltids + */ + odeltid->tid = itup->t_tid; + odeltid->id = delstate.ndeltids; + ostatus->idxoffnum = offnum; + ostatus->knowndeletable = ItemIdIsDead(itemid); + ostatus->promising = false; /* unused */ + ostatus->freespace = 0; /* unused */ + + delstate.ndeltids++; + } + else + { + int nitem = BTreeTupleGetNPosting(itup); + + for (int p = 0; p < nitem; p++) + { + ItemPointer tid = BTreeTupleGetPostingN(itup, p); + + tidblock = ItemPointerGetBlockNumber(tid); + match = bsearch(&tidblock, deadblocks, ndeadblocks, + sizeof(BlockNumber), _bt_blk_cmp); + + if (!match) + { + Assert(!ItemIdIsDead(itemid)); + continue; + } + + /* + * TID's table block is among those pointed to by the TIDs + * from LP_DEAD-bit set tuples on page -- add TID to deltids + */ + odeltid->tid = *tid; + odeltid->id = delstate.ndeltids; + ostatus->idxoffnum = offnum; + ostatus->knowndeletable = ItemIdIsDead(itemid); + ostatus->promising = false; /* unused */ + ostatus->freespace = 0; /* unused */ + + odeltid++; + ostatus++; + delstate.ndeltids++; + } + } + } + + pfree(deadblocks); + + Assert(delstate.ndeltids >= ndeletable); + + /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */ + _bt_delitems_delete_check(rel, buffer, heapRel, &delstate); + + pfree(delstate.deltids); + pfree(delstate.status); +} + +/* + * _bt_deadblocks() -- Get LP_DEAD related table blocks. + * + * Builds sorted and unique-ified array of table block numbers from index + * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table + * block from incoming newitem just in case it isn't among the LP_DEAD-related + * table blocks. + * + * Always counting the newitem's table block as an LP_DEAD related block makes + * sense because the cost is consistently low; it is practically certain that + * the table block will not incur a buffer miss in tableam. On the other hand + * the benefit is often quite high. There is a decent chance that there will + * be some deletable items from this block, since in general most garbage + * tuples became garbage in the recent past (in many cases this won't be the + * first logical row that core code added to/modified in table block + * recently). + * + * Returns final array, and sets *nblocks to its final size for caller. + */ +static BlockNumber * +_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, + IndexTuple newitem, int *nblocks) +{ + int spacentids, + ntids; + BlockNumber *tidblocks; + + /* + * Accumulate each TID's block in array whose initial size has space for + * one table block per LP_DEAD-set tuple (plus space for the newitem table + * block). Array will only need to grow when there are LP_DEAD-marked + * posting list tuples (which is not that common). + */ + spacentids = ndeletable + 1; + ntids = 0; + tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids); + + /* + * First add the table block for the incoming newitem. This is the one + * case where simple deletion can visit a table block that doesn't have + * any known deletable items. + */ + Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem)); + tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid); + + for (int i = 0; i < ndeletable; i++) + { + ItemId itemid = PageGetItemId(page, deletable[i]); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + + if (!BTreeTupleIsPosting(itup)) + { + if (ntids + 1 > spacentids) + { + spacentids *= 2; + tidblocks = (BlockNumber *) + repalloc(tidblocks, sizeof(BlockNumber) * spacentids); + } + + tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid); + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + if (ntids + nposting > spacentids) + { + spacentids = Max(spacentids * 2, ntids + nposting); + tidblocks = (BlockNumber *) + repalloc(tidblocks, sizeof(BlockNumber) * spacentids); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer tid = BTreeTupleGetPostingN(itup, j); + + tidblocks[ntids++] = ItemPointerGetBlockNumber(tid); + } + } + } + + qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); + *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); + + return tidblocks; +} + +/* + * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass + */ +static inline int +_bt_blk_cmp(const void *arg1, const void *arg2) +{ + BlockNumber b1 = *((BlockNumber *) arg1); + BlockNumber b2 = *((BlockNumber *) arg2); + + if (b1 < b2) + return -1; + else if (b1 > b2) + return 1; + + return 0; +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c new file mode 100644 index 0000000..ebec8fa --- /dev/null +++ b/src/backend/access/nbtree/nbtpage.c @@ -0,0 +1,3073 @@ +/*------------------------------------------------------------------------- + * + * nbtpage.c + * BTree-specific page management code for the Postgres btree access + * method. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtpage.c + * + * NOTES + * Postgres btree pages look like ordinary relation pages. The opaque + * data at high addresses includes pointers to left and right siblings + * and flag data describing page state. The first page in a btree, page + * zero, is special -- it stores meta-information describing the tree. + * Pages one and higher store the actual tree data. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + +static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); +static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, + FullTransactionId safexid); +static void _bt_delitems_delete(Relation rel, Buffer buf, + TransactionId latestRemovedXid, + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable); +static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, + OffsetNumber *updatedoffsets, + Size *updatedbuflen, bool needswal); +static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, + BTStack stack); +static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, + BlockNumber scanblkno, + bool *rightsib_empty, + BTVacState *vstate); +static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, + BTStack stack, + Buffer *subtreeparent, + OffsetNumber *poffset, + BlockNumber *topparent, + BlockNumber *topparentrightsib); +static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, + FullTransactionId safexid); + +/* + * _bt_initmetapage() -- Fill a page buffer with a correct metapage image + */ +void +_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool allequalimage) +{ + BTMetaPageData *metad; + BTPageOpaque metaopaque; + + _bt_pageinit(page, BLCKSZ); + + metad = BTPageGetMeta(page); + metad->btm_magic = BTREE_MAGIC; + metad->btm_version = BTREE_VERSION; + metad->btm_root = rootbknum; + metad->btm_level = level; + metad->btm_fastroot = rootbknum; + metad->btm_fastlevel = level; + metad->btm_last_cleanup_num_delpages = 0; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + metad->btm_allequalimage = allequalimage; + + metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); + metaopaque->btpo_flags = BTP_META; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; +} + +/* + * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version + * 3, the last version that can be updated without broadly affecting + * on-disk compatibility. (A REINDEX is required to upgrade to v4.) + * + * This routine does purely in-memory image upgrade. Caller is + * responsible for locking, WAL-logging etc. + */ +void +_bt_upgrademetapage(Page page) +{ + BTMetaPageData *metad; + BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY; + + metad = BTPageGetMeta(page); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* It must be really a meta page of upgradable version */ + Assert(metaopaque->btpo_flags & BTP_META); + Assert(metad->btm_version < BTREE_NOVAC_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + + /* Set version number and fill extra fields added into version 3 */ + metad->btm_version = BTREE_NOVAC_VERSION; + metad->btm_last_cleanup_num_delpages = 0; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + /* Only a REINDEX can set this field */ + Assert(!metad->btm_allequalimage); + metad->btm_allequalimage = false; + + /* Adjust pd_lower (see _bt_initmetapage() for details) */ + ((PageHeader) page)->pd_lower = + ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; +} + +/* + * Get metadata from share-locked buffer containing metapage, while performing + * standard sanity checks. + * + * Callers that cache data returned here in local cache should note that an + * on-the-fly upgrade using _bt_upgrademetapage() can change the version field + * and BTREE_NOVAC_VERSION specific fields without invalidating local cache. + */ +static BTMetaPageData * +_bt_getmeta(Relation rel, Buffer metabuf) +{ + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + /* sanity-check the metapage */ + if (!P_ISMETA(metaopaque) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" is not a btree", + RelationGetRelationName(rel)))); + + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); + + return metad; +} + +/* + * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup + * + * Called by btvacuumcleanup when btbulkdelete was never called because no + * index tuples needed to be deleted. + */ +bool +_bt_vacuum_needs_cleanup(Relation rel) +{ + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + uint32 btm_version; + BlockNumber prev_num_delpages; + + /* + * Copy details from metapage to local variables quickly. + * + * Note that we deliberately avoid using cached version of metapage here. + */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + btm_version = metad->btm_version; + + if (btm_version < BTREE_NOVAC_VERSION) + { + /* + * Metapage needs to be dynamically upgraded to store fields that are + * only present when btm_version >= BTREE_NOVAC_VERSION + */ + _bt_relbuf(rel, metabuf); + return true; + } + + prev_num_delpages = metad->btm_last_cleanup_num_delpages; + _bt_relbuf(rel, metabuf); + + /* + * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the + * total size of the index. We can reasonably expect (though are not + * guaranteed) to be able to recycle this many pages if we decide to do a + * btvacuumscan call during the ongoing btvacuumcleanup. For further + * details see the nbtree/README section on placing deleted pages in the + * FSM. + */ + if (prev_num_delpages > 0 && + prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20) + return true; + + return false; +} + +/* + * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup. + * + * Called at the end of btvacuumcleanup, when num_delpages value has been + * finalized. + */ +void +_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages) +{ + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + /* + * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage + * field started out as a TransactionId field called btm_oldest_btpo_xact. + * Both "versions" are just uint32 fields. It was convenient to repurpose + * the field when we began to use 64-bit XIDs in deleted pages. + * + * It's possible that a pg_upgrade'd database will contain an XID value in + * what is now recognized as the metapage's btm_last_cleanup_num_delpages + * field. _bt_vacuum_needs_cleanup() may even believe that this value + * indicates that there are lots of pages that it needs to recycle, when + * in reality there are only one or two. The worst that can happen is + * that there will be a call to btvacuumscan a little earlier, which will + * set btm_last_cleanup_num_delpages to a sane value when we're called. + * + * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is + * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just + * to be consistent. + */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */ + if (metad->btm_version >= BTREE_NOVAC_VERSION && + metad->btm_last_cleanup_num_delpages == num_delpages) + { + /* Usually means index continues to have num_delpages of 0 */ + _bt_relbuf(rel, metabuf); + return; + } + + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); + + START_CRIT_SECTION(); + + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + + /* update cleanup-related information */ + metad->btm_last_cleanup_num_delpages = num_delpages; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + MarkBufferDirty(metabuf); + + /* write wal record if needed */ + if (RelationNeedsWAL(rel)) + { + xl_btree_metadata md; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + md.version = metad->btm_version; + md.root = metad->btm_root; + md.level = metad->btm_level; + md.fastroot = metad->btm_fastroot; + md.fastlevel = metad->btm_fastlevel; + md.last_cleanup_num_delpages = num_delpages; + md.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP); + + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + + _bt_relbuf(rel, metabuf); +} + +/* + * _bt_getroot() -- Get the root page of the btree. + * + * Since the root page can move around the btree file, we have to read + * its location from the metadata page, and then read the root page + * itself. If no root page exists yet, we have to create one. + * + * The access type parameter (BT_READ or BT_WRITE) controls whether + * a new root page will be created or not. If access = BT_READ, + * and no root page exists, we just return InvalidBuffer. For + * BT_WRITE, we try to create the root page if it doesn't exist. + * NOTE that the returned root page will have only a read lock set + * on it even if access = BT_WRITE! + * + * The returned page is not necessarily the true root --- it could be + * a "fast root" (a page that is alone in its level due to deletions). + * Also, if the root page is split while we are "in flight" to it, + * what we will return is the old root, which is now just the leftmost + * page on a probably-not-very-wide level. For most purposes this is + * as good as or better than the true root, so we do not bother to + * insist on finding the true root. We do, however, guarantee to + * return a live (not deleted or half-dead) page. + * + * On successful return, the root page is pinned and read-locked. + * The metadata page is not locked or pinned on exit. + */ +Buffer +_bt_getroot(Relation rel, int access) +{ + Buffer metabuf; + Buffer rootbuf; + Page rootpage; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + uint32 rootlevel; + BTMetaPageData *metad; + + /* + * Try to use previously-cached metapage data to find the root. This + * normally saves one buffer access per index search, which is a very + * helpful savings in bufmgr traffic and hence contention. + */ + if (rel->rd_amcache != NULL) + { + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); + Assert(metad->btm_root != P_NONE); + + rootblkno = metad->btm_fastroot; + Assert(rootblkno != P_NONE); + rootlevel = metad->btm_fastlevel; + + rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + + /* + * Since the cache might be stale, we check the page more carefully + * here than normal. We *must* check that it's not deleted. If it's + * not alone on its level, then we reject too --- this may be overly + * paranoid but better safe than sorry. Note we don't check P_ISROOT, + * because that's not set in a "fast root". + */ + if (!P_IGNORE(rootopaque) && + rootopaque->btpo_level == rootlevel && + P_LEFTMOST(rootopaque) && + P_RIGHTMOST(rootopaque)) + { + /* OK, accept cached page as the root */ + return rootbuf; + } + _bt_relbuf(rel, rootbuf); + /* Cache is stale, throw it away */ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + } + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* if no root page initialized yet, do it */ + if (metad->btm_root == P_NONE) + { + Page metapg; + + /* If access = BT_READ, caller doesn't want us to create root yet */ + if (access == BT_READ) + { + _bt_relbuf(rel, metabuf); + return InvalidBuffer; + } + + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); + + /* + * Race condition: if someone else initialized the metadata between + * the time we released the read lock and acquired the write lock, we + * must avoid doing it again. + */ + if (metad->btm_root != P_NONE) + { + /* + * Metadata initialized by someone else. In order to guarantee no + * deadlocks, we have to release the metadata page and start all + * over again. (Is that really true? But it's hardly worth trying + * to optimize this case.) + */ + _bt_relbuf(rel, metabuf); + return _bt_getroot(rel, access); + } + + /* + * Get, initialize, write, and leave a lock of the appropriate type on + * the new root page. Since this is the first page in the tree, it's + * a leaf as well as the root. + */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootblkno = BufferGetBlockNumber(rootbuf); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); + rootopaque->btpo_level = 0; + rootopaque->btpo_cycleid = 0; + /* Get raw page pointer for metapage */ + metapg = BufferGetPage(metabuf); + + /* NO ELOG(ERROR) till meta is updated */ + START_CRIT_SECTION(); + + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + + metad->btm_root = rootblkno; + metad->btm_level = 0; + metad->btm_fastroot = rootblkno; + metad->btm_fastlevel = 0; + metad->btm_last_cleanup_num_delpages = 0; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + + MarkBufferDirty(rootbuf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_newroot xlrec; + XLogRecPtr recptr; + xl_btree_metadata md; + + XLogBeginInsert(); + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + md.version = metad->btm_version; + md.root = rootblkno; + md.level = 0; + md.fastroot = rootblkno; + md.fastlevel = 0; + md.last_cleanup_num_delpages = 0; + md.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); + + xlrec.rootblk = rootblkno; + xlrec.level = 0; + + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); + + PageSetLSN(rootpage, recptr); + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + + /* + * swap root write lock for read lock. There is no danger of anyone + * else accessing the new root page while it's unlocked, since no one + * else knows where it is yet. + */ + _bt_unlockbuf(rel, rootbuf); + _bt_lockbuf(rel, rootbuf, BT_READ); + + /* okay, metadata is correct, release lock on it without caching */ + _bt_relbuf(rel, metabuf); + } + else + { + rootblkno = metad->btm_fastroot; + Assert(rootblkno != P_NONE); + rootlevel = metad->btm_fastlevel; + + /* + * Cache the metapage data for next time + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + + /* + * We are done with the metapage; arrange to release it via first + * _bt_relandgetbuf call + */ + rootbuf = metabuf; + + for (;;) + { + rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + + if (!P_IGNORE(rootopaque)) + break; + + /* it's dead, Jim. step right one page */ + if (P_RIGHTMOST(rootopaque)) + elog(ERROR, "no live root page found in index \"%s\"", + RelationGetRelationName(rel)); + rootblkno = rootopaque->btpo_next; + } + + if (rootopaque->btpo_level != rootlevel) + elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", + rootblkno, RelationGetRelationName(rel), + rootopaque->btpo_level, rootlevel); + } + + /* + * By here, we have a pin and read lock on the root page, and no lock set + * on the metadata page. Return the root page's buffer. + */ + return rootbuf; +} + +/* + * _bt_gettrueroot() -- Get the true root page of the btree. + * + * This is the same as the BT_READ case of _bt_getroot(), except + * we follow the true-root link not the fast-root link. + * + * By the time we acquire lock on the root page, it might have been split and + * not be the true root anymore. This is okay for the present uses of this + * routine; we only really need to be able to move up at least one tree level + * from whatever non-root page we were at. If we ever do need to lock the + * one true root page, we could loop here, re-reading the metapage on each + * failure. (Note that it wouldn't do to hold the lock on the metapage while + * moving to the root --- that'd deadlock against any concurrent root split.) + */ +Buffer +_bt_gettrueroot(Relation rel) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + Buffer rootbuf; + Page rootpage; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + uint32 rootlevel; + BTMetaPageData *metad; + + /* + * We don't try to use cached metapage data here, since (a) this path is + * not performance-critical, and (b) if we are here it suggests our cache + * is out-of-date anyway. In light of point (b), it's probably safest to + * actively flush any cached metapage info. + */ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + if (!P_ISMETA(metaopaque) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" is not a btree", + RelationGetRelationName(rel)))); + + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); + + /* if no root page initialized yet, fail */ + if (metad->btm_root == P_NONE) + { + _bt_relbuf(rel, metabuf); + return InvalidBuffer; + } + + rootblkno = metad->btm_root; + rootlevel = metad->btm_level; + + /* + * We are done with the metapage; arrange to release it via first + * _bt_relandgetbuf call + */ + rootbuf = metabuf; + + for (;;) + { + rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + + if (!P_IGNORE(rootopaque)) + break; + + /* it's dead, Jim. step right one page */ + if (P_RIGHTMOST(rootopaque)) + elog(ERROR, "no live root page found in index \"%s\"", + RelationGetRelationName(rel)); + rootblkno = rootopaque->btpo_next; + } + + if (rootopaque->btpo_level != rootlevel) + elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", + rootblkno, RelationGetRelationName(rel), + rootopaque->btpo_level, rootlevel); + + return rootbuf; +} + +/* + * _bt_getrootheight() -- Get the height of the btree search tree. + * + * We return the level (counting from zero) of the current fast root. + * This represents the number of tree levels we'd have to descend through + * to start any btree index search. + * + * This is used by the planner for cost-estimation purposes. Since it's + * only an estimate, slightly-stale data is fine, hence we don't worry + * about updating previously cached data. + */ +int +_bt_getrootheight(Relation rel) +{ + BTMetaPageData *metad; + + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here and report the index height is zero. + * (XXX perhaps _bt_getroot() should be changed to allow this case.) + */ + if (metad->btm_root == P_NONE) + { + _bt_relbuf(rel, metabuf); + return 0; + } + + /* + * Cache the metapage data for next time + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + _bt_relbuf(rel, metabuf); + } + + /* Get cached page */ + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); + Assert(metad->btm_fastroot != P_NONE); + + return metad->btm_fastlevel; +} + +/* + * _bt_metaversion() -- Get version/status info from metapage. + * + * Sets caller's *heapkeyspace and *allequalimage arguments using data + * from the B-Tree metapage (could be locally-cached version). This + * information needs to be stashed in insertion scankey, so we provide a + * single function that fetches both at once. + * + * This is used to determine the rules that must be used to descend a + * btree. Version 4 indexes treat heap TID as a tiebreaker attribute. + * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable + * performance when inserting a new BTScanInsert-wise duplicate tuple + * among many leaf pages already full of such duplicates. + * + * Also sets allequalimage field, which indicates whether or not it is + * safe to apply deduplication. We rely on the assumption that + * btm_allequalimage will be zero'ed on heapkeyspace indexes that were + * pg_upgrade'd from Postgres 12. + */ +void +_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage) +{ + BTMetaPageData *metad; + + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here. (XXX perhaps _bt_getroot() should + * be changed to allow this case.) + */ + if (metad->btm_root == P_NONE) + { + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *allequalimage = metad->btm_allequalimage; + + _bt_relbuf(rel, metabuf); + return; + } + + /* + * Cache the metapage data for next time + * + * An on-the-fly version upgrade performed by _bt_upgrademetapage() + * can change the nbtree version for an index without invalidating any + * local cache. This is okay because it can only happen when moving + * from version 2 to version 3, both of which are !heapkeyspace + * versions. + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + _bt_relbuf(rel, metabuf); + } + + /* Get cached page */ + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); + Assert(metad->btm_fastroot != P_NONE); + + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *allequalimage = metad->btm_allequalimage; +} + +/* + * _bt_checkpage() -- Verify that a freshly-read page looks sane. + */ +void +_bt_checkpage(Relation rel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * ReadBuffer verifies that every newly-read page passes + * PageHeaderIsValid, which means it either contains a reasonably sane + * page header or is all-zero. We have to defend against the all-zero + * case, however. + */ + if (PageIsNew(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains unexpected zero page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + + /* + * Additionally check that the special area looks sane. + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); +} + +/* + * Log the reuse of a page from the FSM. + */ +static void +_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid) +{ + xl_btree_reuse_page xlrec_reuse; + + /* + * Note that we don't register the buffer with the record, because this + * operation doesn't modify the page. This record only exists to provide a + * conflict point for Hot Standby. + */ + + /* XLOG stuff */ + xlrec_reuse.node = rel->rd_node; + xlrec_reuse.block = blkno; + xlrec_reuse.latestRemovedFullXid = safexid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); + + XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE); +} + +/* + * _bt_getbuf() -- Get a buffer by block number for read or write. + * + * blkno == P_NEW means to get an unallocated index page. The page + * will be initialized before returning it. + * + * The general rule in nbtree is that it's never okay to access a + * page without holding both a buffer pin and a buffer lock on + * the page's buffer. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer and its reference count has been incremented + * (ie, the buffer is "locked and pinned"). Also, we apply + * _bt_checkpage to sanity-check the page (except in P_NEW case), + * and perform Valgrind client requests that help Valgrind detect + * unsafe page accesses. + * + * Note: raw LockBuffer() calls are disallowed in nbtree; all + * buffer lock requests need to go through wrapper functions such + * as _bt_lockbuf(). + */ +Buffer +_bt_getbuf(Relation rel, BlockNumber blkno, int access) +{ + Buffer buf; + + if (blkno != P_NEW) + { + /* Read an existing block of the relation */ + buf = ReadBuffer(rel, blkno); + _bt_lockbuf(rel, buf, access); + _bt_checkpage(rel, buf); + } + else + { + bool needLock; + Page page; + + Assert(access == BT_WRITE); + + /* + * First see if the FSM knows of any free pages. + * + * We can't trust the FSM's report unreservedly; we have to check that + * the page is still free. (For example, an already-free page could + * have been re-used between the time the last VACUUM scanned it and + * the time the VACUUM made its FSM updates.) + * + * In fact, it's worse than that: we can't even assume that it's safe + * to take a lock on the reported page. If somebody else has a lock + * on it, or even worse our own caller does, we could deadlock. (The + * own-caller scenario is actually not improbable. Consider an index + * on a serial or timestamp column. Nearly all splits will be at the + * rightmost page, so it's entirely likely that _bt_split will call us + * while holding a lock on the page most recently acquired from FSM. A + * VACUUM running concurrently with the previous split could well have + * placed that page back in FSM.) + * + * To get around that, we ask for only a conditional lock on the + * reported page. If we fail, then someone else is using the page, + * and we may reasonably assume it's not free. (If we happen to be + * wrong, the worst consequence is the page will be lost to use till + * the next VACUUM, which is no big problem.) + */ + for (;;) + { + blkno = GetFreeIndexPage(rel); + if (blkno == InvalidBlockNumber) + break; + buf = ReadBuffer(rel, blkno); + if (_bt_conditionallockbuf(rel, buf)) + { + page = BufferGetPage(buf); + + /* + * It's possible to find an all-zeroes page in an index. For + * example, a backend might successfully extend the relation + * one page and then crash before it is able to make a WAL + * entry for adding the page. If we find a zeroed page then + * reclaim it immediately. + */ + if (PageIsNew(page)) + { + /* Okay to use page. Initialize and return it. */ + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; + } + + if (BTPageIsRecyclable(page)) + { + /* + * If we are generating WAL for Hot Standby then create a + * WAL record that will allow us to conflict with queries + * running on standby, in case they have snapshots older + * than safexid value + */ + if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) + _bt_log_reuse_page(rel, blkno, + BTPageGetDeleteXid(page)); + + /* Okay to use page. Re-initialize and return it. */ + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; + } + elog(DEBUG2, "FSM returned nonrecyclable page"); + _bt_relbuf(rel, buf); + } + else + { + elog(DEBUG2, "FSM returned nonlockable page"); + /* couldn't get lock, so just drop pin */ + ReleaseBuffer(buf); + } + } + + /* + * Extend the relation by one page. + * + * We have to use a lock to ensure no one else is extending the rel at + * the same time, else we will both try to initialize the same new + * page. We can skip locking for new or temp relations, however, + * since no one else could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(rel); + + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + + buf = ReadBuffer(rel, P_NEW); + + /* Acquire buffer lock on new page */ + _bt_lockbuf(rel, buf, BT_WRITE); + + /* + * Release the file-extension lock; it's now OK for someone else to + * extend the relation some more. Note that we cannot release this + * lock before we have buffer lock on the new page, or we risk a race + * condition against btvacuumscan --- see comments therein. + */ + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + /* Initialize the new page before returning it */ + page = BufferGetPage(buf); + Assert(PageIsNew(page)); + _bt_pageinit(page, BufferGetPageSize(buf)); + } + + /* ref count and lock type are correct */ + return buf; +} + +/* + * _bt_relandgetbuf() -- release a locked buffer and get another one. + * + * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the + * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer + * then it reduces to just _bt_getbuf; allowing this case simplifies some + * callers. + * + * The original motivation for using this was to avoid two entries to the + * bufmgr when one would do. However, now it's mainly just a notational + * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf + * is when the target page is the same one already in the buffer. + */ +Buffer +_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) +{ + Buffer buf; + + Assert(blkno != P_NEW); + if (BufferIsValid(obuf)) + _bt_unlockbuf(rel, obuf); + buf = ReleaseAndReadBuffer(obuf, rel, blkno); + _bt_lockbuf(rel, buf, access); + + _bt_checkpage(rel, buf); + return buf; +} + +/* + * _bt_relbuf() -- release a locked buffer. + * + * Lock and pin (refcount) are both dropped. + */ +void +_bt_relbuf(Relation rel, Buffer buf) +{ + _bt_unlockbuf(rel, buf); + ReleaseBuffer(buf); +} + +/* + * _bt_lockbuf() -- lock a pinned buffer. + * + * Lock is acquired without acquiring another pin. This is like a raw + * LockBuffer() call, but performs extra steps needed by Valgrind. + * + * Note: Caller may need to call _bt_checkpage() with buf when pin on buf + * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). + */ +void +_bt_lockbuf(Relation rel, Buffer buf, int access) +{ + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, access); + + /* + * It doesn't matter that _bt_unlockbuf() won't get called in the event of + * an nbtree error (e.g. a unique violation error). That won't cause + * Valgrind false positives. + * + * The nbtree client requests are superimposed on top of the bufmgr.c + * buffer pin client requests. In the event of an nbtree error the buffer + * will certainly get marked as defined when the backend once again + * acquires its first pin on the buffer. (Of course, if the backend never + * touches the buffer again then it doesn't matter that it remains + * non-accessible to Valgrind.) + * + * Note: When an IndexTuple C pointer gets computed using an ItemId read + * from a page while a lock was held, the C pointer becomes unsafe to + * dereference forever as soon as the lock is released. Valgrind can only + * detect cases where the pointer gets dereferenced with no _current_ + * lock/pin held, though. + */ + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_unlockbuf() -- unlock a pinned buffer. + */ +void +_bt_unlockbuf(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable. Check that proactively. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned + * buffer. + * + * Note: Caller may need to call _bt_checkpage() with buf when pin on buf + * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). + */ +bool +_bt_conditionallockbuf(Relation rel, Buffer buf) +{ + /* ConditionalLockBuffer() asserts that pin is held by this backend */ + if (!ConditionalLockBuffer(buf)) + return false; + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + + return true; +} + +/* + * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup + * lock. + */ +void +_bt_upgradelockbufcleanup(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable. Check that proactively. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); +} + +/* + * _bt_pageinit() -- Initialize a new page. + * + * On return, the page header is initialized; data space is empty; + * special space is zeroed out. + */ +void +_bt_pageinit(Page page, Size size) +{ + PageInit(page, size, sizeof(BTPageOpaqueData)); +} + +/* + * Delete item(s) from a btree leaf page during VACUUM. + * + * This routine assumes that the caller has a super-exclusive write lock on + * the buffer. Also, the given deletable and updatable arrays *must* be + * sorted in ascending order. + * + * Routine deals with deleting TIDs when some (but not all) of the heap TIDs + * in an existing posting list item are to be removed. This works by + * updating/overwriting an existing item with caller's new version of the item + * (a version that lacks the TIDs that are to be deleted). + * + * We record VACUUMs and b-tree deletes differently in WAL. Deletes must + * generate their own latestRemovedXid by accessing the table directly, + * whereas VACUUMs rely on the initial VACUUM table scan performing + * WAL-logging that takes care of the issue for the table's indexes + * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree + * deletes don't do. + */ +void +_bt_delitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable) +{ + Page page = BufferGetPage(buf); + BTPageOpaque opaque; + bool needswal = RelationNeedsWAL(rel); + char *updatedbuf = NULL; + Size updatedbuflen = 0; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + + /* Shouldn't be called unless there's something to do */ + Assert(ndeletable > 0 || nupdatable > 0); + + /* Generate new version of posting lists without deleted TIDs */ + if (nupdatable > 0) + updatedbuf = _bt_delitems_update(updatable, nupdatable, + updatedoffsets, &updatedbuflen, + needswal); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* + * Handle posting tuple updates. + * + * Deliberately do this before handling simple deletes. If we did it the + * other way around (i.e. WAL record order -- simple deletes before + * updates) then we'd have to make compensating changes to the 'updatable' + * array of offset numbers. + * + * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it + * happens to already be set. It's important that we not interfere with + * _bt_delitems_delete(). + */ + for (int i = 0; i < nupdatable; i++) + { + OffsetNumber updatedoffset = updatedoffsets[i]; + IndexTuple itup; + Size itemsz; + + itup = updatable[i]->itup; + itemsz = MAXALIGN(IndexTupleSize(itup)); + if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, + itemsz)) + elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + } + + /* Now handle simple deletes of entire tuples */ + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * We can clear the vacuum cycle ID since this page has certainly been + * processed by the current vacuum scan. + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_cycleid = 0; + + /* + * Clear the BTP_HAS_GARBAGE page flag. + * + * This flag indicates the presence of LP_DEAD items on the page (though + * not reliably). Note that we only rely on it with pg_upgrade'd + * !heapkeyspace indexes. That's why clearing it here won't usually + * interfere with _bt_delitems_delete(). + */ + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (needswal) + { + XLogRecPtr recptr; + xl_btree_vacuum xlrec_vacuum; + + xlrec_vacuum.ndeleted = ndeletable; + xlrec_vacuum.nupdated = nupdatable; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum); + + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + if (nupdatable > 0) + { + XLogRegisterBufData(0, (char *) updatedoffsets, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updatedbuf, updatedbuflen); + } + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* can't leak memory here */ + if (updatedbuf != NULL) + pfree(updatedbuf); + /* free tuples allocated within _bt_delitems_update() */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]->itup); +} + +/* + * Delete item(s) from a btree leaf page during single-page cleanup. + * + * This routine assumes that the caller has pinned and write locked the + * buffer. Also, the given deletable and updatable arrays *must* be sorted in + * ascending order. + * + * Routine deals with deleting TIDs when some (but not all) of the heap TIDs + * in an existing posting list item are to be removed. This works by + * updating/overwriting an existing item with caller's new version of the item + * (a version that lacks the TIDs that are to be deleted). + * + * This is nearly the same as _bt_delitems_vacuum as far as what it does to + * the page, but it needs its own latestRemovedXid from caller (caller gets + * this from tableam). This is used by the REDO routine to generate recovery + * conflicts. The other difference is that only _bt_delitems_vacuum will + * clear page's VACUUM cycle ID. + */ +static void +_bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable) +{ + Page page = BufferGetPage(buf); + BTPageOpaque opaque; + bool needswal = RelationNeedsWAL(rel); + char *updatedbuf = NULL; + Size updatedbuflen = 0; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + + /* Shouldn't be called unless there's something to do */ + Assert(ndeletable > 0 || nupdatable > 0); + + /* Generate new versions of posting lists without deleted TIDs */ + if (nupdatable > 0) + updatedbuf = _bt_delitems_update(updatable, nupdatable, + updatedoffsets, &updatedbuflen, + needswal); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* Handle updates and deletes just like _bt_delitems_vacuum */ + for (int i = 0; i < nupdatable; i++) + { + OffsetNumber updatedoffset = updatedoffsets[i]; + IndexTuple itup; + Size itemsz; + + itup = updatable[i]->itup; + itemsz = MAXALIGN(IndexTupleSize(itup)); + if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, + itemsz)) + elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + } + + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at + * this point. The VACUUM command alone controls vacuum cycle IDs. + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Clear the BTP_HAS_GARBAGE page flag. + * + * This flag indicates the presence of LP_DEAD items on the page (though + * not reliably). Note that we only rely on it with pg_upgrade'd + * !heapkeyspace indexes. + */ + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (needswal) + { + XLogRecPtr recptr; + xl_btree_delete xlrec_delete; + + xlrec_delete.latestRemovedXid = latestRemovedXid; + xlrec_delete.ndeleted = ndeletable; + xlrec_delete.nupdated = nupdatable; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete); + + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + if (nupdatable > 0) + { + XLogRegisterBufData(0, (char *) updatedoffsets, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updatedbuf, updatedbuflen); + } + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* can't leak memory here */ + if (updatedbuf != NULL) + pfree(updatedbuf); + /* free tuples allocated within _bt_delitems_update() */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]->itup); +} + +/* + * Set up state needed to delete TIDs from posting list tuples via "updating" + * the tuple. Performs steps common to both _bt_delitems_vacuum and + * _bt_delitems_delete. These steps must take place before each function's + * critical section begins. + * + * updatable and nupdatable are inputs, though note that we will use + * _bt_update_posting() to replace the original itup with a pointer to a final + * version in palloc()'d memory. Caller should free the tuples when its done. + * + * The first nupdatable entries from updatedoffsets are set to the page offset + * number for posting list tuples that caller updates. This is mostly useful + * because caller may need to WAL-log the page offsets (though we always do + * this for caller out of convenience). + * + * Returns buffer consisting of an array of xl_btree_update structs that + * describe the steps we perform here for caller (though only when needswal is + * true). Also sets *updatedbuflen to the final size of the buffer. This + * buffer is used by caller when WAL logging is required. + */ +static char * +_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, + OffsetNumber *updatedoffsets, Size *updatedbuflen, + bool needswal) +{ + char *updatedbuf = NULL; + Size buflen = 0; + + /* Shouldn't be called unless there's something to do */ + Assert(nupdatable > 0); + + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + Size itemsz; + + /* Replace work area IndexTuple with updated version */ + _bt_update_posting(vacposting); + + /* Keep track of size of xl_btree_update for updatedbuf in passing */ + itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16); + buflen += itemsz; + + /* Build updatedoffsets buffer in passing */ + updatedoffsets[i] = vacposting->updatedoffset; + } + + /* XLOG stuff */ + if (needswal) + { + Size offset = 0; + + /* Allocate, set final size for caller */ + updatedbuf = palloc(buflen); + *updatedbuflen = buflen; + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + Size itemsz; + xl_btree_update update; + + update.ndeletedtids = vacposting->ndeletedtids; + memcpy(updatedbuf + offset, &update.ndeletedtids, + SizeOfBtreeUpdate); + offset += SizeOfBtreeUpdate; + + itemsz = update.ndeletedtids * sizeof(uint16); + memcpy(updatedbuf + offset, vacposting->deletetids, itemsz); + offset += itemsz; + } + } + + return updatedbuf; +} + +/* + * Comparator used by _bt_delitems_delete_check() to restore deltids array + * back to its original leaf-page-wise sort order + */ +static int +_bt_delitems_cmp(const void *a, const void *b) +{ + TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a; + TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b; + + if (indexdelete1->id > indexdelete2->id) + return 1; + if (indexdelete1->id < indexdelete2->id) + return -1; + + Assert(false); + + return 0; +} + +/* + * Try to delete item(s) from a btree leaf page during single-page cleanup. + * + * nbtree interface to table_index_delete_tuples(). Deletes a subset of index + * tuples from caller's deltids array: those whose TIDs are found safe to + * delete by the tableam (or already marked LP_DEAD in index, and so already + * known to be deletable by our simple index deletion caller). We physically + * delete index tuples from buf leaf page last of all (for index tuples where + * that is known to be safe following our table_index_delete_tuples() call). + * + * Simple index deletion caller only includes TIDs from index tuples marked + * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be + * included without increasing the total number of distinct table blocks for + * the deletion operation as a whole. This approach often allows us to delete + * some extra index tuples that were practically free for tableam to check in + * passing (when they actually turn out to be safe to delete). It probably + * only makes sense for the tableam to go ahead with these extra checks when + * it is block-oriented (otherwise the checks probably won't be practically + * free, which we rely on). The tableam interface requires the tableam side + * to handle the problem, though, so this is okay (we as an index AM are free + * to make the simplifying assumption that all tableams must be block-based). + * + * Bottom-up index deletion caller provides all the TIDs from the leaf page, + * without expecting that tableam will check most of them. The tableam has + * considerable discretion around which entries/blocks it checks. Our role in + * costing the bottom-up deletion operation is strictly advisory. + * + * Note: Caller must have added deltids entries (i.e. entries that go in + * delstate's main array) in leaf-page-wise order: page offset number order, + * TID order among entries taken from the same posting list tuple (tiebreak on + * TID). This order is convenient to work with here. + * + * Note: We also rely on the id field of each deltids element "capturing" this + * original leaf-page-wise order. That is, we expect to be able to get back + * to the original leaf-page-wise order just by sorting deltids on the id + * field (tableam will sort deltids for its own reasons, so we'll need to put + * it back in leaf-page-wise order afterwards). + */ +void +_bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, + TM_IndexDeleteOp *delstate) +{ + Page page = BufferGetPage(buf); + TransactionId latestRemovedXid; + OffsetNumber postingidxoffnum = InvalidOffsetNumber; + int ndeletable = 0, + nupdatable = 0; + OffsetNumber deletable[MaxIndexTuplesPerPage]; + BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + + /* Use tableam interface to determine which tuples to delete first */ + latestRemovedXid = table_index_delete_tuples(heapRel, delstate); + + /* Should not WAL-log latestRemovedXid unless it's required */ + if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel)) + latestRemovedXid = InvalidTransactionId; + + /* + * Construct a leaf-page-wise description of what _bt_delitems_delete() + * needs to do to physically delete index tuples from the page. + * + * Must sort deltids array to restore leaf-page-wise order (original order + * before call to tableam). This is the order that the loop expects. + * + * Note that deltids array might be a lot smaller now. It might even have + * no entries at all (with bottom-up deletion caller), in which case there + * is nothing left to do. + */ + qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete), + _bt_delitems_cmp); + if (delstate->ndeltids == 0) + { + Assert(delstate->bottomup); + return; + } + + /* We definitely have to delete at least one index tuple (or one TID) */ + for (int i = 0; i < delstate->ndeltids; i++) + { + TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id; + OffsetNumber idxoffnum = dstatus->idxoffnum; + ItemId itemid = PageGetItemId(page, idxoffnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + int nestedi, + nitem; + BTVacuumPosting vacposting; + + Assert(OffsetNumberIsValid(idxoffnum)); + + if (idxoffnum == postingidxoffnum) + { + /* + * This deltid entry is a TID from a posting list tuple that has + * already been completely processed + */ + Assert(BTreeTupleIsPosting(itup)); + Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup), + &delstate->deltids[i].tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup), + &delstate->deltids[i].tid) >= 0); + continue; + } + + if (!BTreeTupleIsPosting(itup)) + { + /* Plain non-pivot tuple */ + Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid)); + if (dstatus->knowndeletable) + deletable[ndeletable++] = idxoffnum; + continue; + } + + /* + * itup is a posting list tuple whose lowest deltids entry (which may + * or may not be for the first TID from itup) is considered here now. + * We should process all of the deltids entries for the posting list + * together now, though (not just the lowest). Remember to skip over + * later itup-related entries during later iterations of outermost + * loop. + */ + postingidxoffnum = idxoffnum; /* Remember work in outermost loop */ + nestedi = i; /* Initialize for first itup deltids entry */ + vacposting = NULL; /* Describes final action for itup */ + nitem = BTreeTupleGetNPosting(itup); + for (int p = 0; p < nitem; p++) + { + ItemPointer ptid = BTreeTupleGetPostingN(itup, p); + int ptidcmp = -1; + + /* + * This nested loop reuses work across ptid TIDs taken from itup. + * We take advantage of the fact that both itup's TIDs and deltids + * entries (within a single itup/posting list grouping) must both + * be in ascending TID order. + */ + for (; nestedi < delstate->ndeltids; nestedi++) + { + TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi]; + TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id); + + /* Stop once we get past all itup related deltids entries */ + Assert(tdstatus->idxoffnum >= idxoffnum); + if (tdstatus->idxoffnum != idxoffnum) + break; + + /* Skip past non-deletable itup related entries up front */ + if (!tdstatus->knowndeletable) + continue; + + /* Entry is first partial ptid match (or an exact match)? */ + ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid); + if (ptidcmp >= 0) + { + /* Greater than or equal (partial or exact) match... */ + break; + } + } + + /* ...exact ptid match to a deletable deltids entry? */ + if (ptidcmp != 0) + continue; + + /* Exact match for deletable deltids entry -- ptid gets deleted */ + if (vacposting == NULL) + { + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + nitem * sizeof(uint16)); + vacposting->itup = itup; + vacposting->updatedoffset = idxoffnum; + vacposting->ndeletedtids = 0; + } + vacposting->deletetids[vacposting->ndeletedtids++] = p; + } + + /* Final decision on itup, a posting list tuple */ + + if (vacposting == NULL) + { + /* No TIDs to delete from itup -- do nothing */ + } + else if (vacposting->ndeletedtids == nitem) + { + /* Straight delete of itup (to delete all TIDs) */ + deletable[ndeletable++] = idxoffnum; + /* Turns out we won't need granular information */ + pfree(vacposting); + } + else + { + /* Delete some (but not all) TIDs from itup */ + Assert(vacposting->ndeletedtids > 0 && + vacposting->ndeletedtids < nitem); + updatable[nupdatable++] = vacposting; + } + } + + /* Physically delete tuples (or TIDs) using deletable (or updatable) */ + _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable, + updatable, nupdatable); + + /* be tidy */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]); +} + +/* + * Check that leftsib page (the btpo_prev of target page) is not marked with + * INCOMPLETE_SPLIT flag. Used during page deletion. + * + * Returning true indicates that page flag is set in leftsib (which is + * definitely still the left sibling of target). When that happens, the + * target doesn't have a downlink in parent, and the page deletion algorithm + * isn't prepared to handle that. Deletion of the target page (or the whole + * subtree that contains the target page) cannot take place. + * + * Caller should not have a lock on the target page itself, since pages on the + * same level must always be locked left to right to avoid deadlocks. + */ +static bool +_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + bool result; + + /* Easy case: No left sibling */ + if (leftsib == P_NONE) + return false; + + buf = _bt_getbuf(rel, leftsib, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If the left sibling was concurrently split, so that its next-pointer + * doesn't point to the current page anymore, the split that created + * target must be completed. Caller can reasonably expect that there will + * be a downlink to the target page that it can relocate using its stack. + * (We don't allow splitting an incompletely split page again until the + * previous split has been completed.) + */ + result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque)); + _bt_relbuf(rel, buf); + + return result; +} + +/* + * Check that leafrightsib page (the btpo_next of target leaf page) is not + * marked with ISHALFDEAD flag. Used during page deletion. + * + * Returning true indicates that page flag is set in leafrightsib, so page + * deletion cannot go ahead. Our caller is not prepared to deal with the case + * where the parent page does not have a pivot tuples whose downlink points to + * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't + * seem worth going to the trouble of teaching our caller to deal with it. + * The situation will be resolved after VACUUM finishes the deletion of the + * half-dead page (when a future VACUUM operation reaches the target page + * again). + * + * _bt_leftsib_splitflag() is called for both leaf pages and internal pages. + * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is + * okay because of the restriction on deleting pages that are the rightmost + * page of their parent (i.e. that such deletions can only take place when the + * entire subtree must be deleted). The leaf level check made here will apply + * to a right "cousin" leaf page rather than a simple right sibling leaf page + * in cases where caller actually goes on to attempt deleting pages that are + * above the leaf page. The right cousin leaf page is representative of the + * left edge of the subtree to the right of the to-be-deleted subtree as a + * whole, which is exactly the condition that our caller cares about. + * (Besides, internal pages are never marked half-dead, so it isn't even + * possible to _directly_ assess if an internal page is part of some other + * to-be-deleted subtree.) + */ +static bool +_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + bool result; + + Assert(leafrightsib != P_NONE); + + buf = _bt_getbuf(rel, leafrightsib, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque)); + result = P_ISHALFDEAD(opaque); + _bt_relbuf(rel, buf); + + return result; +} + +/* + * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so. + * + * This action unlinks the leaf page from the b-tree structure, removing all + * pointers leading to it --- but not touching its own left and right links. + * The page cannot be physically reclaimed right away, since other processes + * may currently be trying to follow links leading to the page; they have to + * be allowed to use its right-link to recover. See nbtree/README. + * + * On entry, the target buffer must be pinned and locked (either read or write + * lock is OK). The page must be an empty leaf page, which may be half-dead + * already (a half-dead page should only be passed to us when an earlier + * VACUUM operation was interrupted, though). Note in particular that caller + * should never pass a buffer containing an existing deleted page here. The + * lock and pin on caller's buffer will be dropped before we return. + * + * Maintains bulk delete stats for caller, which are taken from vstate. We + * need to cooperate closely with caller here so that whole VACUUM operation + * reliably avoids any double counting of subsidiary-to-leafbuf pages that we + * delete in passing. If such pages happen to be from a block number that is + * ahead of the current scanblkno position, then caller is expected to count + * them directly later on. It's simpler for us to understand caller's + * requirements than it would be for caller to understand when or how a + * deleted page became deleted after the fact. + * + * NOTE: this leaks memory. Rather than trying to clean up everything + * carefully, it's better to run it in a temp context that can be reset + * frequently. + */ +void +_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) +{ + BlockNumber rightsib; + bool rightsib_empty; + Page page; + BTPageOpaque opaque; + + /* + * Save original leafbuf block number from caller. Only deleted blocks + * that are <= scanblkno are added to bulk delete stat's pages_deleted + * count. + */ + BlockNumber scanblkno = BufferGetBlockNumber(leafbuf); + + /* + * "stack" is a search stack leading (approximately) to the target page. + * It is initially NULL, but when iterating, we keep it to avoid + * duplicated search effort. + * + * Also, when "stack" is not NULL, we have already checked that the + * current page is not the right half of an incomplete split, i.e. the + * left sibling does not have its INCOMPLETE_SPLIT flag set, including + * when the current target page is to the right of caller's initial page + * (the scanblkno page). + */ + BTStack stack = NULL; + + for (;;) + { + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Internal pages are never deleted directly, only as part of deleting + * the whole subtree all the way down to leaf level. + * + * Also check for deleted pages here. Caller never passes us a fully + * deleted page. Only VACUUM can delete pages, so there can't have + * been a concurrent deletion. Assume that we reached any deleted + * page encountered here by following a sibling link, and that the + * index is corrupt. + */ + Assert(!P_ISDELETED(opaque)); + if (!P_ISLEAF(opaque) || P_ISDELETED(opaque)) + { + /* + * Pre-9.4 page deletion only marked internal pages as half-dead, + * but now we only use that flag on leaf pages. The old algorithm + * was never supposed to leave half-dead pages in the tree, it was + * just a transient state, but it was nevertheless possible in + * error scenarios. We don't know how to deal with them here. They + * are harmless as far as searches are considered, but inserts + * into the deleted keyspace could add out-of-order downlinks in + * the upper levels. Log a notice, hopefully the admin will notice + * and reindex. + */ + if (P_ISHALFDEAD(opaque)) + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains a half-dead internal page", + RelationGetRelationName(rel)), + errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); + + if (P_ISDELETED(opaque)) + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"", + BufferGetBlockNumber(leafbuf), + scanblkno, + RelationGetRelationName(rel)))); + + _bt_relbuf(rel, leafbuf); + return; + } + + /* + * We can never delete rightmost pages nor root pages. While at it, + * check that page is empty, since it's possible that the leafbuf page + * was empty a moment ago, but has since had some inserts. + * + * To keep the algorithm simple, we also never delete an incompletely + * split page (they should be rare enough that this doesn't make any + * meaningful difference to disk usage): + * + * The INCOMPLETE_SPLIT flag on the page tells us if the page is the + * left half of an incomplete split, but ensuring that it's not the + * right half is more complicated. For that, we have to check that + * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using + * _bt_leftsib_splitflag(). On the first iteration, we temporarily + * release the lock on scanblkno/leafbuf, check the left sibling, and + * construct a search stack to scanblkno. On subsequent iterations, + * we know we stepped right from a page that passed these tests, so + * it's OK. + */ + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || + P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || + P_INCOMPLETE_SPLIT(opaque)) + { + /* Should never fail to delete a half-dead page */ + Assert(!P_ISHALFDEAD(opaque)); + + _bt_relbuf(rel, leafbuf); + return; + } + + /* + * First, remove downlink pointing to the page (or a parent of the + * page, if we are going to delete a taller subtree), and mark the + * leafbuf page half-dead + */ + if (!P_ISHALFDEAD(opaque)) + { + /* + * We need an approximate pointer to the page's parent page. We + * use a variant of the standard search mechanism to search for + * the page's high key; this will give us a link to either the + * current parent or someplace to its left (if there are multiple + * equal high keys, which is possible with !heapkeyspace indexes). + * + * Also check if this is the right-half of an incomplete split + * (see comment above). + */ + if (!stack) + { + BTScanInsert itup_key; + ItemId itemid; + IndexTuple targetkey; + BlockNumber leftsib, + leafblkno; + Buffer sleafbuf; + + itemid = PageGetItemId(page, P_HIKEY); + targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); + + leftsib = opaque->btpo_prev; + leafblkno = BufferGetBlockNumber(leafbuf); + + /* + * To avoid deadlocks, we'd better drop the leaf page lock + * before going further. + */ + _bt_unlockbuf(rel, leafbuf); + + /* + * Check that the left sibling of leafbuf (if any) is not + * marked with INCOMPLETE_SPLIT flag before proceeding + */ + Assert(leafblkno == scanblkno); + if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) + { + ReleaseBuffer(leafbuf); + return; + } + + /* we need an insertion scan key for the search, so build one */ + itup_key = _bt_mkscankey(rel, targetkey); + /* find the leftmost leaf page with matching pivot/high key */ + itup_key->pivotsearch = true; + stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL); + /* won't need a second lock or pin on leafbuf */ + _bt_relbuf(rel, sleafbuf); + + /* + * Re-lock the leaf page, and start over to use our stack + * within _bt_mark_page_halfdead. We must do it that way + * because it's possible that leafbuf can no longer be + * deleted. We need to recheck. + * + * Note: We can't simply hold on to the sleafbuf lock instead, + * because it's barely possible that sleafbuf is not the same + * page as leafbuf. This happens when leafbuf split after our + * original lock was dropped, but before _bt_search finished + * its descent. We rely on the assumption that we'll find + * leafbuf isn't safe to delete anymore in this scenario. + * (Page deletion can cope with the stack being to the left of + * leafbuf, but not to the right of leafbuf.) + */ + _bt_lockbuf(rel, leafbuf, BT_WRITE); + continue; + } + + /* + * See if it's safe to delete the leaf page, and determine how + * many parent/internal pages above the leaf level will be + * deleted. If it's safe then _bt_mark_page_halfdead will also + * perform the first phase of deletion, which includes marking the + * leafbuf page half-dead. + */ + Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque)); + if (!_bt_mark_page_halfdead(rel, leafbuf, stack)) + { + _bt_relbuf(rel, leafbuf); + return; + } + } + + /* + * Then unlink it from its siblings. Each call to + * _bt_unlink_halfdead_page unlinks the topmost page from the subtree, + * making it shallower. Iterate until the leafbuf page is deleted. + */ + rightsib_empty = false; + Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)); + while (P_ISHALFDEAD(opaque)) + { + /* Check for interrupts in _bt_unlink_halfdead_page */ + if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, + &rightsib_empty, vstate)) + { + /* + * _bt_unlink_halfdead_page should never fail, since we + * established that deletion is generally safe in + * _bt_mark_page_halfdead -- index must be corrupt. + * + * Note that _bt_unlink_halfdead_page already released the + * lock and pin on leafbuf for us. + */ + Assert(false); + return; + } + } + + Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); + + rightsib = opaque->btpo_next; + + _bt_relbuf(rel, leafbuf); + + /* + * Check here, as calling loops will have locks held, preventing + * interrupts from being processed. + */ + CHECK_FOR_INTERRUPTS(); + + /* + * The page has now been deleted. If its right sibling is completely + * empty, it's possible that the reason we haven't deleted it earlier + * is that it was the rightmost child of the parent. Now that we + * removed the downlink for this page, the right sibling might now be + * the only child of the parent, and could be removed. It would be + * picked up by the next vacuum anyway, but might as well try to + * remove it now, so loop back to process the right sibling. + * + * Note: This relies on the assumption that _bt_getstackbuf() will be + * able to reuse our original descent stack with a different child + * block (provided that the child block is to the right of the + * original leaf page reached by _bt_search()). It will even update + * the descent stack each time we loop around, avoiding repeated work. + */ + if (!rightsib_empty) + break; + + leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); + } +} + +/* + * First stage of page deletion. + * + * Establish the height of the to-be-deleted subtree with leafbuf at its + * lowest level, remove the downlink to the subtree, and mark leafbuf + * half-dead. The final to-be-deleted subtree is usually just leafbuf itself, + * but may include additional internal pages (at most one per level of the + * tree below the root). + * + * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is + * the rightmost child of its parent (and parent has more than one downlink). + * Returns 'true' when the first stage of page deletion completed + * successfully. + */ +static bool +_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) +{ + BlockNumber leafblkno; + BlockNumber leafrightsib; + BlockNumber topparent; + BlockNumber topparentrightsib; + ItemId itemid; + Page page; + BTPageOpaque opaque; + Buffer subtreeparent; + OffsetNumber poffset; + OffsetNumber nextoffset; + IndexTuple itup; + IndexTupleData trunctuple; + + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && + P_ISLEAF(opaque) && !P_IGNORE(opaque) && + P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); + + /* + * Save info about the leaf page. + */ + leafblkno = BufferGetBlockNumber(leafbuf); + leafrightsib = opaque->btpo_next; + + /* + * Before attempting to lock the parent page, check that the right sibling + * is not in half-dead state. A half-dead right sibling would have no + * downlink in the parent, which would be highly confusing later when we + * delete the downlink. It would fail the "right sibling of target page + * is also the next child in parent page" cross-check below. + */ + if (_bt_rightsib_halfdeadflag(rel, leafrightsib)) + { + elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead", + leafblkno, leafrightsib); + return false; + } + + /* + * We cannot delete a page that is the rightmost child of its immediate + * parent, unless it is the only child --- in which case the parent has to + * be deleted too, and the same condition applies recursively to it. We + * have to check this condition all the way up before trying to delete, + * and lock the parent of the root of the to-be-deleted subtree (the + * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent + * for us. We remove the downlink to the "top parent" page (subtree root + * page) from the subtree parent page below. + * + * Initialize topparent to be leafbuf page now. The final to-be-deleted + * subtree is often a degenerate one page subtree consisting only of the + * leafbuf page. When that happens, the leafbuf page is the final subtree + * root page/top parent page. + */ + topparent = leafblkno; + topparentrightsib = leafrightsib; + if (!_bt_lock_subtree_parent(rel, leafblkno, stack, + &subtreeparent, &poffset, + &topparent, &topparentrightsib)) + return false; + + /* + * Check that the parent-page index items we're about to delete/overwrite + * in subtree parent page contain what we expect. This can fail if the + * index has become corrupt for some reason. We want to throw any error + * before entering the critical section --- otherwise it'd be a PANIC. + */ + page = BufferGetPage(subtreeparent); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + +#ifdef USE_ASSERT_CHECKING + + /* + * This is just an assertion because _bt_lock_subtree_parent should have + * guaranteed tuple has the expected contents + */ + itemid = PageGetItemId(page, poffset); + itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleGetDownLink(itup) == topparent); +#endif + + nextoffset = OffsetNumberNext(poffset); + itemid = PageGetItemId(page, nextoffset); + itup = (IndexTuple) PageGetItem(page, itemid); + if (BTreeTupleGetDownLink(itup) != topparentrightsib) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"", + topparentrightsib, topparent, + BTreeTupleGetDownLink(itup), + BufferGetBlockNumber(subtreeparent), + RelationGetRelationName(rel)))); + + /* + * Any insert which would have gone on the leaf block will now go to its + * right sibling. In other words, the key space moves right. + */ + PredicateLockPageCombine(rel, leafblkno, leafrightsib); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* + * Update parent of subtree. We want to delete the downlink to the top + * parent page/root of the subtree, and the *following* key. Easiest way + * is to copy the right sibling's downlink over the downlink that points + * to top parent page, and then delete the right sibling's original pivot + * tuple. + * + * Lanin and Shasha make the key space move left when deleting a page, + * whereas the key space moves right here. That's why we cannot simply + * delete the pivot tuple with the downlink to the top parent page. See + * nbtree/README. + */ + page = BufferGetPage(subtreeparent); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + itemid = PageGetItemId(page, poffset); + itup = (IndexTuple) PageGetItem(page, itemid); + BTreeTupleSetDownLink(itup, topparentrightsib); + + nextoffset = OffsetNumberNext(poffset); + PageIndexTupleDelete(page, nextoffset); + + /* + * Mark the leaf page as half-dead, and stamp it with a link to the top + * parent page. When the leaf page is also the top parent page, the link + * is set to InvalidBlockNumber. + */ + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags |= BTP_HALF_DEAD; + + Assert(PageGetMaxOffsetNumber(page) == P_HIKEY); + MemSet(&trunctuple, 0, sizeof(IndexTupleData)); + trunctuple.t_info = sizeof(IndexTupleData); + if (topparent != leafblkno) + BTreeTupleSetTopParent(&trunctuple, topparent); + else + BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber); + + if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple, + IndexTupleSize(&trunctuple))) + elog(ERROR, "could not overwrite high key in half-dead page"); + + /* Must mark buffers dirty before XLogInsert */ + MarkBufferDirty(subtreeparent); + MarkBufferDirty(leafbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_mark_page_halfdead xlrec; + XLogRecPtr recptr; + + xlrec.poffset = poffset; + xlrec.leafblk = leafblkno; + if (topparent != leafblkno) + xlrec.topparent = topparent; + else + xlrec.topparent = InvalidBlockNumber; + + XLogBeginInsert(); + XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD); + + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + xlrec.leftblk = opaque->btpo_prev; + xlrec.rightblk = opaque->btpo_next; + + XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD); + + page = BufferGetPage(subtreeparent); + PageSetLSN(page, recptr); + page = BufferGetPage(leafbuf); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + _bt_relbuf(rel, subtreeparent); + return true; +} + +/* + * Second stage of page deletion. + * + * Unlinks a single page (in the subtree undergoing deletion) from its + * siblings. Also marks the page deleted. + * + * To get rid of the whole subtree, including the leaf page itself, call here + * until the leaf page is deleted. The original "top parent" established in + * the first stage of deletion is deleted in the first call here, while the + * leaf page is deleted in the last call here. Note that the leaf page itself + * is often the initial top parent page. + * + * Returns 'false' if the page could not be unlinked (shouldn't happen). If + * the right sibling of the current target page is empty, *rightsib_empty is + * set to true, allowing caller to delete the target's right sibling page in + * passing. Note that *rightsib_empty is only actually used by caller when + * target page is leafbuf, following last call here for leafbuf/the subtree + * containing leafbuf. (We always set *rightsib_empty for caller, just to be + * consistent.) + * + * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). + * On success exit, we'll be holding pin and write lock. On failure exit, + * we'll release both pin and lock before returning (we define it that way + * to avoid having to reacquire a lock we already released). + */ +static bool +_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, + bool *rightsib_empty, BTVacState *vstate) +{ + BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); + IndexBulkDeleteResult *stats = vstate->stats; + BlockNumber leafleftsib; + BlockNumber leafrightsib; + BlockNumber target; + BlockNumber leftsib; + BlockNumber rightsib; + Buffer lbuf = InvalidBuffer; + Buffer buf; + Buffer rbuf; + Buffer metabuf = InvalidBuffer; + Page metapg = NULL; + BTMetaPageData *metad = NULL; + ItemId itemid; + Page page; + BTPageOpaque opaque; + FullTransactionId safexid; + bool rightsib_is_rightmost; + uint32 targetlevel; + IndexTuple leafhikey; + BlockNumber leaftopparent; + + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)); + + /* + * Remember some information about the leaf page. + */ + itemid = PageGetItemId(page, P_HIKEY); + leafhikey = (IndexTuple) PageGetItem(page, itemid); + target = BTreeTupleGetTopParent(leafhikey); + leafleftsib = opaque->btpo_prev; + leafrightsib = opaque->btpo_next; + + _bt_unlockbuf(rel, leafbuf); + + /* + * Check here, as calling loops will have locks held, preventing + * interrupts from being processed. + */ + CHECK_FOR_INTERRUPTS(); + + /* Unlink the current top parent of the subtree */ + if (!BlockNumberIsValid(target)) + { + /* Target is leaf page (or leaf page is top parent, if you prefer) */ + target = leafblkno; + + buf = leafbuf; + leftsib = leafleftsib; + targetlevel = 0; + } + else + { + /* Target is the internal page taken from leaf's top parent link */ + Assert(target != leafblkno); + + /* Fetch the block number of the target's left sibling */ + buf = _bt_getbuf(rel, target, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + leftsib = opaque->btpo_prev; + targetlevel = opaque->btpo_level; + Assert(targetlevel > 0); + + /* + * To avoid deadlocks, we'd better drop the target page lock before + * going further. + */ + _bt_unlockbuf(rel, buf); + } + + /* + * We have to lock the pages we need to modify in the standard order: + * moving right, then up. Else we will deadlock against other writers. + * + * So, first lock the leaf page, if it's not the target. Then find and + * write-lock the current left sibling of the target page. The sibling + * that was current a moment ago could have split, so we may have to move + * right. + */ + if (target != leafblkno) + _bt_lockbuf(rel, leafbuf, BT_WRITE); + if (leftsib != P_NONE) + { + lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + while (P_ISDELETED(opaque) || opaque->btpo_next != target) + { + bool leftsibvalid = true; + + /* + * Before we follow the link from the page that was the left + * sibling mere moments ago, validate its right link. This + * reduces the opportunities for loop to fail to ever make any + * progress in the presence of index corruption. + * + * Note: we rely on the assumption that there can only be one + * vacuum process running at a time (against the same index). + */ + if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) || + leftsib == opaque->btpo_next) + leftsibvalid = false; + + leftsib = opaque->btpo_next; + _bt_relbuf(rel, lbuf); + + if (!leftsibvalid) + { + if (target != leafblkno) + { + /* we have only a pin on target, but pin+lock on leafbuf */ + ReleaseBuffer(buf); + _bt_relbuf(rel, leafbuf); + } + else + { + /* we have only a pin on leafbuf */ + ReleaseBuffer(leafbuf); + } + + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("valid left sibling for deletion target could not be located: " + "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"", + leftsib, target, leafblkno, scanblkno, + RelationGetRelationName(rel)))); + + return false; + } + + CHECK_FOR_INTERRUPTS(); + + /* step right one page */ + lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + } + else + lbuf = InvalidBuffer; + + /* Next write-lock the target page itself */ + _bt_lockbuf(rel, buf, BT_WRITE); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Check page is still empty etc, else abandon deletion. This is just for + * paranoia's sake; a half-dead page cannot resurrect because there can be + * only one vacuum process running at a time. + */ + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque)) + elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"", + target, RelationGetRelationName(rel)); + + if (opaque->btpo_prev != leftsib) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"", + leftsib, opaque->btpo_prev, target, + RelationGetRelationName(rel)))); + + if (target == leafblkno) + { + if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || + !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) + elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"", + target, RelationGetRelationName(rel)); + + /* Leaf page is also target page: don't set leaftopparent */ + leaftopparent = InvalidBlockNumber; + } + else + { + IndexTuple finaldataitem; + + if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) || + P_ISLEAF(opaque)) + elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"", + targetlevel, target, RelationGetRelationName(rel)); + + /* Target is internal: set leaftopparent for next call here... */ + itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); + finaldataitem = (IndexTuple) PageGetItem(page, itemid); + leaftopparent = BTreeTupleGetDownLink(finaldataitem); + /* ...except when it would be a redundant pointer-to-self */ + if (leaftopparent == leafblkno) + leaftopparent = InvalidBlockNumber; + } + + /* No leaftopparent for level 0 (leaf page) or level 1 target */ + Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1); + + /* + * And next write-lock the (current) right sibling. + */ + rightsib = opaque->btpo_next; + rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (opaque->btpo_prev != target) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling's left-link doesn't match: " + "block %u links to %u instead of expected %u in index \"%s\"", + rightsib, opaque->btpo_prev, target, + RelationGetRelationName(rel)))); + rightsib_is_rightmost = P_RIGHTMOST(opaque); + *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); + + /* + * If we are deleting the next-to-last page on the target's level, then + * the rightsib is a candidate to become the new fast root. (In theory, it + * might be possible to push the fast root even further down, but the odds + * of doing so are slim, and the locking considerations daunting.) + * + * We can safely acquire a lock on the metapage here --- see comments for + * _bt_newroot(). + */ + if (leftsib == P_NONE && rightsib_is_rightmost) + { + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_RIGHTMOST(opaque)) + { + /* rightsib will be the only one left on the level */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + /* + * The expected case here is btm_fastlevel == targetlevel+1; if + * the fastlevel is <= targetlevel, something is wrong, and we + * choose to overwrite it to fix it. + */ + if (metad->btm_fastlevel > targetlevel + 1) + { + /* no update wanted */ + _bt_relbuf(rel, metabuf); + metabuf = InvalidBuffer; + } + } + } + + /* + * Here we begin doing the deletion. + */ + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* + * Update siblings' side-links. Note the target page's side-links will + * continue to point to the siblings. Asserts here are just rechecking + * things we already verified above. + */ + if (BufferIsValid(lbuf)) + { + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->btpo_next == target); + opaque->btpo_next = rightsib; + } + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->btpo_prev == target); + opaque->btpo_prev = leftsib; + + /* + * If we deleted a parent of the targeted leaf page, instead of the leaf + * itself, update the leaf to point to the next remaining child in the + * subtree. + * + * Note: We rely on the fact that a buffer pin on the leaf page has been + * held since leafhikey was initialized. This is safe, though only + * because the page was already half-dead at that point. The leaf page + * cannot have been modified by any other backend during the period when + * no lock was held. + */ + if (target != leafblkno) + BTreeTupleSetTopParent(leafhikey, leaftopparent); + + /* + * Mark the page itself deleted. It can be recycled when all current + * transactions are gone. Storing GetTopTransactionId() would work, but + * we're in VACUUM and would not otherwise have an XID. Having already + * updated links to the target, ReadNextFullTransactionId() suffices as an + * upper bound. Any scan having retained a now-stale link is advertising + * in its PGPROC an xmin less than or equal to the value we read here. It + * will continue to do so, holding back the xmin horizon, for the duration + * of that scan. + */ + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); + + /* + * Store upper bound XID that's used to determine when deleted page is no + * longer needed as a tombstone + */ + safexid = ReadNextFullTransactionId(); + BTPageSetDeleted(page, safexid); + opaque->btpo_cycleid = 0; + + /* And update the metapage, if needed */ + if (BufferIsValid(metabuf)) + { + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + metad->btm_fastroot = rightsib; + metad->btm_fastlevel = targetlevel; + MarkBufferDirty(metabuf); + } + + /* Must mark buffers dirty before XLogInsert */ + MarkBufferDirty(rbuf); + MarkBufferDirty(buf); + if (BufferIsValid(lbuf)) + MarkBufferDirty(lbuf); + if (target != leafblkno) + MarkBufferDirty(leafbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_unlink_page xlrec; + xl_btree_metadata xlmeta; + uint8 xlinfo; + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); + if (BufferIsValid(lbuf)) + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + if (target != leafblkno) + XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); + + /* information stored on the target/to-be-unlinked block */ + xlrec.leftsib = leftsib; + xlrec.rightsib = rightsib; + xlrec.level = targetlevel; + xlrec.safexid = safexid; + + /* information needed to recreate the leaf block (if not the target) */ + xlrec.leafleftsib = leafleftsib; + xlrec.leafrightsib = leafrightsib; + xlrec.leaftopparent = leaftopparent; + + XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage); + + if (BufferIsValid(metabuf)) + { + XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + xlmeta.version = metad->btm_version; + xlmeta.root = metad->btm_root; + xlmeta.level = metad->btm_level; + xlmeta.fastroot = metad->btm_fastroot; + xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; + xlmeta.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); + xlinfo = XLOG_BTREE_UNLINK_PAGE_META; + } + else + xlinfo = XLOG_BTREE_UNLINK_PAGE; + + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + if (BufferIsValid(metabuf)) + { + PageSetLSN(metapg, recptr); + } + page = BufferGetPage(rbuf); + PageSetLSN(page, recptr); + page = BufferGetPage(buf); + PageSetLSN(page, recptr); + if (BufferIsValid(lbuf)) + { + page = BufferGetPage(lbuf); + PageSetLSN(page, recptr); + } + if (target != leafblkno) + { + page = BufferGetPage(leafbuf); + PageSetLSN(page, recptr); + } + } + + END_CRIT_SECTION(); + + /* release metapage */ + if (BufferIsValid(metabuf)) + _bt_relbuf(rel, metabuf); + + /* release siblings */ + if (BufferIsValid(lbuf)) + _bt_relbuf(rel, lbuf); + _bt_relbuf(rel, rbuf); + + /* If the target is not leafbuf, we're done with it now -- release it */ + if (target != leafblkno) + _bt_relbuf(rel, buf); + + /* + * Maintain pages_newly_deleted, which is simply the number of pages + * deleted by the ongoing VACUUM operation. + * + * Maintain pages_deleted in a way that takes into account how + * btvacuumpage() will count deleted pages that have yet to become + * scanblkno -- only count page when it's not going to get that treatment + * later on. + */ + stats->pages_newly_deleted++; + if (target <= scanblkno) + stats->pages_deleted++; + + /* + * Remember information about the target page (now a newly deleted page) + * in dedicated vstate space for later. The page will be considered as a + * candidate to place in the FSM at the end of the current btvacuumscan() + * call. + */ + _bt_pendingfsm_add(vstate, target, safexid); + + return true; +} + +/* + * Establish how tall the to-be-deleted subtree will be during the first stage + * of page deletion. + * + * Caller's child argument is the block number of the page caller wants to + * delete (this is leafbuf's block number, except when we're called + * recursively). stack is a search stack leading to it. Note that we will + * update the stack entry(s) to reflect current downlink positions --- this is + * similar to the corresponding point in page split handling. + * + * If "first stage" caller cannot go ahead with deleting _any_ pages, returns + * false. Returns true on success, in which case caller can use certain + * details established here to perform the first stage of deletion. This + * function is the last point at which page deletion may be deemed unsafe + * (barring index corruption, or unexpected concurrent page deletions). + * + * We write lock the parent of the root of the to-be-deleted subtree for + * caller on success (i.e. we leave our lock on the *subtreeparent buffer for + * caller). Caller will have to remove a downlink from *subtreeparent. We + * also set a *subtreeparent offset number in *poffset, to indicate the + * location of the pivot tuple that contains the relevant downlink. + * + * The root of the to-be-deleted subtree is called the "top parent". Note + * that the leafbuf page is often the final "top parent" page (you can think + * of the leafbuf page as a degenerate single page subtree when that happens). + * Caller should initialize *topparent to the target leafbuf page block number + * (while *topparentrightsib should be set to leafbuf's right sibling block + * number). We will update *topparent (and *topparentrightsib) for caller + * here, though only when it turns out that caller will delete at least one + * internal page (i.e. only when caller needs to store a valid link to the top + * parent block in the leafbuf page using BTreeTupleSetTopParent()). + */ +static bool +_bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, + Buffer *subtreeparent, OffsetNumber *poffset, + BlockNumber *topparent, BlockNumber *topparentrightsib) +{ + BlockNumber parent, + leftsibparent; + OffsetNumber parentoffset, + maxoff; + Buffer pbuf; + Page page; + BTPageOpaque opaque; + + /* + * Locate the pivot tuple whose downlink points to "child". Write lock + * the parent page itself. + */ + pbuf = _bt_getstackbuf(rel, stack, child); + if (pbuf == InvalidBuffer) + { + /* + * Failed to "re-find" a pivot tuple whose downlink matched our child + * block number on the parent level -- the index must be corrupt. + * Don't even try to delete the leafbuf subtree. Just report the + * issue and press on with vacuuming the index. + * + * Note: _bt_getstackbuf() recovers from concurrent page splits that + * take place on the parent level. Its approach is a near-exhaustive + * linear search. This also gives it a surprisingly good chance of + * recovering in the event of a buggy or inconsistent opclass. But we + * don't rely on that here. + */ + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u", + RelationGetRelationName(rel), child))); + return false; + } + + parent = stack->bts_blkno; + parentoffset = stack->bts_offset; + + page = BufferGetPage(pbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + leftsibparent = opaque->btpo_prev; + + /* + * _bt_getstackbuf() completes page splits on returned parent buffer when + * required. + * + * In general it's a bad idea for VACUUM to use up more disk space, which + * is why page deletion does not finish incomplete page splits most of the + * time. We allow this limited exception because the risk is much lower, + * and the potential downside of not proceeding is much higher: A single + * internal page with the INCOMPLETE_SPLIT flag set might otherwise + * prevent us from deleting hundreds of empty leaf pages from one level + * down. + */ + Assert(!P_INCOMPLETE_SPLIT(opaque)); + + if (parentoffset < maxoff) + { + /* + * Child is not the rightmost child in parent, so it's safe to delete + * the subtree whose root/topparent is child page + */ + *subtreeparent = pbuf; + *poffset = parentoffset; + return true; + } + + /* + * Child is the rightmost child of parent. + * + * Since it's the rightmost child of parent, deleting the child (or + * deleting the subtree whose root/topparent is the child page) is only + * safe when it's also possible to delete the parent. + */ + Assert(parentoffset == maxoff); + if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque)) + { + /* + * Child isn't parent's only child, or parent is rightmost on its + * entire level. Definitely cannot delete any pages. + */ + _bt_relbuf(rel, pbuf); + return false; + } + + /* + * Now make sure that the parent deletion is itself safe by examining the + * child's grandparent page. Recurse, passing the parent page as the + * child page (child's grandparent is the parent on the next level up). If + * parent deletion is unsafe, then child deletion must also be unsafe (in + * which case caller cannot delete any pages at all). + */ + *topparent = parent; + *topparentrightsib = opaque->btpo_next; + + /* + * Release lock on parent before recursing. + * + * It's OK to release page locks on parent before recursive call locks + * grandparent. An internal page can only acquire an entry if the child + * is split, but that cannot happen as long as we still hold a lock on the + * leafbuf page. + */ + _bt_relbuf(rel, pbuf); + + /* + * Before recursing, check that the left sibling of parent (if any) is not + * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the + * parent lock). + * + * Note: We deliberately avoid completing incomplete splits here. + */ + if (_bt_leftsib_splitflag(rel, leftsibparent, parent)) + return false; + + /* Recurse to examine child page's grandparent page */ + return _bt_lock_subtree_parent(rel, parent, stack->bts_parent, + subtreeparent, poffset, + topparent, topparentrightsib); +} + +/* + * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize + * optimization. + * + * Called at the start of a btvacuumscan(). Caller's cleanuponly argument + * indicates if ongoing VACUUM has not (and will not) call btbulkdelete(). + * + * We expect to allocate memory inside VACUUM's top-level memory context here. + * The working buffer is subject to a limit based on work_mem. Our strategy + * when the array can no longer grow within the bounds of that limit is to + * stop saving additional newly deleted pages, while proceeding as usual with + * the pages that we can fit. + */ +void +_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly) +{ + int64 maxbufsize; + + /* + * Don't bother with optimization in cleanup-only case -- we don't expect + * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan() + * can only take place because this optimization didn't work out during + * the last VACUUM. + */ + if (cleanuponly) + return; + + /* + * Cap maximum size of array so that we always respect work_mem. Avoid + * int overflow here. + */ + vstate->bufsize = 256; + maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM); + maxbufsize = Min(maxbufsize, INT_MAX); + maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM)); + /* Stay sane with small work_mem */ + maxbufsize = Max(maxbufsize, vstate->bufsize); + vstate->maxbufsize = maxbufsize; + + /* Allocate buffer, indicate that there are currently 0 pending pages */ + vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize); + vstate->npendingpages = 0; +} + +/* + * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during + * the ongoing VACUUM operation) into the free space map -- though only when + * it is actually safe to do so by now. + * + * Called at the end of a btvacuumscan(), just before free space map vacuuming + * takes place. + * + * Frees memory allocated by _bt_pendingfsm_init(), if any. + */ +void +_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate) +{ + IndexBulkDeleteResult *stats = vstate->stats; + + Assert(stats->pages_newly_deleted >= vstate->npendingpages); + + if (vstate->npendingpages == 0) + { + /* Just free memory when nothing to do */ + if (vstate->pendingpages) + pfree(vstate->pendingpages); + + return; + } + +#ifdef DEBUG_BTREE_PENDING_FSM + + /* + * Debugging aid: Sleep for 5 seconds to greatly increase the chances of + * placing pending pages in the FSM. Note that the optimization will + * never be effective without some other backend concurrently consuming an + * XID. + */ + pg_usleep(5000000L); +#endif + + /* + * Recompute VACUUM XID boundaries. + * + * We don't actually care about the oldest non-removable XID. Computing + * the oldest such XID has a useful side-effect that we rely on: it + * forcibly updates the XID horizon state for this backend. This step is + * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize + * that it is now safe to recycle newly deleted pages without this step. + */ + GetOldestNonRemovableTransactionId(NULL); + + for (int i = 0; i < vstate->npendingpages; i++) + { + BlockNumber target = vstate->pendingpages[i].target; + FullTransactionId safexid = vstate->pendingpages[i].safexid; + + /* + * Do the equivalent of checking BTPageIsRecyclable(), but without + * accessing the page again a second time. + * + * Give up on finding the first non-recyclable page -- all later pages + * must be non-recyclable too, since _bt_pendingfsm_add() adds pages + * to the array in safexid order. + */ + if (!GlobalVisCheckRemovableFullXid(NULL, safexid)) + break; + + RecordFreeIndexPage(rel, target); + stats->pages_free++; + } + + pfree(vstate->pendingpages); +} + +/* + * Maintain array of pages that were deleted during current btvacuumscan() + * call, for use in _bt_pendingfsm_finalize() + */ +static void +_bt_pendingfsm_add(BTVacState *vstate, + BlockNumber target, + FullTransactionId safexid) +{ + Assert(vstate->npendingpages <= vstate->bufsize); + Assert(vstate->bufsize <= vstate->maxbufsize); + +#ifdef USE_ASSERT_CHECKING + + /* + * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the + * array will always be in safexid order (since that is the order that we + * save them in here) + */ + if (vstate->npendingpages > 0) + { + FullTransactionId lastsafexid = + vstate->pendingpages[vstate->npendingpages - 1].safexid; + + Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid)); + } +#endif + + /* + * If temp buffer reaches maxbufsize/work_mem capacity then we discard + * information about this page. + * + * Note that this also covers the case where we opted to not use the + * optimization in _bt_pendingfsm_init(). + */ + if (vstate->npendingpages == vstate->maxbufsize) + return; + + /* Consider enlarging buffer */ + if (vstate->npendingpages == vstate->bufsize) + { + int newbufsize = vstate->bufsize * 2; + + /* Respect work_mem */ + if (newbufsize > vstate->maxbufsize) + newbufsize = vstate->maxbufsize; + + vstate->bufsize = newbufsize; + vstate->pendingpages = + repalloc(vstate->pendingpages, + sizeof(BTPendingFSM) * vstate->bufsize); + } + + /* Save metadata for newly deleted page */ + vstate->pendingpages[vstate->npendingpages].target = target; + vstate->pendingpages[vstate->npendingpages].safexid = safexid; + vstate->npendingpages++; +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c new file mode 100644 index 0000000..1360ab8 --- /dev/null +++ b/src/backend/access/nbtree/nbtree.c @@ -0,0 +1,1446 @@ +/*------------------------------------------------------------------------- + * + * nbtree.c + * Implementation of Lehman and Yao's btree management algorithm for + * Postgres. + * + * NOTES + * This file contains only the public interface routines. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtree.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/relscan.h" +#include "access/xlog.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "storage/condition_variable.h" +#include "storage/indexfsm.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" + + +/* + * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. + * + * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to + * a new page; others must wait. + * + * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan + * to a new page; some process can start doing that. + * + * BTPARALLEL_DONE indicates that the scan is complete (including error exit). + * We reach this state once for every distinct combination of array keys. + */ +typedef enum +{ + BTPARALLEL_NOT_INITIALIZED, + BTPARALLEL_ADVANCING, + BTPARALLEL_IDLE, + BTPARALLEL_DONE +} BTPS_State; + +/* + * BTParallelScanDescData contains btree specific shared information required + * for parallel scan. + */ +typedef struct BTParallelScanDescData +{ + BlockNumber btps_scanPage; /* latest or next page to be scanned */ + BTPS_State btps_pageStatus; /* indicates whether next page is + * available for scan. see above for + * possible states of parallel scan. */ + int btps_arrayKeyCount; /* count indicating number of array scan + * keys processed by parallel scan */ + slock_t btps_mutex; /* protects above variables */ + ConditionVariable btps_cv; /* used to synchronize parallel scan */ +} BTParallelScanDescData; + +typedef struct BTParallelScanDescData *BTParallelScanDesc; + + +static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state, + BTCycleId cycleid); +static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno); +static BTVacuumPosting btreevacuumposting(BTVacState *vstate, + IndexTuple posting, + OffsetNumber updatedoffset, + int *nremaining); + + +/* + * Btree handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +bthandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = BTMaxStrategyNumber; + amroutine->amsupport = BTNProcs; + amroutine->amoptsprocnum = BTOPTIONS_PROC; + amroutine->amcanorder = true; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = true; + amroutine->amcanunique = true; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = true; + amroutine->amsearchnulls = true; + amroutine->amstorage = false; + amroutine->amclusterable = true; + amroutine->ampredlocks = true; + amroutine->amcanparallel = true; + amroutine->amcaninclude = true; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = btbuild; + amroutine->ambuildempty = btbuildempty; + amroutine->aminsert = btinsert; + amroutine->ambulkdelete = btbulkdelete; + amroutine->amvacuumcleanup = btvacuumcleanup; + amroutine->amcanreturn = btcanreturn; + amroutine->amcostestimate = btcostestimate; + amroutine->amoptions = btoptions; + amroutine->amproperty = btproperty; + amroutine->ambuildphasename = btbuildphasename; + amroutine->amvalidate = btvalidate; + amroutine->amadjustmembers = btadjustmembers; + amroutine->ambeginscan = btbeginscan; + amroutine->amrescan = btrescan; + amroutine->amgettuple = btgettuple; + amroutine->amgetbitmap = btgetbitmap; + amroutine->amendscan = btendscan; + amroutine->ammarkpos = btmarkpos; + amroutine->amrestrpos = btrestrpos; + amroutine->amestimateparallelscan = btestimateparallelscan; + amroutine->aminitparallelscan = btinitparallelscan; + amroutine->amparallelrescan = btparallelrescan; + + PG_RETURN_POINTER(amroutine); +} + +/* + * btbuildempty() -- build an empty btree index in the initialization fork + */ +void +btbuildempty(Relation index) +{ + Page metapage; + + /* Construct metapage. */ + metapage = (Page) palloc(BLCKSZ); + _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); + + /* + * Write the page and log it. It might seem that an immediate sync would + * be sufficient to guarantee that the file exists on disk, but recovery + * itself might remove it while replaying, for example, an + * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need + * this even when wal_level=minimal. + */ + PageSetChecksumInplace(metapage, BTREE_METAPAGE); + smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, + (char *) metapage, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BTREE_METAPAGE, metapage, true); + + /* + * An immediate sync is required even if we xlog'd the page, because the + * write did not go through shared_buffers and therefore a concurrent + * checkpoint may have moved the redo pointer past our xlog record. + */ + smgrimmedsync(index->rd_smgr, INIT_FORKNUM); +} + +/* + * btinsert() -- insert an index tuple into a btree. + * + * Descend the tree recursively, find the appropriate location for our + * new tuple, and put it there. + */ +bool +btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + bool result; + IndexTuple itup; + + /* generate an index tuple */ + itup = index_form_tuple(RelationGetDescr(rel), values, isnull); + itup->t_tid = *ht_ctid; + + result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel); + + pfree(itup); + + return result; +} + +/* + * btgettuple() -- Get the next tuple in the scan. + */ +bool +btgettuple(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool res; + + /* btree indexes are never lossy */ + scan->xs_recheck = false; + + /* + * If we have any array keys, initialize them during first call for a + * scan. We can't do this in btrescan because we don't know the scan + * direction at that time. + */ + if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) + { + /* punt if we have any unsatisfiable array keys */ + if (so->numArrayKeys < 0) + return false; + + _bt_start_array_keys(scan, dir); + } + + /* This loop handles advancing to the next array elements, if any */ + do + { + /* + * If we've already initialized this scan, we can just advance it in + * the appropriate direction. If we haven't done so yet, we call + * _bt_first() to get the first item in the scan. + */ + if (!BTScanPosIsValid(so->currPos)) + res = _bt_first(scan, dir); + else + { + /* + * Check to see if we should kill the previously-fetched tuple. + */ + if (scan->kill_prior_tuple) + { + /* + * Yes, remember it for later. (We'll deal with all such + * tuples at once right before leaving the index page.) The + * test for numKilled overrun is not just paranoia: if the + * caller reverses direction in the indexscan then the same + * item might get entered multiple times. It's not worth + * trying to optimize that, so we don't detect it, but instead + * just forget any excess entries. + */ + if (so->killedItems == NULL) + so->killedItems = (int *) + palloc(MaxTIDsPerBTreePage * sizeof(int)); + if (so->numKilled < MaxTIDsPerBTreePage) + so->killedItems[so->numKilled++] = so->currPos.itemIndex; + } + + /* + * Now continue the scan. + */ + res = _bt_next(scan, dir); + } + + /* If we have a tuple, return it ... */ + if (res) + break; + /* ... otherwise see if we have more array keys to deal with */ + } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + + return res; +} + +/* + * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap + */ +int64 +btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int64 ntids = 0; + ItemPointer heapTid; + + /* + * If we have any array keys, initialize them. + */ + if (so->numArrayKeys) + { + /* punt if we have any unsatisfiable array keys */ + if (so->numArrayKeys < 0) + return ntids; + + _bt_start_array_keys(scan, ForwardScanDirection); + } + + /* This loop handles advancing to the next array elements, if any */ + do + { + /* Fetch the first page & tuple */ + if (_bt_first(scan, ForwardScanDirection)) + { + /* Save tuple ID, and continue scanning */ + heapTid = &scan->xs_heaptid; + tbm_add_tuples(tbm, heapTid, 1, false); + ntids++; + + for (;;) + { + /* + * Advance to next tuple within page. This is the same as the + * easy case in _bt_next(). + */ + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + /* let _bt_next do the heavy lifting */ + if (!_bt_next(scan, ForwardScanDirection)) + break; + } + + /* Save tuple ID, and continue scanning */ + heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; + tbm_add_tuples(tbm, heapTid, 1, false); + ntids++; + } + } + /* Now see if we have more array keys to deal with */ + } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); + + return ntids; +} + +/* + * btbeginscan() -- start a scan on a btree index + */ +IndexScanDesc +btbeginscan(Relation rel, int nkeys, int norderbys) +{ + IndexScanDesc scan; + BTScanOpaque so; + + /* no order by operators allowed */ + Assert(norderbys == 0); + + /* get the scan */ + scan = RelationGetIndexScan(rel, nkeys, norderbys); + + /* allocate private workspace */ + so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); + BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(so->markPos); + if (scan->numberOfKeys > 0) + so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); + else + so->keyData = NULL; + + so->arrayKeyData = NULL; /* assume no array keys for now */ + so->numArrayKeys = 0; + so->arrayKeys = NULL; + so->arrayContext = NULL; + + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; + + /* + * We don't know yet whether the scan will be index-only, so we do not + * allocate the tuple workspace arrays until btrescan. However, we set up + * scan->xs_itupdesc whether we'll need it or not, since that's so cheap. + */ + so->currTuples = so->markTuples = NULL; + + scan->xs_itupdesc = RelationGetDescr(rel); + + scan->opaque = so; + + return scan; +} + +/* + * btrescan() -- rescan an index relation + */ +void +btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* we aren't holding any read locks, but gotta drop the pins */ + if (BTScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + } + + so->markItemIndex = -1; + so->arrayKeyCount = 0; + BTScanPosUnpinIfPinned(so->markPos); + BTScanPosInvalidate(so->markPos); + + /* + * Allocate tuple workspace arrays, if needed for an index-only scan and + * not already done in a previous rescan call. To save on palloc + * overhead, both workspaces are allocated as one palloc block; only this + * function and btendscan know that. + * + * NOTE: this data structure also makes it safe to return data from a + * "name" column, even though btree name_ops uses an underlying storage + * datatype of cstring. The risk there is that "name" is supposed to be + * padded to NAMEDATALEN, but the actual index tuple is probably shorter. + * However, since we only return data out of tuples sitting in the + * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some + * data out of the markTuples array --- running off the end of memory for + * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats + * adding special-case treatment for name_ops elsewhere. + */ + if (scan->xs_want_itup && so->currTuples == NULL) + { + so->currTuples = (char *) palloc(BLCKSZ * 2); + so->markTuples = so->currTuples + BLCKSZ; + } + + /* + * Reset the scan keys + */ + if (scankey && scan->numberOfKeys > 0) + memmove(scan->keyData, + scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ + + /* If any keys are SK_SEARCHARRAY type, set up array-key info */ + _bt_preprocess_array_keys(scan); +} + +/* + * btendscan() -- close down a scan + */ +void +btendscan(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* we aren't holding any read locks, but gotta drop the pins */ + if (BTScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + BTScanPosUnpinIfPinned(so->currPos); + } + + so->markItemIndex = -1; + BTScanPosUnpinIfPinned(so->markPos); + + /* No need to invalidate positions, the RAM is about to be freed. */ + + /* Release storage */ + if (so->keyData != NULL) + pfree(so->keyData); + /* so->arrayKeyData and so->arrayKeys are in arrayContext */ + if (so->arrayContext != NULL) + MemoryContextDelete(so->arrayContext); + if (so->killedItems != NULL) + pfree(so->killedItems); + if (so->currTuples != NULL) + pfree(so->currTuples); + /* so->markTuples should not be pfree'd, see btrescan */ + pfree(so); +} + +/* + * btmarkpos() -- save current scan position + */ +void +btmarkpos(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* There may be an old mark with a pin (but no lock). */ + BTScanPosUnpinIfPinned(so->markPos); + + /* + * Just record the current itemIndex. If we later step to next page + * before releasing the marked position, _bt_steppage makes a full copy of + * the currPos struct in markPos. If (as often happens) the mark is moved + * before we leave the page, we don't have to do that work. + */ + if (BTScanPosIsValid(so->currPos)) + so->markItemIndex = so->currPos.itemIndex; + else + { + BTScanPosInvalidate(so->markPos); + so->markItemIndex = -1; + } + + /* Also record the current positions of any array keys */ + if (so->numArrayKeys) + _bt_mark_array_keys(scan); +} + +/* + * btrestrpos() -- restore scan to last saved position + */ +void +btrestrpos(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* Restore the marked positions of any array keys */ + if (so->numArrayKeys) + _bt_restore_array_keys(scan); + + if (so->markItemIndex >= 0) + { + /* + * The scan has never moved to a new page since the last mark. Just + * restore the itemIndex. + * + * NB: In this case we can't count on anything in so->markPos to be + * accurate. + */ + so->currPos.itemIndex = so->markItemIndex; + } + else + { + /* + * The scan moved to a new page after last mark or restore, and we are + * now restoring to the marked page. We aren't holding any read + * locks, but if we're still holding the pin for the current position, + * we must drop it. + */ + if (BTScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + BTScanPosUnpinIfPinned(so->currPos); + } + + if (BTScanPosIsValid(so->markPos)) + { + /* bump pin on mark buffer for assignment to current buffer */ + if (BTScanPosIsPinned(so->markPos)) + IncrBufferRefCount(so->markPos.buf); + memcpy(&so->currPos, &so->markPos, + offsetof(BTScanPosData, items[1]) + + so->markPos.lastItem * sizeof(BTScanPosItem)); + if (so->currTuples) + memcpy(so->currTuples, so->markTuples, + so->markPos.nextTupleOffset); + } + else + BTScanPosInvalidate(so->currPos); + } +} + +/* + * btestimateparallelscan -- estimate storage for BTParallelScanDescData + */ +Size +btestimateparallelscan(void) +{ + return sizeof(BTParallelScanDescData); +} + +/* + * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan + */ +void +btinitparallelscan(void *target) +{ + BTParallelScanDesc bt_target = (BTParallelScanDesc) target; + + SpinLockInit(&bt_target->btps_mutex); + bt_target->btps_scanPage = InvalidBlockNumber; + bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + bt_target->btps_arrayKeyCount = 0; + ConditionVariableInit(&bt_target->btps_cv); +} + +/* + * btparallelrescan() -- reset parallel scan + */ +void +btparallelrescan(IndexScanDesc scan) +{ + BTParallelScanDesc btscan; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + + Assert(parallel_scan); + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + /* + * In theory, we don't need to acquire the spinlock here, because there + * shouldn't be any other workers running at this point, but we do so for + * consistency. + */ + SpinLockAcquire(&btscan->btps_mutex); + btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrayKeyCount = 0; + SpinLockRelease(&btscan->btps_mutex); +} + +/* + * _bt_parallel_seize() -- Begin the process of advancing the scan to a new + * page. Other scans must wait until we call _bt_parallel_release() + * or _bt_parallel_done(). + * + * The return value is true if we successfully seized the scan and false + * if we did not. The latter case occurs if no pages remain for the current + * set of scankeys. + * + * If the return value is true, *pageno returns the next or current page + * of the scan (depending on the scan direction). An invalid block number + * means the scan hasn't yet started, and P_NONE means we've reached the end. + * The first time a participating process reaches the last page, it will return + * true and set *pageno to P_NONE; after that, further attempts to seize the + * scan will return false. + * + * Callers should ignore the value of pageno if the return value is false. + */ +bool +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTPS_State pageStatus; + bool exit_loop = false; + bool status = true; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + *pageno = P_NONE; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + while (1) + { + SpinLockAcquire(&btscan->btps_mutex); + pageStatus = btscan->btps_pageStatus; + + if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + { + /* Parallel scan has already advanced to a new set of scankeys. */ + status = false; + } + else if (pageStatus == BTPARALLEL_DONE) + { + /* + * We're done with this set of scankeys. This may be the end, or + * there could be more sets to try. + */ + status = false; + } + else if (pageStatus != BTPARALLEL_ADVANCING) + { + /* + * We have successfully seized control of the scan for the purpose + * of advancing it to a new page! + */ + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + *pageno = btscan->btps_scanPage; + exit_loop = true; + } + SpinLockRelease(&btscan->btps_mutex); + if (exit_loop || !status) + break; + ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE); + } + ConditionVariableCancelSleep(); + + return status; +} + +/* + * _bt_parallel_release() -- Complete the process of advancing the scan to a + * new page. We now have the new value btps_scanPage; some other backend + * can now begin advancing the scan. + */ +void +_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) +{ + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + SpinLockAcquire(&btscan->btps_mutex); + btscan->btps_scanPage = scan_page; + btscan->btps_pageStatus = BTPARALLEL_IDLE; + SpinLockRelease(&btscan->btps_mutex); + ConditionVariableSignal(&btscan->btps_cv); +} + +/* + * _bt_parallel_done() -- Mark the parallel scan as complete. + * + * When there are no pages left to scan, this function should be called to + * notify other workers. Otherwise, they might wait forever for the scan to + * advance to the next page. + */ +void +_bt_parallel_done(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + bool status_changed = false; + + /* Do nothing, for non-parallel scans */ + if (parallel_scan == NULL) + return; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + /* + * Mark the parallel scan as done for this combination of scan keys, + * unless some other process already did so. See also + * _bt_advance_array_keys. + */ + SpinLockAcquire(&btscan->btps_mutex); + if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && + btscan->btps_pageStatus != BTPARALLEL_DONE) + { + btscan->btps_pageStatus = BTPARALLEL_DONE; + status_changed = true; + } + SpinLockRelease(&btscan->btps_mutex); + + /* wake up all the workers associated with this parallel scan */ + if (status_changed) + ConditionVariableBroadcast(&btscan->btps_cv); +} + +/* + * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array + * keys. + * + * Updates the count of array keys processed for both local and parallel + * scans. + */ +void +_bt_parallel_advance_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + so->arrayKeyCount++; + SpinLockAcquire(&btscan->btps_mutex); + if (btscan->btps_pageStatus == BTPARALLEL_DONE) + { + btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrayKeyCount++; + } + SpinLockRelease(&btscan->btps_mutex); +} + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation rel = info->index; + BTCycleId cycleid; + + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* Establish the vacuum cycle ID to use for this scan */ + /* The ENSURE stuff ensures we clean up shared memory on failure */ + PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); + { + cycleid = _bt_start_vacuum(rel); + + btvacuumscan(info, stats, callback, callback_state, cycleid); + } + PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); + _bt_end_vacuum(rel); + + return stats; +} + +/* + * Post-VACUUM cleanup. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + BlockNumber num_delpages; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + /* + * If btbulkdelete was called, we need not do anything (we just maintain + * the information used within _bt_vacuum_needs_cleanup() by calling + * _bt_set_cleanup_info() below). + * + * If btbulkdelete was _not_ called, then we have a choice to make: we + * must decide whether or not a btvacuumscan() call is needed now (i.e. + * whether the ongoing VACUUM operation can entirely avoid a physical scan + * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us + * now. + */ + if (stats == NULL) + { + /* Check if VACUUM operation can entirely avoid btvacuumscan() call */ + if (!_bt_vacuum_needs_cleanup(info->index)) + return NULL; + + /* + * Since we aren't going to actually delete any leaf items, there's no + * need to go through all the vacuum-cycle-ID pushups here. + * + * Posting list tuples are a source of inaccuracy for cleanup-only + * scans. btvacuumscan() will assume that the number of index tuples + * from each page can be used as num_index_tuples, even though + * num_index_tuples is supposed to represent the number of TIDs in the + * index. This naive approach can underestimate the number of tuples + * in the index significantly. + * + * We handle the problem by making num_index_tuples an estimate in + * cleanup-only case. + */ + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + btvacuumscan(info, stats, NULL, NULL, 0); + stats->estimated_count = true; + } + + /* + * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup(). + * + * num_delpages is the number of deleted pages now in the index that were + * not safe to place in the FSM to be recycled just yet. num_delpages is + * greater than 0 only when _bt_pagedel() actually deleted pages during + * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must + * have failed to place any newly deleted pages in the FSM just moments + * ago. (Actually, there are edge cases where recycling of the current + * VACUUM's newly deleted pages does not even become safe by the time the + * next VACUUM comes around. See nbtree/README.) + */ + Assert(stats->pages_deleted >= stats->pages_free); + num_delpages = stats->pages_deleted - stats->pages_free; + _bt_set_cleanup_info(info->index, num_delpages); + + /* + * It's quite possible for us to be fooled by concurrent page splits into + * double-counting some index tuples, so disbelieve any total that exceeds + * the underlying heap's count ... if we know that accurately. Otherwise + * this might just make matters worse. + */ + if (!info->estimated_count) + { + if (stats->num_index_tuples > info->num_heap_tuples) + stats->num_index_tuples = info->num_heap_tuples; + } + + return stats; +} + +/* + * btvacuumscan --- scan the index for VACUUMing purposes + * + * This combines the functions of looking for leaf tuples that are deletable + * according to the vacuum callback, looking for empty pages that can be + * deleted, and looking for old deleted pages that can be recycled. Both + * btbulkdelete and btvacuumcleanup invoke this (the latter only if no + * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true). + * + * The caller is responsible for initially allocating/zeroing a stats struct + * and for obtaining a vacuum cycle ID if necessary. + */ +static void +btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state, + BTCycleId cycleid) +{ + Relation rel = info->index; + BTVacState vstate; + BlockNumber num_pages; + BlockNumber scanblkno; + bool needLock; + + /* + * Reset fields that track information about the entire index now. This + * avoids double-counting in the case where a single VACUUM command + * requires multiple scans of the index. + * + * Avoid resetting the tuples_removed and pages_newly_deleted fields here, + * since they track information about the VACUUM command, and so must last + * across each call to btvacuumscan(). + * + * (Note that pages_free is treated as state about the whole index, not + * the current VACUUM. This is appropriate because RecordFreeIndexPage() + * calls are idempotent, and get repeated for the same deleted pages in + * some scenarios. The point for us is to track the number of recyclable + * pages in the index at the end of the VACUUM command.) + */ + stats->num_pages = 0; + stats->num_index_tuples = 0; + stats->pages_deleted = 0; + stats->pages_free = 0; + + /* Set up info to pass down to btvacuumpage */ + vstate.info = info; + vstate.stats = stats; + vstate.callback = callback; + vstate.callback_state = callback_state; + vstate.cycleid = cycleid; + + /* Create a temporary memory context to run _bt_pagedel in */ + vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, + "_bt_pagedel", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize vstate fields used by _bt_pendingfsm_finalize */ + vstate.bufsize = 0; + vstate.maxbufsize = 0; + vstate.pendingpages = NULL; + vstate.npendingpages = 0; + /* Consider applying _bt_pendingfsm_finalize optimization */ + _bt_pendingfsm_init(rel, &vstate, (callback == NULL)); + + /* + * The outer loop iterates over all index pages except the metapage, in + * physical order (we hope the kernel will cooperate in providing + * read-ahead for speed). It is critical that we visit all leaf pages, + * including ones added after we start the scan, else we might fail to + * delete some deletable tuples. Hence, we must repeatedly check the + * relation length. We must acquire the relation-extension lock while + * doing so to avoid a race condition: if someone else is extending the + * relation, there is a window where bufmgr/smgr have created a new + * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If + * we manage to scan such a page here, we'll improperly assume it can be + * recycled. Taking the lock synchronizes things enough to prevent a + * problem: either num_pages won't include the new page, or _bt_getbuf + * already has write lock on the buffer and it will be fully initialized + * before we can examine it. (See also vacuumlazy.c, which has the same + * issue.) Also, we need not worry if a page is added immediately after + * we look; the page splitting code already has write-lock on the left + * page before it adds a right page, so we must already have processed any + * tuples due to be moved into such a page. + * + * We can skip locking for new or temp relations, however, since no one + * else could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(rel); + + scanblkno = BTREE_METAPAGE + 1; + for (;;) + { + /* Get the current relation length */ + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + num_pages = RelationGetNumberOfBlocks(rel); + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + if (info->report_progress) + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, + num_pages); + + /* Quit if we've scanned the whole relation */ + if (scanblkno >= num_pages) + break; + /* Iterate over pages, then loop back to recheck length */ + for (; scanblkno < num_pages; scanblkno++) + { + btvacuumpage(&vstate, scanblkno); + if (info->report_progress) + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + scanblkno); + } + } + + /* Set statistics num_pages field to final size of index */ + stats->num_pages = num_pages; + + MemoryContextDelete(vstate.pagedelcontext); + + /* + * If there were any calls to _bt_pagedel() during scan of the index then + * see if any of the resulting pages can be placed in the FSM now. When + * it's not safe we'll have to leave it up to a future VACUUM operation. + * + * Finally, if we placed any pages in the FSM (either just now or during + * the scan), forcibly update the upper-level FSM pages to ensure that + * searchers can find them. + */ + _bt_pendingfsm_finalize(rel, &vstate); + if (stats->pages_free > 0) + IndexFreeSpaceMapVacuum(rel); +} + +/* + * btvacuumpage --- VACUUM one page + * + * This processes a single page for btvacuumscan(). In some cases we must + * backtrack to re-examine and VACUUM pages that were the scanblkno during + * a previous call here. This is how we handle page splits (that happened + * after our cycleid was acquired) whose right half page happened to reuse + * a block that we might have processed at some point before it was + * recycled (i.e. before the page split). + */ +static void +btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) +{ + IndexVacuumInfo *info = vstate->info; + IndexBulkDeleteResult *stats = vstate->stats; + IndexBulkDeleteCallback callback = vstate->callback; + void *callback_state = vstate->callback_state; + Relation rel = info->index; + bool attempt_pagedel; + BlockNumber blkno, + backtrack_to; + Buffer buf; + Page page; + BTPageOpaque opaque; + + blkno = scanblkno; + +backtrack: + + attempt_pagedel = false; + backtrack_to = P_NONE; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + /* + * We can't use _bt_getbuf() here because it always applies + * _bt_checkpage(), which will barf on an all-zero page. We want to + * recycle all-zero pages, not fail. Also, we want to use a nondefault + * buffer access strategy. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + _bt_lockbuf(rel, buf, BT_READ); + page = BufferGetPage(buf); + opaque = NULL; + if (!PageIsNew(page)) + { + _bt_checkpage(rel, buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + Assert(blkno <= scanblkno); + if (blkno != scanblkno) + { + /* + * We're backtracking. + * + * We followed a right link to a sibling leaf page (a page that + * happens to be from a block located before scanblkno). The only + * case we want to do anything with is a live leaf page having the + * current vacuum cycle ID. + * + * The page had better be in a state that's consistent with what we + * expect. Check for conditions that imply corruption in passing. It + * can't be half-dead because only an interrupted VACUUM process can + * leave pages in that state, so we'd definitely have dealt with it + * back when the page was the scanblkno page (half-dead pages are + * always marked fully deleted by _bt_pagedel()). This assumes that + * there can be only one vacuum process running at a time. + */ + if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque)) + { + Assert(false); + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"", + blkno, scanblkno, RelationGetRelationName(rel)))); + _bt_relbuf(rel, buf); + return; + } + + /* + * We may have already processed the page in an earlier call, when the + * page was scanblkno. This happens when the leaf page split occurred + * after the scan began, but before the right sibling page became the + * scanblkno. + * + * Page may also have been deleted by current btvacuumpage() call, + * since _bt_pagedel() sometimes deletes the right sibling page of + * scanblkno in passing (it does so after we decided where to + * backtrack to). We don't need to process this page as a deleted + * page a second time now (in fact, it would be wrong to count it as a + * deleted page in the bulk delete statistics a second time). + */ + if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque)) + { + /* Done with current scanblkno (and all lower split pages) */ + _bt_relbuf(rel, buf); + return; + } + } + + if (!opaque || BTPageIsRecyclable(page)) + { + /* Okay to recycle this page (which could be leaf or internal) */ + RecordFreeIndexPage(rel, blkno); + stats->pages_deleted++; + stats->pages_free++; + } + else if (P_ISDELETED(opaque)) + { + /* + * Already deleted page (which could be leaf or internal). Can't + * recycle yet. + */ + stats->pages_deleted++; + } + else if (P_ISHALFDEAD(opaque)) + { + /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */ + attempt_pagedel = true; + + /* + * _bt_pagedel() will increment both pages_newly_deleted and + * pages_deleted stats in all cases (barring corruption) + */ + } + else if (P_ISLEAF(opaque)) + { + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable; + BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + int nupdatable; + OffsetNumber offnum, + minoff, + maxoff; + int nhtidsdead, + nhtidslive; + + /* + * Trade in the initial read lock for a super-exclusive write lock on + * this page. We must get such a lock on every leaf page over the + * course of the vacuum scan, whether or not it actually contains any + * deletable tuples --- see nbtree/README. + */ + _bt_upgradelockbufcleanup(rel, buf); + + /* + * Check whether we need to backtrack to earlier pages. What we are + * concerned about is a page split that happened since we started the + * vacuum scan. If the split moved tuples on the right half of the + * split (i.e. the tuples that sort high) to a block that we already + * passed over, then we might have missed the tuples. We need to + * backtrack now. (Must do this before possibly clearing btpo_cycleid + * or deleting scanblkno page below!) + */ + if (vstate->cycleid != 0 && + opaque->btpo_cycleid == vstate->cycleid && + !(opaque->btpo_flags & BTP_SPLIT_END) && + !P_RIGHTMOST(opaque) && + opaque->btpo_next < scanblkno) + backtrack_to = opaque->btpo_next; + + /* + * When each VACUUM begins, it determines an OldestXmin cutoff value. + * Tuples before the cutoff are removed by VACUUM. Scan over all + * items to see which ones need to be deleted according to cutoff + * point using callback. + */ + ndeletable = 0; + nupdatable = 0; + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + nhtidsdead = 0; + nhtidslive = 0; + if (callback) + { + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + IndexTuple itup; + + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, offnum)); + + /* + * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM + * records do not produce their own conflicts. This is safe + * as long as the callback function only considers whether the + * index tuple refers to pre-cutoff heap tuples that were + * certainly already pruned away during VACUUM's initial heap + * scan by the time we get here. (heapam's XLOG_HEAP2_PRUNE + * records produce conflicts using a latestRemovedXid value + * for the pointed-to heap tuples, so there is no need to + * produce our own conflict now.) + * + * Backends with snapshots acquired after a VACUUM starts but + * before it finishes could have visibility cutoff with a + * later xid than VACUUM's OldestXmin cutoff. These backends + * might happen to opportunistically mark some index tuples + * LP_DEAD before we reach them, even though they may be after + * our cutoff. We don't try to kill these "extra" index + * tuples in _bt_delitems_vacuum(). This keep things simple, + * and allows us to always avoid generating our own conflicts. + */ + Assert(!BTreeTupleIsPivot(itup)); + if (!BTreeTupleIsPosting(itup)) + { + /* Regular tuple, standard table TID representation */ + if (callback(&itup->t_tid, callback_state)) + { + deletable[ndeletable++] = offnum; + nhtidsdead++; + } + else + nhtidslive++; + } + else + { + BTVacuumPosting vacposting; + int nremaining; + + /* Posting list tuple */ + vacposting = btreevacuumposting(vstate, itup, offnum, + &nremaining); + if (vacposting == NULL) + { + /* + * All table TIDs from the posting tuple remain, so no + * delete or update required + */ + Assert(nremaining == BTreeTupleGetNPosting(itup)); + } + else if (nremaining > 0) + { + + /* + * Store metadata about posting list tuple in + * updatable array for entire page. Existing tuple + * will be updated during the later call to + * _bt_delitems_vacuum(). + */ + Assert(nremaining < BTreeTupleGetNPosting(itup)); + updatable[nupdatable++] = vacposting; + nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining; + } + else + { + /* + * All table TIDs from the posting list must be + * deleted. We'll delete the index tuple completely + * (no update required). + */ + Assert(nremaining == 0); + deletable[ndeletable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup); + pfree(vacposting); + } + + nhtidslive += nremaining; + } + } + } + + /* + * Apply any needed deletes or updates. We issue just one + * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic. + */ + if (ndeletable > 0 || nupdatable > 0) + { + Assert(nhtidsdead >= ndeletable + nupdatable); + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, + nupdatable); + + stats->tuples_removed += nhtidsdead; + /* must recompute maxoff */ + maxoff = PageGetMaxOffsetNumber(page); + + /* can't leak memory here */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]); + } + else + { + /* + * If the leaf page has been split during this vacuum cycle, it + * seems worth expending a write to clear btpo_cycleid even if we + * don't have any deletions to do. (If we do, _bt_delitems_vacuum + * takes care of this.) This ensures we won't process the page + * again. + * + * We treat this like a hint-bit update because there's no need to + * WAL-log it. + */ + Assert(nhtidsdead == 0); + if (vstate->cycleid != 0 && + opaque->btpo_cycleid == vstate->cycleid) + { + opaque->btpo_cycleid = 0; + MarkBufferDirtyHint(buf, true); + } + } + + /* + * If the leaf page is now empty, try to delete it; else count the + * live tuples (live table TIDs in posting lists are counted as + * separate live tuples). We don't delete when backtracking, though, + * since that would require teaching _bt_pagedel() about backtracking + * (doesn't seem worth adding more complexity to deal with that). + * + * We don't count the number of live TIDs during cleanup-only calls to + * btvacuumscan (i.e. when callback is not set). We count the number + * of index tuples directly instead. This avoids the expense of + * directly examining all of the tuples on each page. VACUUM will + * treat num_index_tuples as an estimate in cleanup-only case, so it + * doesn't matter that this underestimates num_index_tuples + * significantly in some cases. + */ + if (minoff > maxoff) + attempt_pagedel = (blkno == scanblkno); + else if (callback) + stats->num_index_tuples += nhtidslive; + else + stats->num_index_tuples += maxoff - minoff + 1; + + Assert(!attempt_pagedel || nhtidslive == 0); + } + + if (attempt_pagedel) + { + MemoryContext oldcontext; + + /* Run pagedel in a temp context to avoid memory leakage */ + MemoryContextReset(vstate->pagedelcontext); + oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); + + /* + * _bt_pagedel maintains the bulk delete stats on our behalf; + * pages_newly_deleted and pages_deleted are likely to be incremented + * during call + */ + Assert(blkno == scanblkno); + _bt_pagedel(rel, buf, vstate); + + MemoryContextSwitchTo(oldcontext); + /* pagedel released buffer, so we shouldn't */ + } + else + _bt_relbuf(rel, buf); + + if (backtrack_to != P_NONE) + { + blkno = backtrack_to; + goto backtrack; + } +} + +/* + * btreevacuumposting --- determine TIDs still needed in posting list + * + * Returns metadata describing how to build replacement tuple without the TIDs + * that VACUUM needs to delete. Returned value is NULL in the common case + * where no changes are needed to caller's posting list tuple (we avoid + * allocating memory here as an optimization). + * + * The number of TIDs that should remain in the posting list tuple is set for + * caller in *nremaining. + */ +static BTVacuumPosting +btreevacuumposting(BTVacState *vstate, IndexTuple posting, + OffsetNumber updatedoffset, int *nremaining) +{ + int live = 0; + int nitem = BTreeTupleGetNPosting(posting); + ItemPointer items = BTreeTupleGetPosting(posting); + BTVacuumPosting vacposting = NULL; + + for (int i = 0; i < nitem; i++) + { + if (!vstate->callback(items + i, vstate->callback_state)) + { + /* Live table TID */ + live++; + } + else if (vacposting == NULL) + { + /* + * First dead table TID encountered. + * + * It's now clear that we need to delete one or more dead table + * TIDs, so start maintaining metadata describing how to update + * existing posting list tuple. + */ + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + nitem * sizeof(uint16)); + + vacposting->itup = posting; + vacposting->updatedoffset = updatedoffset; + vacposting->ndeletedtids = 0; + vacposting->deletetids[vacposting->ndeletedtids++] = i; + } + else + { + /* Second or subsequent dead table TID */ + vacposting->deletetids[vacposting->ndeletedtids++] = i; + } + } + + *nremaining = live; + return vacposting; +} + +/* + * btcanreturn() -- Check whether btree indexes support index-only scans. + * + * btrees always do, so this is trivial. + */ +bool +btcanreturn(Relation index, int attno) +{ + return true; +} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c new file mode 100644 index 0000000..fdf0e56 --- /dev/null +++ b/src/backend/access/nbtree/nbtsearch.c @@ -0,0 +1,2501 @@ +/*------------------------------------------------------------------------- + * + * nbtsearch.c + * Search code for postgres btrees. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsearch.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/relscan.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/predicate.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" + + +static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); +static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, + OffsetNumber offnum); +static void _bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup); +static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset); +static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); +static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, + ScanDirection dir); +static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); +static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); +static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); + + +/* + * _bt_drop_lock_and_maybe_pin() + * + * Unlock the buffer; and if it is safe to release the pin, do that, too. It + * is safe if the scan is using an MVCC snapshot and the index is WAL-logged. + * This will prevent vacuum from stalling in a blocked state trying to read a + * page when a cursor is sitting on it -- at least in many important cases. + * + * Set the buffer to invalid if the pin is released, since the buffer may be + * re-used. If we need to go back to this block (for example, to apply + * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it + * will remain in shared memory for as long as it takes to scan the index + * buffer page. + */ +static void +_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +{ + _bt_unlockbuf(scan->indexRelation, sp->buf); + + if (IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation) && + !scan->xs_want_itup) + { + ReleaseBuffer(sp->buf); + sp->buf = InvalidBuffer; + } +} + +/* + * _bt_search() -- Search the tree for a particular scankey, + * or more precisely for the first leaf page it could be on. + * + * The passed scankey is an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * + * Return value is a stack of parent-page pointers (i.e. there is no entry for + * the leaf level/page). *bufP is set to the address of the leaf-page buffer, + * which is locked and pinned. No locks are held on the parent pages, + * however! + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + * + * The returned buffer is locked according to access parameter. Additionally, + * access = BT_WRITE will allow an empty root page to be created and returned. + * When access = BT_READ, an empty index will result in *bufP being set to + * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered + * during the search will be finished. + */ +BTStack +_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, + Snapshot snapshot) +{ + BTStack stack_in = NULL; + int page_access = BT_READ; + + /* Get the root page to start with */ + *bufP = _bt_getroot(rel, access); + + /* If index is empty and access = BT_READ, no root page is created. */ + if (!BufferIsValid(*bufP)) + return (BTStack) NULL; + + /* Loop iterates once per level descended in the tree */ + for (;;) + { + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + ItemId itemid; + IndexTuple itup; + BlockNumber child; + BTStack new_stack; + + /* + * Race -- the page we just grabbed may have split since we read its + * downlink in its parent page (or the metapage). If it has, we may + * need to move right to its new sibling. Do that. + * + * In write-mode, allow _bt_moveright to finish any incomplete splits + * along the way. Strictly speaking, we'd only need to finish an + * incomplete split on the leaf page we're about to insert to, not on + * any of the upper levels (internal pages with incomplete splits are + * also taken care of in _bt_getstackbuf). But this is a good + * opportunity to finish splits of internal pages too. + */ + *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in, + page_access, snapshot); + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISLEAF(opaque)) + break; + + /* + * Find the appropriate pivot tuple on this page. Its downlink points + * to the child page that we're about to descend to. + */ + offnum = _bt_binsrch(rel, key, *bufP); + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); + child = BTreeTupleGetDownLink(itup); + + /* + * We need to save the location of the pivot tuple we chose in a new + * stack entry for this page/level. If caller ends up splitting a + * page one level down, it usually ends up inserting a new pivot + * tuple/downlink immediately after the location recorded here. + */ + new_stack = (BTStack) palloc(sizeof(BTStackData)); + new_stack->bts_blkno = BufferGetBlockNumber(*bufP); + new_stack->bts_offset = offnum; + new_stack->bts_parent = stack_in; + + /* + * Page level 1 is lowest non-leaf page level prior to leaves. So, if + * we're on the level 1 and asked to lock leaf page in write mode, + * then lock next page in write mode, because it must be a leaf. + */ + if (opaque->btpo_level == 1 && access == BT_WRITE) + page_access = BT_WRITE; + + /* drop the read lock on the page, then acquire one on its child */ + *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access); + + /* okay, all set to move down a level */ + stack_in = new_stack; + } + + /* + * If we're asked to lock leaf in write mode, but didn't manage to, then + * relock. This should only happen when the root page is a leaf page (and + * the only page in the index other than the metapage). + */ + if (access == BT_WRITE && page_access == BT_READ) + { + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, *bufP); + _bt_lockbuf(rel, *bufP, BT_WRITE); + + /* + * Race -- the leaf page may have split after we dropped the read lock + * but before we acquired a write lock. If it has, we may need to + * move right to its new sibling. Do that. + */ + *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE, + snapshot); + } + + return stack_in; +} + +/* + * _bt_moveright() -- move right in the btree if necessary. + * + * When we follow a pointer to reach a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. + * + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry is + * strictly less than the scankey, or <= the scankey in the + * key.nextkey=true case, then we followed the wrong link and we need + * to move right. + * + * The passed insertion-type scankey can omit the rightmost column(s) of the + * index. (see nbtree/README) + * + * When key.nextkey is false (the usual case), we are looking for the first + * item >= key. When key.nextkey is true, we are looking for the first item + * strictly greater than key. + * + * If forupdate is true, we will attempt to finish any incomplete splits + * that we encounter. This is required when locking a target page for an + * insertion, because we don't allow inserting on a page before the split + * is completed. 'stack' is only used if forupdate is true. + * + * On entry, we have the buffer pinned and a lock of the type specified by + * 'access'. If we move right, we release the buffer and lock and acquire + * the same on the right sibling. Return value is the buffer we stop at. + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + */ +Buffer +_bt_moveright(Relation rel, + BTScanInsert key, + Buffer buf, + bool forupdate, + BTStack stack, + int access, + Snapshot snapshot) +{ + Page page; + BTPageOpaque opaque; + int32 cmpval; + + /* + * When nextkey = false (normal case): if the scan key that brought us to + * this page is > the high key stored on the page, then the page has split + * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could + * have some duplicates to the right as well as the left, but that's + * something that's only ever dealt with on the leaf level, after + * _bt_search has found an initial leaf page.) + * + * When nextkey = true: move right if the scan key is >= page's high key. + * (Note that key.scantid cannot be set in this case.) + * + * The page could even have split more than once, so scan as far as + * needed. + * + * We also have to move right if we followed a link that brought us to a + * dead page. + */ + cmpval = key->nextkey ? 0 : 1; + + for (;;) + { + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_RIGHTMOST(opaque)) + break; + + /* + * Finish any incomplete splits we encounter along the way. + */ + if (forupdate && P_INCOMPLETE_SPLIT(opaque)) + { + BlockNumber blkno = BufferGetBlockNumber(buf); + + /* upgrade our lock if necessary */ + if (access == BT_READ) + { + _bt_unlockbuf(rel, buf); + _bt_lockbuf(rel, buf, BT_WRITE); + } + + if (P_INCOMPLETE_SPLIT(opaque)) + _bt_finish_split(rel, buf, stack); + else + _bt_relbuf(rel, buf); + + /* re-acquire the lock in the right mode, and re-check */ + buf = _bt_getbuf(rel, blkno, access); + continue; + } + + if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval) + { + /* step right one page */ + buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); + continue; + } + else + break; + } + + if (P_IGNORE(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + + return buf; +} + +/* + * _bt_binsrch() -- Do a binary search for a key on a particular page. + * + * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first + * key >= given scankey, or > scankey if nextkey is true. (NOTE: in + * particular, this means it is possible to return a value 1 greater than the + * number of keys on the page, if the scankey is > all keys on the page.) + * + * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber + * of the last key < given scankey, or last key <= given scankey if nextkey + * is true. (Since _bt_compare treats the first data key of such a page as + * minus infinity, there will be at least one key < scankey, so the result + * always points at one of the keys on the page.) This key indicates the + * right place to descend to be sure we find all leaf keys >= given scankey + * (or leaf keys > given scankey when nextkey is true). + * + * This procedure is not responsible for walking right, it just examines + * the given page. _bt_binsrch() has no lock or refcount side effects + * on the buffer. + */ +static OffsetNumber +_bt_binsrch(Relation rel, + BTScanInsert key, + Buffer buf) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high; + int32 result, + cmpval; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Requesting nextkey semantics while using scantid seems nonsensical */ + Assert(!key->nextkey || key->scantid == NULL); + /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */ + Assert(!P_ISLEAF(opaque) || key->scantid == NULL); + + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + + /* + * If there are no keys on the page, return the first available slot. Note + * this covers two cases: the page is really empty (no keys), or it + * contains only a high key. The latter case is possible after vacuuming. + * This can never happen on an internal page, however, since they are + * never empty (an internal page must have children). + */ + if (unlikely(high < low)) + return low; + + /* + * Binary search to find the first key on the page >= scan key, or first + * key > scankey when nextkey is true. + * + * For nextkey=false (cmpval=1), the loop invariant is: all slots before + * 'low' are < scan key, all slots at or after 'high' are >= scan key. + * + * For nextkey=true (cmpval=0), the loop invariant is: all slots before + * 'low' are <= scan key, all slots at or after 'high' are > scan key. + * + * We can fall out when high == low. + */ + high++; /* establish the loop invariant for high */ + + cmpval = key->nextkey ? 0 : 1; /* select comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid); + + if (result >= cmpval) + low = mid + 1; + else + high = mid; + } + + /* + * At this point we have high == low, but be careful: they could point + * past the last slot on the page. + * + * On a leaf page, we always return the first key >= scan key (resp. > + * scan key), which could be the last slot + 1. + */ + if (P_ISLEAF(opaque)) + return low; + + /* + * On a non-leaf page, return the last key < scan key (resp. <= scan key). + * There must be one if _bt_compare() is playing by the rules. + */ + Assert(low > P_FIRSTDATAKEY(opaque)); + + return OffsetNumberPrev(low); +} + +/* + * + * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search. + * + * Like _bt_binsrch(), but with support for caching the binary search + * bounds. Only used during insertion, and only on the leaf page that it + * looks like caller will insert tuple on. Exclusive-locked and pinned + * leaf page is contained within insertstate. + * + * Caches the bounds fields in insertstate so that a subsequent call can + * reuse the low and strict high bounds of original binary search. Callers + * that use these fields directly must be prepared for the case where low + * and/or stricthigh are not on the same page (one or both exceed maxoff + * for the page). The case where there are no items on the page (high < + * low) makes bounds invalid. + * + * Caller is responsible for invalidating bounds when it modifies the page + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by a posting + * list split). + */ +OffsetNumber +_bt_binsrch_insert(Relation rel, BTInsertState insertstate) +{ + BTScanInsert key = insertstate->itup_key; + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high, + stricthigh; + int32 result, + cmpval; + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque)); + Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); + + if (!insertstate->bounds_valid) + { + /* Start new binary search */ + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + } + else + { + /* Restore result of previous binary search against same page */ + low = insertstate->low; + high = insertstate->stricthigh; + } + + /* If there are no keys on the page, return the first available slot */ + if (unlikely(high < low)) + { + /* Caller can't reuse bounds */ + insertstate->low = InvalidOffsetNumber; + insertstate->stricthigh = InvalidOffsetNumber; + insertstate->bounds_valid = false; + return low; + } + + /* + * Binary search to find the first key on the page >= scan key. (nextkey + * is always false when inserting). + * + * The loop invariant is: all slots before 'low' are < scan key, all slots + * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is + * maintained to save additional search effort for caller. + * + * We can fall out when high == low. + */ + if (!insertstate->bounds_valid) + high++; /* establish the loop invariant for high */ + stricthigh = high; /* high initially strictly higher */ + + cmpval = 1; /* !nextkey comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid); + + if (result >= cmpval) + low = mid + 1; + else + { + high = mid; + if (result != 0) + stricthigh = high; + } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + { + /* + * postingoff should never be set more than once per leaf page + * binary search. That would mean that there are duplicate table + * TIDs in the index, which is never okay. Check for that here. + */ + if (insertstate->postingoff != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(key->scantid), + ItemPointerGetOffsetNumber(key->scantid), + low, stricthigh, + BufferGetBlockNumber(insertstate->buf), + RelationGetRelationName(rel)))); + + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); + } + } + + /* + * On a leaf page, a binary search always returns the first key >= scan + * key (at least in !nextkey case), which could be the last slot + 1. This + * is also the lower bound of cached search. + * + * stricthigh may also be the last slot + 1, which prevents caller from + * using bounds directly, but is still useful to us if we're called a + * second time with cached bounds (cached low will be < stricthigh when + * that happens). + */ + insertstate->low = low; + insertstate->stricthigh = stricthigh; + insertstate->bounds_valid = true; + + return low; +} + +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Helper routine for _bt_binsrch_insert(). + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + Assert(key->heapkeyspace && key->allequalimage); + + /* + * In the event that posting list tuple has LP_DEAD bit set, indicate this + * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A + * second call to _bt_binsrch_insert() can take place when its caller has + * removed the dead item. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res > 0) + low = mid + 1; + else if (res < 0) + high = mid; + else + return mid; + } + + /* Exact match not found */ + return low; +} + +/*---------- + * _bt_compare() -- Compare insertion-type scankey to tuple on a page. + * + * page/offnum: location of btree item to be compared to. + * + * This routine returns: + * <0 if scankey < tuple at offnum; + * 0 if scankey == tuple at offnum; + * >0 if scankey > tuple at offnum. + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). + * + * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be + * "minus infinity": this routine will always claim it is less than the + * scankey. The actual key value stored is explicitly truncated to 0 + * attributes (explicitly minus infinity) with version 3+ indexes, but + * that isn't relied upon. This allows us to implement the Lehman and + * Yao convention that the first down-link pointer is before the first + * key. See backend/access/nbtree/README for details. + *---------- + */ +int32 +_bt_compare(Relation rel, + BTScanInsert key, + Page page, + OffsetNumber offnum) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + IndexTuple itup; + ItemPointer heapTid; + ScanKey scankey; + int ncmpkey; + int ntupatts; + int32 result; + + Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); + Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(key->heapkeyspace || key->scantid == NULL); + + /* + * Force result ">" if target item is first data item on an internal page + * --- see NOTE above. + */ + if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) + return 1; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + ntupatts = BTreeTupleGetNAtts(itup, rel); + + /* + * The scan key is set up with the attribute number associated with each + * term in the key. It is important that, if the index is multi-key, the + * scan contain the first k key attributes, and that they be in order. If + * you think about how multi-key ordering works, you'll understand why + * this is. + * + * We don't test for violation of this condition here, however. The + * initial setup for the index scan had better have gotten it right (see + * _bt_first). + */ + + ncmpkey = Min(ntupatts, key->keysz); + Assert(key->heapkeyspace || ncmpkey == key->keysz); + Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); + scankey = key->scankeys; + for (int i = 1; i <= ncmpkey; i++) + { + Datum datum; + bool isNull; + + datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); + + if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ + { + if (isNull) + result = 0; /* NULL "=" NULL */ + else if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull) /* key is NOT_NULL and item is NULL */ + { + if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * The sk_func needs to be passed the index value as left arg and + * the sk_argument as right arg (they might be of different + * types). Since it is convenient for callers to think of + * _bt_compare as comparing the scankey to the index item, we have + * to flip the sign of the comparison result. (Unless it's a DESC + * column, in which case we *don't* flip the sign.) + */ + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); + + if (!(scankey->sk_flags & SK_BT_DESC)) + INVERT_COMPARE_RESULT(result); + } + + /* if the keys are unequal, return the difference */ + if (result != 0) + return result; + + scankey++; + } + + /* + * All non-truncated attributes (other than heap TID) were found to be + * equal. Treat truncated attributes as minus infinity when scankey has a + * key attribute value that would otherwise be compared directly. + * + * Note: it doesn't matter if ntupatts includes non-key attributes; + * scankey won't, so explicitly excluding non-key attributes isn't + * necessary. + */ + if (key->keysz > ntupatts) + return 1; + + /* + * Use the heap TID attribute and scantid to try to break the tie. The + * rules are the same as any other key attribute -- only the + * representation differs. + */ + heapTid = BTreeTupleGetHeapTID(itup); + if (key->scantid == NULL) + { + /* + * Most searches have a scankey that is considered greater than a + * truncated pivot tuple if and when the scankey has equal values for + * attributes up to and including the least significant untruncated + * attribute in tuple. + * + * For example, if an index has the minimum two attributes (single + * user key attribute, plus heap TID attribute), and a page's high key + * is ('foo', -inf), and scankey is ('foo', <omitted>), the search + * will not descend to the page to the left. The search will descend + * right instead. The truncated attribute in pivot tuple means that + * all non-pivot tuples on the page to the left are strictly < 'foo', + * so it isn't necessary to descend left. In other words, search + * doesn't have to descend left because it isn't interested in a match + * that has a heap TID value of -inf. + * + * However, some searches (pivotsearch searches) actually require that + * we descend left when this happens. -inf is treated as a possible + * match for omitted scankey attribute(s). This is needed by page + * deletion, which must re-find leaf pages that are targets for + * deletion using their high keys. + * + * Note: the heap TID part of the test ensures that scankey is being + * compared to a pivot tuple with one or more truncated key + * attributes. + * + * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the + * left here, since they have no heap TID attribute (and cannot have + * any -inf key values in any case, since truncation can only remove + * non-key attributes). !heapkeyspace searches must always be + * prepared to deal with matches on both sides of the pivot once the + * leaf level is reached. + */ + if (key->heapkeyspace && !key->pivotsearch && + key->keysz == ntupatts && heapTid == NULL) + return 1; + + /* All provided scankey arguments found to be equal */ + return 0; + } + + /* + * Treat truncated heap TID as minus infinity, since scankey has a key + * attribute value (scantid) that would otherwise be compared directly + */ + Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel)); + if (heapTid == NULL) + return 1; + + /* + * Scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * with scantid. + */ + Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); + result = ItemPointerCompare(key->scantid, heapTid); + if (result <= 0 || !BTreeTupleIsPosting(itup)) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; +} + +/* + * _bt_first() -- Find the first item in a scan. + * + * We need to be clever about the direction of scan, the search + * conditions, and the tree ordering. We find the first item (or, + * if backwards scan, the last item) in the tree that satisfies the + * qualifications in the scan key. On success exit, the page containing + * the current index tuple is pinned but not locked, and data about + * the matching tuple(s) on the page has been loaded into so->currPos. + * scan->xs_ctup.t_self is set to the heap TID of the current tuple, + * and if requested, scan->xs_itup points to a copy of the index tuple. + * + * If there are no matching items in the index, we return false, with no + * pins or locks held. + * + * Note that scan->keyData[], and the so->keyData[] scankey built from it, + * are both search-type scankeys (see nbtree/README for more about this). + * Within this routine, we build a temporary insertion-type scankey to use + * in locating the scan start position. + */ +bool +_bt_first(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + BTStack stack; + OffsetNumber offnum; + StrategyNumber strat; + bool nextkey; + bool goback; + BTScanInsertData inskey; + ScanKey startKeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + int keysCount = 0; + int i; + bool status; + StrategyNumber strat_total; + BTScanPosItem *currItem; + BlockNumber blkno; + + Assert(!BTScanPosIsValid(so->currPos)); + + pgstat_count_index_scan(rel); + + /* + * Examine the scan keys and eliminate any redundant keys; also mark the + * keys that must be matched to continue the scan. + */ + _bt_preprocess_keys(scan); + + /* + * Quit now if _bt_preprocess_keys() discovered that the scan keys can + * never be satisfied (eg, x == 1 AND x > 2). + */ + if (!so->qual_ok) + { + /* Notify any other workers that we're done with this scan key. */ + _bt_parallel_done(scan); + return false; + } + + /* + * For parallel scans, get the starting page from shared state. If the + * scan has not started, proceed to find out first leaf page in the usual + * way while keeping other participating processes waiting. If the scan + * has already begun, use the page number from the shared structure. + */ + if (scan->parallel_scan != NULL) + { + status = _bt_parallel_seize(scan, &blkno); + if (!status) + return false; + else if (blkno == P_NONE) + { + _bt_parallel_done(scan); + return false; + } + else if (blkno != InvalidBlockNumber) + { + if (!_bt_parallel_readpage(scan, blkno, dir)) + return false; + goto readcomplete; + } + } + + /*---------- + * Examine the scan keys to discover where we need to start the scan. + * + * We want to identify the keys that can be used as starting boundaries; + * these are =, >, or >= keys for a forward scan or =, <, <= keys for + * a backwards scan. We can use keys for multiple attributes so long as + * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept + * a > or < boundary or find an attribute with no boundary (which can be + * thought of as the same as "> -infinity"), we can't use keys for any + * attributes to its right, because it would break our simplistic notion + * of what initial positioning strategy to use. + * + * When the scan keys include cross-type operators, _bt_preprocess_keys + * may not be able to eliminate redundant keys; in such cases we will + * arbitrarily pick a usable one for each attribute. This is correct + * but possibly not optimal behavior. (For example, with keys like + * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when + * x=5 would be more efficient.) Since the situation only arises given + * a poorly-worded query plus an incomplete opfamily, live with it. + * + * When both equality and inequality keys appear for a single attribute + * (again, only possible when cross-type operators appear), we *must* + * select one of the equality keys for the starting point, because + * _bt_checkkeys() will stop the scan as soon as an equality qual fails. + * For example, if we have keys like "x >= 4 AND x = 10" and we elect to + * start at x=4, we will fail and stop before reaching x=10. If multiple + * equality quals survive preprocessing, however, it doesn't matter which + * one we use --- by definition, they are either redundant or + * contradictory. + * + * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. + * If the index stores nulls at the end of the index we'll be starting + * from, and we have no boundary key for the column (which means the key + * we deduced NOT NULL from is an inequality key that constrains the other + * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to + * use as a boundary key. If we didn't do this, we might find ourselves + * traversing a lot of null entries at the start of the scan. + * + * In this loop, row-comparison keys are treated the same as keys on their + * first (leftmost) columns. We'll add on lower-order columns of the row + * comparison below, if possible. + * + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. + *---------- + */ + strat_total = BTEqualStrategyNumber; + if (so->numberOfKeys > 0) + { + AttrNumber curattr; + ScanKey chosen; + ScanKey impliesNN; + ScanKey cur; + + /* + * chosen is the so-far-chosen key for the current attribute, if any. + * We don't cast the decision in stone until we reach keys for the + * next attribute. + */ + curattr = 1; + chosen = NULL; + /* Also remember any scankey that implies a NOT NULL constraint */ + impliesNN = NULL; + + /* + * Loop iterates from 0 to numberOfKeys inclusive; we use the last + * pass to handle after-last-key processing. Actual exit from the + * loop is at one of the "break" statements below. + */ + for (cur = so->keyData, i = 0;; cur++, i++) + { + if (i >= so->numberOfKeys || cur->sk_attno != curattr) + { + /* + * Done looking at keys for curattr. If we didn't find a + * usable boundary key, see if we can deduce a NOT NULL key. + */ + if (chosen == NULL && impliesNN != NULL && + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + ScanDirectionIsForward(dir) : + ScanDirectionIsBackward(dir))) + { + /* Yes, so build the key in notnullkeys[keysCount] */ + chosen = ¬nullkeys[keysCount]; + ScanKeyEntryInitialize(chosen, + (SK_SEARCHNOTNULL | SK_ISNULL | + (impliesNN->sk_flags & + (SK_BT_DESC | SK_BT_NULLS_FIRST))), + curattr, + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + BTGreaterStrategyNumber : + BTLessStrategyNumber), + InvalidOid, + InvalidOid, + InvalidOid, + (Datum) 0); + } + + /* + * If we still didn't find a usable boundary key, quit; else + * save the boundary key pointer in startKeys. + */ + if (chosen == NULL) + break; + startKeys[keysCount++] = chosen; + + /* + * Adjust strat_total, and quit if we have stored a > or < + * key. + */ + strat = chosen->sk_strategy; + if (strat != BTEqualStrategyNumber) + { + strat_total = strat; + if (strat == BTGreaterStrategyNumber || + strat == BTLessStrategyNumber) + break; + } + + /* + * Done if that was the last attribute, or if next key is not + * in sequence (implying no boundary key is available for the + * next attribute). + */ + if (i >= so->numberOfKeys || + cur->sk_attno != curattr + 1) + break; + + /* + * Reset for next attr. + */ + curattr = cur->sk_attno; + chosen = NULL; + impliesNN = NULL; + } + + /* + * Can we use this key as a starting boundary for this attr? + * + * If not, does it imply a NOT NULL constraint? (Because + * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, + * *any* inequality key works for that; we need not test.) + */ + switch (cur->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsBackward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + case BTEqualStrategyNumber: + /* override any non-equality choice */ + chosen = cur; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsForward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + } + } + } + + /* + * If we found no usable boundary keys, we have to start from one end of + * the tree. Walk down that edge to the first or last key, and scan from + * there. + */ + if (keysCount == 0) + { + bool match; + + match = _bt_endpoint(scan, dir); + + if (!match) + { + /* No match, so mark (parallel) scan finished */ + _bt_parallel_done(scan); + } + + return match; + } + + /* + * We want to start the scan somewhere within the index. Set up an + * insertion scankey we can use to search for the boundary point we + * identified above. The insertion scankey is built using the keys + * identified by startKeys[]. (Remaining insertion scankey fields are + * initialized after initial-positioning strategy is finalized.) + */ + Assert(keysCount <= INDEX_MAX_KEYS); + for (i = 0; i < keysCount; i++) + { + ScanKey cur = startKeys[i]; + + Assert(cur->sk_attno == i + 1); + + if (cur->sk_flags & SK_ROW_HEADER) + { + /* + * Row comparison header: look to the first row member instead. + * + * The member scankeys are already in insertion format (ie, they + * have sk_func = 3-way-comparison function), but we have to watch + * out for nulls, which _bt_preprocess_keys didn't check. A null + * in the first row member makes the condition unmatchable, just + * like qual_ok = false. + */ + ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_flags & SK_ISNULL) + { + _bt_parallel_done(scan); + return false; + } + memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); + + /* + * If the row comparison is the last positioning key we accepted, + * try to add additional keys from the lower-order row members. + * (If we accepted independent conditions on additional index + * columns, we use those instead --- doesn't seem worth trying to + * determine which is more restrictive.) Note that this is OK + * even if the row comparison is of ">" or "<" type, because the + * condition applied to all but the last row member is effectively + * ">=" or "<=", and so the extra keys don't break the positioning + * scheme. But, by the same token, if we aren't able to use all + * the row members, then the part of the row comparison that we + * did use has to be treated as just a ">=" or "<=" condition, and + * so we'd better adjust strat_total accordingly. + */ + if (i == keysCount - 1) + { + bool used_all_subkeys = false; + + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) + { + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != keysCount + 1) + break; /* out-of-sequence, can't use it */ + if (subkey->sk_strategy != cur->sk_strategy) + break; /* wrong direction, can't use it */ + if (subkey->sk_flags & SK_ISNULL) + break; /* can't use null keys */ + Assert(keysCount < INDEX_MAX_KEYS); + memcpy(inskey.scankeys + keysCount, subkey, + sizeof(ScanKeyData)); + keysCount++; + if (subkey->sk_flags & SK_ROW_END) + { + used_all_subkeys = true; + break; + } + } + if (!used_all_subkeys) + { + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } + } + break; /* done with outer loop */ + } + } + else + { + /* + * Ordinary comparison key. Transform the search-style scan key + * to an insertion scan key by replacing the sk_func with the + * appropriate btree comparison function. + * + * If scankey operator is not a cross-type comparison, we can use + * the cached comparison function; otherwise gotta look it up in + * the catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means + * the opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + if (cur->sk_subtype == rel->rd_opcintype[i] || + cur->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + procinfo, + cur->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + cur->sk_subtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, + cur->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + cmp_proc, + cur->sk_argument); + } + } + } + + /*---------- + * Examine the selected initial-positioning strategy to determine exactly + * where we need to start the scan, and set flag variables to control the + * code below. + * + * If nextkey = false, _bt_search and _bt_binsrch will locate the first + * item >= scan key. If nextkey = true, they will locate the first + * item > scan key. + * + * If goback = true, we will then step back one item, while if + * goback = false, we will start the scan on the located item. + *---------- + */ + switch (strat_total) + { + case BTLessStrategyNumber: + + /* + * Find first item >= scankey, then back up one to arrive at last + * item < scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + nextkey = false; + goback = true; + break; + + case BTLessEqualStrategyNumber: + + /* + * Find first item > scankey, then back up one to arrive at last + * item <= scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + nextkey = true; + goback = true; + break; + + case BTEqualStrategyNumber: + + /* + * If a backward scan was specified, need to start with last equal + * item not first one. + */ + if (ScanDirectionIsBackward(dir)) + { + /* + * This is the same as the <= strategy. We will check at the + * end whether the found item is actually =. + */ + nextkey = true; + goback = true; + } + else + { + /* + * This is the same as the >= strategy. We will check at the + * end whether the found item is actually =. + */ + nextkey = false; + goback = false; + } + break; + + case BTGreaterEqualStrategyNumber: + + /* + * Find first item >= scankey. (This is only used for forward + * scans.) + */ + nextkey = false; + goback = false; + break; + + case BTGreaterStrategyNumber: + + /* + * Find first item > scankey. (This is only used for forward + * scans.) + */ + nextkey = true; + goback = false; + break; + + default: + /* can't get here, but keep compiler quiet */ + elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); + return false; + } + + /* Initialize remaining insertion scan key fields */ + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); + inskey.anynullkeys = false; /* unused */ + inskey.nextkey = nextkey; + inskey.pivotsearch = false; + inskey.scantid = NULL; + inskey.keysz = keysCount; + + /* + * Use the manufactured insertion scan key to descend the tree and + * position ourselves on the target leaf page. + */ + stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); + + /* don't need to keep the stack around... */ + _bt_freestack(stack); + + if (!BufferIsValid(buf)) + { + /* + * We only get here if the index is completely empty. Lock relation + * because nothing finer to lock exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + + /* + * mark parallel scan as done, so that all the workers can finish + * their scan + */ + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + + return false; + } + else + PredicateLockPage(rel, BufferGetBlockNumber(buf), + scan->xs_snapshot); + + _bt_initialize_more_data(so, dir); + + /* position to the precise item on the page */ + offnum = _bt_binsrch(rel, &inskey, buf); + + /* + * If nextkey = false, we are positioned at the first item >= scan key, or + * possibly at the end of a page on which all the existing items are less + * than the scan key and we know that everything on later pages is greater + * than or equal to scan key. + * + * If nextkey = true, we are positioned at the first item > scan key, or + * possibly at the end of a page on which all the existing items are less + * than or equal to the scan key and we know that everything on later + * pages is greater than scan key. + * + * The actually desired starting point is either this item or the prior + * one, or in the end-of-page case it's the first item on the next page or + * the last item on this page. Adjust the starting offset if needed. (If + * this results in an offset before the first item or after the last one, + * _bt_readpage will report no items found, and then we'll step to the + * next page as needed.) + */ + if (goback) + offnum = OffsetNumberPrev(offnum); + + /* remember which buffer we have pinned, if any */ + Assert(!BTScanPosIsValid(so->currPos)); + so->currPos.buf = buf; + + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, dir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + return false; + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + +readcomplete: + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_next() -- Get the next item in a scan. + * + * On entry, so->currPos describes the current page, which may be pinned + * but is not locked, and so->currPos.itemIndex identifies which item was + * previously returned. + * + * On successful exit, scan->xs_ctup.t_self is set to the TID of the + * next heap tuple, and if requested, scan->xs_itup points to a copy of + * the index tuple. so->currPos is updated as needed. + * + * On failure exit (no more tuples), we release pin and set + * so->currPos.buf to InvalidBuffer. + */ +bool +_bt_next(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPosItem *currItem; + + /* + * Advance to next tuple on current page; or if there's no more, try to + * step to the next page with data. + */ + if (ScanDirectionIsForward(dir)) + { + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + if (!_bt_steppage(scan, dir)) + return false; + } + } + else + { + if (--so->currPos.itemIndex < so->currPos.firstItem) + { + if (!_bt_steppage(scan, dir)) + return false; + } + } + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_readpage() -- Load data from current index page into so->currPos + * + * Caller must have pinned and read-locked so->currPos.buf; the buffer's state + * is not changed here. Also, currPos.moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of so->currPos are + * initialized from scratch here. + * + * We scan the current page starting at offnum and moving in the indicated + * direction. All items matching the scan keys are loaded into currPos.items. + * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports + * that there can be no more matching tuples in the current scan direction. + * + * In the case of a parallel scan, caller must have called _bt_parallel_seize + * prior to calling this function; this function will invoke + * _bt_parallel_release before returning. + * + * Returns true if any matching items found on the page, false if none. + */ +static bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int itemIndex; + bool continuescan; + int indnatts; + + /* + * We must have the buffer pinned and locked, but the usual macro can't be + * used here; this function is what makes it good for currPos. + */ + Assert(BufferIsValid(so->currPos.buf)); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* allow next page be processed by parallel worker */ + if (scan->parallel_scan) + { + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, opaque->btpo_next); + else + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + continuescan = true; /* default assumption */ + indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * We note the buffer's block number so that we can release the pin later. + * This allows us to re-read the buffer if it is needed again for hinting. + */ + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + + /* + * we must save the page's right-link while scanning it; this tells us + * where to step right to after we're done with these items. There is no + * corresponding need for the left-link, since splits always go right. + */ + so->currPos.nextPage = opaque->btpo_next; + + /* initialize tuple workspace to empty */ + so->currPos.nextTupleOffset = 0; + + /* + * Now that the current page has been made consistent, the macro should be + * good. + */ + Assert(BTScanPosIsPinned(so->currPos)); + + if (ScanDirectionIsForward(dir)) + { + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + + itup = (IndexTuple) PageGetItem(page, iid); + + if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID + */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!continuescan) + break; + + offnum = OffsetNumberNext(offnum); + } + + /* + * We don't need to visit page to the right when the high key + * indicates that no more matches will be found there. + * + * Checking the high key like this works out more often than you might + * think. Leaf page splits pick a split point between the two most + * dissimilar tuples (this is weighed against the need to evenly share + * free space). Leaf pages with high key attribute values that can + * only appear on non-pivot tuples on the right sibling page are + * common. + */ + if (continuescan && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + IndexTuple itup = (IndexTuple) PageGetItem(page, iid); + int truncatt; + + truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); + _bt_checkkeys(scan, itup, truncatt, dir, &continuescan); + } + + if (!continuescan) + so->currPos.moreRight = false; + + Assert(itemIndex <= MaxTIDsPerBTreePage); + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; + } + else + { + /* load items[] in descending order */ + itemIndex = MaxTIDsPerBTreePage; + + offnum = Min(offnum, maxoff); + + while (offnum >= minoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool tuple_alive; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual. Most of the + * time, it's a win to not bother examining the tuple's index + * keys, but just skip to the next tuple (previous, actually, + * since we're scanning backwards). However, if this is the first + * tuple on the page, we do check the index keys, to prevent + * uselessly advancing to the page to the left. This is similar + * to the high key optimization used by forward scans. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + Assert(offnum >= P_FIRSTDATAKEY(opaque)); + if (offnum > P_FIRSTDATAKEY(opaque)) + { + offnum = OffsetNumberPrev(offnum); + continue; + } + + tuple_alive = false; + } + else + tuple_alive = true; + + itup = (IndexTuple) PageGetItem(page, iid); + + passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, + &continuescan); + if (passes_quals && tuple_alive) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID. + * + * Note that we deliberately save/return items from + * posting lists in ascending heap TID order for backwards + * scans. This allows _bt_killitems() to make a + * consistent assumption about the order of items + * associated with the same posting list tuple. + */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } + } + if (!continuescan) + { + /* there can't be any more matches, so stop */ + so->currPos.moreLeft = false; + break; + } + + offnum = OffsetNumberPrev(offnum); + } + + Assert(itemIndex >= 0); + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxTIDsPerBTreePage - 1; + so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + } + + return (so->currPos.firstItem <= so->currPos.lastItem); +} + +/* Save an index item into so->currPos.items[itemIndex] */ +static void +_bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + + currItem->heapTid = itup->t_tid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + Size itupsz = IndexTupleSize(itup); + + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += MAXALIGN(itupsz); + } +} + +/* + * Setup state to save TIDs/items from a single posting list tuple. + * + * Saves an index item into so->currPos.items[itemIndex] for TID that is + * returned to scan first. Second or subsequent TIDs for posting list should + * be saved by calling _bt_savepostingitem(). + * + * Returns an offset into tuple storage space that main tuple is stored at if + * needed. + */ +static int +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(BTreeTupleIsPosting(itup)); + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + + return currItem->tupleOffset; + } + + return 0; +} + +/* + * Save an index item into so->currPos.items[itemIndex] for current posting + * tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. Caller passes its return value as tupleOffset. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every TID + * that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = tupleOffset; +} + +/* + * _bt_steppage() -- Step to next page containing valid data for scan + * + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked; + * if pinned, we'll drop the pin before moving to next page. The buffer is + * not locked on entry. + * + * For success on a scan using a non-MVCC snapshot we hold a pin, but not a + * read lock, on that page. If we do not hold the pin, we set so->currPos.buf + * to InvalidBuffer. We return true to indicate success. + */ +static bool +_bt_steppage(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BlockNumber blkno = InvalidBlockNumber; + bool status; + + Assert(BTScanPosIsValid(so->currPos)); + + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + + /* + * Before we modify currPos, make a copy of the page data if there was a + * mark position that needs it. + */ + if (so->markItemIndex >= 0) + { + /* bump pin on current buffer for assignment to mark buffer */ + if (BTScanPosIsPinned(so->currPos)) + IncrBufferRefCount(so->currPos.buf); + memcpy(&so->markPos, &so->currPos, + offsetof(BTScanPosData, items[1]) + + so->currPos.lastItem * sizeof(BTScanPosItem)); + if (so->markTuples) + memcpy(so->markTuples, so->currTuples, + so->currPos.nextTupleOffset); + so->markPos.itemIndex = so->markItemIndex; + so->markItemIndex = -1; + } + + if (ScanDirectionIsForward(dir)) + { + /* Walk right to the next page with data */ + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the next block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so use the previously-saved nextPage link. */ + blkno = so->currPos.nextPage; + } + + /* Remember we left a page with data */ + so->currPos.moreLeft = true; + + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + } + else + { + /* Remember we left a page with data */ + so->currPos.moreRight = true; + + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the current block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + BTScanPosUnpinIfPinned(so->currPos); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so just use our own notion of the current page */ + blkno = so->currPos.currPage; + } + } + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + return true; +} + +/* + * _bt_readnextpage() -- Read next page containing valid data for scan + * + * On success exit, so->currPos is updated to contain data from the next + * interesting page. Caller is responsible to release lock and pin on + * buffer on success. We return true to indicate success. + * + * If there are no more matching records in the given direction, we drop all + * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. + */ +static bool +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel; + Page page; + BTPageOpaque opaque; + bool status; + + rel = scan->indexRelation; + + if (ScanDirectionIsForward(dir)) + { + for (;;) + { + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ + if (blkno == P_NONE || !so->currPos.moreRight) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + /* step right one page */ + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* check for deleted page */ + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, blkno, scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, opaque->btpo_next); + } + + /* nope, keep going */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, so->currPos.buf); + } + } + } + else + { + /* + * Should only happen in parallel cases, when some other backend + * advanced the scan. + */ + if (so->currPos.currPage != blkno) + { + BTScanPosUnpinIfPinned(so->currPos); + so->currPos.currPage = blkno; + } + + /* + * Walk left to the next page with data. This is much more complex + * than the walk-right case because of the possibility that the page + * to our left splits while we are in flight to it, plus the + * possibility that the page we were on gets deleted after we leave + * it. See nbtree/README for details. + * + * It might be possible to rearrange this code to have less overhead + * in pinning and locking, but that would require capturing the left + * pointer when the page is initially read, and using it here, along + * with big changes to _bt_walk_left() and the code below. It is not + * clear whether this would be a win, since if the page immediately to + * the left splits after we read this page and before we step left, we + * would need to visit more pages than with the current code. + * + * Note that if we change the code so that we drop the pin for a scan + * which uses a non-MVCC snapshot, we will need to modify the code for + * walking left, to allow for the possibility that a referenced page + * has been deleted. As long as the buffer is pinned or the snapshot + * is MVCC the page cannot move past the half-dead state to fully + * deleted. + */ + if (BTScanPosIsPinned(so->currPos)) + _bt_lockbuf(rel, so->currPos.buf, BT_READ); + else + so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); + + for (;;) + { + /* Done if we know there are no matching keys to the left */ + if (!so->currPos.moreLeft) + { + _bt_relbuf(rel, so->currPos.buf); + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* Step to next physical page */ + so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, + scan->xs_snapshot); + + /* if we're physically at end of index, return failure */ + if (so->currPos.buf == InvalidBuffer) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* + * Okay, we managed to move left to a non-deleted page. Done if + * it's not half-dead and contains matching tuples. Else loop back + * and do it all again. + */ + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreLeft if we can stop */ + if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + /* + * For parallel scans, get the last page scanned as it is quite + * possible that by the time we try to seize the scan, some other + * worker has already advanced the scan to a different page. We + * must continue based on the latest page scanned by any worker. + */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + } + } + } + + return true; +} + +/* + * _bt_parallel_readpage() -- Read current page containing valid data for scan + * + * On success, release lock and maybe pin on buffer. We return true to + * indicate success. + */ +static bool +_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + _bt_initialize_more_data(so, dir); + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + return true; +} + +/* + * _bt_walk_left() -- step left one page, if possible + * + * The given buffer must be pinned and read-locked. This will be dropped + * before stepping left. On return, we have pin and read lock on the + * returned page, instead. + * + * Returns InvalidBuffer if there is no page to the left (no lock is held + * in that case). + * + * When working on a non-leaf level, it is possible for the returned page + * to be half-dead; the caller should check that condition and step left + * again if it's important. + */ +static Buffer +_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) +{ + Page page; + BTPageOpaque opaque; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + BlockNumber obknum; + BlockNumber lblkno; + BlockNumber blkno; + int tries; + + /* if we're at end of tree, release buf and return failure */ + if (P_LEFTMOST(opaque)) + { + _bt_relbuf(rel, buf); + break; + } + /* remember original page we are stepping left from */ + obknum = BufferGetBlockNumber(buf); + /* step left */ + blkno = lblkno = opaque->btpo_prev; + _bt_relbuf(rel, buf); + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this isn't the page we want, walk right till we find what we + * want --- but go no more than four hops (an arbitrary limit). If we + * don't find the correct page by then, the most likely bet is that + * the original page got deleted and isn't in the sibling chain at all + * anymore, not that its left sibling got split more than four times. + * + * Note that it is correct to test P_ISDELETED not P_IGNORE here, + * because half-dead pages are still in the sibling chain. Caller + * must reject half-dead pages if wanted. + */ + tries = 0; + for (;;) + { + if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum) + { + /* Found desired page, return it */ + return buf; + } + if (P_RIGHTMOST(opaque) || ++tries > 4) + break; + blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Return to the original page to see what's up */ + buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * It was deleted. Move right to first nondeleted page (there + * must be one); that is the page that has acquired the deleted + * one's keyspace, so stepping left from it will take us where we + * want to be. + */ + for (;;) + { + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_ISDELETED(opaque)) + break; + } + + /* + * Now return to top of loop, resetting obknum to point to this + * nondeleted page, and try again. + */ + } + else + { + /* + * It wasn't deleted; the explanation had better be that the page + * to the left got split or deleted. Without this check, we'd go + * into an infinite loop if there's anything wrong. + */ + if (opaque->btpo_prev == lblkno) + elog(ERROR, "could not find left sibling of block %u in index \"%s\"", + obknum, RelationGetRelationName(rel)); + /* Okay to try again with new lblkno value */ + } + } + + return InvalidBuffer; +} + +/* + * _bt_get_endpoint() -- Find the first or last page on a given tree level + * + * If the index is empty, we will return InvalidBuffer; any other failure + * condition causes ereport(). We will not return a dead page. + * + * The returned buffer is pinned and read-locked. + */ +Buffer +_bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + BlockNumber blkno; + IndexTuple itup; + + /* + * If we are looking for a leaf page, okay to descend from fast root; + * otherwise better descend from true root. (There is no point in being + * smarter about intermediate levels.) + */ + if (level == 0) + buf = _bt_getroot(rel, BT_READ); + else + buf = _bt_gettrueroot(rel); + + if (!BufferIsValid(buf)) + return InvalidBuffer; + + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + /* + * If we landed on a deleted page, step right to find a live page + * (there must be one). Also, if we want the rightmost page, step + * right if needed to get to it (this could happen if the page split + * since we obtained a pointer to it). + */ + while (P_IGNORE(opaque) || + (rightmost && !P_RIGHTMOST(opaque))) + { + blkno = opaque->btpo_next; + if (blkno == P_NONE) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Done? */ + if (opaque->btpo_level == level) + break; + if (opaque->btpo_level < level) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("btree level %u not found in index \"%s\"", + level, RelationGetRelationName(rel)))); + + /* Descend to leftmost or rightmost child page */ + if (rightmost) + offnum = PageGetMaxOffsetNumber(page); + else + offnum = P_FIRSTDATAKEY(opaque); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + blkno = BTreeTupleGetDownLink(itup); + + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + return buf; +} + +/* + * _bt_endpoint() -- Find the first or last page in the index, and scan + * from there to the first key satisfying all the quals. + * + * This is used by _bt_first() to set up a scan when we've determined + * that the scan must start at the beginning or end of the index (for + * a forward or backward scan respectively). Exit conditions are the + * same as for _bt_first(). + */ +static bool +_bt_endpoint(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber start; + BTScanPosItem *currItem; + + /* + * Scan down to the leftmost or rightmost leaf page. This is a simplified + * version of _bt_search(). We don't maintain a stack since we know we + * won't need it. + */ + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); + + if (!BufferIsValid(buf)) + { + /* + * Empty index. Lock the whole relation, as nothing finer to lock + * exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + BTScanPosInvalidate(so->currPos); + return false; + } + + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISLEAF(opaque)); + + if (ScanDirectionIsForward(dir)) + { + /* There could be dead pages to the left, so not this: */ + /* Assert(P_LEFTMOST(opaque)); */ + + start = P_FIRSTDATAKEY(opaque); + } + else if (ScanDirectionIsBackward(dir)) + { + Assert(P_RIGHTMOST(opaque)); + + start = PageGetMaxOffsetNumber(page); + } + else + { + elog(ERROR, "invalid scan direction: %d", (int) dir); + start = 0; /* keep compiler quiet */ + } + + /* remember which buffer we have pinned */ + so->currPos.buf = buf; + + _bt_initialize_more_data(so, dir); + + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, dir, start)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + return false; + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately + * for scan direction + */ +static inline void +_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) +{ + /* initialize moreLeft/moreRight appropriately for scan direction */ + if (ScanDirectionIsForward(dir)) + { + so->currPos.moreLeft = false; + so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } + so->numKilled = 0; /* just paranoia */ + so->markItemIndex = -1; /* ditto */ +} diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c new file mode 100644 index 0000000..78f78e7 --- /dev/null +++ b/src/backend/access/nbtree/nbtsort.c @@ -0,0 +1,2016 @@ +/*------------------------------------------------------------------------- + * + * nbtsort.c + * Build a btree from sorted input by loading leaf pages sequentially. + * + * NOTES + * + * We use tuplesort.c to sort the given index tuples into order. + * Then we scan the index tuples in order and build the btree pages + * for each level. We load source tuples into leaf-level pages. + * Whenever we fill a page at one level, we add a link to it to its + * parent level (starting a new parent level if necessary). When + * done, we write out each final page on each level, adding it to + * its parent level. When we have only one page on a level, it must be + * the root -- it can be attached to the btree metapage and we are done. + * + * It is not wise to pack the pages entirely full, since then *any* + * insertion would cause a split (and not only of the leaf page; the need + * for a split would cascade right up the tree). The steady-state load + * factor for btrees is usually estimated at 70%. We choose to pack leaf + * pages to the user-controllable fill factor (default 90%) while upper pages + * are always packed to 70%. This gives us reasonable density (there aren't + * many upper pages if the keys are reasonable-size) without risking a lot of + * cascading splits during early insertions. + * + * Formerly the index pages being built were kept in shared buffers, but + * that is of no value (since other backends have no interest in them yet) + * and it created locking problems for CHECKPOINT, because the upper-level + * pages were held exclusive-locked for long periods. Now we just build + * the pages in local memory and smgrwrite or smgrextend them as we finish + * them. They will need to be re-read into shared buffers on first use after + * the build finishes. + * + * This code isn't concerned about the FSM at all. The caller is responsible + * for initializing that. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/parallel.h" +#include "access/relscan.h" +#include "access/table.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "commands/progress.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" /* pgrminclude ignore */ +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + + +/* Magic numbers for parallel state sharing */ +#define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001) +#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002) +#define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003) +#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004) +#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005) +#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006) + +/* + * DISABLE_LEADER_PARTICIPATION disables the leader's participation in + * parallel index builds. This may be useful as a debugging aid. +#undef DISABLE_LEADER_PARTICIPATION + */ + +/* + * Status record for spooling/sorting phase. (Note we may have two of + * these due to the special requirements for uniqueness-checking with + * dead tuples.) + */ +typedef struct BTSpool +{ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + Relation heap; + Relation index; + bool isunique; +} BTSpool; + +/* + * Status for index builds performed in parallel. This is allocated in a + * dynamic shared memory segment. Note that there is a separate tuplesort TOC + * entry, private to tuplesort.c but allocated by this module on its behalf. + */ +typedef struct BTShared +{ + /* + * These fields are not modified during the sort. They primarily exist + * for the benefit of worker processes that need to create BTSpool state + * corresponding to that used by the leader. + */ + Oid heaprelid; + Oid indexrelid; + bool isunique; + bool isconcurrent; + int scantuplesortstates; + + /* + * workersdonecv is used to monitor the progress of workers. All parallel + * participants must indicate that they are done before leader can use + * mutable state that workers maintain during scan (and before leader can + * proceed to tuplesort_performsort()). + */ + ConditionVariable workersdonecv; + + /* + * mutex protects all fields before heapdesc. + * + * These fields contain status information of interest to B-Tree index + * builds that must work just the same when an index is built in parallel. + */ + slock_t mutex; + + /* + * Mutable state that is maintained by workers, and reported back to + * leader at end of parallel scan. + * + * nparticipantsdone is number of worker processes finished. + * + * reltuples is the total number of input heap tuples. + * + * havedead indicates if RECENTLY_DEAD tuples were encountered during + * build. + * + * indtuples is the total number of tuples that made it into the index. + * + * brokenhotchain indicates if any worker detected a broken HOT chain + * during build. + */ + int nparticipantsdone; + double reltuples; + bool havedead; + double indtuples; + bool brokenhotchain; + + /* + * ParallelTableScanDescData data follows. Can't directly embed here, as + * implementations of the parallel table scan desc interface might need + * stronger alignment. + */ +} BTShared; + +/* + * Return pointer to a BTShared's parallel table scan. + * + * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just + * MAXALIGN. + */ +#define ParallelTableScanFromBTShared(shared) \ + (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared))) + +/* + * Status for leader in parallel index build. + */ +typedef struct BTLeader +{ + /* parallel context itself */ + ParallelContext *pcxt; + + /* + * nparticipanttuplesorts is the exact number of worker processes + * successfully launched, plus one leader process if it participates as a + * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader + * participating as a worker). + */ + int nparticipanttuplesorts; + + /* + * Leader process convenience pointers to shared state (leader avoids TOC + * lookups). + * + * btshared is the shared state for entire build. sharedsort is the + * shared, tuplesort-managed state passed to each process tuplesort. + * sharedsort2 is the corresponding btspool2 shared state, used only when + * building unique indexes. snapshot is the snapshot used by the scan iff + * an MVCC snapshot is required. + */ + BTShared *btshared; + Sharedsort *sharedsort; + Sharedsort *sharedsort2; + Snapshot snapshot; + WalUsage *walusage; + BufferUsage *bufferusage; +} BTLeader; + +/* + * Working state for btbuild and its callback. + * + * When parallel CREATE INDEX is used, there is a BTBuildState for each + * participant. + */ +typedef struct BTBuildState +{ + bool isunique; + bool havedead; + Relation heap; + BTSpool *spool; + + /* + * spool2 is needed only when the index is a unique index. Dead tuples are + * put into spool2 instead of spool in order to avoid uniqueness check. + */ + BTSpool *spool2; + double indtuples; + + /* + * btleader is only present when a parallel index build is performed, and + * only in the leader process. (Actually, only the leader has a + * BTBuildState. Workers have their own spool and spool2, though.) + */ + BTLeader *btleader; +} BTBuildState; + +/* + * Status record for a btree page being built. We have one of these + * for each active tree level. + */ +typedef struct BTPageState +{ + Page btps_page; /* workspace for page building */ + BlockNumber btps_blkno; /* block # to write this page at */ + IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ + OffsetNumber btps_lastoff; /* last item offset loaded */ + Size btps_lastextra; /* last item's extra posting list space */ + uint32 btps_level; /* tree level (0 = leaf) */ + Size btps_full; /* "full" if less than this much free space */ + struct BTPageState *btps_next; /* link to parent level, if any */ +} BTPageState; + +/* + * Overall status record for index writing phase. + */ +typedef struct BTWriteState +{ + Relation heap; + Relation index; + BTScanInsert inskey; /* generic insertion scankey */ + bool btws_use_wal; /* dump pages to WAL? */ + BlockNumber btws_pages_alloced; /* # pages allocated */ + BlockNumber btws_pages_written; /* # pages written out */ + Page btws_zeropage; /* workspace for filling zeroes */ +} BTWriteState; + + +static double _bt_spools_heapscan(Relation heap, Relation index, + BTBuildState *buildstate, IndexInfo *indexInfo); +static void _bt_spooldestroy(BTSpool *btspool); +static void _bt_spool(BTSpool *btspool, ItemPointer self, + Datum *values, bool *isnull); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *state); +static Page _bt_blnewpage(uint32 level); +static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); +static void _bt_slideleft(Page rightmostpage); +static void _bt_sortaddtup(Page page, Size itemsize, + IndexTuple itup, OffsetNumber itup_off, + bool newfirstdataitem); +static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, + IndexTuple itup, Size truncextra); +static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, + BTPageState *state, + BTDedupState dstate); +static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); +static void _bt_load(BTWriteState *wstate, + BTSpool *btspool, BTSpool *btspool2); +static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, + int request); +static void _bt_end_parallel(BTLeader *btleader); +static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static double _bt_parallel_heapscan(BTBuildState *buildstate, + bool *brokenhotchain); +static void _bt_leader_participate_as_worker(BTBuildState *buildstate); +static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, + BTShared *btshared, Sharedsort *sharedsort, + Sharedsort *sharedsort2, int sortmem, + bool progress); + + +/* + * btbuild() -- build a new btree index. + */ +IndexBuildResult * +btbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + BTBuildState buildstate; + double reltuples; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + ResetUsage(); +#endif /* BTREE_BUILD_STATS */ + + buildstate.isunique = indexInfo->ii_Unique; + buildstate.havedead = false; + buildstate.heap = heap; + buildstate.spool = NULL; + buildstate.spool2 = NULL; + buildstate.indtuples = 0; + buildstate.btleader = NULL; + + /* + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + + /* + * Finish the build by (1) completing the sort of the spool file, (2) + * inserting the sorted tuples into btree pages and (3) building the upper + * levels. Finally, it may also be necessary to end use of parallelism. + */ + _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_spooldestroy(buildstate.spool); + if (buildstate.spool2) + _bt_spooldestroy(buildstate.spool2); + if (buildstate.btleader) + _bt_end_parallel(buildstate.btleader); + + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = buildstate.indtuples; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD STATS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + return result; +} + +/* + * Create and initialize one or two spool structures, and save them in caller's + * buildstate argument. May also fill-in fields within indexInfo used by index + * builds. + * + * Scans the heap, possibly in parallel, filling spools with IndexTuples. This + * routine encapsulates all aspects of managing parallelism. Caller need only + * call _bt_end_parallel() in parallel case after it is done with spool/spool2. + * + * Returns the total number of heap tuples scanned. + */ +static double +_bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, + IndexInfo *indexInfo) +{ + BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); + SortCoordinate coordinate = NULL; + double reltuples = 0; + + /* + * We size the sort area as maintenance_work_mem rather than work_mem to + * speed index creation. This should be OK since a single backend can't + * run multiple index creations in parallel (see also: notes on + * parallelism and maintenance_work_mem below). + */ + btspool->heap = heap; + btspool->index = index; + btspool->isunique = indexInfo->ii_Unique; + + /* Save as primary spool */ + buildstate->spool = btspool; + + /* Report table scan phase started */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN); + + /* Attempt to launch parallel worker scan when required */ + if (indexInfo->ii_ParallelWorkers > 0) + _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent, + indexInfo->ii_ParallelWorkers); + + /* + * If parallel build requested and at least one worker process was + * successfully launched, set up coordination state + */ + if (buildstate->btleader) + { + coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate->isWorker = false; + coordinate->nParticipants = + buildstate->btleader->nparticipanttuplesorts; + coordinate->sharedsort = buildstate->btleader->sharedsort; + } + + /* + * Begin serial/leader tuplesort. + * + * In cases where parallelism is involved, the leader receives the same + * share of maintenance_work_mem as a serial sort (it is generally treated + * in the same way as a serial sort once we return). Parallel worker + * Tuplesortstates will have received only a fraction of + * maintenance_work_mem, though. + * + * We rely on the lifetime of the Leader Tuplesortstate almost not + * overlapping with any worker Tuplesortstate's lifetime. There may be + * some small overlap, but that's okay because we rely on leader + * Tuplesortstate only allocating a small, fixed amount of memory here. + * When its tuplesort_performsort() is called (by our caller), and + * significant amounts of memory are likely to be used, all workers must + * have already freed almost all memory held by their Tuplesortstates + * (they are about to go away completely, too). The overall effect is + * that maintenance_work_mem always represents an absolute high watermark + * on the amount of memory used by a CREATE INDEX operation, regardless of + * the use of parallelism or any other factor. + */ + buildstate->spool->sortstate = + tuplesort_begin_index_btree(heap, index, buildstate->isunique, + maintenance_work_mem, coordinate, + false); + + /* + * If building a unique index, put dead tuples in a second spool to keep + * them out of the uniqueness check. We expect that the second spool (for + * dead tuples) won't get very full, so we give it only work_mem. + */ + if (indexInfo->ii_Unique) + { + BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); + SortCoordinate coordinate2 = NULL; + + /* Initialize secondary spool */ + btspool2->heap = heap; + btspool2->index = index; + btspool2->isunique = false; + /* Save as secondary spool */ + buildstate->spool2 = btspool2; + + if (buildstate->btleader) + { + /* + * Set up non-private state that is passed to + * tuplesort_begin_index_btree() about the basic high level + * coordination of a parallel sort. + */ + coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate2->isWorker = false; + coordinate2->nParticipants = + buildstate->btleader->nparticipanttuplesorts; + coordinate2->sharedsort = buildstate->btleader->sharedsort2; + } + + /* + * We expect that the second one (for dead tuples) won't get very + * full, so we give it only work_mem + */ + buildstate->spool2->sortstate = + tuplesort_begin_index_btree(heap, index, false, work_mem, + coordinate2, false); + } + + /* Fill spool using either serial or parallel heap scan */ + if (!buildstate->btleader) + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + _bt_build_callback, (void *) buildstate, + NULL); + else + reltuples = _bt_parallel_heapscan(buildstate, + &indexInfo->ii_BrokenHotChain); + + /* + * Set the progress target for the next phase. Reset the block number + * values set by table_index_build_scan + */ + { + const int progress_index[] = { + PROGRESS_CREATEIDX_TUPLES_TOTAL, + PROGRESS_SCAN_BLOCKS_TOTAL, + PROGRESS_SCAN_BLOCKS_DONE + }; + const int64 progress_vals[] = { + buildstate->indtuples, + 0, 0 + }; + + pgstat_progress_update_multi_param(3, progress_index, progress_vals); + } + + /* okay, all heap tuples are spooled */ + if (buildstate->spool2 && !buildstate->havedead) + { + /* spool2 turns out to be unnecessary */ + _bt_spooldestroy(buildstate->spool2); + buildstate->spool2 = NULL; + } + + return reltuples; +} + +/* + * clean up a spool structure and its substructures. + */ +static void +_bt_spooldestroy(BTSpool *btspool) +{ + tuplesort_end(btspool->sortstate); + pfree(btspool); +} + +/* + * spool an index entry into the sort file. + */ +static void +_bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) +{ + tuplesort_putindextuplevalues(btspool->sortstate, btspool->index, + self, values, isnull); +} + +/* + * given a spool loaded by successive calls to _bt_spool, + * create an entire btree. + */ +static void +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +{ + BTWriteState wstate; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD (Spool) STATISTICS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + /* Execute the sort */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_1); + tuplesort_performsort(btspool->sortstate); + if (btspool2) + { + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_2); + tuplesort_performsort(btspool2->sortstate); + } + + wstate.heap = btspool->heap; + wstate.index = btspool->index; + wstate.inskey = _bt_mkscankey(wstate.index, NULL); + /* _bt_mkscankey() won't set allequalimage without metapage */ + wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + wstate.btws_use_wal = RelationNeedsWAL(wstate.index); + + /* reserve the metapage */ + wstate.btws_pages_alloced = BTREE_METAPAGE + 1; + wstate.btws_pages_written = 0; + wstate.btws_zeropage = NULL; /* until needed */ + + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_LEAF_LOAD); + _bt_load(&wstate, btspool, btspool2); +} + +/* + * Per-tuple callback for table_index_build_scan + */ +static void +_bt_build_callback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + BTBuildState *buildstate = (BTBuildState *) state; + + /* + * insert the index tuple into the appropriate spool file for subsequent + * processing + */ + if (tupleIsAlive || buildstate->spool2 == NULL) + _bt_spool(buildstate->spool, tid, values, isnull); + else + { + /* dead tuples are put into spool2 */ + buildstate->havedead = true; + _bt_spool(buildstate->spool2, tid, values, isnull); + } + + buildstate->indtuples += 1; +} + +/* + * allocate workspace for a new, clean btree page, not linked to any siblings. + */ +static Page +_bt_blnewpage(uint32 level) +{ + Page page; + BTPageOpaque opaque; + + page = (Page) palloc(BLCKSZ); + + /* Zero the page and set up standard page header info */ + _bt_pageinit(page, BLCKSZ); + + /* Initialize BT opaque state */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_prev = opaque->btpo_next = P_NONE; + opaque->btpo_level = level; + opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; + opaque->btpo_cycleid = 0; + + /* Make the P_HIKEY line pointer appear allocated */ + ((PageHeader) page)->pd_lower += sizeof(ItemIdData); + + return page; +} + +/* + * emit a completed btree page, and release the working storage. + */ +static void +_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) +{ + /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ + RelationOpenSmgr(wstate->index); + + /* XLOG stuff */ + if (wstate->btws_use_wal) + { + /* We use the XLOG_FPI record type for this */ + log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); + } + + /* + * If we have to write pages nonsequentially, fill in the space with + * zeroes until we come back and overwrite. This is not logically + * necessary on standard Unix filesystems (unwritten space will read as + * zeroes anyway), but it should help to avoid fragmentation. The dummy + * pages aren't WAL-logged though. + */ + while (blkno > wstate->btws_pages_written) + { + if (!wstate->btws_zeropage) + wstate->btws_zeropage = (Page) palloc0(BLCKSZ); + /* don't set checksum for all-zero page */ + smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, + wstate->btws_pages_written++, + (char *) wstate->btws_zeropage, + true); + } + + PageSetChecksumInplace(page, blkno); + + /* + * Now write the page. There's no need for smgr to schedule an fsync for + * this write; we'll do it ourselves before ending the build. + */ + if (blkno == wstate->btws_pages_written) + { + /* extending the file... */ + smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, + (char *) page, true); + wstate->btws_pages_written++; + } + else + { + /* overwriting a block we zero-filled before */ + smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, + (char *) page, true); + } + + pfree(page); +} + +/* + * allocate and initialize a new BTPageState. the returned structure + * is suitable for immediate use by _bt_buildadd. + */ +static BTPageState * +_bt_pagestate(BTWriteState *wstate, uint32 level) +{ + BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); + + /* create initial page for level */ + state->btps_page = _bt_blnewpage(level); + + /* and assign it a page position */ + state->btps_blkno = wstate->btws_pages_alloced++; + + state->btps_lowkey = NULL; + /* initialize lastoff so first item goes into P_FIRSTKEY */ + state->btps_lastoff = P_HIKEY; + state->btps_lastextra = 0; + state->btps_level = level; + /* set "full" threshold based on level. See notes at head of file. */ + if (level > 0) + state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); + else + state->btps_full = BTGetTargetPageFreeSpace(wstate->index); + + /* no parent level, yet */ + state->btps_next = NULL; + + return state; +} + +/* + * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to + * P_HIKEY, overwriting P_HIKEY). + * + * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the + * rightmost page on its level is not supposed to get a high key. Now that + * it's clear that this page is a rightmost page, remove the unneeded empty + * P_HIKEY line pointer space. + */ +static void +_bt_slideleft(Page rightmostpage) +{ + OffsetNumber off; + OffsetNumber maxoff; + ItemId previi; + + maxoff = PageGetMaxOffsetNumber(rightmostpage); + Assert(maxoff >= P_FIRSTKEY); + previi = PageGetItemId(rightmostpage, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) + { + ItemId thisii = PageGetItemId(rightmostpage, off); + + *previi = *thisii; + previi = thisii; + } + ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData); +} + +/* + * Add an item to a page being built. + * + * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant + * raises an error directly. + * + * Note that our nbtsort.c caller does not know yet if the page will be + * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by + * caller. Page that turns out to be the rightmost on its level is fixed by + * calling _bt_slideleft(). + */ +static void +_bt_sortaddtup(Page page, + Size itemsize, + IndexTuple itup, + OffsetNumber itup_off, + bool newfirstdataitem) +{ + IndexTupleData trunctuple; + + if (newfirstdataitem) + { + trunctuple = *itup; + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetNAtts(&trunctuple, 0, false); + itup = &trunctuple; + itemsize = sizeof(IndexTupleData); + } + + if (PageAddItem(page, (Item) itup, itemsize, itup_off, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to the index page"); +} + +/*---------- + * Add an item to a disk page from the sort output (or add a posting list + * item formed from the sort output). + * + * We must be careful to observe the page layout conventions of nbtsearch.c: + * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. + * - on non-leaf pages, the key portion of the first item need not be + * stored, we should store only the link. + * + * A leaf page being built looks like: + * + * +----------------+---------------------------------+ + * | PageHeaderData | linp0 linp1 linp2 ... | + * +-----------+----+---------------------------------+ + * | ... linpN | | + * +-----------+--------------------------------------+ + * | ^ last | + * | | + * +-------------+------------------------------------+ + * | | itemN ... | + * +-------------+------------------+-----------------+ + * | ... item3 item2 item1 | "special space" | + * +--------------------------------+-----------------+ + * + * Contrast this with the diagram in bufpage.h; note the mismatch + * between linps and items. This is because we reserve linp0 as a + * placeholder for the pointer to the "high key" item; when we have + * filled up the page, we will set linp0 to point to itemN and clear + * linpN. On the other hand, if we find this is the last (rightmost) + * page, we leave the items alone and slide the linp array over. If + * the high key is to be truncated, offset 1 is deleted, and we insert + * the truncated high key at offset 1. + * + * 'last' pointer indicates the last offset added to the page. + * + * 'truncextra' is the size of the posting list in itup, if any. This + * information is stashed for the next call here, when we may benefit + * from considering the impact of truncating away the posting list on + * the page before deciding to finish the page off. Posting lists are + * often relatively large, so it is worth going to the trouble of + * accounting for the saving from truncating away the posting list of + * the tuple that becomes the high key (that may be the only way to + * get close to target free space on the page). Note that this is + * only used for the soft fillfactor-wise limit, not the critical hard + * limit. + *---------- + */ +static void +_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, + Size truncextra) +{ + Page npage; + BlockNumber nblkno; + OffsetNumber last_off; + Size last_truncextra; + Size pgspc; + Size itupsz; + bool isleaf; + + /* + * This is a handy place to check for cancel interrupts during the btree + * load phase of index creation. + */ + CHECK_FOR_INTERRUPTS(); + + npage = state->btps_page; + nblkno = state->btps_blkno; + last_off = state->btps_lastoff; + last_truncextra = state->btps_lastextra; + state->btps_lastextra = truncextra; + + pgspc = PageGetFreeSpace(npage); + itupsz = IndexTupleSize(itup); + itupsz = MAXALIGN(itupsz); + /* Leaf case has slightly different rules due to suffix truncation */ + isleaf = (state->btps_level == 0); + + /* + * Check whether the new item can fit on a btree page on current level at + * all. + * + * Every newly built index will treat heap TID as part of the keyspace, + * which imposes the requirement that new high keys must occasionally have + * a heap TID appended within _bt_truncate(). That may leave a new pivot + * tuple one or two MAXALIGN() quantums larger than the original + * firstright tuple it's derived from. v4 deals with the problem by + * decreasing the limit on the size of tuples inserted on the leaf level + * by the same small amount. Enforce the new v4+ limit on the leaf level, + * and the old limit on internal levels, since pivot tuples may need to + * make use of the reserved space. This should never fail on internal + * pages. + */ + if (unlikely(itupsz > BTMaxItemSize(npage))) + _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, + itup); + + /* + * Check to see if current page will fit new item, with space left over to + * append a heap TID during suffix truncation when page is a leaf page. + * + * It is guaranteed that we can fit at least 2 non-pivot tuples plus a + * high key with heap TID when finishing off a leaf page, since we rely on + * _bt_check_third_page() rejecting oversized non-pivot tuples. On + * internal pages we can always fit 3 pivot tuples with larger internal + * page tuple limit (includes page high key). + * + * Most of the time, a page is only "full" in the sense that the soft + * fillfactor-wise limit has been exceeded. However, we must always leave + * at least two items plus a high key on each page before starting a new + * page. Disregard fillfactor and insert on "full" current page if we + * don't have the minimum number of items yet. (Note that we deliberately + * assume that suffix truncation neither enlarges nor shrinks new high key + * when applying soft limit, except when last tuple has a posting list.) + */ + Assert(last_truncextra == 0 || isleaf); + if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || + (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY)) + { + /* + * Finish off the page and write it out. + */ + Page opage = npage; + BlockNumber oblkno = nblkno; + ItemId ii; + ItemId hii; + IndexTuple oitup; + + /* Create new page of same level */ + npage = _bt_blnewpage(state->btps_level); + + /* and assign it a page position */ + nblkno = wstate->btws_pages_alloced++; + + /* + * We copy the last item on the page into the new page, and then + * rearrange the old page so that the 'last item' becomes its high key + * rather than a true data item. There had better be at least two + * items on the page already, else the page would be empty of useful + * data. + */ + Assert(last_off > P_FIRSTKEY); + ii = PageGetItemId(opage, last_off); + oitup = (IndexTuple) PageGetItem(opage, ii); + _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY, + !isleaf); + + /* + * Move 'last' into the high key position on opage. _bt_blnewpage() + * allocated empty space for a line pointer when opage was first + * created, so this is a matter of rearranging already-allocated space + * on page, and initializing high key line pointer. (Actually, leaf + * pages must also swap oitup with a truncated version of oitup, which + * is sometimes larger than oitup, though never by more than the space + * needed to append a heap TID.) + */ + hii = PageGetItemId(opage, P_HIKEY); + *hii = *ii; + ItemIdSetUnused(ii); /* redundant */ + ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); + + if (isleaf) + { + IndexTuple lastleft; + IndexTuple truncated; + + /* + * Truncate away any unneeded attributes from high key on leaf + * level. This is only done at the leaf level because downlinks + * in internal pages are either negative infinity items, or get + * their contents from copying from one level down. See also: + * _bt_split(). + * + * We don't try to bias our choice of split point to make it more + * likely that _bt_truncate() can truncate away more attributes, + * whereas the split point used within _bt_split() is chosen much + * more delicately. Even still, the lastleft and firstright + * tuples passed to _bt_truncate() here are at least not fully + * equal to each other when deduplication is used, unless there is + * a large group of duplicates (also, unique index builds usually + * have few or no spool2 duplicates). When the split point is + * between two unequal tuples, _bt_truncate() will avoid including + * a heap TID in the new high key, which is the most important + * benefit of suffix truncation. + * + * Overwrite the old item with new truncated high key directly. + * oitup is already located at the physical beginning of tuple + * space, so this should directly reuse the existing tuple space. + */ + ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); + lastleft = (IndexTuple) PageGetItem(opage, ii); + + Assert(IndexTupleSize(oitup) > last_truncextra); + truncated = _bt_truncate(wstate->index, lastleft, oitup, + wstate->inskey); + if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated, + IndexTupleSize(truncated))) + elog(ERROR, "failed to add high key to the index page"); + pfree(truncated); + + /* oitup should continue to point to the page's high key */ + hii = PageGetItemId(opage, P_HIKEY); + oitup = (IndexTuple) PageGetItem(opage, hii); + } + + /* + * Link the old page into its parent, using its low key. If we don't + * have a parent, we have to create one; this adds a new btree level. + */ + if (state->btps_next == NULL) + state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); + + Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) || + P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); + Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 || + !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); + BTreeTupleSetDownLink(state->btps_lowkey, oblkno); + _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0); + pfree(state->btps_lowkey); + + /* + * Save a copy of the high key from the old page. It is also the low + * key for the new page. + */ + state->btps_lowkey = CopyIndexTuple(oitup); + + /* + * Set the sibling links for both pages. + */ + { + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); + + oopaque->btpo_next = nblkno; + nopaque->btpo_prev = oblkno; + nopaque->btpo_next = P_NONE; /* redundant */ + } + + /* + * Write out the old page. We never need to touch it again, so we can + * free the opage workspace too. + */ + _bt_blwritepage(wstate, opage, oblkno); + + /* + * Reset last_off to point to new page + */ + last_off = P_FIRSTKEY; + } + + /* + * By here, either original page is still the current page, or a new page + * was created that became the current page. Either way, the current page + * definitely has space for new item. + * + * If the new item is the first for its page, it must also be the first + * item on its entire level. On later same-level pages, a low key for a + * page will be copied from the prior page in the code above. Generate a + * minus infinity low key here instead. + */ + if (last_off == P_HIKEY) + { + Assert(state->btps_lowkey == NULL); + state->btps_lowkey = palloc0(sizeof(IndexTupleData)); + state->btps_lowkey->t_info = sizeof(IndexTupleData); + BTreeTupleSetNAtts(state->btps_lowkey, 0, false); + } + + /* + * Add the new item into the current page. + */ + last_off = OffsetNumberNext(last_off); + _bt_sortaddtup(npage, itupsz, itup, last_off, + !isleaf && last_off == P_FIRSTKEY); + + state->btps_page = npage; + state->btps_blkno = nblkno; + state->btps_lastoff = last_off; +} + +/* + * Finalize pending posting list tuple, and add it to the index. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple + * using _bt_buildadd(). + */ +static void +_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, + BTDedupState dstate) +{ + Assert(dstate->nitems > 0); + + if (dstate->nitems == 1) + _bt_buildadd(wstate, state, dstate->base, 0); + else + { + IndexTuple postingtuple; + Size truncextra; + + /* form a tuple with a posting list */ + postingtuple = _bt_form_posting(dstate->base, + dstate->htids, + dstate->nhtids); + /* Calculate posting list overhead */ + truncextra = IndexTupleSize(postingtuple) - + BTreeTupleGetPostingOffset(postingtuple); + + _bt_buildadd(wstate, state, postingtuple, truncextra); + pfree(postingtuple); + } + + dstate->nmaxitems = 0; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; +} + +/* + * Finish writing out the completed btree. + */ +static void +_bt_uppershutdown(BTWriteState *wstate, BTPageState *state) +{ + BTPageState *s; + BlockNumber rootblkno = P_NONE; + uint32 rootlevel = 0; + Page metapage; + + /* + * Each iteration of this loop completes one more level of the tree. + */ + for (s = state; s != NULL; s = s->btps_next) + { + BlockNumber blkno; + BTPageOpaque opaque; + + blkno = s->btps_blkno; + opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); + + /* + * We have to link the last page on this level to somewhere. + * + * If we're at the top, it's the root, so attach it to the metapage. + * Otherwise, add an entry for it to its parent using its low key. + * This may cause the last page of the parent level to split, but + * that's not a problem -- we haven't gotten to it yet. + */ + if (s->btps_next == NULL) + { + opaque->btpo_flags |= BTP_ROOT; + rootblkno = blkno; + rootlevel = s->btps_level; + } + else + { + Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) || + P_LEFTMOST(opaque)); + Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 || + !P_LEFTMOST(opaque)); + BTreeTupleSetDownLink(s->btps_lowkey, blkno); + _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0); + pfree(s->btps_lowkey); + s->btps_lowkey = NULL; + } + + /* + * This is the rightmost page, so the ItemId array needs to be slid + * back one slot. Then we can dump out the page. + */ + _bt_slideleft(s->btps_page); + _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); + s->btps_page = NULL; /* writepage freed the workspace */ + } + + /* + * As the last step in the process, construct the metapage and make it + * point to the new root (unless we had no data at all, in which case it's + * set to point to "P_NONE"). This changes the index to the "valid" state + * by filling in a valid magic number in the metapage. + */ + metapage = (Page) palloc(BLCKSZ); + _bt_initmetapage(metapage, rootblkno, rootlevel, + wstate->inskey->allequalimage); + _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); +} + +/* + * Read tuples in correct sort order from tuplesort, and load them into + * btree leaves. + */ +static void +_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) +{ + BTPageState *state = NULL; + bool merge = (btspool2 != NULL); + IndexTuple itup, + itup2 = NULL; + bool load1; + TupleDesc tupdes = RelationGetDescr(wstate->index); + int i, + keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); + SortSupport sortKeys; + int64 tuples_done = 0; + bool deduplicate; + + deduplicate = wstate->inskey->allequalimage && !btspool->isunique && + BTGetDeduplicateItems(wstate->index); + + if (merge) + { + /* + * Another BTSpool for dead tuples exists. Now we have to merge + * btspool and btspool2. + */ + + /* the preparation of merge */ + itup = tuplesort_getindextuple(btspool->sortstate, true); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); + + /* Prepare SortSupport data for each column */ + sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); + + for (i = 0; i < keysz; i++) + { + SortSupport sortKey = sortKeys + i; + ScanKey scanKey = wstate->inskey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Abbreviation is not supported here */ + sortKey->abbreviate = false; + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); + } + + for (;;) + { + load1 = true; /* load BTSpool next ? */ + if (itup2 == NULL) + { + if (itup == NULL) + break; + } + else if (itup != NULL) + { + int32 compare = 0; + + for (i = 1; i <= keysz; i++) + { + SortSupport entry; + Datum attrDatum1, + attrDatum2; + bool isNull1, + isNull2; + + entry = sortKeys + i - 1; + attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); + attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); + + compare = ApplySortComparator(attrDatum1, isNull1, + attrDatum2, isNull2, + entry); + if (compare > 0) + { + load1 = false; + break; + } + else if (compare < 0) + break; + } + + /* + * If key values are equal, we sort on ItemPointer. This is + * required for btree indexes, since heap TID is treated as an + * implicit last key attribute in order to ensure that all + * keys in the index are physically unique. + */ + if (compare == 0) + { + compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); + Assert(compare != 0); + if (compare > 0) + load1 = false; + } + } + else + load1 = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (load1) + { + _bt_buildadd(wstate, state, itup, 0); + itup = tuplesort_getindextuple(btspool->sortstate, true); + } + else + { + _bt_buildadd(wstate, state, itup2, 0); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + pfree(sortKeys); + } + else if (deduplicate) + { + /* merge is unnecessary, deduplicate into posting lists */ + BTDedupState dstate; + + dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); + dstate->deduplicate = true; /* unused */ + dstate->nmaxitems = 0; /* unused */ + dstate->maxpostingsize = 0; /* set later */ + /* Metadata about base tuple of current pending posting list */ + dstate->base = NULL; + dstate->baseoff = InvalidOffsetNumber; /* unused */ + dstate->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + dstate->htids = NULL; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; /* unused */ + dstate->nintervals = 0; /* unused */ + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + + /* + * Limit size of posting list tuples to 1/10 space we want to + * leave behind on the page, plus space for final item's line + * pointer. This is equal to the space that we'd like to + * leave behind on each leaf page when fillfactor is 90, + * allowing us to get close to fillfactor% space utilization + * when there happen to be a great many duplicates. (This + * makes higher leaf fillfactor settings ineffective when + * building indexes that have many duplicates, but packing + * leaf pages full with few very large tuples doesn't seem + * like a useful goal.) + */ + dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - + sizeof(ItemIdData); + Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && + dstate->maxpostingsize <= INDEX_SIZE_MASK); + dstate->htids = palloc(dstate->maxpostingsize); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + else if (_bt_keep_natts_fast(wstate->index, dstate->base, + itup) > keysz && + _bt_dedup_save_htid(dstate, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID from itup has been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list. + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + + if (state) + { + /* + * Handle the last item (there must be a last item when the + * tuplesort returned one or more tuples) + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + pfree(dstate->htids); + } + + pfree(dstate); + } + else + { + /* merging and deduplication are both unnecessary */ + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + _bt_buildadd(wstate, state, itup, 0); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + } + + /* Close down final pages and write the metapage */ + _bt_uppershutdown(wstate, state); + + /* + * When we WAL-logged index pages, we must nonetheless fsync index files. + * Since we're building outside shared buffers, a CHECKPOINT occurring + * during the build has no way to flush the previously written data to + * disk (indeed it won't know the index even exists). A crash later on + * would replay WAL from the checkpoint, therefore it wouldn't replay our + * earlier WAL entries. If we do not fsync those pages here, they might + * still not be on disk when the crash occurs. + */ + if (wstate->btws_use_wal) + { + RelationOpenSmgr(wstate->index); + smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); + } +} + +/* + * Create parallel context, and launch workers for leader. + * + * buildstate argument should be initialized (with the exception of the + * tuplesort state in spools, which may later be created based on shared + * state initially set up here). + * + * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY. + * + * request is the target number of parallel worker processes to launch. + * + * Sets buildstate's BTLeader, which caller must use to shut down parallel + * mode by passing it to _bt_end_parallel() at the very end of its index + * build. If not even a single worker process can be launched, this is + * never set, and caller should proceed with a serial index build. + */ +static void +_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) +{ + ParallelContext *pcxt; + int scantuplesortstates; + Snapshot snapshot; + Size estbtshared; + Size estsort; + BTShared *btshared; + Sharedsort *sharedsort; + Sharedsort *sharedsort2; + BTSpool *btspool = buildstate->spool; + BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader)); + WalUsage *walusage; + BufferUsage *bufferusage; + bool leaderparticipates = true; + int querylen; + +#ifdef DISABLE_LEADER_PARTICIPATION + leaderparticipates = false; +#endif + + /* + * Enter parallel mode, and create context for parallel build of btree + * index + */ + EnterParallelMode(); + Assert(request > 0); + pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main", + request); + + scantuplesortstates = leaderparticipates ? request + 1 : request; + + /* + * Prepare for scan of the base relation. In a normal index build, we use + * SnapshotAny because we must retrieve all tuples and do our own time + * qual checks (because we have to index RECENTLY_DEAD tuples). In a + * concurrent build, we take a regular MVCC snapshot and index whatever's + * live according to that. + */ + if (!isconcurrent) + snapshot = SnapshotAny; + else + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + + /* + * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and + * PARALLEL_KEY_TUPLESORT tuplesort workspace + */ + estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, estbtshared); + estsort = tuplesort_estimate_shared(scantuplesortstates); + shm_toc_estimate_chunk(&pcxt->estimator, estsort); + + /* + * Unique case requires a second spool, and so we may have to account for + * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 + */ + if (!btspool->isunique) + shm_toc_estimate_keys(&pcxt->estimator, 2); + else + { + shm_toc_estimate_chunk(&pcxt->estimator, estsort); + shm_toc_estimate_keys(&pcxt->estimator, 3); + } + + /* + * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE + * and PARALLEL_KEY_BUFFER_USAGE. + * + * If there are no extensions loaded that care, we could skip this. We + * have no way of knowing whether anyone's looking at pgWalUsage or + * pgBufferUsage, so do it unconditionally. + */ + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ + if (debug_query_string) + { + querylen = strlen(debug_query_string); + shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + else + querylen = 0; /* keep compiler quiet */ + + /* Everyone's had a chance to ask for space, so now create the DSM */ + InitializeParallelDSM(pcxt); + + /* If no DSM segment was available, back out (do serial build) */ + if (pcxt->seg == NULL) + { + if (IsMVCCSnapshot(snapshot)) + UnregisterSnapshot(snapshot); + DestroyParallelContext(pcxt); + ExitParallelMode(); + return; + } + + /* Store shared build state, for which we reserved space */ + btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared); + /* Initialize immutable state */ + btshared->heaprelid = RelationGetRelid(btspool->heap); + btshared->indexrelid = RelationGetRelid(btspool->index); + btshared->isunique = btspool->isunique; + btshared->isconcurrent = isconcurrent; + btshared->scantuplesortstates = scantuplesortstates; + ConditionVariableInit(&btshared->workersdonecv); + SpinLockInit(&btshared->mutex); + /* Initialize mutable state */ + btshared->nparticipantsdone = 0; + btshared->reltuples = 0.0; + btshared->havedead = false; + btshared->indtuples = 0.0; + btshared->brokenhotchain = false; + table_parallelscan_initialize(btspool->heap, + ParallelTableScanFromBTShared(btshared), + snapshot); + + /* + * Store shared tuplesort-private state, for which we reserved space. + * Then, initialize opaque state using tuplesort routine. + */ + sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); + tuplesort_initialize_shared(sharedsort, scantuplesortstates, + pcxt->seg); + + shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); + + /* Unique case requires a second spool, and associated shared state */ + if (!btspool->isunique) + sharedsort2 = NULL; + else + { + /* + * Store additional shared tuplesort-private state, for which we + * reserved space. Then, initialize opaque state using tuplesort + * routine. + */ + sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); + tuplesort_initialize_shared(sharedsort2, scantuplesortstates, + pcxt->seg); + + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2); + } + + /* Store query string for workers */ + if (debug_query_string) + { + char *sharedquery; + + sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); + memcpy(sharedquery, debug_query_string, querylen + 1); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery); + } + + /* + * Allocate space for each worker's WalUsage and BufferUsage; no need to + * initialize. + */ + walusage = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage); + bufferusage = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage); + + /* Launch workers, saving status for leader/caller */ + LaunchParallelWorkers(pcxt); + btleader->pcxt = pcxt; + btleader->nparticipanttuplesorts = pcxt->nworkers_launched; + if (leaderparticipates) + btleader->nparticipanttuplesorts++; + btleader->btshared = btshared; + btleader->sharedsort = sharedsort; + btleader->sharedsort2 = sharedsort2; + btleader->snapshot = snapshot; + btleader->walusage = walusage; + btleader->bufferusage = bufferusage; + + /* If no workers were successfully launched, back out (do serial build) */ + if (pcxt->nworkers_launched == 0) + { + _bt_end_parallel(btleader); + return; + } + + /* Save leader state now that it's clear build will be parallel */ + buildstate->btleader = btleader; + + /* Join heap scan ourselves */ + if (leaderparticipates) + _bt_leader_participate_as_worker(buildstate); + + /* + * Caller needs to wait for all launched workers when we return. Make + * sure that the failure-to-start case will not hang forever. + */ + WaitForParallelWorkersToAttach(pcxt); +} + +/* + * Shut down workers, destroy parallel context, and end parallel mode. + */ +static void +_bt_end_parallel(BTLeader *btleader) +{ + int i; + + /* Shutdown worker processes */ + WaitForParallelWorkersToFinish(btleader->pcxt); + + /* + * Next, accumulate WAL usage. (This must wait for the workers to finish, + * or we might get incomplete data.) + */ + for (i = 0; i < btleader->pcxt->nworkers_launched; i++) + InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); + + /* Free last reference to MVCC snapshot, if one was used */ + if (IsMVCCSnapshot(btleader->snapshot)) + UnregisterSnapshot(btleader->snapshot); + DestroyParallelContext(btleader->pcxt); + ExitParallelMode(); +} + +/* + * Returns size of shared memory required to store state for a parallel + * btree index build based on the snapshot its parallel scan will use. + */ +static Size +_bt_parallel_estimate_shared(Relation heap, Snapshot snapshot) +{ + /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ + return add_size(BUFFERALIGN(sizeof(BTShared)), + table_parallelscan_estimate(heap, snapshot)); +} + +/* + * Within leader, wait for end of heap scan. + * + * When called, parallel heap scan started by _bt_begin_parallel() will + * already be underway within worker processes (when leader participates + * as a worker, we should end up here just as workers are finishing). + * + * Fills in fields needed for ambuild statistics, and lets caller set + * field indicating that some worker encountered a broken HOT chain. + * + * Returns the total number of heap tuples scanned. + */ +static double +_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain) +{ + BTShared *btshared = buildstate->btleader->btshared; + int nparticipanttuplesorts; + double reltuples; + + nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts; + for (;;) + { + SpinLockAcquire(&btshared->mutex); + if (btshared->nparticipantsdone == nparticipanttuplesorts) + { + buildstate->havedead = btshared->havedead; + buildstate->indtuples = btshared->indtuples; + *brokenhotchain = btshared->brokenhotchain; + reltuples = btshared->reltuples; + SpinLockRelease(&btshared->mutex); + break; + } + SpinLockRelease(&btshared->mutex); + + ConditionVariableSleep(&btshared->workersdonecv, + WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN); + } + + ConditionVariableCancelSleep(); + + return reltuples; +} + +/* + * Within leader, participate as a parallel worker. + */ +static void +_bt_leader_participate_as_worker(BTBuildState *buildstate) +{ + BTLeader *btleader = buildstate->btleader; + BTSpool *leaderworker; + BTSpool *leaderworker2; + int sortmem; + + /* Allocate memory and initialize private spool */ + leaderworker = (BTSpool *) palloc0(sizeof(BTSpool)); + leaderworker->heap = buildstate->spool->heap; + leaderworker->index = buildstate->spool->index; + leaderworker->isunique = buildstate->spool->isunique; + + /* Initialize second spool, if required */ + if (!btleader->btshared->isunique) + leaderworker2 = NULL; + else + { + /* Allocate memory for worker's own private secondary spool */ + leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool)); + + /* Initialize worker's own secondary spool */ + leaderworker2->heap = leaderworker->heap; + leaderworker2->index = leaderworker->index; + leaderworker2->isunique = false; + } + + /* + * Might as well use reliable figure when doling out maintenance_work_mem + * (when requested number of workers were not launched, this will be + * somewhat higher than it is for other workers). + */ + sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts; + + /* Perform work common to all participants */ + _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared, + btleader->sharedsort, btleader->sharedsort2, + sortmem, true); + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ +} + +/* + * Perform work within a launched parallel process. + */ +void +_bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) +{ + char *sharedquery; + BTSpool *btspool; + BTSpool *btspool2; + BTShared *btshared; + Sharedsort *sharedsort; + Sharedsort *sharedsort2; + Relation heapRel; + Relation indexRel; + LOCKMODE heapLockmode; + LOCKMODE indexLockmode; + WalUsage *walusage; + BufferUsage *bufferusage; + int sortmem; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + ResetUsage(); +#endif /* BTREE_BUILD_STATS */ + + /* + * The only possible status flag that can be set to the parallel worker is + * PROC_IN_SAFE_IC. + */ + Assert((MyProc->statusFlags == 0) || + (MyProc->statusFlags == PROC_IN_SAFE_IC)); + + /* Set debug_query_string for individual workers first */ + sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true); + debug_query_string = sharedquery; + + /* Report the query string from leader */ + pgstat_report_activity(STATE_RUNNING, debug_query_string); + + /* Look up nbtree shared state */ + btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false); + + /* Open relations using lock modes known to be obtained by index.c */ + if (!btshared->isconcurrent) + { + heapLockmode = ShareLock; + indexLockmode = AccessExclusiveLock; + } + else + { + heapLockmode = ShareUpdateExclusiveLock; + indexLockmode = RowExclusiveLock; + } + + /* Open relations within worker */ + heapRel = table_open(btshared->heaprelid, heapLockmode); + indexRel = index_open(btshared->indexrelid, indexLockmode); + + /* Initialize worker's own spool */ + btspool = (BTSpool *) palloc0(sizeof(BTSpool)); + btspool->heap = heapRel; + btspool->index = indexRel; + btspool->isunique = btshared->isunique; + + /* Look up shared state private to tuplesort.c */ + sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); + tuplesort_attach_shared(sharedsort, seg); + if (!btshared->isunique) + { + btspool2 = NULL; + sharedsort2 = NULL; + } + else + { + /* Allocate memory for worker's own private secondary spool */ + btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); + + /* Initialize worker's own secondary spool */ + btspool2->heap = btspool->heap; + btspool2->index = btspool->index; + btspool2->isunique = false; + /* Look up shared state private to tuplesort.c */ + sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false); + tuplesort_attach_shared(sharedsort2, seg); + } + + /* Prepare to track buffer usage during parallel execution */ + InstrStartParallelQuery(); + + /* Perform sorting of spool, and possibly a spool2 */ + sortmem = maintenance_work_mem / btshared->scantuplesortstates; + _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort, + sharedsort2, sortmem, false); + + /* Report WAL/buffer usage during parallel execution */ + bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); + walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); + InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + &walusage[ParallelWorkerNumber]); + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +} + +/* + * Perform a worker's portion of a parallel sort. + * + * This generates a tuplesort for passed btspool, and a second tuplesort + * state if a second btspool is need (i.e. for unique index builds). All + * other spool fields should already be set when this is called. + * + * sortmem is the amount of working memory to use within each worker, + * expressed in KBs. + * + * When this returns, workers are done, and need only release resources. + */ +static void +_bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, + BTShared *btshared, Sharedsort *sharedsort, + Sharedsort *sharedsort2, int sortmem, bool progress) +{ + SortCoordinate coordinate; + BTBuildState buildstate; + TableScanDesc scan; + double reltuples; + IndexInfo *indexInfo; + + /* Initialize local tuplesort coordination state */ + coordinate = palloc0(sizeof(SortCoordinateData)); + coordinate->isWorker = true; + coordinate->nParticipants = -1; + coordinate->sharedsort = sharedsort; + + /* Begin "partial" tuplesort */ + btspool->sortstate = tuplesort_begin_index_btree(btspool->heap, + btspool->index, + btspool->isunique, + sortmem, coordinate, + false); + + /* + * Just as with serial case, there may be a second spool. If so, a + * second, dedicated spool2 partial tuplesort is required. + */ + if (btspool2) + { + SortCoordinate coordinate2; + + /* + * We expect that the second one (for dead tuples) won't get very + * full, so we give it only work_mem (unless sortmem is less for + * worker). Worker processes are generally permitted to allocate + * work_mem independently. + */ + coordinate2 = palloc0(sizeof(SortCoordinateData)); + coordinate2->isWorker = true; + coordinate2->nParticipants = -1; + coordinate2->sharedsort = sharedsort2; + btspool2->sortstate = + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, + Min(sortmem, work_mem), coordinate2, + false); + } + + /* Fill in buildstate for _bt_build_callback() */ + buildstate.isunique = btshared->isunique; + buildstate.havedead = false; + buildstate.heap = btspool->heap; + buildstate.spool = btspool; + buildstate.spool2 = btspool2; + buildstate.indtuples = 0; + buildstate.btleader = NULL; + + /* Join parallel scan */ + indexInfo = BuildIndexInfo(btspool->index); + indexInfo->ii_Concurrent = btshared->isconcurrent; + scan = table_beginscan_parallel(btspool->heap, + ParallelTableScanFromBTShared(btshared)); + reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, + true, progress, _bt_build_callback, + (void *) &buildstate, scan); + + /* Execute this worker's part of the sort */ + if (progress) + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_1); + tuplesort_performsort(btspool->sortstate); + if (btspool2) + { + if (progress) + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_2); + tuplesort_performsort(btspool2->sortstate); + } + + /* + * Done. Record ambuild statistics, and whether we encountered a broken + * HOT chain. + */ + SpinLockAcquire(&btshared->mutex); + btshared->nparticipantsdone++; + btshared->reltuples += reltuples; + if (buildstate.havedead) + btshared->havedead = true; + btshared->indtuples += buildstate.indtuples; + if (indexInfo->ii_BrokenHotChain) + btshared->brokenhotchain = true; + SpinLockRelease(&btshared->mutex); + + /* Notify leader */ + ConditionVariableSignal(&btshared->workersdonecv); + + /* We can end tuplesorts immediately */ + tuplesort_end(btspool->sortstate); + if (btspool2) + tuplesort_end(btspool2->sortstate); +} diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c new file mode 100644 index 0000000..3485e93 --- /dev/null +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -0,0 +1,1190 @@ +/*------------------------------------------------------------------------- + * + * nbtsplitloc.c + * Choose split point code for Postgres btree implementation. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsplitloc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "storage/lmgr.h" + +typedef enum +{ + /* strategy for searching through materialized list of split points */ + SPLIT_DEFAULT, /* give some weight to truncation */ + SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */ + SPLIT_SINGLE_VALUE /* leave left page almost full */ +} FindSplitStrat; + +typedef struct +{ + /* details of free space left by split */ + int16 curdelta; /* current leftfree/rightfree delta */ + int16 leftfree; /* space left on left page post-split */ + int16 rightfree; /* space left on right page post-split */ + + /* split point identifying fields (returned by _bt_findsplitloc) */ + OffsetNumber firstrightoff; /* first origpage item on rightpage */ + bool newitemonleft; /* new item goes on left, or right? */ + +} SplitPoint; + +typedef struct +{ + /* context data for _bt_recsplitloc */ + Relation rel; /* index relation */ + Page origpage; /* page undergoing split */ + IndexTuple newitem; /* new item (cause of page split) */ + Size newitemsz; /* size of newitem (includes line pointer) */ + bool is_leaf; /* T if splitting a leaf page */ + bool is_rightmost; /* T if splitting rightmost page on level */ + OffsetNumber newitemoff; /* where the new item is to be inserted */ + int leftspace; /* space available for items on left page */ + int rightspace; /* space available for items on right page */ + int olddataitemstotal; /* space taken by old items */ + Size minfirstrightsz; /* smallest firstright size */ + + /* candidate split point data */ + int maxsplits; /* maximum number of splits */ + int nsplits; /* current number of splits */ + SplitPoint *splits; /* all candidate split points for page */ + int interval; /* current range of acceptable split points */ +} FindSplitData; + +static void _bt_recsplitloc(FindSplitData *state, + OffsetNumber firstrightoff, bool newitemonleft, + int olddataitemstoleft, + Size firstrightofforigpagetuplesz); +static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, + bool usemult); +static int _bt_splitcmp(const void *arg1, const void *arg2); +static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, + int leaffillfactor, bool *usemult); +static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); +static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, + bool *newitemonleft, FindSplitStrat strategy); +static int _bt_defaultinterval(FindSplitData *state); +static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, + SplitPoint *rightpage, FindSplitStrat *strategy); +static void _bt_interval_edges(FindSplitData *state, + SplitPoint **leftinterval, SplitPoint **rightinterval); +static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split); +static inline IndexTuple _bt_split_lastleft(FindSplitData *state, + SplitPoint *split); +static inline IndexTuple _bt_split_firstright(FindSplitData *state, + SplitPoint *split); + + +/* + * _bt_findsplitloc() -- find an appropriate place to split a page. + * + * The main goal here is to equalize the free space that will be on each + * split page, *after accounting for the inserted tuple*. (If we fail to + * account for it, we might find ourselves with too little room on the page + * that it needs to go into!) + * + * If the page is the rightmost page on its level, we instead try to arrange + * to leave the left split page fillfactor% full. In this way, when we are + * inserting successively increasing keys (consider sequences, timestamps, + * etc) we will end up with a tree whose pages are about fillfactor% full, + * instead of the 50% full result that we'd get without this special case. + * This is the same as nbtsort.c produces for a newly-created tree. Note + * that leaf and nonleaf pages use different fillfactors. Note also that + * there are a number of further special cases where fillfactor is not + * applied in the standard way. + * + * We are passed the intended insert position of the new tuple, expressed as + * the offsetnumber of the tuple it must go in front of (this could be + * maxoff+1 if the tuple is to go at the end). The new tuple itself is also + * passed, since it's needed to give some weight to how effective suffix + * truncation will be. The implementation picks the split point that + * maximizes the effectiveness of suffix truncation from a small list of + * alternative candidate split points that leave each side of the split with + * about the same share of free space. Suffix truncation is secondary to + * equalizing free space, except in cases with large numbers of duplicates. + * Note that it is always assumed that caller goes on to perform truncation, + * even with pg_upgrade'd indexes where that isn't actually the case + * (!heapkeyspace indexes). See nbtree/README for more information about + * suffix truncation. + * + * We return the index of the first existing tuple that should go on the + * righthand page (which is called firstrightoff), plus a boolean + * indicating whether the new tuple goes on the left or right page. You + * can think of the returned state as a point _between_ two adjacent data + * items (laftleft and firstright data items) on an imaginary version of + * origpage that already includes newitem. The bool is necessary to + * disambiguate the case where firstrightoff == newitemoff (i.e. it is + * sometimes needed to determine if the firstright tuple for the split is + * newitem rather than the tuple from origpage at offset firstrightoff). + */ +OffsetNumber +_bt_findsplitloc(Relation rel, + Page origpage, + OffsetNumber newitemoff, + Size newitemsz, + IndexTuple newitem, + bool *newitemonleft) +{ + BTPageOpaque opaque; + int leftspace, + rightspace, + olddataitemstotal, + olddataitemstoleft, + perfectpenalty, + leaffillfactor; + FindSplitData state; + FindSplitStrat strategy; + ItemId itemid; + OffsetNumber offnum, + maxoff, + firstrightoff; + double fillfactormult; + bool usemult; + SplitPoint leftpage, + rightpage; + + opaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + maxoff = PageGetMaxOffsetNumber(origpage); + + /* Total free space available on a btree page, after fixed overhead */ + leftspace = rightspace = + PageGetPageSize(origpage) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + + /* The right page will have the same high key as the old page */ + if (!P_RIGHTMOST(opaque)) + { + itemid = PageGetItemId(origpage, P_HIKEY); + rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + + sizeof(ItemIdData)); + } + + /* Count up total space in data items before actually scanning 'em */ + olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage); + leaffillfactor = BTGetFillFactor(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + state.rel = rel; + state.origpage = origpage; + state.newitem = newitem; + state.newitemsz = newitemsz; + state.is_leaf = P_ISLEAF(opaque); + state.is_rightmost = P_RIGHTMOST(opaque); + state.leftspace = leftspace; + state.rightspace = rightspace; + state.olddataitemstotal = olddataitemstotal; + state.minfirstrightsz = SIZE_MAX; + state.newitemoff = newitemoff; + + /* newitem cannot be a posting list item */ + Assert(!BTreeTupleIsPosting(newitem)); + + /* + * nsplits should never exceed maxoff because there will be at most as + * many candidate split points as there are points _between_ tuples, once + * you imagine that the new item is already on the original page (the + * final number of splits may be slightly lower because not all points + * between tuples will be legal). + */ + state.maxsplits = maxoff; + state.splits = palloc(sizeof(SplitPoint) * state.maxsplits); + state.nsplits = 0; + + /* + * Scan through the data items and calculate space usage for a split at + * each possible position + */ + olddataitemstoleft = 0; + + for (offnum = P_FIRSTDATAKEY(opaque); + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + Size itemsz; + + itemid = PageGetItemId(origpage, offnum); + itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); + + /* + * When item offset number is not newitemoff, neither side of the + * split can be newitem. Record a split after the previous data item + * from original page, but before the current data item from original + * page. (_bt_recsplitloc() will reject the split when there are no + * previous items, which we rely on.) + */ + if (offnum < newitemoff) + _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); + else if (offnum > newitemoff) + _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); + else + { + /* + * Record a split after all "offnum < newitemoff" original page + * data items, but before newitem + */ + _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); + + /* + * Record a split after newitem, but before data item from + * original page at offset newitemoff/current offset + */ + _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); + } + + olddataitemstoleft += itemsz; + } + + /* + * Record a split after all original page data items, but before newitem. + * (Though only when it's possible that newitem will end up alone on new + * right page.) + */ + Assert(olddataitemstoleft == olddataitemstotal); + if (newitemoff > maxoff) + _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0); + + /* + * I believe it is not possible to fail to find a feasible split, but just + * in case ... + */ + if (state.nsplits == 0) + elog(ERROR, "could not find a feasible split point for index \"%s\"", + RelationGetRelationName(rel)); + + /* + * Start search for a split point among list of legal split points. Give + * primary consideration to equalizing available free space in each half + * of the split initially (start with default strategy), while applying + * rightmost and split-after-new-item optimizations where appropriate. + * Either of the two other fallback strategies may be required for cases + * with a large number of duplicates around the original/space-optimal + * split point. + * + * Default strategy gives some weight to suffix truncation in deciding a + * split point on leaf pages. It attempts to select a split point where a + * distinguishing attribute appears earlier in the new high key for the + * left side of the split, in order to maximize the number of trailing + * attributes that can be truncated away. Only candidate split points + * that imply an acceptable balance of free space on each side are + * considered. See _bt_defaultinterval(). + */ + if (!state.is_leaf) + { + /* fillfactormult only used on rightmost page */ + usemult = state.is_rightmost; + fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0; + } + else if (state.is_rightmost) + { + /* Rightmost leaf page -- fillfactormult always used */ + usemult = true; + fillfactormult = leaffillfactor / 100.0; + } + else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult)) + { + /* + * New item inserted at rightmost point among a localized grouping on + * a leaf page -- apply "split after new item" optimization, either by + * applying leaf fillfactor multiplier, or by choosing the exact split + * point that leaves newitem as lastleft. (usemult is set for us.) + */ + if (usemult) + { + /* fillfactormult should be set based on leaf fillfactor */ + fillfactormult = leaffillfactor / 100.0; + } + else + { + /* find precise split point after newitemoff */ + for (int i = 0; i < state.nsplits; i++) + { + SplitPoint *split = state.splits + i; + + if (split->newitemonleft && + newitemoff == split->firstrightoff) + { + pfree(state.splits); + *newitemonleft = true; + return newitemoff; + } + } + + /* + * Cannot legally split after newitemoff; proceed with split + * without using fillfactor multiplier. This is defensive, and + * should never be needed in practice. + */ + fillfactormult = 0.50; + } + } + else + { + /* Other leaf page. 50:50 page split. */ + usemult = false; + /* fillfactormult not used, but be tidy */ + fillfactormult = 0.50; + } + + /* + * Save leftmost and rightmost splits for page before original ordinal + * sort order is lost by delta/fillfactormult sort + */ + leftpage = state.splits[0]; + rightpage = state.splits[state.nsplits - 1]; + + /* Give split points a fillfactormult-wise delta, and sort on deltas */ + _bt_deltasortsplits(&state, fillfactormult, usemult); + + /* Determine split interval for default strategy */ + state.interval = _bt_defaultinterval(&state); + + /* + * Determine if default strategy/split interval will produce a + * sufficiently distinguishing split, or if we should change strategies. + * Alternative strategies change the range of split points that are + * considered acceptable (split interval), and possibly change + * fillfactormult, in order to deal with pages with a large number of + * duplicates gracefully. + * + * Pass low and high splits for the entire page (actually, they're for an + * imaginary version of the page that includes newitem). These are used + * when the initial split interval encloses split points that are full of + * duplicates, and we need to consider if it's even possible to avoid + * appending a heap TID. + */ + perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy); + + if (strategy == SPLIT_DEFAULT) + { + /* + * Default strategy worked out (always works out with internal page). + * Original split interval still stands. + */ + } + + /* + * Many duplicates strategy is used when a heap TID would otherwise be + * appended, but the page isn't completely full of logical duplicates. + * + * The split interval is widened to include all legal candidate split + * points. There might be a few as two distinct values in the whole-page + * split interval, though it's also possible that most of the values on + * the page are unique. The final split point will either be to the + * immediate left or to the immediate right of the group of duplicate + * tuples that enclose the first/delta-optimal split point (perfect + * penalty was set so that the lowest delta split point that avoids + * appending a heap TID will be chosen). Maximizing the number of + * attributes that can be truncated away is not a goal of the many + * duplicates strategy. + * + * Single value strategy is used when it is impossible to avoid appending + * a heap TID. It arranges to leave the left page very full. This + * maximizes space utilization in cases where tuples with the same + * attribute values span many pages. Newly inserted duplicates will tend + * to have higher heap TID values, so we'll end up splitting to the right + * consistently. (Single value strategy is harmless though not + * particularly useful with !heapkeyspace indexes.) + */ + else if (strategy == SPLIT_MANY_DUPLICATES) + { + Assert(state.is_leaf); + /* Shouldn't try to truncate away extra user attributes */ + Assert(perfectpenalty == + IndexRelationGetNumberOfKeyAttributes(state.rel)); + /* No need to resort splits -- no change in fillfactormult/deltas */ + state.interval = state.nsplits; + } + else if (strategy == SPLIT_SINGLE_VALUE) + { + Assert(state.is_leaf); + /* Split near the end of the page */ + usemult = true; + fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0; + /* Resort split points with new delta */ + _bt_deltasortsplits(&state, fillfactormult, usemult); + /* Appending a heap TID is unavoidable, so interval of 1 is fine */ + state.interval = 1; + } + + /* + * Search among acceptable split points (using final split interval) for + * the entry that has the lowest penalty, and is therefore expected to + * maximize fan-out. Sets *newitemonleft for us. + */ + firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, + strategy); + pfree(state.splits); + + return firstrightoff; +} + +/* + * Subroutine to record a particular point between two tuples (possibly the + * new item) on page (ie, combination of firstrightoff and newitemonleft + * settings) in *state for later analysis. This is also a convenient point to + * check if the split is legal (if it isn't, it won't be recorded). + * + * firstrightoff is the offset of the first item on the original page that + * goes to the right page, and firstrightofforigpagetuplesz is the size of + * that tuple. firstrightoff can be > max offset, which means that all the + * old items go to the left page and only the new item goes to the right page. + * We don't actually use firstrightofforigpagetuplesz in that case (actually, + * we don't use it for _any_ split where the firstright tuple happens to be + * newitem). + * + * olddataitemstoleft is the total size of all old items to the left of the + * split point that is recorded here when legal. Should not include + * newitemsz, since that is handled here. + */ +static void +_bt_recsplitloc(FindSplitData *state, + OffsetNumber firstrightoff, + bool newitemonleft, + int olddataitemstoleft, + Size firstrightofforigpagetuplesz) +{ + int16 leftfree, + rightfree; + Size firstrightsz; + Size postingsz = 0; + bool newitemisfirstright; + + /* Is the new item going to be split point's firstright tuple? */ + newitemisfirstright = (firstrightoff == state->newitemoff && + !newitemonleft); + + if (newitemisfirstright) + firstrightsz = state->newitemsz; + else + { + firstrightsz = firstrightofforigpagetuplesz; + + /* + * Calculate suffix truncation space saving when firstright tuple is a + * posting list tuple, though only when the tuple is over 64 bytes + * including line pointer overhead (arbitrary). This avoids accessing + * the tuple in cases where its posting list must be very small (if + * tuple has one at all). + * + * Note: We don't do this in the case where firstright tuple is + * newitem, since newitem cannot have a posting list. + */ + if (state->is_leaf && firstrightsz > 64) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->origpage, firstrightoff); + newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsz = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + + /* Account for all the old tuples */ + leftfree = state->leftspace - olddataitemstoleft; + rightfree = state->rightspace - + (state->olddataitemstotal - olddataitemstoleft); + + /* + * The first item on the right page becomes the high key of the left page; + * therefore it counts against left space as well as right space (we + * cannot assume that suffix truncation will make it any smaller). When + * index has included attributes, then those attributes of left page high + * key will be truncated leaving that page with slightly more free space. + * However, that shouldn't affect our ability to find valid split + * location, since we err in the direction of being pessimistic about free + * space on the left half. Besides, even when suffix truncation of + * non-TID attributes occurs, the new high key often won't even be a + * single MAXALIGN() quantum smaller than the firstright tuple it's based + * on. + * + * If we are on the leaf level, assume that suffix truncation cannot avoid + * adding a heap TID to the left half's new high key when splitting at the + * leaf level. In practice the new high key will often be smaller and + * will rarely be larger, but conservatively assume the worst case. We do + * go to the trouble of subtracting away posting list overhead, though + * only when it looks like it will make an appreciable difference. + * (Posting lists are the only case where truncation will typically make + * the final high key far smaller than firstright, so being a bit more + * precise there noticeably improves the balance of free space.) + */ + if (state->is_leaf) + leftfree -= (int16) (firstrightsz + + MAXALIGN(sizeof(ItemPointerData)) - + postingsz); + else + leftfree -= (int16) firstrightsz; + + /* account for the new item */ + if (newitemonleft) + leftfree -= (int16) state->newitemsz; + else + rightfree -= (int16) state->newitemsz; + + /* + * If we are not on the leaf level, we will be able to discard the key + * data from the first item that winds up on the right page. + */ + if (!state->is_leaf) + rightfree += (int16) firstrightsz - + (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); + + /* Record split if legal */ + if (leftfree >= 0 && rightfree >= 0) + { + Assert(state->nsplits < state->maxsplits); + + /* Determine smallest firstright tuple size among legal splits */ + state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz); + + state->splits[state->nsplits].curdelta = 0; + state->splits[state->nsplits].leftfree = leftfree; + state->splits[state->nsplits].rightfree = rightfree; + state->splits[state->nsplits].firstrightoff = firstrightoff; + state->splits[state->nsplits].newitemonleft = newitemonleft; + state->nsplits++; + } +} + +/* + * Subroutine to assign space deltas to materialized array of candidate split + * points based on current fillfactor, and to sort array using that fillfactor + */ +static void +_bt_deltasortsplits(FindSplitData *state, double fillfactormult, + bool usemult) +{ + for (int i = 0; i < state->nsplits; i++) + { + SplitPoint *split = state->splits + i; + int16 delta; + + if (usemult) + delta = fillfactormult * split->leftfree - + (1.0 - fillfactormult) * split->rightfree; + else + delta = split->leftfree - split->rightfree; + + if (delta < 0) + delta = -delta; + + /* Save delta */ + split->curdelta = delta; + } + + qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp); +} + +/* + * qsort-style comparator used by _bt_deltasortsplits() + */ +static int +_bt_splitcmp(const void *arg1, const void *arg2) +{ + SplitPoint *split1 = (SplitPoint *) arg1; + SplitPoint *split2 = (SplitPoint *) arg2; + + if (split1->curdelta > split2->curdelta) + return 1; + if (split1->curdelta < split2->curdelta) + return -1; + + return 0; +} + +/* + * Subroutine to determine whether or not a non-rightmost leaf page should be + * split immediately after the would-be original page offset for the + * new/incoming tuple (or should have leaf fillfactor applied when new item is + * to the right on original page). This is appropriate when there is a + * pattern of localized monotonically increasing insertions into a composite + * index, where leading attribute values form local groupings, and we + * anticipate further insertions of the same/current grouping (new item's + * grouping) in the near future. This can be thought of as a variation on + * applying leaf fillfactor during rightmost leaf page splits, since cases + * that benefit will converge on packing leaf pages leaffillfactor% full over + * time. + * + * We may leave extra free space remaining on the rightmost page of a "most + * significant column" grouping of tuples if that grouping never ends up + * having future insertions that use the free space. That effect is + * self-limiting; a future grouping that becomes the "nearest on the right" + * grouping of the affected grouping usually puts the extra free space to good + * use. + * + * Caller uses optimization when routine returns true, though the exact action + * taken by caller varies. Caller uses original leaf page fillfactor in + * standard way rather than using the new item offset directly when *usemult + * was also set to true here. Otherwise, caller applies optimization by + * locating the legal split point that makes the new tuple the lastleft tuple + * for the split. + */ +static bool +_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, + int leaffillfactor, bool *usemult) +{ + int16 nkeyatts; + ItemId itemid; + IndexTuple tup; + int keepnatts; + + Assert(state->is_leaf && !state->is_rightmost); + + nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); + + /* Single key indexes not considered here */ + if (nkeyatts == 1) + return false; + + /* Ascending insertion pattern never inferred when new item is first */ + if (state->newitemoff == P_FIRSTKEY) + return false; + + /* + * Only apply optimization on pages with equisized tuples, since ordinal + * keys are likely to be fixed-width. Testing if the new tuple is + * variable width directly might also work, but that fails to apply the + * optimization to indexes with a numeric_ops attribute. + * + * Conclude that page has equisized tuples when the new item is the same + * width as the smallest item observed during pass over page, and other + * non-pivot tuples must be the same width as well. (Note that the + * possibly-truncated existing high key isn't counted in + * olddataitemstotal, and must be subtracted from maxoff.) + */ + if (state->newitemsz != state->minfirstrightsz) + return false; + if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal) + return false; + + /* + * Avoid applying optimization when tuples are wider than a tuple + * consisting of two non-NULL int8/int64 attributes (or four non-NULL + * int4/int32 attributes) + */ + if (state->newitemsz > + MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) + + sizeof(ItemIdData)) + return false; + + /* + * At least the first attribute's value must be equal to the corresponding + * value in previous tuple to apply optimization. New item cannot be a + * duplicate, either. + * + * Handle case where new item is to the right of all items on the existing + * page. This is suggestive of monotonically increasing insertions in + * itself, so the "heap TID adjacency" test is not applied here. + */ + if (state->newitemoff > maxoff) + { + itemid = PageGetItemId(state->origpage, maxoff); + tup = (IndexTuple) PageGetItem(state->origpage, itemid); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + *usemult = true; + return true; + } + + return false; + } + + /* + * "Low cardinality leading column, high cardinality suffix column" + * indexes with a random insertion pattern (e.g., an index with a boolean + * column, such as an index on '(book_is_in_print, book_isbn)') present us + * with a risk of consistently misapplying the optimization. We're + * willing to accept very occasional misapplication of the optimization, + * provided the cases where we get it wrong are rare and self-limiting. + * + * Heap TID adjacency strongly suggests that the item just to the left was + * inserted very recently, which limits overapplication of the + * optimization. Besides, all inappropriate cases triggered here will + * still split in the middle of the page on average. + */ + itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff)); + tup = (IndexTuple) PageGetItem(state->origpage, itemid); + /* Do cheaper test first */ + if (BTreeTupleIsPosting(tup) || + !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) + return false; + /* Check same conditions as rightmost item case, too */ + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + double interp = (double) state->newitemoff / ((double) maxoff + 1); + double leaffillfactormult = (double) leaffillfactor / 100.0; + + /* + * Don't allow caller to split after a new item when it will result in + * a split point to the right of the point that a leaf fillfactor + * split would use -- have caller apply leaf fillfactor instead + */ + *usemult = interp > leaffillfactormult; + + return true; + } + + return false; +} + +/* + * Subroutine for determining if two heap TIDS are "adjacent". + * + * Adjacent means that the high TID is very likely to have been inserted into + * heap relation immediately after the low TID, probably during the current + * transaction. + */ +static bool +_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) +{ + BlockNumber lowblk, + highblk; + + lowblk = ItemPointerGetBlockNumber(lowhtid); + highblk = ItemPointerGetBlockNumber(highhtid); + + /* Make optimistic assumption of adjacency when heap blocks match */ + if (lowblk == highblk) + return true; + + /* When heap block one up, second offset should be FirstOffsetNumber */ + if (lowblk + 1 == highblk && + ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber) + return true; + + return false; +} + +/* + * Subroutine to find the "best" split point among candidate split points. + * The best split point is the split point with the lowest penalty among split + * points that fall within current/final split interval. Penalty is an + * abstract score, with a definition that varies depending on whether we're + * splitting a leaf page or an internal page. See _bt_split_penalty() for + * details. + * + * "perfectpenalty" is assumed to be the lowest possible penalty among + * candidate split points. This allows us to return early without wasting + * cycles on calculating the first differing attribute for all candidate + * splits when that clearly cannot improve our choice (or when we only want a + * minimally distinguishing split point, and don't want to make the split any + * more unbalanced than is necessary). + * + * We return the index of the first existing tuple that should go on the right + * page, plus a boolean indicating if new item is on left of split point. + */ +static OffsetNumber +_bt_bestsplitloc(FindSplitData *state, int perfectpenalty, + bool *newitemonleft, FindSplitStrat strategy) +{ + int bestpenalty, + lowsplit; + int highsplit = Min(state->interval, state->nsplits); + SplitPoint *final; + + bestpenalty = INT_MAX; + lowsplit = 0; + for (int i = lowsplit; i < highsplit; i++) + { + int penalty; + + penalty = _bt_split_penalty(state, state->splits + i); + + if (penalty < bestpenalty) + { + bestpenalty = penalty; + lowsplit = i; + } + + if (penalty <= perfectpenalty) + break; + } + + final = &state->splits[lowsplit]; + + /* + * There is a risk that the "many duplicates" strategy will repeatedly do + * the wrong thing when there are monotonically decreasing insertions to + * the right of a large group of duplicates. Repeated splits could leave + * a succession of right half pages with free space that can never be + * used. This must be avoided. + * + * Consider the example of the leftmost page in a single integer attribute + * NULLS FIRST index which is almost filled with NULLs. Monotonically + * decreasing integer insertions might cause the same leftmost page to + * split repeatedly at the same point. Each split derives its new high + * key from the lowest current value to the immediate right of the large + * group of NULLs, which will always be higher than all future integer + * insertions, directing all future integer insertions to the same + * leftmost page. + */ + if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && + !final->newitemonleft && final->firstrightoff >= state->newitemoff && + final->firstrightoff < state->newitemoff + 9) + { + /* + * Avoid the problem by performing a 50:50 split when the new item is + * just to the right of the would-be "many duplicates" split point. + * (Note that the test used for an insert that is "just to the right" + * of the split point is conservative.) + */ + final = &state->splits[0]; + } + + *newitemonleft = final->newitemonleft; + return final->firstrightoff; +} + +#define LEAF_SPLIT_DISTANCE 0.050 +#define INTERNAL_SPLIT_DISTANCE 0.075 + +/* + * Return a split interval to use for the default strategy. This is a limit + * on the number of candidate split points to give further consideration to. + * Only a fraction of all candidate splits points (those located at the start + * of the now-sorted splits array) fall within the split interval. Split + * interval is applied within _bt_bestsplitloc(). + * + * Split interval represents an acceptable range of split points -- those that + * have leftfree and rightfree values that are acceptably balanced. The final + * split point chosen is the split point with the lowest "penalty" among split + * points in this split interval (unless we change our entire strategy, in + * which case the interval also changes -- see _bt_strategy()). + * + * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits, + * and sigma b for internal ("branch") splits. It's hard to provide a + * theoretical justification for the size of the split interval, though it's + * clear that a small split interval can make tuples on level L+1 much smaller + * on average, without noticeably affecting space utilization on level L. + * (Note that the way that we calculate split interval might need to change if + * suffix truncation is taught to truncate tuples "within" the last + * attribute/datum for data types like text, which is more or less how it is + * assumed to work in the paper.) + */ +static int +_bt_defaultinterval(FindSplitData *state) +{ + SplitPoint *spaceoptimal; + int16 tolerance, + lowleftfree, + lowrightfree, + highleftfree, + highrightfree; + + /* + * Determine leftfree and rightfree values that are higher and lower than + * we're willing to tolerate. Note that the final split interval will be + * about 10% of nsplits in the common case where all non-pivot tuples + * (data items) from a leaf page are uniformly sized. We're a bit more + * aggressive when splitting internal pages. + */ + if (state->is_leaf) + tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE; + else + tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE; + + /* First candidate split point is the most evenly balanced */ + spaceoptimal = state->splits; + lowleftfree = spaceoptimal->leftfree - tolerance; + lowrightfree = spaceoptimal->rightfree - tolerance; + highleftfree = spaceoptimal->leftfree + tolerance; + highrightfree = spaceoptimal->rightfree + tolerance; + + /* + * Iterate through split points, starting from the split immediately after + * 'spaceoptimal'. Find the first split point that divides free space so + * unevenly that including it in the split interval would be unacceptable. + */ + for (int i = 1; i < state->nsplits; i++) + { + SplitPoint *split = state->splits + i; + + /* Cannot use curdelta here, since its value is often weighted */ + if (split->leftfree < lowleftfree || split->rightfree < lowrightfree || + split->leftfree > highleftfree || split->rightfree > highrightfree) + return i; + } + + return state->nsplits; +} + +/* + * Subroutine to decide whether split should use default strategy/initial + * split interval, or whether it should finish splitting the page using + * alternative strategies (this is only possible with leaf pages). + * + * Caller uses alternative strategy (or sticks with default strategy) based + * on how *strategy is set here. Return value is "perfect penalty", which is + * passed to _bt_bestsplitloc() as a final constraint on how far caller is + * willing to go to avoid appending a heap TID when using the many duplicates + * strategy (it also saves _bt_bestsplitloc() useless cycles). + */ +static int +_bt_strategy(FindSplitData *state, SplitPoint *leftpage, + SplitPoint *rightpage, FindSplitStrat *strategy) +{ + IndexTuple leftmost, + rightmost; + SplitPoint *leftinterval, + *rightinterval; + int perfectpenalty; + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); + + /* Assume that alternative strategy won't be used for now */ + *strategy = SPLIT_DEFAULT; + + /* + * Use smallest observed firstright item size for entire page (actually, + * entire imaginary version of page that includes newitem) as perfect + * penalty on internal pages. This can save cycles in the common case + * where most or all splits (not just splits within interval) have + * firstright tuples that are the same size. + */ + if (!state->is_leaf) + return state->minfirstrightsz; + + /* + * Use leftmost and rightmost tuples from leftmost and rightmost splits in + * current split interval + */ + _bt_interval_edges(state, &leftinterval, &rightinterval); + leftmost = _bt_split_lastleft(state, leftinterval); + rightmost = _bt_split_firstright(state, rightinterval); + + /* + * If initial split interval can produce a split point that will at least + * avoid appending a heap TID in new high key, we're done. Finish split + * with default strategy and initial split interval. + */ + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + if (perfectpenalty <= indnkeyatts) + return perfectpenalty; + + /* + * Work out how caller should finish split when even their "perfect" + * penalty for initial/default split interval indicates that the interval + * does not contain even a single split that avoids appending a heap TID. + * + * Use the leftmost split's lastleft tuple and the rightmost split's + * firstright tuple to assess every possible split. + */ + leftmost = _bt_split_lastleft(state, leftpage); + rightmost = _bt_split_firstright(state, rightpage); + + /* + * If page (including new item) has many duplicates but is not entirely + * full of duplicates, a many duplicates strategy split will be performed. + * If page is entirely full of duplicates, a single value strategy split + * will be performed. + */ + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + if (perfectpenalty <= indnkeyatts) + { + *strategy = SPLIT_MANY_DUPLICATES; + + /* + * Many duplicates strategy should split at either side the group of + * duplicates that enclose the delta-optimal split point. Return + * indnkeyatts rather than the true perfect penalty to make that + * happen. (If perfectpenalty was returned here then low cardinality + * composite indexes could have continual unbalanced splits.) + * + * Note that caller won't go through with a many duplicates split in + * rare cases where it looks like there are ever-decreasing insertions + * to the immediate right of the split point. This must happen just + * before a final decision is made, within _bt_bestsplitloc(). + */ + return indnkeyatts; + } + + /* + * Single value strategy is only appropriate with ever-increasing heap + * TIDs; otherwise, original default strategy split should proceed to + * avoid pathological performance. Use page high key to infer if this is + * the rightmost page among pages that store the same duplicate value. + * This should not prevent insertions of heap TIDs that are slightly out + * of order from using single value strategy, since that's expected with + * concurrent inserters of the same duplicate value. + */ + else if (state->is_rightmost) + *strategy = SPLIT_SINGLE_VALUE; + else + { + ItemId itemid; + IndexTuple hikey; + + itemid = PageGetItemId(state->origpage, P_HIKEY); + hikey = (IndexTuple) PageGetItem(state->origpage, itemid); + perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, + state->newitem); + if (perfectpenalty <= indnkeyatts) + *strategy = SPLIT_SINGLE_VALUE; + else + { + /* + * Have caller finish split using default strategy, since page + * does not appear to be the rightmost page for duplicates of the + * value the page is filled with + */ + } + } + + return perfectpenalty; +} + +/* + * Subroutine to locate leftmost and rightmost splits for current/default + * split interval. Note that it will be the same split iff there is only one + * split in interval. + */ +static void +_bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, + SplitPoint **rightinterval) +{ + int highsplit = Min(state->interval, state->nsplits); + SplitPoint *deltaoptimal; + + deltaoptimal = state->splits; + *leftinterval = NULL; + *rightinterval = NULL; + + /* + * Delta is an absolute distance to optimal split point, so both the + * leftmost and rightmost split point will usually be at the end of the + * array + */ + for (int i = highsplit - 1; i >= 0; i--) + { + SplitPoint *distant = state->splits + i; + + if (distant->firstrightoff < deltaoptimal->firstrightoff) + { + if (*leftinterval == NULL) + *leftinterval = distant; + } + else if (distant->firstrightoff > deltaoptimal->firstrightoff) + { + if (*rightinterval == NULL) + *rightinterval = distant; + } + else if (!distant->newitemonleft && deltaoptimal->newitemonleft) + { + /* + * "incoming tuple will become firstright" (distant) is to the + * left of "incoming tuple will become lastleft" (delta-optimal) + */ + Assert(distant->firstrightoff == state->newitemoff); + if (*leftinterval == NULL) + *leftinterval = distant; + } + else if (distant->newitemonleft && !deltaoptimal->newitemonleft) + { + /* + * "incoming tuple will become lastleft" (distant) is to the right + * of "incoming tuple will become firstright" (delta-optimal) + */ + Assert(distant->firstrightoff == state->newitemoff); + if (*rightinterval == NULL) + *rightinterval = distant; + } + else + { + /* There was only one or two splits in initial split interval */ + Assert(distant == deltaoptimal); + if (*leftinterval == NULL) + *leftinterval = distant; + if (*rightinterval == NULL) + *rightinterval = distant; + } + + if (*leftinterval && *rightinterval) + return; + } + + Assert(false); +} + +/* + * Subroutine to find penalty for caller's candidate split point. + * + * On leaf pages, penalty is the attribute number that distinguishes each side + * of a split. It's the last attribute that needs to be included in new high + * key for left page. It can be greater than the number of key attributes in + * cases where a heap TID will need to be appended during truncation. + * + * On internal pages, penalty is simply the size of the firstright tuple for + * the split (including line pointer overhead). This tuple will become the + * new high key for the left page. + */ +static inline int +_bt_split_penalty(FindSplitData *state, SplitPoint *split) +{ + IndexTuple lastleft; + IndexTuple firstright; + + if (!state->is_leaf) + { + ItemId itemid; + + if (!split->newitemonleft && + split->firstrightoff == state->newitemoff) + return state->newitemsz; + + itemid = PageGetItemId(state->origpage, split->firstrightoff); + + return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); + } + + lastleft = _bt_split_lastleft(state, split); + firstright = _bt_split_firstright(state, split); + + return _bt_keep_natts_fast(state->rel, lastleft, firstright); +} + +/* + * Subroutine to get a lastleft IndexTuple for a split point + */ +static inline IndexTuple +_bt_split_lastleft(FindSplitData *state, SplitPoint *split) +{ + ItemId itemid; + + if (split->newitemonleft && split->firstrightoff == state->newitemoff) + return state->newitem; + + itemid = PageGetItemId(state->origpage, + OffsetNumberPrev(split->firstrightoff)); + return (IndexTuple) PageGetItem(state->origpage, itemid); +} + +/* + * Subroutine to get a firstright IndexTuple for a split point + */ +static inline IndexTuple +_bt_split_firstright(FindSplitData *state, SplitPoint *split) +{ + ItemId itemid; + + if (!split->newitemonleft && split->firstrightoff == state->newitemoff) + return state->newitem; + + itemid = PageGetItemId(state->origpage, split->firstrightoff); + return (IndexTuple) PageGetItem(state->origpage, itemid); +} diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c new file mode 100644 index 0000000..d524310 --- /dev/null +++ b/src/backend/access/nbtree/nbtutils.c @@ -0,0 +1,2751 @@ +/*------------------------------------------------------------------------- + * + * nbtutils.c + * Utility code for Postgres btree implementation. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtutils.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <time.h> + +#include "access/nbtree.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "catalog/catalog.h" +#include "commands/progress.h" +#include "lib/qunique.h" +#include "miscadmin.h" +#include "utils/array.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +typedef struct BTSortArrayContext +{ + FmgrInfo flinfo; + Oid collation; + bool reverse; +} BTSortArrayContext; + +static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, + StrategyNumber strat, + Datum *elems, int nelems); +static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, + bool reverse, + Datum *elems, int nelems); +static int _bt_compare_array_elements(const void *a, const void *b, void *arg); +static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, + ScanKey leftarg, ScanKey rightarg, + bool *result); +static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); +static void _bt_mark_scankey_required(ScanKey skey); +static bool _bt_check_rowcompare(ScanKey skey, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + ScanDirection dir, bool *continuescan); +static int _bt_keep_natts(Relation rel, IndexTuple lastleft, + IndexTuple firstright, BTScanInsert itup_key); + + +/* + * _bt_mkscankey + * Build an insertion scan key that contains comparison data from itup + * as well as comparator routines appropriate to the key datatypes. + * + * When itup is a non-pivot tuple, the returned insertion scan key is + * suitable for finding a place for it to go on the leaf level. Pivot + * tuples can be used to re-find leaf page with matching high key, but + * then caller needs to set scan key's pivotsearch field to true. This + * allows caller to search for a leaf page with a matching high key, + * which is usually to the left of the first leaf page a non-pivot match + * might appear on. + * + * The result is intended for use with _bt_compare() and _bt_truncate(). + * Callers that don't need to fill out the insertion scankey arguments + * (e.g. they use an ad-hoc comparison routine, or only need a scankey + * for _bt_truncate()) can pass a NULL index tuple. The scankey will + * be initialized as if an "all truncated" pivot tuple was passed + * instead. + * + * Note that we may occasionally have to share lock the metapage to + * determine whether or not the keys in the index are expected to be + * unique (i.e. if this is a "heapkeyspace" index). We assume a + * heapkeyspace index when caller passes a NULL tuple, allowing index + * build callers to avoid accessing the non-existent metapage. We + * also assume that the index is _not_ allequalimage when a NULL tuple + * is passed; CREATE INDEX callers call _bt_allequalimage() to set the + * field themselves. + */ +BTScanInsert +_bt_mkscankey(Relation rel, IndexTuple itup) +{ + BTScanInsert key; + ScanKey skey; + TupleDesc itupdesc; + int indnkeyatts; + int16 *indoption; + int tupnatts; + int i; + + itupdesc = RelationGetDescr(rel); + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + indoption = rel->rd_indoption; + tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; + + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); + + /* + * We'll execute search using scan key constructed on key columns. + * Truncated attributes and non-key attributes are omitted from the final + * scan key. + */ + key = palloc(offsetof(BTScanInsertData, scankeys) + + sizeof(ScanKeyData) * indnkeyatts); + if (itup) + _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); + else + { + /* Utility statement callers can set these fields themselves */ + key->heapkeyspace = true; + key->allequalimage = false; + } + key->anynullkeys = false; /* initial assumption */ + key->nextkey = false; + key->pivotsearch = false; + key->keysz = Min(indnkeyatts, tupnatts); + key->scantid = key->heapkeyspace && itup ? + BTreeTupleGetHeapTID(itup) : NULL; + skey = key->scankeys; + for (i = 0; i < indnkeyatts; i++) + { + FmgrInfo *procinfo; + Datum arg; + bool null; + int flags; + + /* + * We can use the cached (default) support procs since no cross-type + * comparison can be needed. + */ + procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); + + /* + * Key arguments built from truncated attributes (or when caller + * provides no tuple) are defensively represented as NULL values. They + * should never be used. + */ + if (i < tupnatts) + arg = index_getattr(itup, i + 1, itupdesc, &null); + else + { + arg = (Datum) 0; + null = true; + } + flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); + ScanKeyEntryInitializeWithInfo(&skey[i], + flags, + (AttrNumber) (i + 1), + InvalidStrategy, + InvalidOid, + rel->rd_indcollation[i], + procinfo, + arg); + /* Record if any key attribute is NULL (or truncated) */ + if (null) + key->anynullkeys = true; + } + + return key; +} + +/* + * free a retracement stack made by _bt_search. + */ +void +_bt_freestack(BTStack stack) +{ + BTStack ostack; + + while (stack != NULL) + { + ostack = stack; + stack = stack->bts_parent; + pfree(ostack); + } +} + + +/* + * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys + * + * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and + * set up BTArrayKeyInfo info for each one that is an equality-type key. + * Prepare modified scan keys in so->arrayKeyData, which will hold the current + * array elements during each primitive indexscan operation. For inequality + * array keys, it's sufficient to find the extreme element value and replace + * the whole array with that scalar value. + * + * Note: the reason we need so->arrayKeyData, rather than just scribbling + * on scan->keyData, is that callers are permitted to call btrescan without + * supplying a new set of scankey data. + */ +void +_bt_preprocess_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int numberOfKeys = scan->numberOfKeys; + int16 *indoption = scan->indexRelation->rd_indoption; + int numArrayKeys; + ScanKey cur; + int i; + MemoryContext oldContext; + + /* Quick check to see if there are any array keys */ + numArrayKeys = 0; + for (i = 0; i < numberOfKeys; i++) + { + cur = &scan->keyData[i]; + if (cur->sk_flags & SK_SEARCHARRAY) + { + numArrayKeys++; + Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL))); + /* If any arrays are null as a whole, we can quit right now. */ + if (cur->sk_flags & SK_ISNULL) + { + so->numArrayKeys = -1; + so->arrayKeyData = NULL; + return; + } + } + } + + /* Quit if nothing to do. */ + if (numArrayKeys == 0) + { + so->numArrayKeys = 0; + so->arrayKeyData = NULL; + return; + } + + /* + * Make a scan-lifespan context to hold array-associated data, or reset it + * if we already have one from a previous rescan cycle. + */ + if (so->arrayContext == NULL) + so->arrayContext = AllocSetContextCreate(CurrentMemoryContext, + "BTree array context", + ALLOCSET_SMALL_SIZES); + else + MemoryContextReset(so->arrayContext); + + oldContext = MemoryContextSwitchTo(so->arrayContext); + + /* Create modifiable copy of scan->keyData in the workspace context */ + so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); + memcpy(so->arrayKeyData, + scan->keyData, + scan->numberOfKeys * sizeof(ScanKeyData)); + + /* Allocate space for per-array data in the workspace context */ + so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo)); + + /* Now process each array key */ + numArrayKeys = 0; + for (i = 0; i < numberOfKeys; i++) + { + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + bool *elem_nulls; + int num_nonnulls; + int j; + + cur = &so->arrayKeyData[i]; + if (!(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + /* + * First, deconstruct the array into elements. Anything allocated + * here (including a possibly detoasted array value) is in the + * workspace context. + */ + arrayval = DatumGetArrayTypeP(cur->sk_argument); + /* We could cache this data, but not clear it's worth it */ + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + + /* + * Compress out any null elements. We can ignore them since we assume + * all btree operators are strict. + */ + num_nonnulls = 0; + for (j = 0; j < num_elems; j++) + { + if (!elem_nulls[j]) + elem_values[num_nonnulls++] = elem_values[j]; + } + + /* We could pfree(elem_nulls) now, but not worth the cycles */ + + /* If there's no non-nulls, the scan qual is unsatisfiable */ + if (num_nonnulls == 0) + { + numArrayKeys = -1; + break; + } + + /* + * If the comparison operator is not equality, then the array qual + * degenerates to a simple comparison against the smallest or largest + * non-null array element, as appropriate. + */ + switch (cur->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + cur->sk_argument = + _bt_find_extreme_element(scan, cur, + BTGreaterStrategyNumber, + elem_values, num_nonnulls); + continue; + case BTEqualStrategyNumber: + /* proceed with rest of loop */ + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + cur->sk_argument = + _bt_find_extreme_element(scan, cur, + BTLessStrategyNumber, + elem_values, num_nonnulls); + continue; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) cur->sk_strategy); + break; + } + + /* + * Sort the non-null elements and eliminate any duplicates. We must + * sort in the same ordering used by the index column, so that the + * successive primitive indexscans produce data in index order. + */ + num_elems = _bt_sort_array_elements(scan, cur, + (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, + elem_values, num_nonnulls); + + /* + * And set up the BTArrayKeyInfo data. + */ + so->arrayKeys[numArrayKeys].scan_key = i; + so->arrayKeys[numArrayKeys].num_elems = num_elems; + so->arrayKeys[numArrayKeys].elem_values = elem_values; + numArrayKeys++; + } + + so->numArrayKeys = numArrayKeys; + + MemoryContextSwitchTo(oldContext); +} + +/* + * _bt_find_extreme_element() -- get least or greatest array element + * + * scan and skey identify the index column, whose opfamily determines the + * comparison semantics. strat should be BTLessStrategyNumber to get the + * least element, or BTGreaterStrategyNumber to get the greatest. + */ +static Datum +_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, + StrategyNumber strat, + Datum *elems, int nelems) +{ + Relation rel = scan->indexRelation; + Oid elemtype, + cmp_op; + RegProcedure cmp_proc; + FmgrInfo flinfo; + Datum result; + int i; + + /* + * Determine the nominal datatype of the array elements. We have to + * support the convention that sk_subtype == InvalidOid means the opclass + * input type; this is a hack to simplify life for ScanKeyInit(). + */ + elemtype = skey->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[skey->sk_attno - 1]; + + /* + * Look up the appropriate comparison operator in the opfamily. + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but it seems quite unlikely that an opfamily would omit + * non-cross-type comparison operators for any datatype that it supports + * at all. + */ + cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1], + elemtype, + elemtype, + strat); + if (!OidIsValid(cmp_op)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strat, elemtype, elemtype, + rel->rd_opfamily[skey->sk_attno - 1]); + cmp_proc = get_opcode(cmp_op); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing oprcode for operator %u", cmp_op); + + fmgr_info(cmp_proc, &flinfo); + + Assert(nelems > 0); + result = elems[0]; + for (i = 1; i < nelems; i++) + { + if (DatumGetBool(FunctionCall2Coll(&flinfo, + skey->sk_collation, + elems[i], + result))) + result = elems[i]; + } + + return result; +} + +/* + * _bt_sort_array_elements() -- sort and de-dup array elements + * + * The array elements are sorted in-place, and the new number of elements + * after duplicate removal is returned. + * + * scan and skey identify the index column, whose opfamily determines the + * comparison semantics. If reverse is true, we sort in descending order. + */ +static int +_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, + bool reverse, + Datum *elems, int nelems) +{ + Relation rel = scan->indexRelation; + Oid elemtype; + RegProcedure cmp_proc; + BTSortArrayContext cxt; + + if (nelems <= 1) + return nelems; /* no work to do */ + + /* + * Determine the nominal datatype of the array elements. We have to + * support the convention that sk_subtype == InvalidOid means the opclass + * input type; this is a hack to simplify life for ScanKeyInit(). + */ + elemtype = skey->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[skey->sk_attno - 1]; + + /* + * Look up the appropriate comparison function in the opfamily. + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but it seems quite unlikely that an opfamily would omit + * non-cross-type support functions for any datatype that it supports at + * all. + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + elemtype, + elemtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", + BTORDER_PROC, elemtype, elemtype, + rel->rd_opfamily[skey->sk_attno - 1]); + + /* Sort the array elements */ + fmgr_info(cmp_proc, &cxt.flinfo); + cxt.collation = skey->sk_collation; + cxt.reverse = reverse; + qsort_arg((void *) elems, nelems, sizeof(Datum), + _bt_compare_array_elements, (void *) &cxt); + + /* Now scan the sorted elements and remove duplicates */ + return qunique_arg(elems, nelems, sizeof(Datum), + _bt_compare_array_elements, &cxt); +} + +/* + * qsort_arg comparator for sorting array elements + */ +static int +_bt_compare_array_elements(const void *a, const void *b, void *arg) +{ + Datum da = *((const Datum *) a); + Datum db = *((const Datum *) b); + BTSortArrayContext *cxt = (BTSortArrayContext *) arg; + int32 compare; + + compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo, + cxt->collation, + da, db)); + if (cxt->reverse) + INVERT_COMPARE_RESULT(compare); + return compare; +} + +/* + * _bt_start_array_keys() -- Initialize array keys at start of a scan + * + * Set up the cur_elem counters and fill in the first sk_argument value for + * each array scankey. We can't do this until we know the scan direction. + */ +void +_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int i; + + for (i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + + Assert(curArrayKey->num_elems > 0); + if (ScanDirectionIsBackward(dir)) + curArrayKey->cur_elem = curArrayKey->num_elems - 1; + else + curArrayKey->cur_elem = 0; + skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; + } +} + +/* + * _bt_advance_array_keys() -- Advance to next set of array elements + * + * Returns true if there is another set of values to consider, false if not. + * On true result, the scankeys are initialized with the next set of values. + */ +bool +_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool found = false; + int i; + + /* + * We must advance the last array key most quickly, since it will + * correspond to the lowest-order index column among the available + * qualifications. This is necessary to ensure correct ordering of output + * when there are multiple array keys. + */ + for (i = so->numArrayKeys - 1; i >= 0; i--) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + int cur_elem = curArrayKey->cur_elem; + int num_elems = curArrayKey->num_elems; + + if (ScanDirectionIsBackward(dir)) + { + if (--cur_elem < 0) + { + cur_elem = num_elems - 1; + found = false; /* need to advance next array key */ + } + else + found = true; + } + else + { + if (++cur_elem >= num_elems) + { + cur_elem = 0; + found = false; /* need to advance next array key */ + } + else + found = true; + } + + curArrayKey->cur_elem = cur_elem; + skey->sk_argument = curArrayKey->elem_values[cur_elem]; + if (found) + break; + } + + /* advance parallel scan */ + if (scan->parallel_scan != NULL) + _bt_parallel_advance_array_keys(scan); + + return found; +} + +/* + * _bt_mark_array_keys() -- Handle array keys during btmarkpos + * + * Save the current state of the array keys as the "mark" position. + */ +void +_bt_mark_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int i; + + for (i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + + curArrayKey->mark_elem = curArrayKey->cur_elem; + } +} + +/* + * _bt_restore_array_keys() -- Handle array keys during btrestrpos + * + * Restore the array keys to where they were when the mark was set. + */ +void +_bt_restore_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool changed = false; + int i; + + /* Restore each array key to its position when the mark was set */ + for (i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + int mark_elem = curArrayKey->mark_elem; + + if (curArrayKey->cur_elem != mark_elem) + { + curArrayKey->cur_elem = mark_elem; + skey->sk_argument = curArrayKey->elem_values[mark_elem]; + changed = true; + } + } + + /* + * If we changed any keys, we must redo _bt_preprocess_keys. That might + * sound like overkill, but in cases with multiple keys per index column + * it seems necessary to do the full set of pushups. + */ + if (changed) + { + _bt_preprocess_keys(scan); + /* The mark should have been set on a consistent set of keys... */ + Assert(so->qual_ok); + } +} + + +/* + * _bt_preprocess_keys() -- Preprocess scan keys + * + * The given search-type keys (in scan->keyData[] or so->arrayKeyData[]) + * are copied to so->keyData[] with possible transformation. + * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets + * the number of output keys (possibly less, never greater). + * + * The output keys are marked with additional sk_flags bits beyond the + * system-standard bits supplied by the caller. The DESC and NULLS_FIRST + * indoption bits for the relevant index attribute are copied into the flags. + * Also, for a DESC column, we commute (flip) all the sk_strategy numbers + * so that the index sorts in the desired direction. + * + * One key purpose of this routine is to discover which scan keys must be + * satisfied to continue the scan. It also attempts to eliminate redundant + * keys and detect contradictory keys. (If the index opfamily provides + * incomplete sets of cross-type operators, we may fail to detect redundant + * or contradictory keys, but we can survive that.) + * + * The output keys must be sorted by index attribute. Presently we expect + * (but verify) that the input keys are already so sorted --- this is done + * by match_clauses_to_index() in indxpath.c. Some reordering of the keys + * within each attribute may be done as a byproduct of the processing here, + * but no other code depends on that. + * + * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD + * if they must be satisfied in order to continue the scan forward or backward + * respectively. _bt_checkkeys uses these flags. For example, if the quals + * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple + * (1,2,7), but we must continue the scan in case there are tuples (1,3,z). + * But once we reach tuples like (1,4,z) we can stop scanning because no + * later tuples could match. This is reflected by marking the x and y keys, + * but not the z key, with SK_BT_REQFWD. In general, the keys for leading + * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD. + * For the first attribute without an "=" key, any "<" and "<=" keys are + * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD. + * This can be seen to be correct by considering the above example. Note + * in particular that if there are no keys for a given attribute, the keys for + * subsequent attributes can never be required; for instance "WHERE y = 4" + * requires a full-index scan. + * + * If possible, redundant keys are eliminated: we keep only the tightest + * >/>= bound and the tightest </<= bound, and if there's an = key then + * that's the only one returned. (So, we return either a single = key, + * or one or two boundary-condition keys for each attr.) However, if we + * cannot compare two keys for lack of a suitable cross-type operator, + * we cannot eliminate either. If there are two such keys of the same + * operator strategy, the second one is just pushed into the output array + * without further processing here. We may also emit both >/>= or both + * </<= keys if we can't compare them. The logic about required keys still + * works if we don't eliminate redundant keys. + * + * Note that one reason we need direction-sensitive required-key flags is + * precisely that we may not be able to eliminate redundant keys. Suppose + * we have "x > 4::int AND x > 10::bigint", and we are unable to determine + * which key is more restrictive for lack of a suitable cross-type operator. + * _bt_first will arbitrarily pick one of the keys to do the initial + * positioning with. If it picks x > 4, then the x > 10 condition will fail + * until we reach index entries > 10; but we can't stop the scan just because + * x > 10 is failing. On the other hand, if we are scanning backwards, then + * failure of either key is indeed enough to stop the scan. (In general, when + * inequality keys are present, the initial-positioning code only promises to + * position before the first possible match, not exactly at the first match, + * for a forward scan; or after the last match for a backward scan.) + * + * As a byproduct of this work, we can detect contradictory quals such + * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false, + * indicating the scan need not be run at all since no tuples can match. + * (In this case we do not bother completing the output key array!) + * Again, missing cross-type operators might cause us to fail to prove the + * quals contradictory when they really are, but the scan will work correctly. + * + * Row comparison keys are currently also treated without any smarts: + * we just transfer them into the preprocessed array without any + * editorialization. We can treat them the same as an ordinary inequality + * comparison on the row's first index column, for the purposes of the logic + * about required keys. + * + * Note: the reason we have to copy the preprocessed scan keys into private + * storage is that we are modifying the array based on comparisons of the + * key argument values, which could change on a rescan or after moving to + * new elements of array keys. Therefore we can't overwrite the source data. + */ +void +_bt_preprocess_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int numberOfKeys = scan->numberOfKeys; + int16 *indoption = scan->indexRelation->rd_indoption; + int new_numberOfKeys; + int numberOfEqualCols; + ScanKey inkeys; + ScanKey outkeys; + ScanKey cur; + ScanKey xform[BTMaxStrategyNumber]; + bool test_result; + int i, + j; + AttrNumber attno; + + /* initialize result variables */ + so->qual_ok = true; + so->numberOfKeys = 0; + + if (numberOfKeys < 1) + return; /* done if qual-less scan */ + + /* + * Read so->arrayKeyData if array keys are present, else scan->keyData + */ + if (so->arrayKeyData != NULL) + inkeys = so->arrayKeyData; + else + inkeys = scan->keyData; + + outkeys = so->keyData; + cur = &inkeys[0]; + /* we check that input keys are correctly ordered */ + if (cur->sk_attno < 1) + elog(ERROR, "btree index keys must be ordered by attribute"); + + /* We can short-circuit most of the work if there's just one key */ + if (numberOfKeys == 1) + { + /* Apply indoption to scankey (might change sk_strategy!) */ + if (!_bt_fix_scankey_strategy(cur, indoption)) + so->qual_ok = false; + memcpy(outkeys, cur, sizeof(ScanKeyData)); + so->numberOfKeys = 1; + /* We can mark the qual as required if it's for first index col */ + if (cur->sk_attno == 1) + _bt_mark_scankey_required(outkeys); + return; + } + + /* + * Otherwise, do the full set of pushups. + */ + new_numberOfKeys = 0; + numberOfEqualCols = 0; + + /* + * Initialize for processing of keys for attr 1. + * + * xform[i] points to the currently best scan key of strategy type i+1; it + * is NULL if we haven't yet found such a key for this attr. + */ + attno = 1; + memset(xform, 0, sizeof(xform)); + + /* + * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to + * handle after-last-key processing. Actual exit from the loop is at the + * "break" statement below. + */ + for (i = 0;; cur++, i++) + { + if (i < numberOfKeys) + { + /* Apply indoption to scankey (might change sk_strategy!) */ + if (!_bt_fix_scankey_strategy(cur, indoption)) + { + /* NULL can't be matched, so give up */ + so->qual_ok = false; + return; + } + } + + /* + * If we are at the end of the keys for a particular attr, finish up + * processing and emit the cleaned-up keys. + */ + if (i == numberOfKeys || cur->sk_attno != attno) + { + int priorNumberOfEqualCols = numberOfEqualCols; + + /* check input keys are correctly ordered */ + if (i < numberOfKeys && cur->sk_attno < attno) + elog(ERROR, "btree index keys must be ordered by attribute"); + + /* + * If = has been specified, all other keys can be eliminated as + * redundant. If we have a case like key = 1 AND key > 2, we can + * set qual_ok to false and abandon further processing. + * + * We also have to deal with the case of "key IS NULL", which is + * unsatisfiable in combination with any other index condition. By + * the time we get here, that's been classified as an equality + * check, and we've rejected any combination of it with a regular + * equality condition; but not with other types of conditions. + */ + if (xform[BTEqualStrategyNumber - 1]) + { + ScanKey eq = xform[BTEqualStrategyNumber - 1]; + + for (j = BTMaxStrategyNumber; --j >= 0;) + { + ScanKey chk = xform[j]; + + if (!chk || j == (BTEqualStrategyNumber - 1)) + continue; + + if (eq->sk_flags & SK_SEARCHNULL) + { + /* IS NULL is contradictory to anything else */ + so->qual_ok = false; + return; + } + + if (_bt_compare_scankey_args(scan, chk, eq, chk, + &test_result)) + { + if (!test_result) + { + /* keys proven mutually contradictory */ + so->qual_ok = false; + return; + } + /* else discard the redundant non-equality key */ + xform[j] = NULL; + } + /* else, cannot determine redundancy, keep both keys */ + } + /* track number of attrs for which we have "=" keys */ + numberOfEqualCols++; + } + + /* try to keep only one of <, <= */ + if (xform[BTLessStrategyNumber - 1] + && xform[BTLessEqualStrategyNumber - 1]) + { + ScanKey lt = xform[BTLessStrategyNumber - 1]; + ScanKey le = xform[BTLessEqualStrategyNumber - 1]; + + if (_bt_compare_scankey_args(scan, le, lt, le, + &test_result)) + { + if (test_result) + xform[BTLessEqualStrategyNumber - 1] = NULL; + else + xform[BTLessStrategyNumber - 1] = NULL; + } + } + + /* try to keep only one of >, >= */ + if (xform[BTGreaterStrategyNumber - 1] + && xform[BTGreaterEqualStrategyNumber - 1]) + { + ScanKey gt = xform[BTGreaterStrategyNumber - 1]; + ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1]; + + if (_bt_compare_scankey_args(scan, ge, gt, ge, + &test_result)) + { + if (test_result) + xform[BTGreaterEqualStrategyNumber - 1] = NULL; + else + xform[BTGreaterStrategyNumber - 1] = NULL; + } + } + + /* + * Emit the cleaned-up keys into the outkeys[] array, and then + * mark them if they are required. They are required (possibly + * only in one direction) if all attrs before this one had "=". + */ + for (j = BTMaxStrategyNumber; --j >= 0;) + { + if (xform[j]) + { + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, xform[j], sizeof(ScanKeyData)); + if (priorNumberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + } + } + + /* + * Exit loop here if done. + */ + if (i == numberOfKeys) + break; + + /* Re-initialize for new attno */ + attno = cur->sk_attno; + memset(xform, 0, sizeof(xform)); + } + + /* check strategy this key's operator corresponds to */ + j = cur->sk_strategy - 1; + + /* if row comparison, push it directly to the output array */ + if (cur->sk_flags & SK_ROW_HEADER) + { + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, cur, sizeof(ScanKeyData)); + if (numberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + + /* + * We don't support RowCompare using equality; such a qual would + * mess up the numberOfEqualCols tracking. + */ + Assert(j != (BTEqualStrategyNumber - 1)); + continue; + } + + /* have we seen one of these before? */ + if (xform[j] == NULL) + { + /* nope, so remember this scankey */ + xform[j] = cur; + } + else + { + /* yup, keep only the more restrictive key */ + if (_bt_compare_scankey_args(scan, cur, cur, xform[j], + &test_result)) + { + if (test_result) + xform[j] = cur; + else if (j == (BTEqualStrategyNumber - 1)) + { + /* key == a && key == b, but a != b */ + so->qual_ok = false; + return; + } + /* else old key is more restrictive, keep it */ + } + else + { + /* + * We can't determine which key is more restrictive. Keep the + * previous one in xform[j] and push this one directly to the + * output array. + */ + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, cur, sizeof(ScanKeyData)); + if (numberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + } + } + } + + so->numberOfKeys = new_numberOfKeys; +} + +/* + * Compare two scankey values using a specified operator. + * + * The test we want to perform is logically "leftarg op rightarg", where + * leftarg and rightarg are the sk_argument values in those ScanKeys, and + * the comparison operator is the one in the op ScanKey. However, in + * cross-data-type situations we may need to look up the correct operator in + * the index's opfamily: it is the one having amopstrategy = op->sk_strategy + * and amoplefttype/amoprighttype equal to the two argument datatypes. + * + * If the opfamily doesn't supply a complete set of cross-type operators we + * may not be able to make the comparison. If we can make the comparison + * we store the operator result in *result and return true. We return false + * if the comparison could not be made. + * + * Note: op always points at the same ScanKey as either leftarg or rightarg. + * Since we don't scribble on the scankeys, this aliasing should cause no + * trouble. + * + * Note: this routine needs to be insensitive to any DESC option applied + * to the index column. For example, "x < 4" is a tighter constraint than + * "x < 5" regardless of which way the index is sorted. + */ +static bool +_bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, + ScanKey leftarg, ScanKey rightarg, + bool *result) +{ + Relation rel = scan->indexRelation; + Oid lefttype, + righttype, + optype, + opcintype, + cmp_op; + StrategyNumber strat; + + /* + * First, deal with cases where one or both args are NULL. This should + * only happen when the scankeys represent IS NULL/NOT NULL conditions. + */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL) + { + bool leftnull, + rightnull; + + if (leftarg->sk_flags & SK_ISNULL) + { + Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); + leftnull = true; + } + else + leftnull = false; + if (rightarg->sk_flags & SK_ISNULL) + { + Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); + rightnull = true; + } + else + rightnull = false; + + /* + * We treat NULL as either greater than or less than all other values. + * Since true > false, the tests below work correctly for NULLS LAST + * logic. If the index is NULLS FIRST, we need to flip the strategy. + */ + strat = op->sk_strategy; + if (op->sk_flags & SK_BT_NULLS_FIRST) + strat = BTCommuteStrategyNumber(strat); + + switch (strat) + { + case BTLessStrategyNumber: + *result = (leftnull < rightnull); + break; + case BTLessEqualStrategyNumber: + *result = (leftnull <= rightnull); + break; + case BTEqualStrategyNumber: + *result = (leftnull == rightnull); + break; + case BTGreaterEqualStrategyNumber: + *result = (leftnull >= rightnull); + break; + case BTGreaterStrategyNumber: + *result = (leftnull > rightnull); + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat); + *result = false; /* keep compiler quiet */ + break; + } + return true; + } + + /* + * The opfamily we need to worry about is identified by the index column. + */ + Assert(leftarg->sk_attno == rightarg->sk_attno); + + opcintype = rel->rd_opcintype[leftarg->sk_attno - 1]; + + /* + * Determine the actual datatypes of the ScanKey arguments. We have to + * support the convention that sk_subtype == InvalidOid means the opclass + * input type; this is a hack to simplify life for ScanKeyInit(). + */ + lefttype = leftarg->sk_subtype; + if (lefttype == InvalidOid) + lefttype = opcintype; + righttype = rightarg->sk_subtype; + if (righttype == InvalidOid) + righttype = opcintype; + optype = op->sk_subtype; + if (optype == InvalidOid) + optype = opcintype; + + /* + * If leftarg and rightarg match the types expected for the "op" scankey, + * we can use its already-looked-up comparison function. + */ + if (lefttype == opcintype && righttype == optype) + { + *result = DatumGetBool(FunctionCall2Coll(&op->sk_func, + op->sk_collation, + leftarg->sk_argument, + rightarg->sk_argument)); + return true; + } + + /* + * Otherwise, we need to go to the syscache to find the appropriate + * operator. (This cannot result in infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to + * un-flip it to get the correct opfamily member. + */ + strat = op->sk_strategy; + if (op->sk_flags & SK_BT_DESC) + strat = BTCommuteStrategyNumber(strat); + + cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1], + lefttype, + righttype, + strat); + if (OidIsValid(cmp_op)) + { + RegProcedure cmp_proc = get_opcode(cmp_op); + + if (RegProcedureIsValid(cmp_proc)) + { + *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc, + op->sk_collation, + leftarg->sk_argument, + rightarg->sk_argument)); + return true; + } + } + + /* Can't make the comparison */ + *result = false; /* suppress compiler warnings */ + return false; +} + +/* + * Adjust a scankey's strategy and flags setting as needed for indoptions. + * + * We copy the appropriate indoption value into the scankey sk_flags + * (shifting to avoid clobbering system-defined flag bits). Also, if + * the DESC option is set, commute (flip) the operator strategy number. + * + * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up + * the strategy field correctly for them. + * + * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a + * NULL comparison value. Since all btree operators are assumed strict, + * a NULL means that the qual cannot be satisfied. We return true if the + * comparison value isn't NULL, or false if the scan should be abandoned. + * + * This function is applied to the *input* scankey structure; therefore + * on a rescan we will be looking at already-processed scankeys. Hence + * we have to be careful not to re-commute the strategy if we already did it. + * It's a bit ugly to modify the caller's copy of the scankey but in practice + * there shouldn't be any problem, since the index's indoptions are certainly + * not going to change while the scankey survives. + */ +static bool +_bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) +{ + int addflags; + + addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT; + + /* + * We treat all btree operators as strict (even if they're not so marked + * in pg_proc). This means that it is impossible for an operator condition + * with a NULL comparison constant to succeed, and we can reject it right + * away. + * + * However, we now also support "x IS NULL" clauses as search conditions, + * so in that case keep going. The planner has not filled in any + * particular strategy in this case, so set it to BTEqualStrategyNumber + * --- we can treat IS NULL as an equality operator for purposes of search + * strategy. + * + * Likewise, "x IS NOT NULL" is supported. We treat that as either "less + * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS + * FIRST index. + * + * Note: someday we might have to fill in sk_collation from the index + * column's collation. At the moment this is a non-issue because we'll + * never actually call the comparison operator on a NULL. + */ + if (skey->sk_flags & SK_ISNULL) + { + /* SK_ISNULL shouldn't be set in a row header scankey */ + Assert(!(skey->sk_flags & SK_ROW_HEADER)); + + /* Set indoption flags in scankey (might be done already) */ + skey->sk_flags |= addflags; + + /* Set correct strategy for IS NULL or NOT NULL search */ + if (skey->sk_flags & SK_SEARCHNULL) + { + skey->sk_strategy = BTEqualStrategyNumber; + skey->sk_subtype = InvalidOid; + skey->sk_collation = InvalidOid; + } + else if (skey->sk_flags & SK_SEARCHNOTNULL) + { + if (skey->sk_flags & SK_BT_NULLS_FIRST) + skey->sk_strategy = BTGreaterStrategyNumber; + else + skey->sk_strategy = BTLessStrategyNumber; + skey->sk_subtype = InvalidOid; + skey->sk_collation = InvalidOid; + } + else + { + /* regular qual, so it cannot be satisfied */ + return false; + } + + /* Needn't do the rest */ + return true; + } + + /* Adjust strategy for DESC, if we didn't already */ + if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC)) + skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy); + skey->sk_flags |= addflags; + + /* If it's a row header, fix row member flags and strategies similarly */ + if (skey->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT; + if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC)) + subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy); + subkey->sk_flags |= addflags; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + } + + return true; +} + +/* + * Mark a scankey as "required to continue the scan". + * + * Depending on the operator type, the key may be required for both scan + * directions or just one. Also, if the key is a row comparison header, + * we have to mark its first subsidiary ScanKey as required. (Subsequent + * subsidiary ScanKeys are normally for lower-order columns, and thus + * cannot be required, since they're after the first non-equality scankey.) + * + * Note: when we set required-key flag bits in a subsidiary scankey, we are + * scribbling on a data structure belonging to the index AM's caller, not on + * our private copy. This should be OK because the marking will not change + * from scan to scan within a query, and so we'd just re-mark the same way + * anyway on a rescan. Something to keep an eye on though. + */ +static void +_bt_mark_scankey_required(ScanKey skey) +{ + int addflags; + + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + addflags = SK_BT_REQFWD; + break; + case BTEqualStrategyNumber: + addflags = SK_BT_REQFWD | SK_BT_REQBKWD; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + addflags = SK_BT_REQBKWD; + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + addflags = 0; /* keep compiler quiet */ + break; + } + + skey->sk_flags |= addflags; + + if (skey->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + + /* First subkey should be same column/operator as the header */ + Assert(subkey->sk_flags & SK_ROW_MEMBER); + Assert(subkey->sk_attno == skey->sk_attno); + Assert(subkey->sk_strategy == skey->sk_strategy); + subkey->sk_flags |= addflags; + } +} + +/* + * Test whether an indextuple satisfies all the scankey conditions. + * + * Return true if so, false if not. If the tuple fails to pass the qual, + * we also determine whether there's any need to continue the scan beyond + * this tuple, and set *continuescan accordingly. See comments for + * _bt_preprocess_keys(), above, about how this is done. + * + * Forward scan callers can pass a high key tuple in the hopes of having + * us set *continuescan to false, and avoiding an unnecessary visit to + * the page to the right. + * + * scan: index scan descriptor (containing a search-type scankey) + * tuple: index tuple to test + * tupnatts: number of attributes in tupnatts (high key may be truncated) + * dir: direction we are scanning in + * continuescan: output parameter (will be set correctly in all cases) + */ +bool +_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan) +{ + TupleDesc tupdesc; + BTScanOpaque so; + int keysz; + int ikey; + ScanKey key; + + Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + + *continuescan = true; /* default assumption */ + + tupdesc = RelationGetDescr(scan->indexRelation); + so = (BTScanOpaque) scan->opaque; + keysz = so->numberOfKeys; + + for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) + { + Datum datum; + bool isNull; + Datum test; + + if (key->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); + continue; + } + + /* row-comparison keys need special processing */ + if (key->sk_flags & SK_ROW_HEADER) + { + if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, + continuescan)) + continue; + return false; + } + + datum = index_getattr(tuple, + key->sk_attno, + tupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* Handle IS NULL/NOT NULL tests */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (isNull) + continue; /* tuple satisfies this qual */ + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (!isNull) + continue; /* tuple satisfies this qual */ + } + + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + if (isNull) + { + if (key->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + test = FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument); + + if (!DatumGetBool(test)) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + * + * Note: because we stop the scan as soon as any required equality + * qual fails, it is critical that equality quals be used for the + * initial positioning in _bt_first() when they are available. See + * comments in _bt_first(). + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + } + + /* If we get here, the tuple passes all index quals. */ + return true; +} + +/* + * Test whether an indextuple satisfies a row-comparison scan condition. + * + * Return true if so, false if not. If not, also clear *continuescan if + * it's not possible for any future tuples in the current scan direction + * to pass the qual. + * + * This is a subroutine for _bt_checkkeys, which see for more info. + */ +static bool +_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, + TupleDesc tupdesc, ScanDirection dir, bool *continuescan) +{ + ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + int32 cmpresult = 0; + bool result; + + /* First subkey should be same as the header says */ + Assert(subkey->sk_attno == skey->sk_attno); + + /* Loop over columns of the row condition */ + for (;;) + { + Datum datum; + bool isNull; + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + + if (subkey->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); + cmpresult = 0; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + continue; + } + + datum = index_getattr(tuple, + subkey->sk_attno, + tupdesc, + &isNull); + + if (isNull) + { + if (subkey->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + */ + if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + */ + if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + if (subkey->sk_flags & SK_ISNULL) + { + /* + * Unlike the simple-scankey case, this isn't a disallowed case. + * But it can never match. If all the earlier row comparison + * columns are required for the scan direction, we can stop the + * scan, because there can't be another tuple that will succeed. + */ + if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument)) + subkey--; + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + return false; + } + + /* Perform the test --- three-way comparison not bool operator */ + cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, + subkey->sk_collation, + datum, + subkey->sk_argument)); + + if (subkey->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(cmpresult); + + /* Done comparing if unequal, else advance to next column */ + if (cmpresult != 0) + break; + + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + + /* + * At this point cmpresult indicates the overall result of the row + * comparison, and subkey points to the deciding column (or the last + * column if the result is "="). + */ + switch (subkey->sk_strategy) + { + /* EQ and NE cases aren't allowed here */ + case BTLessStrategyNumber: + result = (cmpresult < 0); + break; + case BTLessEqualStrategyNumber: + result = (cmpresult <= 0); + break; + case BTGreaterEqualStrategyNumber: + result = (cmpresult >= 0); + break; + case BTGreaterStrategyNumber: + result = (cmpresult > 0); + break; + default: + elog(ERROR, "unrecognized RowCompareType: %d", + (int) subkey->sk_strategy); + result = 0; /* keep compiler quiet */ + break; + } + + if (!result) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will pass, + * either. Note we have to look at the deciding column, not + * necessarily the first or last column of the row condition. + */ + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + + return result; +} + +/* + * _bt_killitems - set LP_DEAD state for items an indexscan caller has + * told us were killed + * + * scan->opaque, referenced locally through so, contains information about the + * current page and killed tuples thereon (generally, this should only be + * called if so->numKilled > 0). + * + * The caller does not have a lock on the page and may or may not have the + * page pinned in a buffer. Note that read-lock is sufficient for setting + * LP_DEAD status (which is only a hint). + * + * We match items by heap TID before assuming they are the right ones to + * delete. We cope with cases where items have moved right due to insertions. + * If an item has moved off the current page due to a split, we'll fail to + * find it and do nothing (this is not an error case --- we assume the item + * will eventually get marked in a future indexscan). + * + * Note that if we hold a pin on the target page continuously from initially + * reading the items until applying this function, VACUUM cannot have deleted + * any items from the page, and so there is no need to search left from the + * recorded offset. (This observation also guarantees that the item is still + * the right one to delete, which might otherwise be questionable since heap + * TIDs can get recycled.) This holds true even if the page has been modified + * by inserts and page splits, so there is no need to consult the LSN. + * + * If the pin was released after reading the page, then we re-read it. If it + * has been modified since we read it (as determined by the LSN), we dare not + * flag any entries because it is possible that the old entry was vacuumed + * away and the TID was re-used by a completely different heap tuple. + */ +void +_bt_killitems(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int i; + int numKilled = so->numKilled; + bool killedsomething = false; + bool droppedpin PG_USED_FOR_ASSERTS_ONLY; + + Assert(BTScanPosIsValid(so->currPos)); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; + + if (BTScanPosIsPinned(so->currPos)) + { + /* + * We have held the pin on this page since we read the index tuples, + * so all we need to do is lock it. The pin will have prevented + * re-use of any TID on the page, so there is no need to check the + * LSN. + */ + droppedpin = false; + _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); + + page = BufferGetPage(so->currPos.buf); + } + else + { + Buffer buf; + + droppedpin = true; + /* Attempt to re-read the buffer, getting pin and lock. */ + buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + + page = BufferGetPage(buf); + if (BufferGetLSNAtomic(buf) == so->currPos.lsn) + so->currPos.buf = buf; + else + { + /* Modified while not pinned means hinting is not safe. */ + _bt_relbuf(scan->indexRelation, buf); + return; + } + } + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < numKilled; i++) + { + int itemIndex = so->killedItems[i]; + BTScanPosItem *kitem = &so->currPos.items[itemIndex]; + OffsetNumber offnum = kitem->indexOffset; + + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + if (offnum < minoff) + continue; /* pure paranoia */ + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; + + if (BTreeTupleIsPosting(ituple)) + { + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + /* + * We rely on the convention that heap TIDs in the scanpos + * items array are stored in ascending heap TID order for a + * group of TIDs that originally came from a posting list + * tuple. This convention even applies during backwards + * scans, where returning the TIDs in descending order might + * seem more natural. This is about effectiveness, not + * correctness. + * + * Note that the page may have been modified in almost any way + * since we first read it (in the !droppedpin case), so it's + * possible that this posting list tuple wasn't a posting list + * tuple when we first encountered its heap TIDs. + */ + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* + * kitem must have matching offnum when heap TIDs match, + * though only in the common case where the page can't + * have been concurrently modified + */ + Assert(kitem->indexOffset == offnum || !droppedpin); + + /* + * Read-ahead to later kitems here. + * + * We rely on the assumption that not advancing kitem here + * will prevent us from considering the posting list tuple + * fully dead by not matching its next heap TID in next + * loop iteration. + * + * If, on the other hand, this is the final heap TID in + * the posting list tuple, then tuple gets killed + * regardless (i.e. we handle the case where the last + * kitem is also the last heap TID in the last index tuple + * correctly -- posting tuple still gets killed). + */ + if (pi < numKilled) + kitem = &so->currPos.items[so->killedItems[pi++]]; + } + + /* + * Don't bother advancing the outermost loop's int iterator to + * avoid processing killed items that relate to the same + * offnum/posting list tuple. This micro-optimization hardly + * seems worth it. (Further iterations of the outermost loop + * will fail to match on this same posting list's first heap + * TID instead, so we'll advance to the next offnum/index + * tuple pretty quickly.) + */ + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + /* + * Mark index item as dead, if it isn't already. Since this + * happens while holding a buffer lock possibly in shared mode, + * it's possible that multiple processes attempt to do this + * simultaneously, leading to multiple full-page images being sent + * to WAL (if wal_log_hints or data checksums are enabled), which + * is undesirable. + */ + if (killtuple && !ItemIdIsDead(iid)) + { + /* found the item/all posting list items */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. + * + * Whenever we mark anything LP_DEAD, we also set the page's + * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we + * only rely on the page-level flag in !heapkeyspace indexes.) + */ + if (killedsomething) + { + opaque->btpo_flags |= BTP_HAS_GARBAGE; + MarkBufferDirtyHint(so->currPos.buf, true); + } + + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); +} + + +/* + * The following routines manage a shared-memory area in which we track + * assignment of "vacuum cycle IDs" to currently-active btree vacuuming + * operations. There is a single counter which increments each time we + * start a vacuum to assign it a cycle ID. Since multiple vacuums could + * be active concurrently, we have to track the cycle ID for each active + * vacuum; this requires at most MaxBackends entries (usually far fewer). + * We assume at most one vacuum can be active for a given index. + * + * Access to the shared memory area is controlled by BtreeVacuumLock. + * In principle we could use a separate lmgr locktag for each index, + * but a single LWLock is much cheaper, and given the short time that + * the lock is ever held, the concurrency hit should be minimal. + */ + +typedef struct BTOneVacInfo +{ + LockRelId relid; /* global identifier of an index */ + BTCycleId cycleid; /* cycle ID for its active VACUUM */ +} BTOneVacInfo; + +typedef struct BTVacInfo +{ + BTCycleId cycle_ctr; /* cycle ID most recently assigned */ + int num_vacuums; /* number of currently active VACUUMs */ + int max_vacuums; /* allocated length of vacuums[] array */ + BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]; +} BTVacInfo; + +static BTVacInfo *btvacinfo; + + +/* + * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index, + * or zero if there is no active VACUUM + * + * Note: for correct interlocking, the caller must already hold pin and + * exclusive lock on each buffer it will store the cycle ID into. This + * ensures that even if a VACUUM starts immediately afterwards, it cannot + * process those pages until the page split is complete. + */ +BTCycleId +_bt_vacuum_cycleid(Relation rel) +{ + BTCycleId result = 0; + int i; + + /* Share lock is enough since this is a read-only operation */ + LWLockAcquire(BtreeVacuumLock, LW_SHARED); + + for (i = 0; i < btvacinfo->num_vacuums; i++) + { + BTOneVacInfo *vac = &btvacinfo->vacuums[i]; + + if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && + vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) + { + result = vac->cycleid; + break; + } + } + + LWLockRelease(BtreeVacuumLock); + return result; +} + +/* + * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation + * + * Note: the caller must guarantee that it will eventually call + * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure + * that this happens even in elog(FATAL) scenarios, the appropriate coding + * is not just a PG_TRY, but + * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)) + */ +BTCycleId +_bt_start_vacuum(Relation rel) +{ + BTCycleId result; + int i; + BTOneVacInfo *vac; + + LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE); + + /* + * Assign the next cycle ID, being careful to avoid zero as well as the + * reserved high values. + */ + result = ++(btvacinfo->cycle_ctr); + if (result == 0 || result > MAX_BT_CYCLE_ID) + result = btvacinfo->cycle_ctr = 1; + + /* Let's just make sure there's no entry already for this index */ + for (i = 0; i < btvacinfo->num_vacuums; i++) + { + vac = &btvacinfo->vacuums[i]; + if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && + vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) + { + /* + * Unlike most places in the backend, we have to explicitly + * release our LWLock before throwing an error. This is because + * we expect _bt_end_vacuum() to be called before transaction + * abort cleanup can run to release LWLocks. + */ + LWLockRelease(BtreeVacuumLock); + elog(ERROR, "multiple active vacuums for index \"%s\"", + RelationGetRelationName(rel)); + } + } + + /* OK, add an entry */ + if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums) + { + LWLockRelease(BtreeVacuumLock); + elog(ERROR, "out of btvacinfo slots"); + } + vac = &btvacinfo->vacuums[btvacinfo->num_vacuums]; + vac->relid = rel->rd_lockInfo.lockRelId; + vac->cycleid = result; + btvacinfo->num_vacuums++; + + LWLockRelease(BtreeVacuumLock); + return result; +} + +/* + * _bt_end_vacuum --- mark a btree VACUUM operation as done + * + * Note: this is deliberately coded not to complain if no entry is found; + * this allows the caller to put PG_TRY around the start_vacuum operation. + */ +void +_bt_end_vacuum(Relation rel) +{ + int i; + + LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE); + + /* Find the array entry */ + for (i = 0; i < btvacinfo->num_vacuums; i++) + { + BTOneVacInfo *vac = &btvacinfo->vacuums[i]; + + if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && + vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) + { + /* Remove it by shifting down the last entry */ + *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1]; + btvacinfo->num_vacuums--; + break; + } + } + + LWLockRelease(BtreeVacuumLock); +} + +/* + * _bt_end_vacuum wrapped as an on_shmem_exit callback function + */ +void +_bt_end_vacuum_callback(int code, Datum arg) +{ + _bt_end_vacuum((Relation) DatumGetPointer(arg)); +} + +/* + * BTreeShmemSize --- report amount of shared memory space needed + */ +Size +BTreeShmemSize(void) +{ + Size size; + + size = offsetof(BTVacInfo, vacuums); + size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo))); + return size; +} + +/* + * BTreeShmemInit --- initialize this module's shared memory + */ +void +BTreeShmemInit(void) +{ + bool found; + + btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State", + BTreeShmemSize(), + &found); + + if (!IsUnderPostmaster) + { + /* Initialize shared memory area */ + Assert(!found); + + /* + * It doesn't really matter what the cycle counter starts at, but + * having it always start the same doesn't seem good. Seed with + * low-order bits of time() instead. + */ + btvacinfo->cycle_ctr = (BTCycleId) time(NULL); + + btvacinfo->num_vacuums = 0; + btvacinfo->max_vacuums = MaxBackends; + } + else + Assert(found); +} + +bytea * +btoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)}, + {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, + offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}, + {"deduplicate_items", RELOPT_TYPE_BOOL, + offsetof(BTOptions, deduplicate_items)} + + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_BTREE, + sizeof(BTOptions), + tab, lengthof(tab)); + +} + +/* + * btproperty() -- Check boolean properties of indexes. + * + * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel + * to call btcanreturn. + */ +bool +btproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull) +{ + switch (prop) + { + case AMPROP_RETURNABLE: + /* answer only for columns, not AM or whole index */ + if (attno == 0) + return false; + /* otherwise, btree can always return data */ + *res = true; + return true; + + default: + return false; /* punt to generic code */ + } +} + +/* + * btbuildphasename() -- Return name of index build phase. + */ +char * +btbuildphasename(int64 phasenum) +{ + switch (phasenum) + { + case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE: + return "initializing"; + case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN: + return "scanning table"; + case PROGRESS_BTREE_PHASE_PERFORMSORT_1: + return "sorting live tuples"; + case PROGRESS_BTREE_PHASE_PERFORMSORT_2: + return "sorting dead tuples"; + case PROGRESS_BTREE_PHASE_LEAF_LOAD: + return "loading tuples in tree"; + default: + return NULL; + } +} + +/* + * _bt_truncate() -- create tuple without unneeded suffix attributes. + * + * Returns truncated pivot index tuple allocated in caller's memory context, + * with key attributes copied from caller's firstright argument. If rel is + * an INCLUDE index, non-key attributes will definitely be truncated away, + * since they're not part of the key space. More aggressive suffix + * truncation can take place when it's clear that the returned tuple does not + * need one or more suffix key attributes. We only need to keep firstright + * attributes up to and including the first non-lastleft-equal attribute. + * Caller's insertion scankey is used to compare the tuples; the scankey's + * argument values are not considered here. + * + * Note that returned tuple's t_tid offset will hold the number of attributes + * present, so the original item pointer offset is not represented. Caller + * should only change truncated tuple's downlink. Note also that truncated + * key attributes are treated as containing "minus infinity" values by + * _bt_compare(). + * + * In the worst case (when a heap TID must be appended to distinguish lastleft + * from firstright), the size of the returned tuple is the size of firstright + * plus the size of an additional MAXALIGN()'d item pointer. This guarantee + * is important, since callers need to stay under the 1/3 of a page + * restriction on tuple size. If this routine is ever taught to truncate + * within an attribute/datum, it will need to avoid returning an enlarged + * tuple to caller when truncation + TOAST compression ends up enlarging the + * final datum. + */ +IndexTuple +_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + int keepnatts; + IndexTuple pivot; + IndexTuple tidpivot; + ItemPointer pivotheaptid; + Size newsize; + + /* + * We should only ever truncate non-pivot tuples from leaf pages. It's + * never okay to truncate when splitting an internal page. + */ + Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); + + /* Determine how many attributes must be kept in truncated tuple */ + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + +#ifdef DEBUG_NO_TRUNCATE + /* Force truncation to be ineffective for testing purposes */ + keepnatts = nkeyatts + 1; +#endif + + pivot = index_truncate_tuple(itupdesc, firstright, + Min(keepnatts, nkeyatts)); + + if (BTreeTupleIsPosting(pivot)) + { + /* + * index_truncate_tuple() just returns a straight copy of firstright + * when it has no attributes to truncate. When that happens, we may + * need to truncate away a posting list here instead. + */ + Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1); + Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts); + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + + /* + * If there is a distinguishing key attribute within pivot tuple, we're + * done + */ + if (keepnatts <= nkeyatts) + { + BTreeTupleSetNAtts(pivot, keepnatts, false); + return pivot; + } + + /* + * We have to store a heap TID in the new pivot tuple, since no non-TID + * key attribute value in firstright distinguishes the right side of the + * split from the left side. nbtree conceptualizes this case as an + * inability to truncate away any key attributes, since heap TID is + * treated as just another key attribute (despite lacking a pg_attribute + * entry). + * + * Use enlarged space that holds a copy of pivot. We need the extra space + * to store a heap TID at the end (using the special pivot tuple + * representation). Note that the original pivot already has firstright's + * possible posting list/non-key attribute values removed at this point. + */ + newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData)); + tidpivot = palloc0(newsize); + memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot))); + /* Cannot leak memory here */ + pfree(pivot); + + /* + * Store all of firstright's key attribute values plus a tiebreaker heap + * TID value in enlarged pivot tuple + */ + tidpivot->t_info &= ~INDEX_SIZE_MASK; + tidpivot->t_info |= newsize; + BTreeTupleSetNAtts(tidpivot, nkeyatts, true); + pivotheaptid = BTreeTupleGetHeapTID(tidpivot); + + /* + * Lehman & Yao use lastleft as the leaf high key in all cases, but don't + * consider suffix truncation. It seems like a good idea to follow that + * example in cases where no truncation takes place -- use lastleft's heap + * TID. (This is also the closest value to negative infinity that's + * legally usable.) + */ + ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); + + /* + * We're done. Assert() that heap TID invariants hold before returning. + * + * Lehman and Yao require that the downlink to the right page, which is to + * be inserted into the parent page in the second phase of a page split be + * a strict lower bound on items on the right page, and a non-strict upper + * bound for items on the left page. Assert that heap TIDs follow these + * invariants, since a heap TID value is apparently needed as a + * tiebreaker. + */ +#ifndef DEBUG_NO_TRUNCATE + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); +#else + + /* + * Those invariants aren't guaranteed to hold for lastleft + firstright + * heap TID attribute values when they're considered here only because + * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually + * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap + * TID value that always works as a strict lower bound for items to the + * right. In particular, it must avoid using firstright's leading key + * attribute values along with lastleft's heap TID value when lastleft's + * TID happens to be greater than firstright's TID. + */ + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); + + /* + * Pivot heap TID should never be fully equal to firstright. Note that + * the pivot heap TID will still end up equal to lastleft's heap TID when + * that's the only usable value. + */ + ItemPointerSetOffsetNumber(pivotheaptid, + OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); +#endif + + return tidpivot; +} + +/* + * _bt_keep_natts - how many key attributes to keep when truncating. + * + * Caller provides two tuples that enclose a split point. Caller's insertion + * scankey is used to compare the tuples; the scankey's argument values are + * not considered here. + * + * This can return a number of attributes that is one greater than the + * number of key attributes for the index relation. This indicates that the + * caller must use a heap TID as a unique-ifier in new pivot tuple. + */ +static int +_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + TupleDesc itupdesc = RelationGetDescr(rel); + int keepnatts; + ScanKey scankey; + + /* + * _bt_compare() treats truncated key attributes as having the value minus + * infinity, which would break searches within !heapkeyspace indexes. We + * must still truncate away non-key attribute values, though. + */ + if (!itup_key->heapkeyspace) + return nkeyatts; + + scankey = itup_key->scankeys; + keepnatts = 1; + for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum1, + datum2)) != 0) + break; + + keepnatts++; + } + + /* + * Assert that _bt_keep_natts_fast() agrees with us in passing. This is + * expected in an allequalimage index. + */ + Assert(!itup_key->allequalimage || + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + + return keepnatts; +} + +/* + * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts. + * + * This is exported so that a candidate split point can have its effect on + * suffix truncation inexpensively evaluated ahead of time when finding a + * split location. A naive bitwise approach to datum comparisons is used to + * save cycles. + * + * The approach taken here usually provides the same answer as _bt_keep_natts + * will (for the same pair of tuples from a heapkeyspace index), since the + * majority of btree opclasses can never indicate that two datums are equal + * unless they're bitwise equal after detoasting. When an index only has + * "equal image" columns, routine is guaranteed to give the same result as + * _bt_keep_natts would. + * + * Callers can rely on the fact that attributes considered equal here are + * definitely also equal according to _bt_keep_natts, even when the index uses + * an opclass or collation that is not "allequalimage"/deduplication-safe. + * This weaker guarantee is good enough for nbtsplitloc.c caller, since false + * negatives generally only have the effect of making leaf page splits use a + * more balanced split point. + */ +int +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int keysz = IndexRelationGetNumberOfKeyAttributes(rel); + int keepnatts; + + keepnatts = 1; + for (int attnum = 1; attnum <= keysz; attnum++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + Form_pg_attribute att; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + att = TupleDescAttr(itupdesc, attnum - 1); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + !datum_image_eq(datum1, datum2, att->attbyval, att->attlen)) + break; + + keepnatts++; + } + + return keepnatts; +} + +/* + * _bt_check_natts() -- Verify tuple has expected number of attributes. + * + * Returns value indicating if the expected number of attributes were found + * for a particular offset on page. This can be used as a general purpose + * sanity check. + * + * Testing a tuple directly with BTreeTupleGetNAtts() should generally be + * preferred to calling here. That's usually more convenient, and is always + * more explicit. Call here instead when offnum's tuple may be a negative + * infinity tuple that uses the pre-v11 on-disk representation, or when a low + * context check is appropriate. This routine is as strict as possible about + * what is expected on each version of btree. + */ +bool +_bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) +{ + int16 natts = IndexRelationGetNumberOfAttributes(rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + IndexTuple itup; + int tupnatts; + + /* + * We cannot reliably test a deleted or half-dead page, since they have + * dummy high keys + */ + if (P_IGNORE(opaque)) + return true; + + Assert(offnum >= FirstOffsetNumber && + offnum <= PageGetMaxOffsetNumber(page)); + + /* + * Mask allocated for number of keys in index tuple must be able to fit + * maximum possible number of index attributes + */ + StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS, + "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS"); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + tupnatts = BTreeTupleGetNAtts(itup, rel); + + /* !heapkeyspace indexes do not support deduplication */ + if (!heapkeyspace && BTreeTupleIsPosting(itup)) + return false; + + /* Posting list tuples should never have "pivot heap TID" bit set */ + if (BTreeTupleIsPosting(itup) && + (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return false; + + /* INCLUDE indexes do not support deduplication */ + if (natts != nkeyatts && BTreeTupleIsPosting(itup)) + return false; + + if (P_ISLEAF(opaque)) + { + if (offnum >= P_FIRSTDATAKEY(opaque)) + { + /* + * Non-pivot tuple should never be explicitly marked as a pivot + * tuple + */ + if (BTreeTupleIsPivot(itup)) + return false; + + /* + * Leaf tuples that are not the page high key (non-pivot tuples) + * should never be truncated. (Note that tupnatts must have been + * inferred, even with a posting list tuple, because only pivot + * tuples store tupnatts directly.) + */ + return tupnatts == natts; + } + else + { + /* + * Rightmost page doesn't contain a page high key, so tuple was + * checked above as ordinary leaf tuple + */ + Assert(!P_RIGHTMOST(opaque)); + + /* + * !heapkeyspace high key tuple contains only key attributes. Note + * that tupnatts will only have been explicitly represented in + * !heapkeyspace indexes that happen to have non-key attributes. + */ + if (!heapkeyspace) + return tupnatts == nkeyatts; + + /* Use generic heapkeyspace pivot tuple handling */ + } + } + else /* !P_ISLEAF(opaque) */ + { + if (offnum == P_FIRSTDATAKEY(opaque)) + { + /* + * The first tuple on any internal page (possibly the first after + * its high key) is its negative infinity tuple. Negative + * infinity tuples are always truncated to zero attributes. They + * are a particular kind of pivot tuple. + */ + if (heapkeyspace) + return tupnatts == 0; + + /* + * The number of attributes won't be explicitly represented if the + * negative infinity tuple was generated during a page split that + * occurred with a version of Postgres before v11. There must be + * a problem when there is an explicit representation that is + * non-zero, or when there is no explicit representation and the + * tuple is evidently not a pre-pg_upgrade tuple. + * + * Prior to v11, downlinks always had P_HIKEY as their offset. + * Accept that as an alternative indication of a valid + * !heapkeyspace negative infinity tuple. + */ + return tupnatts == 0 || + ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; + } + else + { + /* + * !heapkeyspace downlink tuple with separator key contains only + * key attributes. Note that tupnatts will only have been + * explicitly represented in !heapkeyspace indexes that happen to + * have non-key attributes. + */ + if (!heapkeyspace) + return tupnatts == nkeyatts; + + /* Use generic heapkeyspace pivot tuple handling */ + } + + } + + /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */ + Assert(heapkeyspace); + + /* + * Explicit representation of the number of attributes is mandatory with + * heapkeyspace index pivot tuples, regardless of whether or not there are + * non-key attributes. + */ + if (!BTreeTupleIsPivot(itup)) + return false; + + /* Pivot tuple should not use posting list representation (redundant) */ + if (BTreeTupleIsPosting(itup)) + return false; + + /* + * Heap TID is a tiebreaker key attribute, so it cannot be untruncated + * when any other key attribute is truncated + */ + if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts) + return false; + + /* + * Pivot tuple must have at least one untruncated key attribute (minus + * infinity pivot tuples are the only exception). Pivot tuples can never + * represent that there is a value present for a key attribute that + * exceeds pg_index.indnkeyatts for the index. + */ + return tupnatts > 0 && tupnatts <= nkeyatts; +} + +/* + * + * _bt_check_third_page() -- check whether tuple fits on a btree page at all. + * + * We actually need to be able to fit three items on every page, so restrict + * any one item to 1/3 the per-page available space. Note that itemsz should + * not include the ItemId overhead. + * + * It might be useful to apply TOAST methods rather than throw an error here. + * Using out of line storage would break assumptions made by suffix truncation + * and by contrib/amcheck, though. + */ +void +_bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, + Page page, IndexTuple newtup) +{ + Size itemsz; + BTPageOpaque opaque; + + itemsz = MAXALIGN(IndexTupleSize(newtup)); + + /* Double check item size against limit */ + if (itemsz <= BTMaxItemSize(page)) + return; + + /* + * Tuple is probably too large to fit on page, but it's possible that the + * index uses version 2 or version 3, or that page is an internal page, in + * which case a slightly higher limit applies. + */ + if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page)) + return; + + /* + * Internal page insertions cannot fail here, because that would mean that + * an earlier leaf level insertion that should have failed didn't + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_ISLEAF(opaque)) + elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"", + itemsz, RelationGetRelationName(rel)); + + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"", + itemsz, + needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION, + needheaptidspace ? BTMaxItemSize(page) : + BTMaxItemSizeNoHeapTid(page), + RelationGetRelationName(rel)), + errdetail("Index row references tuple (%u,%u) in relation \"%s\".", + ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), + ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), + RelationGetRelationName(heap)), + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."), + errtableconstraint(heap, RelationGetRelationName(rel)))); +} + +/* + * Are all attributes in rel "equality is image equality" attributes? + * + * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any + * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we + * return false; otherwise we return true. + * + * Returned boolean value is stored in index metapage during index builds. + * Deduplication can only be used when we return true. + */ +bool +_bt_allequalimage(Relation rel, bool debugmessage) +{ + bool allequalimage = true; + + /* INCLUDE indexes don't support deduplication */ + if (IndexRelationGetNumberOfAttributes(rel) != + IndexRelationGetNumberOfKeyAttributes(rel)) + return false; + + /* + * There is no special reason why deduplication cannot work with system + * relations (i.e. with system catalog indexes and TOAST indexes). We + * deem deduplication unsafe for these indexes all the same, since the + * alternative is to force users to always use deduplication, without + * being able to opt out. (ALTER INDEX is not supported with system + * indexes, so users would have no way to set the deduplicate_items + * storage parameter to 'off'.) + */ + if (IsSystemRelation(rel)) + return false; + + for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++) + { + Oid opfamily = rel->rd_opfamily[i]; + Oid opcintype = rel->rd_opcintype[i]; + Oid collation = rel->rd_indcollation[i]; + Oid equalimageproc; + + equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype, + BTEQUALIMAGE_PROC); + + /* + * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to + * be unsafe. Otherwise, actually call proc and see what it says. + */ + if (!OidIsValid(equalimageproc) || + !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation, + ObjectIdGetDatum(opcintype)))) + { + allequalimage = false; + break; + } + } + + /* + * Don't elog() until here to avoid reporting on a system relation index + * or an INCLUDE index + */ + if (debugmessage) + { + if (allequalimage) + elog(DEBUG1, "index \"%s\" can safely use deduplication", + RelationGetRelationName(rel)); + else + elog(DEBUG1, "index \"%s\" cannot use deduplication", + RelationGetRelationName(rel)); + } + + return allequalimage; +} diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c new file mode 100644 index 0000000..7acb64e --- /dev/null +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -0,0 +1,380 @@ +/*------------------------------------------------------------------------- + * + * nbtvalidate.c + * Opclass validator for btree. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/xact.h" +#include "catalog/pg_am.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + + +/* + * Validator for a btree opclass. + * + * Some of the checks done here cover the whole opfamily, and therefore are + * redundant when checking each opclass in a family. But they don't run long + * enough to be much of a problem, so we accept the duplication rather than + * complicate the amvalidate API. + */ +bool +btvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + List *familytypes; + int usefulgroups; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case BTORDER_PROC: + ok = check_amproc_signature(procform->amproc, INT4OID, true, + 2, 2, procform->amproclefttype, + procform->amprocrighttype); + break; + case BTSORTSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; + case BTINRANGE_PROC: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 5, 5, + procform->amproclefttype, + procform->amproclefttype, + procform->amprocrighttype, + BOOLOID, BOOLOID); + break; + case BTEQUALIMAGE_PROC: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 1, 1, OIDOID); + break; + case BTOPTIONS_PROC: + ok = check_amoptsproc_signature(procform->amproc); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "btree", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "btree", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || + oprform->amopstrategy > BTMaxStrategyNumber) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "btree", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* btree doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "btree", + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all btree strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "btree", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + usefulgroups = 0; + opclassgroup = NULL; + familytypes = NIL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* + * It is possible for an in_range support function to have a RHS type + * that is otherwise irrelevant to the opfamily --- for instance, SQL + * requires the datetime_ops opclass to have range support with an + * interval offset. So, if this group appears to contain only an + * in_range function, ignore it: it doesn't represent a pair of + * supported types. + */ + if (thisgroup->operatorset == 0 && + thisgroup->functionset == (1 << BTINRANGE_PROC)) + continue; + + /* Else count it as a relevant group */ + usefulgroups++; + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * Identify all distinct data types handled in this opfamily. This + * implementation is O(N^2), but there aren't likely to be enough + * types in the family for it to matter. + */ + familytypes = list_append_unique_oid(familytypes, thisgroup->lefttype); + familytypes = list_append_unique_oid(familytypes, thisgroup->righttype); + + /* + * Complain if there seems to be an incomplete set of either operators + * or support functions for this datatype pair. The sortsupport, + * in_range, and equalimage functions are considered optional. + */ + if (thisgroup->operatorset != + ((1 << BTLessStrategyNumber) | + (1 << BTLessEqualStrategyNumber) | + (1 << BTEqualStrategyNumber) | + (1 << BTGreaterEqualStrategyNumber) | + (1 << BTGreaterStrategyNumber))) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s", + opfamilyname, "btree", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s", + opfamilyname, "btree", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + } + + /* Check that the originally-named opclass is supported */ + /* (if group is there, we already checked it adequately above) */ + if (!opclassgroup) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing operator(s)", + opclassname, "btree"))); + result = false; + } + + /* + * Complain if the opfamily doesn't have entries for all possible + * combinations of its supported datatypes. While missing cross-type + * operators are not fatal, they do limit the planner's ability to derive + * additional qual clauses from equivalence classes, so it seems + * reasonable to insist that all built-in btree opfamilies be complete. + */ + if (usefulgroups != (list_length(familytypes) * list_length(familytypes))) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)", + opfamilyname, "btree"))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + +/* + * Prechecking function for adding operators/functions to a btree opfamily. + */ +void +btadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + Oid opcintype; + ListCell *lc; + + /* + * Btree operators and comparison support functions are always "loose" + * members of the opfamily if they are cross-type. If they are not + * cross-type, we prefer to tie them to the appropriate opclass ... but if + * the user hasn't created one, we can't do that, and must fall back to + * using the opfamily dependency. (We mustn't force creation of an + * opclass in such a case, as leaving an incomplete opclass laying about + * would be bad. Throwing an error is another undesirable alternative.) + * + * This behavior results in a bit of a dump/reload hazard, in that the + * order of restoring objects could affect what dependencies we end up + * with. pg_dump's existing behavior will preserve the dependency choices + * in most cases, but not if a cross-type operator has been bound tightly + * into an opclass. That's a mistake anyway, so silently "fixing" it + * isn't awful. + * + * Optional support functions are always "loose" family members. + * + * To avoid repeated lookups, we remember the most recently used opclass's + * input type. + */ + if (OidIsValid(opclassoid)) + { + /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */ + CommandCounterIncrement(); + opcintype = get_opclass_input_type(opclassoid); + } + else + opcintype = InvalidOid; + + /* + * We handle operators and support functions almost identically, so rather + * than duplicate this code block, just join the lists. + */ + foreach(lc, list_concat_copy(operators, functions)) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + if (op->is_func && op->number != BTORDER_PROC) + { + /* Optional support proc, so always a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + else if (op->lefttype != op->righttype) + { + /* Cross-type, so always a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + else + { + /* Not cross-type; is there a suitable opclass? */ + if (op->lefttype != opcintype) + { + /* Avoid repeating this expensive lookup, even if it fails */ + opcintype = op->lefttype; + opclassoid = opclass_for_family_datatype(BTREE_AM_OID, + opfamilyoid, + opcintype); + } + if (OidIsValid(opclassoid)) + { + /* Hard dependency on opclass */ + op->ref_is_hard = true; + op->ref_is_family = false; + op->refobjid = opclassoid; + } + else + { + /* We're stuck, so make a soft dependency on the opfamily */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + } + } +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c new file mode 100644 index 0000000..786c08c --- /dev/null +++ b/src/backend/access/nbtree/nbtxlog.c @@ -0,0 +1,1126 @@ +/*------------------------------------------------------------------------- + * + * nbtxlog.c + * WAL replay logic for btrees. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtxlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/procarray.h" +#include "utils/memutils.h" + +static MemoryContext opCtx; /* working memory for operations */ + +/* + * _bt_restore_page -- re-enter all the index tuples on a page + * + * The page is freshly init'd, and *from (length len) is a copy of what + * had been its upper part (pd_upper to pd_special). We assume that the + * tuples had been added to the page in item-number order, and therefore + * the one with highest item number appears first (lowest on the page). + */ +static void +_bt_restore_page(Page page, char *from, int len) +{ + IndexTupleData itupdata; + Size itemsz; + char *end = from + len; + Item items[MaxIndexTuplesPerPage]; + uint16 itemsizes[MaxIndexTuplesPerPage]; + int i; + int nitems; + + /* + * To get the items back in the original order, we add them to the page in + * reverse. To figure out where one tuple ends and another begins, we + * have to scan them in forward order first. + */ + i = 0; + while (from < end) + { + /* + * As we step through the items, 'from' won't always be properly + * aligned, so we need to use memcpy(). Further, we use Item (which + * is just a char*) here for our items array for the same reason; + * wouldn't want the compiler or anyone thinking that an item is + * aligned when it isn't. + */ + memcpy(&itupdata, from, sizeof(IndexTupleData)); + itemsz = IndexTupleSize(&itupdata); + itemsz = MAXALIGN(itemsz); + + items[i] = (Item) from; + itemsizes[i] = itemsz; + i++; + + from += itemsz; + } + nitems = i; + + for (i = nitems - 1; i >= 0; i--) + { + if (PageAddItem(page, items[i], itemsizes[i], nitems - i, + false, false) == InvalidOffsetNumber) + elog(PANIC, "_bt_restore_page: cannot add item to page"); + } +} + +static void +_bt_restore_meta(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer metabuf; + Page metapg; + BTMetaPageData *md; + BTPageOpaque pageop; + xl_btree_metadata *xlrec; + char *ptr; + Size len; + + metabuf = XLogInitBufferForRedo(record, block_id); + ptr = XLogRecGetBlockData(record, block_id, &len); + + Assert(len == sizeof(xl_btree_metadata)); + Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); + xlrec = (xl_btree_metadata *) ptr; + metapg = BufferGetPage(metabuf); + + _bt_pageinit(metapg, BufferGetPageSize(metabuf)); + + md = BTPageGetMeta(metapg); + md->btm_magic = BTREE_MAGIC; + md->btm_version = xlrec->version; + md->btm_root = xlrec->root; + md->btm_level = xlrec->level; + md->btm_fastroot = xlrec->fastroot; + md->btm_fastlevel = xlrec->fastlevel; + /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */ + Assert(md->btm_version >= BTREE_NOVAC_VERSION); + md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages; + md->btm_last_cleanup_num_heap_tuples = -1.0; + md->btm_allequalimage = xlrec->allequalimage; + + pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); + pageop->btpo_flags = BTP_META; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) metapg)->pd_lower = + ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg; + + PageSetLSN(metapg, lsn); + MarkBufferDirty(metabuf); + UnlockReleaseBuffer(metabuf); +} + +/* + * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page + * + * This is a common subroutine of the redo functions of all the WAL record + * types that can insert a downlink: insert, split, and newroot. + */ +static void +_bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buf; + + if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buf); + BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_INCOMPLETE_SPLIT(pageop)); + pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + +static void +btree_xlog_insert(bool isleaf, bool ismeta, bool posting, + XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + + /* + * Insertion to an internal page finishes an incomplete split at the child + * level. Clear the incomplete-split flag in the child. Note: during + * normal operation, the child and parent pages are locked at the same + * time (the locks are coupled), so that clearing the flag and inserting + * the downlink appear atomic to other backends. We don't bother with + * that during replay, because readers don't care about the + * incomplete-split flag and there cannot be updates happening. + */ + if (!isleaf) + _bt_clear_incomplete_split(record, 1); + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); + + page = BufferGetPage(buffer); + + if (!posting) + { + /* Simple retail insertion */ + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add new item"); + } + else + { + ItemId itemid; + IndexTuple oposting, + newitem, + nposting; + uint16 postingoff; + + /* + * A posting list split occurred during leaf page insertion. WAL + * record data will start with an offset number representing the + * point in an existing posting list that a split occurs at. + * + * Use _bt_swap_posting() to repeat posting list split steps from + * primary. Note that newitem from WAL record is 'orignewitem', + * not the final version of newitem that is actually inserted on + * page. + */ + postingoff = *((uint16 *) datapos); + datapos += sizeof(uint16); + datalen -= sizeof(uint16); + + itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum)); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + Assert(isleaf && postingoff > 0); + newitem = CopyIndexTuple((IndexTuple) datapos); + nposting = _bt_swap_posting(newitem, oposting, postingoff); + + /* Replace existing posting list with post-split version */ + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + /* Insert "final" new item (not orignewitem from WAL stream) */ + Assert(IndexTupleSize(newitem) == datalen); + if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add posting split new item"); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * Note: in normal operation, we'd update the metapage while still holding + * lock on the page we inserted into. But during replay it's not + * necessary to hold that lock, since no other index updates can be + * happening concurrently, and readers will cope fine with following an + * obsolete link from the metapage. + */ + if (ismeta) + _bt_restore_meta(record, 2); +} + +static void +btree_xlog_split(bool newitemonleft, XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); + bool isleaf = (xlrec->level == 0); + Buffer buf; + Buffer rbuf; + Page rpage; + BTPageOpaque ropaque; + char *datapos; + Size datalen; + BlockNumber origpagenumber; + BlockNumber rightpagenumber; + BlockNumber spagenumber; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber); + XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber); + if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &spagenumber)) + spagenumber = P_NONE; + + /* + * Clear the incomplete split flag on the appropriate child page one level + * down when origpage/buf is an internal page (there must have been + * cascading page splits during original execution in the event of an + * internal page split). This is like the corresponding btree_xlog_insert + * call for internal pages. We're not clearing the incomplete split flag + * for the current page split here (you can think of this as part of the + * insert of newitem that the page split action needs to perform in + * passing). + * + * Like in btree_xlog_insert, this can be done before locking other pages. + * We never need to couple cross-level locks in REDO routines. + */ + if (!isleaf) + _bt_clear_incomplete_split(record, 3); + + /* Reconstruct right (new) sibling page from scratch */ + rbuf = XLogInitBufferForRedo(record, 1); + datapos = XLogRecGetBlockData(record, 1, &datalen); + rpage = (Page) BufferGetPage(rbuf); + + _bt_pageinit(rpage, BufferGetPageSize(rbuf)); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = spagenumber; + ropaque->btpo_level = xlrec->level; + ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; + ropaque->btpo_cycleid = 0; + + _bt_restore_page(rpage, datapos, datalen); + + PageSetLSN(rpage, lsn); + MarkBufferDirty(rbuf); + + /* Now reconstruct original page (left half of split) */ + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + /* + * To retain the same physical order of the tuples that they had, we + * initialize a temporary empty page for the left page and add all the + * items to that in item number order. This mirrors how _bt_split() + * works. Retaining the same physical order makes WAL consistency + * checking possible. See also _bt_restore_page(), which does the + * same for the right page. + */ + Page origpage = (Page) BufferGetPage(buf); + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + OffsetNumber off; + IndexTuple newitem = NULL, + left_hikey = NULL, + nposting = NULL; + Size newitemsz = 0, + left_hikeysz = 0; + Page leftpage; + OffsetNumber leftoff, + replacepostingoff = InvalidOffsetNumber; + + datapos = XLogRecGetBlockData(record, 0, &datalen); + + if (newitemonleft || xlrec->postingoff != 0) + { + newitem = (IndexTuple) datapos; + newitemsz = MAXALIGN(IndexTupleSize(newitem)); + datapos += newitemsz; + datalen -= newitemsz; + + if (xlrec->postingoff != 0) + { + ItemId itemid; + IndexTuple oposting; + + /* Posting list must be at offset number before new item's */ + replacepostingoff = OffsetNumberPrev(xlrec->newitemoff); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + newitem = CopyIndexTuple(newitem); + itemid = PageGetItemId(origpage, replacepostingoff); + oposting = (IndexTuple) PageGetItem(origpage, itemid); + nposting = _bt_swap_posting(newitem, oposting, + xlrec->postingoff); + } + } + + /* + * Extract left hikey and its size. We assume that 16-bit alignment + * is enough to apply IndexTupleSize (since it's fetching from a + * uint16 field). + */ + left_hikey = (IndexTuple) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + datapos += left_hikeysz; + datalen -= left_hikeysz; + + Assert(datalen == 0); + + leftpage = PageGetTempPageCopySpecial(origpage); + + /* Add high key tuple from WAL record to temp page */ + leftoff = P_HIKEY; + if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add high key to left page after split"); + leftoff = OffsetNumberNext(leftoff); + + for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++) + { + ItemId itemid; + Size itemsz; + IndexTuple item; + + /* Add replacement posting list when required */ + if (off == replacepostingoff) + { + Assert(newitemonleft || + xlrec->firstrightoff == xlrec->newitemoff); + if (PageAddItem(leftpage, (Item) nposting, + MAXALIGN(IndexTupleSize(nposting)), leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new posting list item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + continue; /* don't insert oposting */ + } + + /* add the new item if it was inserted on left page */ + else if (newitemonleft && off == xlrec->newitemoff) + { + if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + } + + itemid = PageGetItemId(origpage, off); + itemsz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(origpage, itemid); + if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add old item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + } + + /* cope with possibility that newitem goes at the end */ + if (newitemonleft && off == xlrec->newitemoff) + { + if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + } + + PageRestoreTempPage(leftpage, origpage); + + /* Fix opaque fields */ + oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; + if (isleaf) + oopaque->btpo_flags |= BTP_LEAF; + oopaque->btpo_next = rightpagenumber; + oopaque->btpo_cycleid = 0; + + PageSetLSN(origpage, lsn); + MarkBufferDirty(buf); + } + + /* Fix left-link of the page to the right of the new right sibling */ + if (spagenumber != P_NONE) + { + Buffer sbuf; + + if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO) + { + Page spage = (Page) BufferGetPage(sbuf); + BTPageOpaque spageop = (BTPageOpaque) PageGetSpecialPointer(spage); + + spageop->btpo_prev = rightpagenumber; + + PageSetLSN(spage, lsn); + MarkBufferDirty(sbuf); + } + if (BufferIsValid(sbuf)) + UnlockReleaseBuffer(sbuf); + } + + /* + * Finally, release the remaining buffers. sbuf, rbuf, and buf must be + * released together, so that readers cannot observe inconsistencies. + */ + UnlockReleaseBuffer(rbuf); + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + +static void +btree_xlog_dedup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record); + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + Page page = (Page) BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + OffsetNumber offnum, + minoff, + maxoff; + BTDedupState state; + BTDedupInterval *intervals; + Page newpage; + + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; /* unused */ + state->nmaxitems = 0; /* unused */ + /* Conservatively use larger maxpostingsize than primary */ + state->maxpostingsize = BTMaxItemSize(page); + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->nintervals = 0; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + newpage = PageGetTempPageCopySpecial(page); + + if (!P_RIGHTMOST(opaque)) + { + ItemId itemid = PageGetItemId(page, P_HIKEY); + Size itemsz = ItemIdGetLength(itemid); + IndexTuple item = (IndexTuple) PageGetItem(page, itemid); + + if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + intervals = (BTDedupInterval *) ptr; + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + if (offnum == minoff) + _bt_dedup_start_pending(state, itup, offnum); + else if (state->nintervals < xlrec->nintervals && + state->baseoff == intervals[state->nintervals].baseoff && + state->nitems < intervals[state->nintervals].nitems) + { + if (!_bt_dedup_save_htid(state, itup)) + elog(ERROR, "deduplication failed to add heap tid to pending posting list"); + } + else + { + _bt_dedup_finish_pending(newpage, state); + _bt_dedup_start_pending(state, itup, offnum); + } + } + + _bt_dedup_finish_pending(newpage, state); + Assert(state->nintervals == xlrec->nintervals); + Assert(memcmp(state->intervals, intervals, + state->nintervals * sizeof(BTDedupInterval)) == 0); + + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + PageRestoreTempPage(newpage, page); + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + +static void +btree_xlog_updates(Page page, OffsetNumber *updatedoffsets, + xl_btree_update *updates, int nupdated) +{ + BTVacuumPosting vacposting; + IndexTuple origtuple; + ItemId itemid; + Size itemsz; + + for (int i = 0; i < nupdated; i++) + { + itemid = PageGetItemId(page, updatedoffsets[i]); + origtuple = (IndexTuple) PageGetItem(page, itemid); + + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + updates->ndeletedtids * sizeof(uint16)); + vacposting->updatedoffset = updatedoffsets[i]; + vacposting->itup = origtuple; + vacposting->ndeletedtids = updates->ndeletedtids; + memcpy(vacposting->deletetids, + (char *) updates + SizeOfBtreeUpdate, + updates->ndeletedtids * sizeof(uint16)); + + _bt_update_posting(vacposting); + + /* Overwrite updated version of tuple */ + itemsz = MAXALIGN(IndexTupleSize(vacposting->itup)); + if (!PageIndexTupleOverwrite(page, updatedoffsets[i], + (Item) vacposting->itup, itemsz)) + elog(PANIC, "failed to update partially dead item"); + + pfree(vacposting->itup); + pfree(vacposting); + + /* advance to next xl_btree_update from array */ + updates = (xl_btree_update *) + ((char *) updates + SizeOfBtreeUpdate + + updates->ndeletedtids * sizeof(uint16)); + } +} + +static void +btree_xlog_vacuum(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque opaque; + + /* + * We need to take a cleanup lock here, just like btvacuumpage(). However, + * it isn't necessary to exhaustively get a cleanup lock on every block in + * the index during recovery (just getting a cleanup lock on pages with + * items to kill suffices). See nbtree/README for details. + */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) + == BLK_NEEDS_REDO) + { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + + page = (Page) BufferGetPage(buffer); + + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + xl_btree_update *updates; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updates = (xl_btree_update *) ((char *) updatedoffsets + + xlrec->nupdated * + sizeof(OffsetNumber)); + + btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated); + } + + if (xlrec->ndeleted > 0) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + + /* + * Mark the page as not containing any LP_DEAD items --- see comments + * in _bt_delitems_vacuum(). + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +btree_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque opaque; + + /* + * If we have any conflict processing to do, it must happen before we + * update the page + */ + if (InHotStandby) + { + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + + ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode); + } + + /* + * We don't need to take a cleanup lock to apply these changes. See + * nbtree/README for details. + */ + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + + page = (Page) BufferGetPage(buffer); + + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + xl_btree_update *updates; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updates = (xl_btree_update *) ((char *) updatedoffsets + + xlrec->nupdated * + sizeof(OffsetNumber)); + + btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated); + } + + if (xlrec->ndeleted > 0) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + + /* Mark the page as not containing any LP_DEAD items */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque pageop; + IndexTupleData trunctuple; + + /* + * In normal operation, we would lock all the pages this WAL record + * touches before changing any of them. In WAL replay, it should be okay + * to lock just one page at a time, since no concurrent index updates can + * be happening, and readers should not care whether they arrive at the + * target page or not (since it's surely empty). + */ + + /* to-be-deleted subtree's parent page */ + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + OffsetNumber poffset; + ItemId itemid; + IndexTuple itup; + OffsetNumber nextoffset; + BlockNumber rightsib; + + page = (Page) BufferGetPage(buffer); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + poffset = xlrec->poffset; + + nextoffset = OffsetNumberNext(poffset); + itemid = PageGetItemId(page, nextoffset); + itup = (IndexTuple) PageGetItem(page, itemid); + rightsib = BTreeTupleGetDownLink(itup); + + itemid = PageGetItemId(page, poffset); + itup = (IndexTuple) PageGetItem(page, itemid); + BTreeTupleSetDownLink(itup, rightsib); + nextoffset = OffsetNumberNext(poffset); + PageIndexTupleDelete(page, nextoffset); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + /* + * Don't need to couple cross-level locks in REDO routines, so release + * lock on internal page immediately + */ + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* Rewrite the leaf page as a halfdead page */ + buffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(buffer); + + _bt_pageinit(page, BufferGetPageSize(buffer)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_prev = xlrec->leftblk; + pageop->btpo_next = xlrec->rightblk; + pageop->btpo_level = 0; + pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; + pageop->btpo_cycleid = 0; + + /* + * Construct a dummy high key item that points to top parent page (value + * is InvalidBlockNumber when the top parent page is the leaf page itself) + */ + MemSet(&trunctuple, 0, sizeof(IndexTupleData)); + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetTopParent(&trunctuple, xlrec->topparent); + + if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "could not add dummy high key to half-dead page"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + + +static void +btree_xlog_unlink_page(uint8 info, XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); + BlockNumber leftsib; + BlockNumber rightsib; + uint32 level; + bool isleaf; + FullTransactionId safexid; + Buffer leftbuf; + Buffer target; + Buffer rightbuf; + Page page; + BTPageOpaque pageop; + + leftsib = xlrec->leftsib; + rightsib = xlrec->rightsib; + level = xlrec->level; + isleaf = (level == 0); + safexid = xlrec->safexid; + + /* No leaftopparent for level 0 (leaf page) or level 1 target */ + Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1); + + /* + * In normal operation, we would lock all the pages this WAL record + * touches before changing any of them. In WAL replay, we at least lock + * the pages in the same standard left-to-right order (leftsib, target, + * rightsib), and don't release the sibling locks until the target is + * marked deleted. + */ + + /* Fix right-link of left sibling, if any */ + if (leftsib != P_NONE) + { + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(leftbuf); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + pageop->btpo_next = rightsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(leftbuf); + } + } + else + leftbuf = InvalidBuffer; + + /* Rewrite target page as empty deleted page */ + target = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(target); + + _bt_pageinit(page, BufferGetPageSize(target)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_prev = leftsib; + pageop->btpo_next = rightsib; + pageop->btpo_level = level; + BTPageSetDeleted(page, safexid); + if (isleaf) + pageop->btpo_flags |= BTP_LEAF; + pageop->btpo_cycleid = 0; + + PageSetLSN(page, lsn); + MarkBufferDirty(target); + + /* Fix left-link of right sibling */ + if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(rightbuf); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + pageop->btpo_prev = leftsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(rightbuf); + } + + /* Release siblings */ + if (BufferIsValid(leftbuf)) + UnlockReleaseBuffer(leftbuf); + if (BufferIsValid(rightbuf)) + UnlockReleaseBuffer(rightbuf); + + /* Release target */ + UnlockReleaseBuffer(target); + + /* + * If we deleted a parent of the targeted leaf page, instead of the leaf + * itself, update the leaf to point to the next remaining child in the + * to-be-deleted subtree + */ + if (XLogRecHasBlockRef(record, 3)) + { + /* + * There is no real data on the page, so we just re-create it from + * scratch using the information from the WAL record. + * + * Note that we don't end up here when the target page is also the + * leafbuf page. There is no need to add a dummy hikey item with a + * top parent link when deleting leafbuf because it's the last page + * we'll delete in the subtree undergoing deletion. + */ + Buffer leafbuf; + IndexTupleData trunctuple; + + Assert(!isleaf); + + leafbuf = XLogInitBufferForRedo(record, 3); + page = (Page) BufferGetPage(leafbuf); + + _bt_pageinit(page, BufferGetPageSize(leafbuf)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; + pageop->btpo_prev = xlrec->leafleftsib; + pageop->btpo_next = xlrec->leafrightsib; + pageop->btpo_level = 0; + pageop->btpo_cycleid = 0; + + /* Add a dummy hikey item */ + MemSet(&trunctuple, 0, sizeof(IndexTupleData)); + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent); + + if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "could not add dummy high key to half-dead page"); + + PageSetLSN(page, lsn); + MarkBufferDirty(leafbuf); + UnlockReleaseBuffer(leafbuf); + } + + /* Update metapage if needed */ + if (info == XLOG_BTREE_UNLINK_PAGE_META) + _bt_restore_meta(record, 4); +} + +static void +btree_xlog_newroot(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque pageop; + char *ptr; + Size len; + + buffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(buffer); + + _bt_pageinit(page, BufferGetPageSize(buffer)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_flags = BTP_ROOT; + pageop->btpo_prev = pageop->btpo_next = P_NONE; + pageop->btpo_level = xlrec->level; + if (xlrec->level == 0) + pageop->btpo_flags |= BTP_LEAF; + pageop->btpo_cycleid = 0; + + if (xlrec->level > 0) + { + ptr = XLogRecGetBlockData(record, 0, &len); + _bt_restore_page(page, ptr, len); + + /* Clear the incomplete-split flag in left child */ + _bt_clear_incomplete_split(record, 1); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + + _bt_restore_meta(record, 2); +} + +/* + * In general VACUUM must defer recycling as a way of avoiding certain race + * conditions. Deleted pages contain a safexid value that is used by VACUUM + * to determine whether or not it's safe to place a page that was deleted by + * VACUUM earlier into the FSM now. See nbtree/README. + * + * As far as any backend operating during original execution is concerned, the + * FSM is a cache of recycle-safe pages; the mere presence of the page in the + * FSM indicates that the page must already be safe to recycle (actually, + * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just + * because it would be unwise to completely trust the FSM, given its current + * limitations). + * + * This isn't sufficient to prevent similar concurrent recycling race + * conditions during Hot Standby, though. For that we need to log a + * xl_btree_reuse_page record at the point that a page is actually recycled + * and reused for an entirely unrelated page inside _bt_split(). These + * records include the same safexid value from the original deleted page, + * stored in the record's latestRemovedFullXid field. + * + * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used + * to determine if it's safe to recycle a page. This mirrors our own test: + * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs(). + * Consequently, one XID value achieves the same exclusion effect on primary + * and standby. + */ +static void +btree_xlog_reuse_page(XLogReaderState *record) +{ + xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); + + if (InHotStandby) + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); +} + +void +btree_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + btree_xlog_insert(true, false, false, record); + break; + case XLOG_BTREE_INSERT_UPPER: + btree_xlog_insert(false, false, false, record); + break; + case XLOG_BTREE_INSERT_META: + btree_xlog_insert(false, true, false, record); + break; + case XLOG_BTREE_SPLIT_L: + btree_xlog_split(true, record); + break; + case XLOG_BTREE_SPLIT_R: + btree_xlog_split(false, record); + break; + case XLOG_BTREE_INSERT_POST: + btree_xlog_insert(true, false, true, record); + break; + case XLOG_BTREE_DEDUP: + btree_xlog_dedup(record); + break; + case XLOG_BTREE_VACUUM: + btree_xlog_vacuum(record); + break; + case XLOG_BTREE_DELETE: + btree_xlog_delete(record); + break; + case XLOG_BTREE_MARK_PAGE_HALFDEAD: + btree_xlog_mark_page_halfdead(info, record); + break; + case XLOG_BTREE_UNLINK_PAGE: + case XLOG_BTREE_UNLINK_PAGE_META: + btree_xlog_unlink_page(info, record); + break; + case XLOG_BTREE_NEWROOT: + btree_xlog_newroot(record); + break; + case XLOG_BTREE_REUSE_PAGE: + btree_xlog_reuse_page(record); + break; + case XLOG_BTREE_META_CLEANUP: + _bt_restore_meta(record, 0); + break; + default: + elog(PANIC, "btree_redo: unknown op code %u", info); + } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +btree_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "Btree recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +btree_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; +} + +/* + * Mask a btree page before performing consistency checks on it. + */ +void +btree_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + BTPageOpaque maskopaq; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_ISLEAF(maskopaq)) + { + /* + * In btree leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * _bt_killitems(), _bt_check_unique() for details. + */ + mask_lp_flags(page); + } + + /* + * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See + * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique() + * for details. + */ + maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* + * During replay of a btree page split, we don't set the BTP_SPLIT_END + * flag of the right sibling and initialize the cycle_id to 0 for the same + * page. See btree_xlog_split() for details. + */ + maskopaq->btpo_flags &= ~BTP_SPLIT_END; + maskopaq->btpo_cycleid = 0; +} |