summaryrefslogtreecommitdiffstats
path: root/src/backend/access/nbtree
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/backend/access/nbtree/Makefile28
-rw-r--r--src/backend/access/nbtree/README1056
-rw-r--r--src/backend/access/nbtree/nbtcompare.c335
-rw-r--r--src/backend/access/nbtree/nbtdedup.c1098
-rw-r--r--src/backend/access/nbtree/nbtinsert.c3009
-rw-r--r--src/backend/access/nbtree/nbtpage.c3073
-rw-r--r--src/backend/access/nbtree/nbtree.c1446
-rw-r--r--src/backend/access/nbtree/nbtsearch.c2501
-rw-r--r--src/backend/access/nbtree/nbtsort.c2016
-rw-r--r--src/backend/access/nbtree/nbtsplitloc.c1190
-rw-r--r--src/backend/access/nbtree/nbtutils.c2751
-rw-r--r--src/backend/access/nbtree/nbtvalidate.c380
-rw-r--r--src/backend/access/nbtree/nbtxlog.c1126
13 files changed, 20009 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
new file mode 100644
index 0000000..d69808e
--- /dev/null
+++ b/src/backend/access/nbtree/Makefile
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/nbtree
+#
+# IDENTIFICATION
+# src/backend/access/nbtree/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/nbtree
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ nbtcompare.o \
+ nbtdedup.o \
+ nbtinsert.o \
+ nbtpage.o \
+ nbtree.o \
+ nbtsearch.o \
+ nbtsort.o \
+ nbtsplitloc.o \
+ nbtutils.o \
+ nbtvalidate.o \
+ nbtxlog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
new file mode 100644
index 0000000..bfe33b6
--- /dev/null
+++ b/src/backend/access/nbtree/README
@@ -0,0 +1,1056 @@
+src/backend/access/nbtree/README
+
+Btree Indexing
+==============
+
+This directory contains a correct implementation of Lehman and Yao's
+high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
+Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions
+on Database Systems, Vol 6, No. 4, December 1981, pp 650-670). We also
+use a simplified version of the deletion logic described in Lanin and
+Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm,
+Proceedings of 1986 Fall Joint Computer Conference, pp 380-389).
+
+The basic Lehman & Yao Algorithm
+--------------------------------
+
+Compared to a classic B-tree, L&Y adds a right-link pointer to each page,
+to the page's right sibling. It also adds a "high key" to each page, which
+is an upper bound on the keys that are allowed on that page. These two
+additions make it possible detect a concurrent page split, which allows the
+tree to be searched without holding any read locks (except to keep a single
+page from being modified while reading it).
+
+When a search follows a downlink to a child page, it compares the page's
+high key with the search key. If the search key is greater than the high
+key, the page must've been split concurrently, and you must follow the
+right-link to find the new page containing the key range you're looking
+for. This might need to be repeated, if the page has been split more than
+once.
+
+Lehman and Yao talk about alternating "separator" keys and downlinks in
+internal pages rather than tuples or records. We use the term "pivot"
+tuple to refer to tuples which don't point to heap tuples, that are used
+only for tree navigation. All tuples on non-leaf pages and high keys on
+leaf pages are pivot tuples. Since pivot tuples are only used to represent
+which part of the key space belongs on each page, they can have attribute
+values copied from non-pivot tuples that were deleted and killed by VACUUM
+some time ago. A pivot tuple may contain a "separator" key and downlink,
+just a separator key (i.e. the downlink value is implicitly undefined), or
+just a downlink (i.e. all attributes are truncated away).
+
+The requirement that all btree keys be unique is satisfied by treating heap
+TID as a tiebreaker attribute. Logical duplicates are sorted in heap TID
+order. This is necessary because Lehman and Yao also require that the key
+range for a subtree S is described by Ki < v <= Ki+1 where Ki and Ki+1 are
+the adjacent keys in the parent page (Ki must be _strictly_ less than v,
+which is assured by having reliably unique keys). Keys are always unique
+on their level, with the exception of a leaf page's high key, which can be
+fully equal to the last item on the page.
+
+The Postgres implementation of suffix truncation must make sure that the
+Lehman and Yao invariants hold, and represents that absent/truncated
+attributes in pivot tuples have the sentinel value "minus infinity". The
+later section on suffix truncation will be helpful if it's unclear how the
+Lehman & Yao invariants work with a real world example.
+
+Differences to the Lehman & Yao algorithm
+-----------------------------------------
+
+We have made the following changes in order to incorporate the L&Y algorithm
+into Postgres:
+
+Lehman and Yao don't require read locks, but assume that in-memory
+copies of tree pages are unshared. Postgres shares in-memory buffers
+among backends. As a result, we do page-level read locking on btree
+pages in order to guarantee that no record is modified while we are
+examining it. This reduces concurrency but guarantees correct
+behavior.
+
+We support the notion of an ordered "scan" of an index as well as
+insertions, deletions, and simple lookups. A scan in the forward
+direction is no problem, we just use the right-sibling pointers that
+L&Y require anyway. (Thus, once we have descended the tree to the
+correct start point for the scan, the scan looks only at leaf pages
+and never at higher tree levels.) To support scans in the backward
+direction, we also store a "left sibling" link much like the "right
+sibling". (This adds an extra step to the L&Y split algorithm: while
+holding the write lock on the page being split, we also lock its former
+right sibling to update that page's left-link. This is safe since no
+writer of that page can be interested in acquiring a write lock on our
+page.) A backwards scan has one additional bit of complexity: after
+following the left-link we must account for the possibility that the
+left sibling page got split before we could read it. So, we have to
+move right until we find a page whose right-link matches the page we
+came from. (Actually, it's even harder than that; see page deletion
+discussion below.)
+
+Page read locks are held only for as long as a scan is examining a page.
+To minimize lock/unlock traffic, an index scan always searches a leaf page
+to identify all the matching items at once, copying their heap tuple IDs
+into backend-local storage. The heap tuple IDs are then processed while
+not holding any page lock within the index. We do continue to hold a pin
+on the leaf page in some circumstances, to protect against concurrent
+deletions (see below). In this state the scan is effectively stopped
+"between" pages, either before or after the page it has pinned. This is
+safe in the presence of concurrent insertions and even page splits, because
+items are never moved across pre-existing page boundaries --- so the scan
+cannot miss any items it should have seen, nor accidentally return the same
+item twice. The scan must remember the page's right-link at the time it
+was scanned, since that is the page to move right to; if we move right to
+the current right-link then we'd re-scan any items moved by a page split.
+We don't similarly remember the left-link, since it's best to use the most
+up-to-date left-link when trying to move left (see detailed move-left
+algorithm below).
+
+In most cases we release our lock and pin on a page before attempting
+to acquire pin and lock on the page we are moving to. In a few places
+it is necessary to lock the next page before releasing the current one.
+This is safe when moving right or up, but not when moving left or down
+(else we'd create the possibility of deadlocks).
+
+Lehman and Yao fail to discuss what must happen when the root page
+becomes full and must be split. Our implementation is to split the
+root in the same way that any other page would be split, then construct
+a new root page holding pointers to both of the resulting pages (which
+now become siblings on the next level of the tree). The new root page
+is then installed by altering the root pointer in the meta-data page (see
+below). This works because the root is not treated specially in any
+other way --- in particular, searches will move right using its link
+pointer if the link is set. Therefore, searches will find the data
+that's been moved into the right sibling even if they read the meta-data
+page before it got updated. This is the same reasoning that makes a
+split of a non-root page safe. The locking considerations are similar too.
+
+When an inserter recurses up the tree, splitting internal pages to insert
+links to pages inserted on the level below, it is possible that it will
+need to access a page above the level that was the root when it began its
+descent (or more accurately, the level that was the root when it read the
+meta-data page). In this case the stack it made while descending does not
+help for finding the correct page. When this happens, we find the correct
+place by re-descending the tree until we reach the level one above the
+level we need to insert a link to, and then moving right as necessary.
+(Typically this will take only two fetches, the meta-data page and the new
+root, but in principle there could have been more than one root split
+since we saw the root. We can identify the correct tree level by means of
+the level numbers stored in each page. The situation is rare enough that
+we do not need a more efficient solution.)
+
+Lehman and Yao must couple/chain locks as part of moving right when
+relocating a child page's downlink during an ascent of the tree. This is
+the only point where Lehman and Yao have to simultaneously hold three
+locks (a lock on the child, the original parent, and the original parent's
+right sibling). We don't need to couple internal page locks for pages on
+the same level, though. We match a child's block number to a downlink
+from a pivot tuple one level up, whereas Lehman and Yao match on the
+separator key associated with the downlink that was followed during the
+initial descent. We can release the lock on the original parent page
+before acquiring a lock on its right sibling, since there is never any
+need to deal with the case where the separator key that we must relocate
+becomes the original parent's high key. Lanin and Shasha don't couple
+locks here either, though they also don't couple locks between levels
+during ascents. They are willing to "wait and try again" to avoid races.
+Their algorithm is optimistic, which means that "an insertion holds no
+more than one write lock at a time during its ascent". We more or less
+stick with Lehman and Yao's approach of conservatively coupling parent and
+child locks when ascending the tree, since it's far simpler.
+
+Lehman and Yao assume fixed-size keys, but we must deal with
+variable-size keys. Therefore there is not a fixed maximum number of
+keys per page; we just stuff in as many as will fit. When we split a
+page, we try to equalize the number of bytes, not items, assigned to
+pages (though suffix truncation is also considered). Note we must include
+the incoming item in this calculation, otherwise it is possible to find
+that the incoming item doesn't fit on the split page where it needs to go!
+
+Deleting index tuples during VACUUM
+-----------------------------------
+
+Before deleting a leaf item, we get a super-exclusive lock on the target
+page, so that no other backend has a pin on the page when the deletion
+starts. This is not necessary for correctness in terms of the btree index
+operations themselves; as explained above, index scans logically stop
+"between" pages and so can't lose their place. The reason we do it is to
+provide an interlock between VACUUM and indexscans. Since VACUUM deletes
+index entries before reclaiming heap tuple line pointers, the
+super-exclusive lock guarantees that VACUUM can't reclaim for re-use a
+line pointer that an indexscanning process might be about to visit. This
+guarantee works only for simple indexscans that visit the heap in sync
+with the index scan, not for bitmap scans. We only need the guarantee
+when using non-MVCC snapshot rules; when using an MVCC snapshot, it
+doesn't matter if the heap tuple is replaced with an unrelated tuple at
+the same TID, because the new tuple won't be visible to our scan anyway.
+Therefore, a scan using an MVCC snapshot which has no other confounding
+factors will not hold the pin after the page contents are read. The
+current reasons for exceptions, where a pin is still needed, are if the
+index is not WAL-logged or if the scan is an index-only scan. If later
+work allows the pin to be dropped for all cases we will be able to
+simplify the vacuum code, since the concept of a super-exclusive lock
+for btree indexes will no longer be needed.
+
+Because a pin is not always held, and a page can be split even while
+someone does hold a pin on it, it is possible that an indexscan will
+return items that are no longer stored on the page it has a pin on, but
+rather somewhere to the right of that page. To ensure that VACUUM can't
+prematurely remove such heap tuples, we require btbulkdelete to obtain a
+super-exclusive lock on every leaf page in the index, even pages that
+don't contain any deletable tuples. Any scan which could yield incorrect
+results if the tuple at a TID matching the scan's range and filter
+conditions were replaced by a different tuple while the scan is in
+progress must hold the pin on each index page until all index entries read
+from the page have been processed. This guarantees that the btbulkdelete
+call cannot return while any indexscan is still holding a copy of a
+deleted index tuple if the scan could be confused by that. Note that this
+requirement does not say that btbulkdelete must visit the pages in any
+particular order. (See also simple deletion and bottom-up deletion,
+below.)
+
+There is no such interlocking for deletion of items in internal pages,
+since backends keep no lock nor pin on a page they have descended past.
+Hence, when a backend is ascending the tree using its stack, it must
+be prepared for the possibility that the item it wants is to the left of
+the recorded position (but it can't have moved left out of the recorded
+page). Since we hold a lock on the lower page (per L&Y) until we have
+re-found the parent item that links to it, we can be assured that the
+parent item does still exist and can't have been deleted.
+
+VACUUM's linear scan, concurrent page splits
+--------------------------------------------
+
+VACUUM accesses the index by doing a linear scan to search for deletable
+TIDs, while considering the possibility of deleting empty pages in
+passing. This is in physical/block order, not logical/keyspace order.
+The tricky part of this is avoiding missing any deletable tuples in the
+presence of concurrent page splits: a page split could easily move some
+tuples from a page not yet passed over by the sequential scan to a
+lower-numbered page already passed over.
+
+To implement this, we provide a "vacuum cycle ID" mechanism that makes it
+possible to determine whether a page has been split since the current
+btbulkdelete cycle started. If btbulkdelete finds a page that has been
+split since it started, and has a right-link pointing to a lower page
+number, then it temporarily suspends its sequential scan and visits that
+page instead. It must continue to follow right-links and vacuum dead
+tuples until reaching a page that either hasn't been split since
+btbulkdelete started, or is above the location of the outer sequential
+scan. Then it can resume the sequential scan. This ensures that all
+tuples are visited. It may be that some tuples are visited twice, but
+that has no worse effect than an inaccurate index tuple count (and we
+can't guarantee an accurate count anyway in the face of concurrent
+activity). Note that this still works if the has-been-recently-split test
+has a small probability of false positives, so long as it never gives a
+false negative. This makes it possible to implement the test with a small
+counter value stored on each index page.
+
+Deleting entire pages during VACUUM
+-----------------------------------
+
+We consider deleting an entire page from the btree only when it's become
+completely empty of items. (Merging partly-full pages would allow better
+space reuse, but it seems impractical to move existing data items left or
+right to make this happen --- a scan moving in the opposite direction
+might miss the items if so.) Also, we *never* delete the rightmost page
+on a tree level (this restriction simplifies the traversal algorithms, as
+explained below). Page deletion always begins from an empty leaf page. An
+internal page can only be deleted as part of deleting an entire subtree.
+This is always a "skinny" subtree consisting of a "chain" of internal pages
+plus a single leaf page. There is one page on each level of the subtree,
+and each level/page covers the same key space.
+
+Deleting a leaf page is a two-stage process. In the first stage, the page
+is unlinked from its parent, and marked as half-dead. The parent page must
+be found using the same type of search as used to find the parent during an
+insertion split. We lock the target and the parent pages, change the
+target's downlink to point to the right sibling, and remove its old
+downlink. This causes the target page's key space to effectively belong to
+its right sibling. (Neither the left nor right sibling pages need to
+change their "high key" if any; so there is no problem with possibly not
+having enough space to replace a high key.) At the same time, we mark the
+target page as half-dead, which causes any subsequent searches to ignore it
+and move right (or left, in a backwards scan). This leaves the tree in a
+similar state as during a page split: the page has no downlink pointing to
+it, but it's still linked to its siblings.
+
+(Note: Lanin and Shasha prefer to make the key space move left, but their
+argument for doing so hinges on not having left-links, which we have
+anyway. So we simplify the algorithm by moving the key space right. This
+is only possible because we don't match on a separator key when ascending
+the tree during a page split, unlike Lehman and Yao/Lanin and Shasha -- it
+doesn't matter if the downlink is re-found in a pivot tuple whose separator
+key does not match the one encountered when inserter initially descended
+the tree.)
+
+To preserve consistency on the parent level, we cannot merge the key space
+of a page into its right sibling unless the right sibling is a child of
+the same parent --- otherwise, the parent's key space assignment changes
+too, meaning we'd have to make bounding-key updates in its parent, and
+perhaps all the way up the tree. Since we can't possibly do that
+atomically, we forbid this case. That means that the rightmost child of a
+parent node can't be deleted unless it's the only remaining child, in which
+case we will delete the parent too (see below).
+
+In the second-stage, the half-dead leaf page is unlinked from its siblings.
+We first lock the left sibling (if any) of the target, the target page
+itself, and its right sibling (there must be one) in that order. Then we
+update the side-links in the siblings, and mark the target page deleted.
+
+When we're about to delete the last remaining child of a parent page, things
+are slightly more complicated. In the first stage, we leave the immediate
+parent of the leaf page alone, and remove the downlink to the parent page
+instead, from the grandparent. If it's the last child of the grandparent
+too, we recurse up until we find a parent with more than one child, and
+remove the downlink of that page. The leaf page is marked as half-dead, and
+the block number of the page whose downlink was removed is stashed in the
+half-dead leaf page. This leaves us with a chain of internal pages, with
+one downlink each, leading to the half-dead leaf page, and no downlink
+pointing to the topmost page in the chain.
+
+While we recurse up to find the topmost parent in the chain, we keep the
+leaf page locked, but don't need to hold locks on the intermediate pages
+between the leaf and the topmost parent -- insertions into upper tree levels
+happen only as a result of splits of child pages, and that can't happen as
+long as we're keeping the leaf locked. The internal pages in the chain
+cannot acquire new children afterwards either, because the leaf page is
+marked as half-dead and won't be split.
+
+Removing the downlink to the top of the to-be-deleted subtree/chain
+effectively transfers the key space to the right sibling for all the
+intermediate levels too, in one atomic operation. A concurrent search might
+still visit the intermediate pages, but it will move right when it reaches
+the half-dead page at the leaf level. In particular, the search will move to
+the subtree to the right of the half-dead leaf page/to-be-deleted subtree,
+since the half-dead leaf page's right sibling must be a "cousin" page, not a
+"true" sibling page (or a second cousin page when the to-be-deleted chain
+starts at leaf page's grandparent page, and so on).
+
+In the second stage, the topmost page in the chain is unlinked from its
+siblings, and the half-dead leaf page is updated to point to the next page
+down in the chain. This is repeated until there are no internal pages left
+in the chain. Finally, the half-dead leaf page itself is unlinked from its
+siblings.
+
+A deleted page cannot be recycled immediately, since there may be other
+processes waiting to reference it (ie, search processes that just left the
+parent, or scans moving right or left from one of the siblings). These
+processes must be able to observe a deleted page for some time after the
+deletion operation, in order to be able to at least recover from it (they
+recover by moving right, as with concurrent page splits). Searchers never
+have to worry about concurrent page recycling.
+
+See "Placing deleted pages in the FSM" section below for a description of
+when and how deleted pages become safe for VACUUM to make recyclable.
+
+Page deletion and backwards scans
+---------------------------------
+
+Moving left in a backward scan is complicated because we must consider
+the possibility that the left sibling was just split (meaning we must find
+the rightmost page derived from the left sibling), plus the possibility
+that the page we were just on has now been deleted and hence isn't in the
+sibling chain at all anymore. So the move-left algorithm becomes:
+
+0. Remember the page we are on as the "original page".
+1. Follow the original page's left-link (we're done if this is zero).
+2. If the current page is live and its right-link matches the "original
+ page", we are done.
+3. Otherwise, move right one or more times looking for a live page whose
+ right-link matches the "original page". If found, we are done. (In
+ principle we could scan all the way to the right end of the index, but
+ in practice it seems better to give up after a small number of tries.
+ It's unlikely the original page's sibling split more than a few times
+ while we were in flight to it; if we do not find a matching link in a
+ few tries, then most likely the original page is deleted.)
+4. Return to the "original page". If it is still live, return to step 1
+ (we guessed wrong about it being deleted, and should restart with its
+ current left-link). If it is dead, move right until a non-dead page
+ is found (there must be one, since rightmost pages are never deleted),
+ mark that as the new "original page", and return to step 1.
+
+This algorithm is correct because the live page found by step 4 will have
+the same left keyspace boundary as the page we started from. Therefore,
+when we ultimately exit, it must be on a page whose right keyspace
+boundary matches the left boundary of where we started --- which is what
+we need to be sure we don't miss or re-scan any items.
+
+Page deletion and tree height
+-----------------------------
+
+Because we never delete the rightmost page of any level (and in particular
+never delete the root), it's impossible for the height of the tree to
+decrease. After massive deletions we might have a scenario in which the
+tree is "skinny", with several single-page levels below the root.
+Operations will still be correct in this case, but we'd waste cycles
+descending through the single-page levels. To handle this we use an idea
+from Lanin and Shasha: we keep track of the "fast root" level, which is
+the lowest single-page level. The meta-data page keeps a pointer to this
+level as well as the true root. All ordinary operations initiate their
+searches at the fast root not the true root. When we split a page that is
+alone on its level or delete the next-to-last page on a level (both cases
+are easily detected), we have to make sure that the fast root pointer is
+adjusted appropriately. In the split case, we do this work as part of the
+atomic update for the insertion into the parent level; in the delete case
+as part of the atomic update for the delete (either way, the metapage has
+to be the last page locked in the update to avoid deadlock risks). This
+avoids race conditions if two such operations are executing concurrently.
+
+Placing deleted pages in the FSM
+--------------------------------
+
+Recycling a page is decoupled from page deletion. A deleted page can only
+be put in the FSM to be recycled once there is no possible scan or search
+that has a reference to it; until then, it must stay in place with its
+sibling links undisturbed, as a tombstone that allows concurrent searches
+to detect and then recover from concurrent deletions (which are rather
+like concurrent page splits to searchers). This design is an
+implementation of what Lanin and Shasha call "the drain technique".
+
+We implement the technique by waiting until all active snapshots and
+registered snapshots as of the page deletion are gone; which is overly
+strong, but is simple to implement within Postgres. When marked fully
+dead, a deleted page is labeled with the next-transaction counter value.
+VACUUM can reclaim the page for re-use when the stored XID is guaranteed
+to be "visible to everyone". As collateral damage, we wait for snapshots
+taken until the next transaction to allocate an XID commits. We also wait
+for running XIDs with no snapshots.
+
+Prior to PostgreSQL 14, VACUUM would only place _old_ deleted pages that
+it encounters during its linear scan (pages deleted by a previous VACUUM
+operation) in the FSM. Newly deleted pages were never placed in the FSM,
+because that was assumed to _always_ be unsafe. That assumption was
+unnecessarily pessimistic in practice, though -- it often doesn't take
+very long for newly deleted pages to become safe to place in the FSM.
+There is no truly principled way to predict when deleted pages will become
+safe to place in the FSM for recycling -- it might become safe almost
+immediately (long before the current VACUUM completes), or it might not
+even be safe by the time the next VACUUM takes place. Recycle safety is
+purely a question of maintaining the consistency (or at least the apparent
+consistency) of a physical data structure. The state within the backend
+running VACUUM is simply not relevant.
+
+PostgreSQL 14 added the ability for VACUUM to consider if it's possible to
+recycle newly deleted pages at the end of the full index scan where the
+page deletion took place. It is convenient to check if it's safe at that
+point. This does require that VACUUM keep around a little bookkeeping
+information about newly deleted pages, but that's very cheap. Using
+in-memory state for this avoids the need to revisit newly deleted pages a
+second time later on -- we can just use safexid values from the local
+bookkeeping state to determine recycle safety in a deferred fashion.
+
+The need for additional FSM indirection after a page deletion operation
+takes place is a natural consequence of the highly permissive rules for
+index scans with Lehman and Yao's design. In general an index scan
+doesn't have to hold a lock or even a pin on any page when it descends the
+tree (nothing that you'd usually think of as an interlock is held "between
+levels"). At the same time, index scans cannot be allowed to land on a
+truly unrelated page due to concurrent recycling (not to be confused with
+concurrent deletion), because that results in wrong answers to queries.
+Simpler approaches to page deletion that don't need to defer recycling are
+possible, but none seem compatible with Lehman and Yao's design.
+
+Placing an already-deleted page in the FSM to be recycled when needed
+doesn't actually change the state of the page. The page will be changed
+whenever it is subsequently taken from the FSM for reuse. The deleted
+page's contents will be overwritten by the split operation (it will become
+the new right sibling page).
+
+Fastpath For Index Insertion
+----------------------------
+
+We optimize for a common case of insertion of increasing index key
+values by caching the last page to which this backend inserted the last
+value, if this page was the rightmost leaf page. For the next insert, we
+can then quickly check if the cached page is still the rightmost leaf
+page and also the correct place to hold the current value. We can avoid
+the cost of walking down the tree in such common cases.
+
+The optimization works on the assumption that there can only be one
+non-ignorable leaf rightmost page, and so not even a visible-to-everyone
+style interlock is required. We cannot fail to detect that our hint was
+invalidated, because there can only be one such page in the B-Tree at
+any time. It's possible that the page will be deleted and recycled
+without a backend's cached page also being detected as invalidated, but
+only when we happen to recycle a block that once again gets recycled as the
+rightmost leaf page.
+
+Simple deletion
+---------------
+
+If a process visits a heap tuple and finds that it's dead and removable
+(ie, dead to all open transactions, not only that process), then we can
+return to the index and mark the corresponding index entry "known dead",
+allowing subsequent index scans to skip visiting the heap tuple. The
+"known dead" marking works by setting the index item's lp_flags state
+to LP_DEAD. This is currently only done in plain indexscans, not bitmap
+scans, because only plain scans visit the heap and index "in sync" and so
+there's not a convenient way to do it for bitmap scans. Note also that
+LP_DEAD bits are often set when checking a unique index for conflicts on
+insert (this is simpler because it takes place when we hold an exclusive
+lock on the leaf page).
+
+Once an index tuple has been marked LP_DEAD it can actually be deleted
+from the index immediately; since index scans only stop "between" pages,
+no scan can lose its place from such a deletion. We separate the steps
+because we allow LP_DEAD to be set with only a share lock (it's exactly
+like a hint bit for a heap tuple), but physically removing tuples requires
+exclusive lock. Also, delaying the deletion often allows us to pick up
+extra index tuples that weren't initially safe for index scans to mark
+LP_DEAD. We do this with index tuples whose TIDs point to the same table
+blocks as an LP_DEAD-marked tuple. They're practically free to check in
+passing, and have a pretty good chance of being safe to delete due to
+various locality effects.
+
+We only try to delete LP_DEAD tuples (and nearby tuples) when we are
+otherwise faced with having to split a page to do an insertion (and hence
+have exclusive lock on it already). Deduplication and bottom-up index
+deletion can also prevent a page split, but simple deletion is always our
+preferred approach. (Note that posting list tuples can only have their
+LP_DEAD bit set when every table TID within the posting list is known
+dead. This isn't much of a problem in practice because LP_DEAD bits are
+just a starting point for simple deletion -- we still manage to perform
+granular deletes of posting list TIDs quite often.)
+
+It's sufficient to have an exclusive lock on the index page, not a
+super-exclusive lock, to do deletion of LP_DEAD items. It might seem
+that this breaks the interlock between VACUUM and indexscans, but that is
+not so: as long as an indexscanning process has a pin on the page where
+the index item used to be, VACUUM cannot complete its btbulkdelete scan
+and so cannot remove the heap tuple. This is another reason why
+btbulkdelete has to get a super-exclusive lock on every leaf page, not only
+the ones where it actually sees items to delete.
+
+LP_DEAD setting by index scans cannot be sure that a TID whose index tuple
+it had planned on LP_DEAD-setting has not been recycled by VACUUM if it
+drops its pin in the meantime. It must conservatively also remember the
+LSN of the page, and only act to set LP_DEAD bits when the LSN has not
+changed at all. (Avoiding dropping the pin entirely also makes it safe, of
+course.)
+
+Bottom-Up deletion
+------------------
+
+We attempt to delete whatever duplicates happen to be present on the page
+when the duplicates are suspected to be caused by version churn from
+successive UPDATEs. This only happens when we receive an executor hint
+indicating that optimizations like heapam's HOT have not worked out for
+the index -- the incoming tuple must be a logically unchanged duplicate
+which is needed for MVCC purposes, suggesting that that might well be the
+dominant source of new index tuples on the leaf page in question. (Also,
+bottom-up deletion is triggered within unique indexes in cases with
+continual INSERT and DELETE related churn, since that is easy to detect
+without any external hint.)
+
+Simple deletion will already have failed to prevent a page split when a
+bottom-up deletion pass takes place (often because no LP_DEAD bits were
+ever set on the page). The two mechanisms have closely related
+implementations. The same WAL records are used for each operation, and
+the same tableam infrastructure is used to determine what TIDs/tuples are
+actually safe to delete. The implementations only differ in how they pick
+TIDs to consider for deletion, and whether or not the tableam will give up
+before accessing all table blocks (bottom-up deletion lives with the
+uncertainty of its success by keeping the cost of failure low). Even
+still, the two mechanisms are clearly distinct at the conceptual level.
+
+Bottom-up index deletion is driven entirely by heuristics (whereas simple
+deletion is guaranteed to delete at least those index tuples that are
+already LP_DEAD marked -- there must be at least one). We have no
+certainty that we'll find even one index tuple to delete. That's why we
+closely cooperate with the tableam to keep the costs it pays in balance
+with the benefits we receive. The interface that we use for this is
+described in detail in access/tableam.h.
+
+Bottom-up index deletion can be thought of as a backstop mechanism against
+unnecessary version-driven page splits. It is based in part on an idea
+from generational garbage collection: the "generational hypothesis". This
+is the empirical observation that "most objects die young". Within
+nbtree, new index tuples often quickly appear in the same place, and then
+quickly become garbage. There can be intense concentrations of garbage in
+relatively few leaf pages with certain workloads (or there could be in
+earlier versions of PostgreSQL without bottom-up index deletion, at
+least). See doc/src/sgml/btree.sgml for a high-level description of the
+design principles behind bottom-up index deletion in nbtree, including
+details of how it complements VACUUM.
+
+We expect to find a reasonably large number of tuples that are safe to
+delete within each bottom-up pass. If we don't then we won't need to
+consider the question of bottom-up deletion for the same leaf page for
+quite a while (usually because the page splits, which resolves the
+situation for the time being). We expect to perform regular bottom-up
+deletion operations against pages that are at constant risk of unnecessary
+page splits caused only by version churn. When the mechanism works well
+we'll constantly be "on the verge" of having version-churn-driven page
+splits, but never actually have even one.
+
+Our duplicate heuristics work well despite being fairly simple.
+Unnecessary page splits only occur when there are truly pathological
+levels of version churn (in theory a small amount of version churn could
+make a page split occur earlier than strictly necessary, but that's pretty
+harmless). We don't have to understand the underlying workload; we only
+have to understand the general nature of the pathology that we target.
+Version churn is easy to spot when it is truly pathological. Affected
+leaf pages are fairly homogeneous.
+
+WAL Considerations
+------------------
+
+The insertion and deletion algorithms in themselves don't guarantee btree
+consistency after a crash. To provide robustness, we depend on WAL
+replay. A single WAL entry is effectively an atomic action --- we can
+redo it from the log if it fails to complete.
+
+Ordinary item insertions (that don't force a page split) are of course
+single WAL entries, since they only affect one page. The same for
+leaf-item deletions (if the deletion brings the leaf page to zero items,
+it is now a candidate to be deleted, but that is a separate action).
+
+An insertion that causes a page split is logged as a single WAL entry for
+the changes occurring on the insertion's level --- including update of the
+right sibling's left-link --- followed by a second WAL entry for the
+insertion on the parent level (which might itself be a page split, requiring
+an additional insertion above that, etc).
+
+For a root split, the follow-on WAL entry is a "new root" entry rather than
+an "insertion" entry, but details are otherwise much the same.
+
+Because splitting involves multiple atomic actions, it's possible that the
+system crashes between splitting a page and inserting the downlink for the
+new half to the parent. After recovery, the downlink for the new page will
+be missing. The search algorithm works correctly, as the page will be found
+by following the right-link from its left sibling, although if a lot of
+downlinks in the tree are missing, performance will suffer. A more serious
+consequence is that if the page without a downlink gets split again, the
+insertion algorithm will fail to find the location in the parent level to
+insert the downlink.
+
+Our approach is to create any missing downlinks on-the-fly, when searching
+the tree for a new insertion. It could be done during searches, too, but
+it seems best not to put any extra updates in what would otherwise be a
+read-only operation (updating is not possible in hot standby mode anyway).
+It would seem natural to add the missing downlinks in VACUUM, but since
+inserting a downlink might require splitting a page, it might fail if you
+run out of disk space. That would be bad during VACUUM - the reason for
+running VACUUM in the first place might be that you run out of disk space,
+and now VACUUM won't finish because you're out of disk space. In contrast,
+an insertion can require enlarging the physical file anyway. There is one
+minor exception: VACUUM finishes interrupted splits of internal pages when
+deleting their children. This allows the code for re-finding parent items
+to be used by both page splits and page deletion.
+
+To identify missing downlinks, when a page is split, the left page is
+flagged to indicate that the split is not yet complete (INCOMPLETE_SPLIT).
+When the downlink is inserted to the parent, the flag is cleared atomically
+with the insertion. The child page is kept locked until the insertion in
+the parent is finished and the flag in the child cleared, but can be
+released immediately after that, before recursing up the tree if the parent
+also needs to be split. This ensures that incompletely split pages should
+not be seen under normal circumstances; only if insertion to the parent
+has failed for some reason. (It's also possible for a reader to observe
+a page with the incomplete split flag set during recovery; see later
+section on "Scans during Recovery" for details.)
+
+We flag the left page, even though it's the right page that's missing the
+downlink, because it's more convenient to know already when following the
+right-link from the left page to the right page that it will need to have
+its downlink inserted to the parent.
+
+When splitting a non-root page that is alone on its level, the required
+metapage update (of the "fast root" link) is performed and logged as part
+of the insertion into the parent level. When splitting the root page, the
+metapage update is handled as part of the "new root" action.
+
+Each step in page deletion is logged as a separate WAL entry: marking the
+leaf as half-dead and removing the downlink is one record, and unlinking a
+page is a second record. If vacuum is interrupted for some reason, or the
+system crashes, the tree is consistent for searches and insertions. The
+next VACUUM will find the half-dead leaf page and continue the deletion.
+
+Before 9.4, we used to keep track of incomplete splits and page deletions
+during recovery and finish them immediately at end of recovery, instead of
+doing it lazily at the next insertion or vacuum. However, that made the
+recovery much more complicated, and only fixed the problem when crash
+recovery was performed. An incomplete split can also occur if an otherwise
+recoverable error, like out-of-memory or out-of-disk-space, happens while
+inserting the downlink to the parent.
+
+Scans during Recovery
+---------------------
+
+nbtree indexes support read queries in Hot Standby mode. Every atomic
+action/WAL record makes isolated changes that leave the tree in a
+consistent state for readers. Readers lock pages according to the same
+rules that readers follow on the primary. (Readers may have to move
+right to recover from a "concurrent" page split or page deletion, just
+like on the primary.)
+
+However, there are a couple of differences in how pages are locked by
+replay/the startup process as compared to the original write operation
+on the primary. The exceptions involve page splits and page deletions.
+The first phase and second phase of a page split are processed
+independently during replay, since they are independent atomic actions.
+We do not attempt to recreate the coupling of parent and child page
+write locks that took place on the primary. This is safe because readers
+never care about the incomplete split flag anyway. Holding on to an
+extra write lock on the primary is only necessary so that a second
+writer cannot observe the incomplete split flag before the first writer
+finishes the split. If we let concurrent writers on the primary observe
+an incomplete split flag on the same page, each writer would attempt to
+complete the unfinished split, corrupting the parent page. (Similarly,
+replay of page deletion records does not hold a write lock on the target
+leaf page throughout; only the primary needs to block out concurrent
+writers that insert on to the page being deleted.)
+
+WAL replay holds same-level locks in a way that matches the approach
+taken during original execution, though. This prevent readers from
+observing same-level inconsistencies. It's probably possible to be more
+lax about how same-level locks are acquired during recovery (most kinds
+of readers could still move right to recover if we didn't couple
+same-level locks), but we prefer to be conservative here.
+
+During recovery all index scans start with ignore_killed_tuples = false
+and we never set kill_prior_tuple. We do this because the oldest xmin
+on the standby server can be older than the oldest xmin on the primary
+server, which means tuples can be marked LP_DEAD even when they are
+still visible on the standby. We don't WAL log tuple LP_DEAD bits, but
+they can still appear in the standby because of full page writes. So
+we must always ignore them in standby, and that means it's not worth
+setting them either. (When LP_DEAD-marked tuples are eventually deleted
+on the primary, the deletion is WAL-logged. Queries that run on a
+standby therefore get much of the benefit of any LP_DEAD setting that
+takes place on the primary.)
+
+Note that we talk about scans that are started during recovery. We go to
+a little trouble to allow a scan to start during recovery and end during
+normal running after recovery has completed. This is a key capability
+because it allows running applications to continue while the standby
+changes state into a normally running server.
+
+The interlocking required to avoid returning incorrect results from
+non-MVCC scans is not required on standby nodes. We still get a
+super-exclusive lock ("cleanup lock") when replaying VACUUM records
+during recovery, but recovery does not need to lock every leaf page
+(only those leaf pages that have items to delete). That is safe because
+HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(),
+HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only ever
+used during write transactions, which cannot exist on the standby. MVCC
+scans are already protected by definition, so HeapTupleSatisfiesMVCC()
+is not a problem. The optimizer looks at the boundaries of value ranges
+using HeapTupleSatisfiesNonVacuumable() with an index-only scan, which
+is also safe. That leaves concern only for HeapTupleSatisfiesToast().
+
+HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's
+because it doesn't need to - if the main heap row is visible then the
+toast rows will also be visible. So as long as we follow a toast
+pointer from a visible (live) tuple the corresponding toast rows
+will also be visible, so we do not need to recheck MVCC on them.
+
+Other Things That Are Handy to Know
+-----------------------------------
+
+Page zero of every btree is a meta-data page. This page stores the
+location of the root page --- both the true root and the current effective
+root ("fast" root). To avoid fetching the metapage for every single index
+search, we cache a copy of the meta-data information in the index's
+relcache entry (rd_amcache). This is a bit ticklish since using the cache
+implies following a root page pointer that could be stale. However, a
+backend following a cached pointer can sufficiently verify whether it
+reached the intended page; either by checking the is-root flag when it
+is going to the true root, or by checking that the page has no siblings
+when going to the fast root. At worst, this could result in descending
+some extra tree levels if we have a cached pointer to a fast root that is
+now above the real fast root. Such cases shouldn't arise often enough to
+be worth optimizing; and in any case we can expect a relcache flush will
+discard the cached metapage before long, since a VACUUM that's moved the
+fast root pointer can be expected to issue a statistics update for the
+index.
+
+The algorithm assumes we can fit at least three items per page
+(a "high key" and two real data items). Therefore it's unsafe
+to accept items larger than 1/3rd page size. Larger items would
+work sometimes, but could cause failures later on depending on
+what else gets put on their page.
+
+"ScanKey" data structures are used in two fundamentally different ways
+in this code, which we describe as "search" scankeys and "insertion"
+scankeys. A search scankey is the kind passed to btbeginscan() or
+btrescan() from outside the btree code. The sk_func pointers in a search
+scankey point to comparison functions that return boolean, such as int4lt.
+There might be more than one scankey entry for a given index column, or
+none at all. (We require the keys to appear in index column order, but
+the order of multiple keys for a given column is unspecified.) An
+insertion scankey ("BTScanInsert" data structure) uses a similar
+array-of-ScanKey data structure, but the sk_func pointers point to btree
+comparison support functions (ie, 3-way comparators that return int4 values
+interpreted as <0, =0, >0). In an insertion scankey there is at most one
+entry per index column. There is also other data about the rules used to
+locate where to begin the scan, such as whether or not the scan is a
+"nextkey" scan. Insertion scankeys are built within the btree code (eg, by
+_bt_mkscankey()) and are used to locate the starting point of a scan, as
+well as for locating the place to insert a new index tuple. (Note: in the
+case of an insertion scankey built from a search scankey or built from a
+truncated pivot tuple, there might be fewer keys than index columns,
+indicating that we have no constraints for the remaining index columns.)
+After we have located the starting point of a scan, the original search
+scankey is consulted as each index entry is sequentially scanned to decide
+whether to return the entry and whether the scan can stop (see
+_bt_checkkeys()).
+
+Notes about suffix truncation
+-----------------------------
+
+We truncate away suffix key attributes that are not needed for a page high
+key during a leaf page split. The remaining attributes must distinguish
+the last index tuple on the post-split left page as belonging on the left
+page, and the first index tuple on the post-split right page as belonging
+on the right page. Tuples logically retain truncated key attributes,
+though they implicitly have "negative infinity" as their value, and have no
+storage overhead. Since the high key is subsequently reused as the
+downlink in the parent page for the new right page, suffix truncation makes
+pivot tuples short. INCLUDE indexes are guaranteed to have non-key
+attributes truncated at the time of a leaf page split, but may also have
+some key attributes truncated away, based on the usual criteria for key
+attributes. They are not a special case, since non-key attributes are
+merely payload to B-Tree searches.
+
+The goal of suffix truncation of key attributes is to improve index
+fan-out. The technique was first described by Bayer and Unterauer (R.Bayer
+and K.Unterauer, Prefix B-Trees, ACM Transactions on Database Systems, Vol
+2, No. 1, March 1977, pp 11-26). The Postgres implementation is loosely
+based on their paper. Note that Postgres only implements what the paper
+refers to as simple prefix B-Trees. Note also that the paper assumes that
+the tree has keys that consist of single strings that maintain the "prefix
+property", much like strings that are stored in a suffix tree (comparisons
+of earlier bytes must always be more significant than comparisons of later
+bytes, and, in general, the strings must compare in a way that doesn't
+break transitive consistency as they're split into pieces). Suffix
+truncation in Postgres currently only works at the whole-attribute
+granularity, but it would be straightforward to invent opclass
+infrastructure that manufactures a smaller attribute value in the case of
+variable-length types, such as text. An opclass support function could
+manufacture the shortest possible key value that still correctly separates
+each half of a leaf page split.
+
+There is sophisticated criteria for choosing a leaf page split point. The
+general idea is to make suffix truncation effective without unduly
+influencing the balance of space for each half of the page split. The
+choice of leaf split point can be thought of as a choice among points
+*between* items on the page to be split, at least if you pretend that the
+incoming tuple was placed on the page already (you have to pretend because
+there won't actually be enough space for it on the page). Choosing the
+split point between two index tuples where the first non-equal attribute
+appears as early as possible results in truncating away as many suffix
+attributes as possible. Evenly balancing space among each half of the
+split is usually the first concern, but even small adjustments in the
+precise split point can allow truncation to be far more effective.
+
+Suffix truncation is primarily valuable because it makes pivot tuples
+smaller, which delays splits of internal pages, but that isn't the only
+reason why it's effective. Even truncation that doesn't make pivot tuples
+smaller due to alignment still prevents pivot tuples from being more
+restrictive than truly necessary in how they describe which values belong
+on which pages.
+
+While it's not possible to correctly perform suffix truncation during
+internal page splits, it's still useful to be discriminating when splitting
+an internal page. The split point that implies a downlink be inserted in
+the parent that's the smallest one available within an acceptable range of
+the fillfactor-wise optimal split point is chosen. This idea also comes
+from the Prefix B-Tree paper. This process has much in common with what
+happens at the leaf level to make suffix truncation effective. The overall
+effect is that suffix truncation tends to produce smaller, more
+discriminating pivot tuples, especially early in the lifetime of the index,
+while biasing internal page splits makes the earlier, smaller pivot tuples
+end up in the root page, delaying root page splits.
+
+Logical duplicates are given special consideration. The logic for
+selecting a split point goes to great lengths to avoid having duplicates
+span more than one page, and almost always manages to pick a split point
+between two user-key-distinct tuples, accepting a completely lopsided split
+if it must. When a page that's already full of duplicates must be split,
+the fallback strategy assumes that duplicates are mostly inserted in
+ascending heap TID order. The page is split in a way that leaves the left
+half of the page mostly full, and the right half of the page mostly empty.
+The overall effect is that leaf page splits gracefully adapt to inserts of
+large groups of duplicates, maximizing space utilization. Note also that
+"trapping" large groups of duplicates on the same leaf page like this makes
+deduplication more efficient. Deduplication can be performed infrequently,
+without merging together existing posting list tuples too often.
+
+Notes about deduplication
+-------------------------
+
+We deduplicate non-pivot tuples in non-unique indexes to reduce storage
+overhead, and to avoid (or at least delay) page splits. Note that the
+goals for deduplication in unique indexes are rather different; see later
+section for details. Deduplication alters the physical representation of
+tuples without changing the logical contents of the index, and without
+adding overhead to read queries. Non-pivot tuples are merged together
+into a single physical tuple with a posting list (a simple array of heap
+TIDs with the standard item pointer format). Deduplication is always
+applied lazily, at the point where it would otherwise be necessary to
+perform a page split. It occurs only when LP_DEAD items have been
+removed, as our last line of defense against splitting a leaf page
+(bottom-up index deletion may be attempted first, as our second last line
+of defense). We can set the LP_DEAD bit with posting list tuples, though
+only when all TIDs are known dead.
+
+Our lazy approach to deduplication allows the page space accounting used
+during page splits to have absolutely minimal special case logic for
+posting lists. Posting lists can be thought of as extra payload that
+suffix truncation will reliably truncate away as needed during page
+splits, just like non-key columns from an INCLUDE index tuple.
+Incoming/new tuples can generally be treated as non-overlapping plain
+items (though see section on posting list splits for information about how
+overlapping new/incoming items are really handled).
+
+The representation of posting lists is almost identical to the posting
+lists used by GIN, so it would be straightforward to apply GIN's varbyte
+encoding compression scheme to individual posting lists. Posting list
+compression would break the assumptions made by posting list splits about
+page space accounting (see later section), so it's not clear how
+compression could be integrated with nbtree. Besides, posting list
+compression does not offer a compelling trade-off for nbtree, since in
+general nbtree is optimized for consistent performance with many
+concurrent readers and writers. Compression would also make the deletion
+of a subset of TIDs from a posting list slow and complicated, which would
+be a big problem for workloads that depend heavily on bottom-up index
+deletion.
+
+A major goal of our lazy approach to deduplication is to limit the
+performance impact of deduplication with random updates. Even concurrent
+append-only inserts of the same key value will tend to have inserts of
+individual index tuples in an order that doesn't quite match heap TID
+order. Delaying deduplication minimizes page level fragmentation.
+
+Deduplication in unique indexes
+-------------------------------
+
+Very often, the number of distinct values that can ever be placed on
+almost any given leaf page in a unique index is fixed and permanent. For
+example, a primary key on an identity column will usually only have leaf
+page splits caused by the insertion of new logical rows within the
+rightmost leaf page. If there is a split of a non-rightmost leaf page,
+then the split must have been triggered by inserts associated with UPDATEs
+of existing logical rows. Splitting a leaf page purely to store multiple
+versions is a false economy. In effect, we're permanently degrading the
+index structure just to absorb a temporary burst of duplicates.
+
+Deduplication in unique indexes helps to prevent these pathological page
+splits. Storing duplicates in a space efficient manner is not the goal,
+since in the long run there won't be any duplicates anyway. Rather, we're
+buying time for standard garbage collection mechanisms to run before a
+page split is needed.
+
+Unique index leaf pages only get a deduplication pass when an insertion
+(that might have to split the page) observed an existing duplicate on the
+page in passing. This is based on the assumption that deduplication will
+only work out when _all_ new insertions are duplicates from UPDATEs. This
+may mean that we miss an opportunity to delay a page split, but that's
+okay because our ultimate goal is to delay leaf page splits _indefinitely_
+(i.e. to prevent them altogether). There is little point in trying to
+delay a split that is probably inevitable anyway. This allows us to avoid
+the overhead of attempting to deduplicate with unique indexes that always
+have few or no duplicates.
+
+Note: Avoiding "unnecessary" page splits driven by version churn is also
+the goal of bottom-up index deletion, which was added to PostgreSQL 14.
+Bottom-up index deletion is now the preferred way to deal with this
+problem (with all kinds of indexes, though especially with unique
+indexes). Still, deduplication can sometimes augment bottom-up index
+deletion. When deletion cannot free tuples (due to an old snapshot
+holding up cleanup), falling back on deduplication provides additional
+capacity. Delaying the page split by deduplicating can allow a future
+bottom-up deletion pass of the same page to succeed.
+
+Posting list splits
+-------------------
+
+When the incoming tuple happens to overlap with an existing posting list,
+a posting list split is performed. Like a page split, a posting list
+split resolves a situation where a new/incoming item "won't fit", while
+inserting the incoming item in passing (i.e. as part of the same atomic
+action). It's possible (though not particularly likely) that an insert of
+a new item on to an almost-full page will overlap with a posting list,
+resulting in both a posting list split and a page split. Even then, the
+atomic action that splits the posting list also inserts the new item
+(since page splits always insert the new item in passing). Including the
+posting list split in the same atomic action as the insert avoids problems
+caused by concurrent inserts into the same posting list -- the exact
+details of how we change the posting list depend upon the new item, and
+vice-versa. A single atomic action also minimizes the volume of extra
+WAL required for a posting list split, since we don't have to explicitly
+WAL-log the original posting list tuple.
+
+Despite piggy-backing on the same atomic action that inserts a new tuple,
+posting list splits can be thought of as a separate, extra action to the
+insert itself (or to the page split itself). Posting list splits
+conceptually "rewrite" an insert that overlaps with an existing posting
+list into an insert that adds its final new item just to the right of the
+posting list instead. The size of the posting list won't change, and so
+page space accounting code does not need to care about posting list splits
+at all. This is an important upside of our design; the page split point
+choice logic is very subtle even without it needing to deal with posting
+list splits.
+
+Only a few isolated extra steps are required to preserve the illusion that
+the new item never overlapped with an existing posting list in the first
+place: the heap TID of the incoming tuple has its TID replaced with the
+rightmost/max heap TID from the existing/originally overlapping posting
+list. Similarly, the original incoming item's TID is relocated to the
+appropriate offset in the posting list (we usually shift TIDs out of the
+way to make a hole for it). Finally, the posting-split-with-page-split
+case must generate a new high key based on an imaginary version of the
+original page that has both the final new item and the after-list-split
+posting tuple (page splits usually just operate against an imaginary
+version that contains the new item/item that won't fit).
+
+This approach avoids inventing an "eager" atomic posting split operation
+that splits the posting list without simultaneously finishing the insert
+of the incoming item. This alternative design might seem cleaner, but it
+creates subtle problems for page space accounting. In general, there
+might not be enough free space on the page to split a posting list such
+that the incoming/new item no longer overlaps with either posting list
+half --- the operation could fail before the actual retail insert of the
+new item even begins. We'd end up having to handle posting list splits
+that need a page split anyway. Besides, supporting variable "split points"
+while splitting posting lists won't actually improve overall space
+utilization.
+
+Notes About Data Representation
+-------------------------------
+
+The right-sibling link required by L&Y is kept in the page "opaque
+data" area, as is the left-sibling link, the page level, and some flags.
+The page level counts upwards from zero at the leaf level, to the tree
+depth minus 1 at the root. (Counting up from the leaves ensures that we
+don't need to renumber any existing pages when splitting the root.)
+
+The Postgres disk block data format (an array of items) doesn't fit
+Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
+so we have to play some games. (The alternating-keys-and-pointers
+notion is important for internal page splits, which conceptually split
+at the middle of an existing pivot tuple -- the tuple's "separator" key
+goes on the left side of the split as the left side's new high key,
+while the tuple's pointer/downlink goes on the right side as the
+first/minus infinity downlink.)
+
+On a page that is not rightmost in its tree level, the "high key" is
+kept in the page's first item, and real data items start at item 2.
+The link portion of the "high key" item goes unused. A page that is
+rightmost has no "high key" (it's implicitly positive infinity), so
+data items start with the first item. Putting the high key at the
+left, rather than the right, may seem odd, but it avoids moving the
+high key as we add data items.
+
+On a leaf page, the data items are simply links to (TIDs of) tuples
+in the relation being indexed, with the associated key values.
+
+On a non-leaf page, the data items are down-links to child pages with
+bounding keys. The key in each data item is a strict lower bound for
+keys on that child page, so logically the key is to the left of that
+downlink. The high key (if present) is the upper bound for the last
+downlink. The first data item on each such page has no lower bound
+--- or lower bound of minus infinity, if you prefer. The comparison
+routines must treat it accordingly. The actual key stored in the
+item is irrelevant, and need not be stored at all. This arrangement
+corresponds to the fact that an L&Y non-leaf page has one more pointer
+than key. Suffix truncation's negative infinity attributes behave in
+the same way.
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
new file mode 100644
index 0000000..7ac73cb
--- /dev/null
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -0,0 +1,335 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtcompare.c
+ * Comparison functions for btree access method.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtcompare.c
+ *
+ * NOTES
+ *
+ * These functions are stored in pg_amproc. For each operator class
+ * defined on btrees, they compute
+ *
+ * compare(a, b):
+ * < 0 if a < b,
+ * = 0 if a == b,
+ * > 0 if a > b.
+ *
+ * The result is always an int32 regardless of the input datatype.
+ *
+ * Although any negative int32 is acceptable for reporting "<",
+ * and any positive int32 is acceptable for reporting ">", routines
+ * that work on 32-bit or wider datatypes can't just return "a - b".
+ * That could overflow and give the wrong answer.
+ *
+ * NOTE: it is critical that the comparison function impose a total order
+ * on all non-NULL values of the data type, and that the datatype's
+ * boolean comparison operators (= < >= etc) yield results consistent
+ * with the comparison routine. Otherwise bad behavior may ensue.
+ * (For example, the comparison operators must NOT punt when faced with
+ * NAN or other funny values; you must devise some collation sequence for
+ * all such values.) If the datatype is not trivial, this is most
+ * reliably done by having the boolean operators invoke the same
+ * three-way comparison code that the btree function does. Therefore,
+ * this file contains only btree support for "trivial" datatypes ---
+ * all others are in the /utils/adt/ files that implement their datatypes.
+ *
+ * NOTE: these routines must not leak memory, since memory allocated
+ * during an index access won't be recovered till end of query. This
+ * primarily affects comparison routines for toastable datatypes;
+ * they have to be careful to free any detoasted copy of an input datum.
+ *
+ * NOTE: we used to forbid comparison functions from returning INT_MIN,
+ * but that proves to be too error-prone because some platforms' versions
+ * of memcmp() etc can return INT_MIN. As a means of stress-testing
+ * callers, this file can be compiled with STRESS_SORT_INT_MIN defined
+ * to cause many of these functions to return INT_MIN or INT_MAX instead of
+ * their customary -1/+1. For production, though, that's not a good idea
+ * since users or third-party code might expect the traditional results.
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "utils/builtins.h"
+#include "utils/sortsupport.h"
+
+#ifdef STRESS_SORT_INT_MIN
+#define A_LESS_THAN_B INT_MIN
+#define A_GREATER_THAN_B INT_MAX
+#else
+#define A_LESS_THAN_B (-1)
+#define A_GREATER_THAN_B 1
+#endif
+
+
+Datum
+btboolcmp(PG_FUNCTION_ARGS)
+{
+ bool a = PG_GETARG_BOOL(0);
+ bool b = PG_GETARG_BOOL(1);
+
+ PG_RETURN_INT32((int32) a - (int32) b);
+}
+
+Datum
+btint2cmp(PG_FUNCTION_ARGS)
+{
+ int16 a = PG_GETARG_INT16(0);
+ int16 b = PG_GETARG_INT16(1);
+
+ PG_RETURN_INT32((int32) a - (int32) b);
+}
+
+static int
+btint2fastcmp(Datum x, Datum y, SortSupport ssup)
+{
+ int16 a = DatumGetInt16(x);
+ int16 b = DatumGetInt16(y);
+
+ return (int) a - (int) b;
+}
+
+Datum
+btint2sortsupport(PG_FUNCTION_ARGS)
+{
+ SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+ ssup->comparator = btint2fastcmp;
+ PG_RETURN_VOID();
+}
+
+Datum
+btint4cmp(PG_FUNCTION_ARGS)
+{
+ int32 a = PG_GETARG_INT32(0);
+ int32 b = PG_GETARG_INT32(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+static int
+btint4fastcmp(Datum x, Datum y, SortSupport ssup)
+{
+ int32 a = DatumGetInt32(x);
+ int32 b = DatumGetInt32(y);
+
+ if (a > b)
+ return A_GREATER_THAN_B;
+ else if (a == b)
+ return 0;
+ else
+ return A_LESS_THAN_B;
+}
+
+Datum
+btint4sortsupport(PG_FUNCTION_ARGS)
+{
+ SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+ ssup->comparator = btint4fastcmp;
+ PG_RETURN_VOID();
+}
+
+Datum
+btint8cmp(PG_FUNCTION_ARGS)
+{
+ int64 a = PG_GETARG_INT64(0);
+ int64 b = PG_GETARG_INT64(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+static int
+btint8fastcmp(Datum x, Datum y, SortSupport ssup)
+{
+ int64 a = DatumGetInt64(x);
+ int64 b = DatumGetInt64(y);
+
+ if (a > b)
+ return A_GREATER_THAN_B;
+ else if (a == b)
+ return 0;
+ else
+ return A_LESS_THAN_B;
+}
+
+Datum
+btint8sortsupport(PG_FUNCTION_ARGS)
+{
+ SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+ ssup->comparator = btint8fastcmp;
+ PG_RETURN_VOID();
+}
+
+Datum
+btint48cmp(PG_FUNCTION_ARGS)
+{
+ int32 a = PG_GETARG_INT32(0);
+ int64 b = PG_GETARG_INT64(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint84cmp(PG_FUNCTION_ARGS)
+{
+ int64 a = PG_GETARG_INT64(0);
+ int32 b = PG_GETARG_INT32(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint24cmp(PG_FUNCTION_ARGS)
+{
+ int16 a = PG_GETARG_INT16(0);
+ int32 b = PG_GETARG_INT32(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint42cmp(PG_FUNCTION_ARGS)
+{
+ int32 a = PG_GETARG_INT32(0);
+ int16 b = PG_GETARG_INT16(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint28cmp(PG_FUNCTION_ARGS)
+{
+ int16 a = PG_GETARG_INT16(0);
+ int64 b = PG_GETARG_INT64(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btint82cmp(PG_FUNCTION_ARGS)
+{
+ int64 a = PG_GETARG_INT64(0);
+ int16 b = PG_GETARG_INT16(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+Datum
+btoidcmp(PG_FUNCTION_ARGS)
+{
+ Oid a = PG_GETARG_OID(0);
+ Oid b = PG_GETARG_OID(1);
+
+ if (a > b)
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else if (a == b)
+ PG_RETURN_INT32(0);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+}
+
+static int
+btoidfastcmp(Datum x, Datum y, SortSupport ssup)
+{
+ Oid a = DatumGetObjectId(x);
+ Oid b = DatumGetObjectId(y);
+
+ if (a > b)
+ return A_GREATER_THAN_B;
+ else if (a == b)
+ return 0;
+ else
+ return A_LESS_THAN_B;
+}
+
+Datum
+btoidsortsupport(PG_FUNCTION_ARGS)
+{
+ SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+ ssup->comparator = btoidfastcmp;
+ PG_RETURN_VOID();
+}
+
+Datum
+btoidvectorcmp(PG_FUNCTION_ARGS)
+{
+ oidvector *a = (oidvector *) PG_GETARG_POINTER(0);
+ oidvector *b = (oidvector *) PG_GETARG_POINTER(1);
+ int i;
+
+ /* We arbitrarily choose to sort first by vector length */
+ if (a->dim1 != b->dim1)
+ PG_RETURN_INT32(a->dim1 - b->dim1);
+
+ for (i = 0; i < a->dim1; i++)
+ {
+ if (a->values[i] != b->values[i])
+ {
+ if (a->values[i] > b->values[i])
+ PG_RETURN_INT32(A_GREATER_THAN_B);
+ else
+ PG_RETURN_INT32(A_LESS_THAN_B);
+ }
+ }
+ PG_RETURN_INT32(0);
+}
+
+Datum
+btcharcmp(PG_FUNCTION_ARGS)
+{
+ char a = PG_GETARG_CHAR(0);
+ char b = PG_GETARG_CHAR(1);
+
+ /* Be careful to compare chars as unsigned */
+ PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b));
+}
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
new file mode 100644
index 0000000..1cd1b59
--- /dev/null
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -0,0 +1,1098 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtdedup.c
+ * Deduplicate or bottom-up delete items in Postgres btrees.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtdedup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
+ TM_IndexDeleteOp *delstate);
+static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem);
+static void _bt_singleval_fillfactor(Page page, BTDedupState state,
+ Size newitemsz);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_posting_valid(IndexTuple posting);
+#endif
+
+/*
+ * Perform a deduplication pass.
+ *
+ * The general approach taken here is to perform as much deduplication as
+ * possible to free as much space as possible. Note, however, that "single
+ * value" strategy is used for !bottomupdedup callers when the page is full of
+ * tuples of a single value. Deduplication passes that apply the strategy
+ * will leave behind a few untouched tuples at the end of the page, preparing
+ * the page for an anticipated page split that uses nbtsplitloc.c's own single
+ * value strategy. Our high level goal is to delay merging the untouched
+ * tuples until after the page splits.
+ *
+ * When a call to _bt_bottomupdel_pass() just took place (and failed), our
+ * high level goal is to prevent a page split entirely by buying more time.
+ * We still hope that a page split can be avoided altogether. That's why
+ * single value strategy is not even considered for bottomupdedup callers.
+ *
+ * The page will have to be split if we cannot successfully free at least
+ * newitemsz (we also need space for newitem's line pointer, which isn't
+ * included in caller's newitemsz).
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ */
+void
+_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
+ Size newitemsz, bool bottomupdedup)
+{
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Page newpage;
+ BTDedupState state;
+ Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
+ bool singlevalstrat = false;
+ int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+ newitemsz += sizeof(ItemIdData);
+
+ /*
+ * Initialize deduplication state.
+ *
+ * It would be possible for maxpostingsize (limit on posting list tuple
+ * size) to be set to one third of the page. However, it seems like a
+ * good idea to limit the size of posting lists to one sixth of a page.
+ * That ought to leave us with a good split point when pages full of
+ * duplicates can be split several times.
+ */
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true;
+ state->nmaxitems = 0;
+ state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
+ /* Metadata about base tuple of current pending posting list */
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ /* Metadata about current pending posting list TIDs */
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ /* Size of all physical tuples to be replaced by pending posting list */
+ state->phystupsize = 0;
+ /* nintervals should be initialized to zero */
+ state->nintervals = 0;
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Consider applying "single value" strategy, though only if the page
+ * seems likely to be split in the near future
+ */
+ if (!bottomupdedup)
+ singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
+
+ /*
+ * Deduplicate items from page, and write them to newpage.
+ *
+ * Copy the original page's LSN into newpage copy. This will become the
+ * updated version of the page. We need this because XLogInsert will
+ * examine the LSN and possibly dump it in a page image.
+ */
+ newpage = PageGetTempPageCopySpecial(page);
+ PageSetLSN(newpage, PageGetLSN(page));
+
+ /* Copy high key, if any */
+ if (!P_RIGHTMOST(opaque))
+ {
+ ItemId hitemid = PageGetItemId(page, P_HIKEY);
+ Size hitemsz = ItemIdGetLength(hitemid);
+ IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
+
+ if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add highkey");
+ }
+
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(!ItemIdIsDead(itemid));
+
+ if (offnum == minoff)
+ {
+ /*
+ * No previous/base tuple for the data item -- use the data item
+ * as base tuple of pending posting list
+ */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ else if (state->deduplicate &&
+ _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+ _bt_dedup_save_htid(state, itup))
+ {
+ /*
+ * Tuple is equal to base tuple of pending posting list. Heap
+ * TID(s) for itup have been saved in state.
+ */
+ }
+ else
+ {
+ /*
+ * Tuple is not equal to pending posting list tuple, or
+ * _bt_dedup_save_htid() opted to not merge current item into
+ * pending posting list for some other reason (e.g., adding more
+ * TIDs would have caused posting list to exceed current
+ * maxpostingsize).
+ *
+ * If state contains pending posting list with more than one item,
+ * form new posting tuple, and actually update the page. Else
+ * reset the state and move on without modifying the page.
+ */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+
+ if (singlevalstrat)
+ {
+ /*
+ * Single value strategy's extra steps.
+ *
+ * Lower maxpostingsize for sixth and final large posting list
+ * tuple at the point where 5 maxpostingsize-capped tuples
+ * have either been formed or observed.
+ *
+ * When a sixth maxpostingsize-capped item is formed/observed,
+ * stop merging together tuples altogether. The few tuples
+ * that remain at the end of the page won't be merged together
+ * at all (at least not until after a future page split takes
+ * place).
+ */
+ if (state->nmaxitems == 5)
+ _bt_singleval_fillfactor(page, state, newitemsz);
+ else if (state->nmaxitems == 6)
+ {
+ state->deduplicate = false;
+ singlevalstrat = false; /* won't be back here */
+ }
+ }
+
+ /* itup starts new pending posting list */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+
+ /* Handle the last item */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+
+ /*
+ * If no items suitable for deduplication were found, newpage must be
+ * exactly the same as the original page, so just return from function.
+ *
+ * We could determine whether or not to proceed on the basis the space
+ * savings being sufficient to avoid an immediate page split instead. We
+ * don't do that because there is some small value in nbtsplitloc.c always
+ * operating against a page that is fully deduplicated (apart from
+ * newitem). Besides, most of the cost has already been paid.
+ */
+ if (state->nintervals == 0)
+ {
+ /* cannot leak memory here */
+ pfree(newpage);
+ pfree(state->htids);
+ pfree(state);
+ return;
+ }
+
+ /*
+ * By here, it's clear that deduplication will definitely go ahead.
+ *
+ * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
+ * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
+ * But keep things tidy.
+ */
+ if (P_HAS_GARBAGE(opaque))
+ {
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+ nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+ }
+
+ START_CRIT_SECTION();
+
+ PageRestoreTempPage(newpage, page);
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+ xl_btree_dedup xlrec_dedup;
+
+ xlrec_dedup.nintervals = state->nintervals;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+ /*
+ * The intervals array is not in the buffer, but pretend that it is.
+ * When XLogInsert stores the whole buffer, the array need not be
+ * stored too.
+ */
+ XLogRegisterBufData(0, (char *) state->intervals,
+ state->nintervals * sizeof(BTDedupInterval));
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Local space accounting should agree with page accounting */
+ Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
+
+ /* cannot leak memory here */
+ pfree(state->htids);
+ pfree(state);
+}
+
+/*
+ * Perform bottom-up index deletion pass.
+ *
+ * See if duplicate index tuples (plus certain nearby tuples) are eligible to
+ * be deleted via bottom-up index deletion. The high level goal here is to
+ * entirely prevent "unnecessary" page splits caused by MVCC version churn
+ * from UPDATEs (when the UPDATEs don't logically modify any of the columns
+ * covered by the 'rel' index). This is qualitative, not quantitative -- we
+ * do not particularly care about once-off opportunities to delete many index
+ * tuples together.
+ *
+ * See nbtree/README for details on the design of nbtree bottom-up deletion.
+ * See access/tableam.h for a description of how we're expected to cooperate
+ * with the tableam.
+ *
+ * Returns true on success, in which case caller can assume page split will be
+ * avoided for a reasonable amount of time. Returns false when caller should
+ * deduplicate the page (if possible at all).
+ *
+ * Note: Occasionally we return true despite failing to delete enough items to
+ * avoid a split. This makes caller skip deduplication and go split the page
+ * right away. Our return value is always just advisory information.
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ */
+bool
+_bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
+ Size newitemsz)
+{
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ BTDedupState state;
+ TM_IndexDeleteOp delstate;
+ bool neverdedup;
+ int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+ newitemsz += sizeof(ItemIdData);
+
+ /* Initialize deduplication state */
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true;
+ state->nmaxitems = 0;
+ state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+ state->nintervals = 0;
+
+ /*
+ * Initialize tableam state that describes bottom-up index deletion
+ * operation.
+ *
+ * We'll go on to ask the tableam to search for TIDs whose index tuples we
+ * can safely delete. The tableam will search until our leaf page space
+ * target is satisfied, or until the cost of continuing with the tableam
+ * operation seems too high. It focuses its efforts on TIDs associated
+ * with duplicate index tuples that we mark "promising".
+ *
+ * This space target is a little arbitrary. The tableam must be able to
+ * keep the costs and benefits in balance. We provide the tableam with
+ * exhaustive information about what might work, without directly
+ * concerning ourselves with avoiding work during the tableam call. Our
+ * role in costing the bottom-up deletion process is strictly advisory.
+ */
+ delstate.bottomup = true;
+ delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
+ delstate.ndeltids = 0;
+ delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+ delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(!ItemIdIsDead(itemid));
+
+ if (offnum == minoff)
+ {
+ /* itup starts first pending interval */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+ _bt_dedup_save_htid(state, itup))
+ {
+ /* Tuple is equal; just added its TIDs to pending interval */
+ }
+ else
+ {
+ /* Finalize interval -- move its TIDs to delete state */
+ _bt_bottomupdel_finish_pending(page, state, &delstate);
+
+ /* itup starts new pending interval */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+ /* Finalize final interval -- move its TIDs to delete state */
+ _bt_bottomupdel_finish_pending(page, state, &delstate);
+
+ /*
+ * We don't give up now in the event of having few (or even zero)
+ * promising tuples for the tableam because it's not up to us as the index
+ * AM to manage costs (note that the tableam might have heuristics of its
+ * own that work out what to do). We should at least avoid having our
+ * caller do a useless deduplication pass after we return in the event of
+ * zero promising tuples, though.
+ */
+ neverdedup = false;
+ if (state->nintervals == 0)
+ neverdedup = true;
+
+ pfree(state->htids);
+ pfree(state);
+
+ /* Ask tableam which TIDs are deletable, then physically delete them */
+ _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
+
+ pfree(delstate.deltids);
+ pfree(delstate.status);
+
+ /* Report "success" to caller unconditionally to avoid deduplication */
+ if (neverdedup)
+ return true;
+
+ /* Don't dedup when we won't end up back here any time soon anyway */
+ return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's base tuple.
+ *
+ * Every tuple processed by deduplication either becomes the base tuple for a
+ * posting list, or gets its heap TID(s) accepted into a pending posting list.
+ * A tuple that starts out as the base tuple for a posting list will only
+ * actually be rewritten within _bt_dedup_finish_pending() when it turns out
+ * that there are duplicates that can be merged into the base tuple.
+ */
+void
+_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+ OffsetNumber baseoff)
+{
+ Assert(state->nhtids == 0);
+ Assert(state->nitems == 0);
+ Assert(!BTreeTupleIsPivot(base));
+
+ /*
+ * Copy heap TID(s) from new base tuple for new candidate posting list
+ * into working state's array
+ */
+ if (!BTreeTupleIsPosting(base))
+ {
+ memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
+ state->nhtids = 1;
+ state->basetupsize = IndexTupleSize(base);
+ }
+ else
+ {
+ int nposting;
+
+ nposting = BTreeTupleGetNPosting(base);
+ memcpy(state->htids, BTreeTupleGetPosting(base),
+ sizeof(ItemPointerData) * nposting);
+ state->nhtids = nposting;
+ /* basetupsize should not include existing posting list */
+ state->basetupsize = BTreeTupleGetPostingOffset(base);
+ }
+
+ /*
+ * Save new base tuple itself -- it'll be needed if we actually create a
+ * new posting list from new pending posting list.
+ *
+ * Must maintain physical size of all existing tuples (including line
+ * pointer overhead) so that we can calculate space savings on page.
+ */
+ state->nitems = 1;
+ state->base = base;
+ state->baseoff = baseoff;
+ state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+ /* Also save baseoff in pending state for interval */
+ state->intervals[state->nintervals].baseoff = state->baseoff;
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state now
+ * includes itup's heap TID(s).
+ */
+bool
+_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
+{
+ int nhtids;
+ ItemPointer htids;
+ Size mergedtupsz;
+
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ nhtids = 1;
+ htids = &itup->t_tid;
+ }
+ else
+ {
+ nhtids = BTreeTupleGetNPosting(itup);
+ htids = BTreeTupleGetPosting(itup);
+ }
+
+ /*
+ * Don't append (have caller finish pending posting list as-is) if
+ * appending heap TID(s) from itup would put us over maxpostingsize limit.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples.
+ */
+ mergedtupsz = MAXALIGN(state->basetupsize +
+ (state->nhtids + nhtids) * sizeof(ItemPointerData));
+
+ if (mergedtupsz > state->maxpostingsize)
+ {
+ /*
+ * Count this as an oversized item for single value strategy, though
+ * only when there are 50 TIDs in the final posting list tuple. This
+ * limit (which is fairly arbitrary) avoids confusion about how many
+ * 1/6 of a page tuples have been encountered/created by the current
+ * deduplication pass.
+ *
+ * Note: We deliberately don't consider which deduplication pass
+ * merged together tuples to create this item (could be a previous
+ * deduplication pass, or current pass). See _bt_do_singleval()
+ * comments.
+ */
+ if (state->nhtids > 50)
+ state->nmaxitems++;
+
+ return false;
+ }
+
+ /*
+ * Save heap TIDs to pending posting list tuple -- itup can be merged into
+ * pending posting list
+ */
+ state->nitems++;
+ memcpy(state->htids + state->nhtids, htids,
+ sizeof(ItemPointerData) * nhtids);
+ state->nhtids += nhtids;
+ state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+ return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page. Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead. This is zero in the case
+ * where no deduplication was possible.
+ */
+Size
+_bt_dedup_finish_pending(Page newpage, BTDedupState state)
+{
+ OffsetNumber tupoff;
+ Size tuplesz;
+ Size spacesaving;
+
+ Assert(state->nitems > 0);
+ Assert(state->nitems <= state->nhtids);
+ Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+ tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
+ if (state->nitems == 1)
+ {
+ /* Use original, unchanged base tuple */
+ tuplesz = IndexTupleSize(state->base);
+ if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ spacesaving = 0;
+ }
+ else
+ {
+ IndexTuple final;
+
+ /* Form a tuple with a posting list */
+ final = _bt_form_posting(state->base, state->htids, state->nhtids);
+ tuplesz = IndexTupleSize(final);
+ Assert(tuplesz <= state->maxpostingsize);
+
+ /* Save final number of items for posting list */
+ state->intervals[state->nintervals].nitems = state->nitems;
+
+ Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
+ if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
+ false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ pfree(final);
+ spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
+ /* Increment nintervals, since we wrote a new posting list tuple */
+ state->nintervals++;
+ Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+ }
+
+ /* Reset state for next pending posting list */
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+
+ return spacesaving;
+}
+
+/*
+ * Finalize interval during bottom-up index deletion.
+ *
+ * During a bottom-up pass we expect that TIDs will be recorded in dedup state
+ * first, and then get moved over to delstate (in variable-sized batches) by
+ * calling here. Call here happens when the number of TIDs in a dedup
+ * interval is known, and interval gets finalized (i.e. when caller sees next
+ * tuple on the page is not a duplicate, or when caller runs out of tuples to
+ * process from leaf page).
+ *
+ * This is where bottom-up deletion determines and remembers which entries are
+ * duplicates. This will be important information to the tableam delete
+ * infrastructure later on. Plain index tuple duplicates are marked
+ * "promising" here, per tableam contract.
+ *
+ * Our approach to marking entries whose TIDs come from posting lists is more
+ * complicated. Posting lists can only be formed by a deduplication pass (or
+ * during an index build), so recent version churn affecting the pointed-to
+ * logical rows is not particularly likely. We may still give a weak signal
+ * about posting list tuples' entries (by marking just one of its TIDs/entries
+ * promising), though this is only a possibility in the event of further
+ * duplicate index tuples in final interval that covers posting list tuple (as
+ * in the plain tuple case). A weak signal/hint will be useful to the tableam
+ * when it has no stronger signal to go with for the deletion operation as a
+ * whole.
+ *
+ * The heuristics we use work well in practice because we only need to give
+ * the tableam the right _general_ idea about where to look. Garbage tends to
+ * naturally get concentrated in relatively few table blocks with workloads
+ * that bottom-up deletion targets. The tableam cannot possibly rank all
+ * available table blocks sensibly based on the hints we provide, but that's
+ * okay -- only the extremes matter. The tableam just needs to be able to
+ * predict which few table blocks will have the most tuples that are safe to
+ * delete for each deletion operation, with low variance across related
+ * deletion operations.
+ */
+static void
+_bt_bottomupdel_finish_pending(Page page, BTDedupState state,
+ TM_IndexDeleteOp *delstate)
+{
+ bool dupinterval = (state->nitems > 1);
+
+ Assert(state->nitems > 0);
+ Assert(state->nitems <= state->nhtids);
+ Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+ for (int i = 0; i < state->nitems; i++)
+ {
+ OffsetNumber offnum = state->baseoff + i;
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+ TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
+ TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Simple case: A plain non-pivot tuple */
+ ideltid->tid = itup->t_tid;
+ ideltid->id = delstate->ndeltids;
+ istatus->idxoffnum = offnum;
+ istatus->knowndeletable = false; /* for now */
+ istatus->promising = dupinterval; /* simple rule */
+ istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
+
+ delstate->ndeltids++;
+ }
+ else
+ {
+ /*
+ * Complicated case: A posting list tuple.
+ *
+ * We make the conservative assumption that there can only be at
+ * most one affected logical row per posting list tuple. There
+ * will be at most one promising entry in deltids to represent
+ * this presumed lone logical row. Note that this isn't even
+ * considered unless the posting list tuple is also in an interval
+ * of duplicates -- this complicated rule is just a variant of the
+ * simple rule used to decide if plain index tuples are promising.
+ */
+ int nitem = BTreeTupleGetNPosting(itup);
+ bool firstpromising = false;
+ bool lastpromising = false;
+
+ Assert(_bt_posting_valid(itup));
+
+ if (dupinterval)
+ {
+ /*
+ * Complicated rule: either the first or last TID in the
+ * posting list gets marked promising (if any at all)
+ */
+ BlockNumber minblocklist,
+ midblocklist,
+ maxblocklist;
+ ItemPointer mintid,
+ midtid,
+ maxtid;
+
+ mintid = BTreeTupleGetHeapTID(itup);
+ midtid = BTreeTupleGetPostingN(itup, nitem / 2);
+ maxtid = BTreeTupleGetMaxHeapTID(itup);
+ minblocklist = ItemPointerGetBlockNumber(mintid);
+ midblocklist = ItemPointerGetBlockNumber(midtid);
+ maxblocklist = ItemPointerGetBlockNumber(maxtid);
+
+ /* Only entry with predominant table block can be promising */
+ firstpromising = (minblocklist == midblocklist);
+ lastpromising = (!firstpromising &&
+ midblocklist == maxblocklist);
+ }
+
+ for (int p = 0; p < nitem; p++)
+ {
+ ItemPointer htid = BTreeTupleGetPostingN(itup, p);
+
+ ideltid->tid = *htid;
+ ideltid->id = delstate->ndeltids;
+ istatus->idxoffnum = offnum;
+ istatus->knowndeletable = false; /* for now */
+ istatus->promising = false;
+ if ((firstpromising && p == 0) ||
+ (lastpromising && p == nitem - 1))
+ istatus->promising = true;
+ istatus->freespace = sizeof(ItemPointerData); /* at worst */
+
+ ideltid++;
+ istatus++;
+ delstate->ndeltids++;
+ }
+ }
+ }
+
+ if (dupinterval)
+ {
+ state->intervals[state->nintervals].nitems = state->nitems;
+ state->nintervals++;
+ }
+
+ /* Reset state for next interval */
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+}
+
+/*
+ * Determine if page non-pivot tuples (data items) are all duplicates of the
+ * same value -- if they are, deduplication's "single value" strategy should
+ * be applied. The general goal of this strategy is to ensure that
+ * nbtsplitloc.c (which uses its own single value strategy) will find a useful
+ * split point as further duplicates are inserted, and successive rightmost
+ * page splits occur among pages that store the same duplicate value. When
+ * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
+ * just like it would if deduplication were disabled.
+ *
+ * We expect that affected workloads will require _several_ single value
+ * strategy deduplication passes (over a page that only stores duplicates)
+ * before the page is finally split. The first deduplication pass should only
+ * find regular non-pivot tuples. Later deduplication passes will find
+ * existing maxpostingsize-capped posting list tuples, which must be skipped
+ * over. The penultimate pass is generally the first pass that actually
+ * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
+ * few untouched non-pivot tuples. The final deduplication pass won't free
+ * any space -- it will skip over everything without merging anything (it
+ * retraces the steps of the penultimate pass).
+ *
+ * Fortunately, having several passes isn't too expensive. Each pass (after
+ * the first pass) won't spend many cycles on the large posting list tuples
+ * left by previous passes. Each pass will find a large contiguous group of
+ * smaller duplicate tuples to merge together at the end of the page.
+ */
+static bool
+_bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem)
+{
+ int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ ItemId itemid;
+ IndexTuple itup;
+
+ itemid = PageGetItemId(page, minoff);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
+ {
+ itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
+ * and final maxpostingsize-capped tuple. The sixth and final posting list
+ * tuple will end up somewhat smaller than the first five. (Note: The first
+ * five tuples could actually just be very large duplicate tuples that
+ * couldn't be merged together at all. Deduplication will simply not modify
+ * the page when that happens.)
+ *
+ * When there are six posting lists on the page (after current deduplication
+ * pass goes on to create/observe a sixth very large tuple), caller should end
+ * its deduplication pass. It isn't useful to try to deduplicate items that
+ * are supposed to end up on the new right sibling page following the
+ * anticipated page split. A future deduplication pass of future right
+ * sibling page might take care of it. (This is why the first single value
+ * strategy deduplication pass for a given leaf page will generally find only
+ * plain non-pivot tuples -- see _bt_do_singleval() comments.)
+ */
+static void
+_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
+{
+ Size leftfree;
+ int reduction;
+
+ /* This calculation needs to match nbtsplitloc.c */
+ leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
+ MAXALIGN(sizeof(BTPageOpaqueData));
+ /* Subtract size of new high key (includes pivot heap TID space) */
+ leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
+
+ /*
+ * Reduce maxpostingsize by an amount equal to target free space on left
+ * half of page
+ */
+ reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
+ if (state->maxpostingsize > reduction)
+ state->maxpostingsize -= reduction;
+ else
+ state->maxpostingsize = 0;
+}
+
+/*
+ * Build a posting list tuple based on caller's "base" index tuple and list of
+ * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
+ * posting list. (Posting list tuples can never have a single heap TID, partly
+ * because that ensures that deduplication always reduces final MAXALIGN()'d
+ * size of entire tuple.)
+ *
+ * Convention is that posting list starts at a MAXALIGN()'d offset (rather
+ * than a SHORTALIGN()'d offset), in line with the approach taken when
+ * appending a heap TID to new pivot tuple/high key during suffix truncation.
+ * This sometimes wastes a little space that was only needed as alignment
+ * padding in the original tuple. Following this convention simplifies the
+ * space accounting used when deduplicating a page (the same convention
+ * simplifies the accounting for choosing a point to split a page at).
+ *
+ * Note: Caller's "htids" array must be unique and already in ascending TID
+ * order. Any existing heap TIDs from "base" won't automatically appear in
+ * returned posting list tuple (they must be included in htids array.)
+ */
+IndexTuple
+_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
+{
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+
+ if (BTreeTupleIsPosting(base))
+ keysize = BTreeTupleGetPostingOffset(base);
+ else
+ keysize = IndexTupleSize(base);
+
+ Assert(!BTreeTupleIsPivot(base));
+ Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
+ Assert(keysize == MAXALIGN(keysize));
+
+ /* Determine final size of new tuple */
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ Assert(newsize <= INDEX_SIZE_MASK);
+ Assert(newsize == MAXALIGN(newsize));
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, base, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ memcpy(BTreeTupleGetPosting(itup), htids,
+ sizeof(ItemPointerData) * nhtids);
+ Assert(_bt_posting_valid(itup));
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ ItemPointerCopy(htids, &itup->t_tid);
+ Assert(ItemPointerIsValid(&itup->t_tid));
+ }
+
+ return itup;
+}
+
+/*
+ * Generate a replacement tuple by "updating" a posting list tuple so that it
+ * no longer has TIDs that need to be deleted.
+ *
+ * Used by both VACUUM and index deletion. Caller's vacposting argument
+ * points to the existing posting list tuple to be updated.
+ *
+ * On return, caller's vacposting argument will point to final "updated"
+ * tuple, which will be palloc()'d in caller's memory context.
+ */
+void
+_bt_update_posting(BTVacuumPosting vacposting)
+{
+ IndexTuple origtuple = vacposting->itup;
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+ int nhtids;
+ int ui,
+ d;
+ ItemPointer htids;
+
+ nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
+
+ Assert(_bt_posting_valid(origtuple));
+ Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
+
+ /*
+ * Determine final size of new tuple.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples. We avoid calling _bt_form_posting() here
+ * to save ourselves a second memory allocation for a htids workspace.
+ */
+ keysize = BTreeTupleGetPostingOffset(origtuple);
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ Assert(newsize <= INDEX_SIZE_MASK);
+ Assert(newsize == MAXALIGN(newsize));
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, origtuple, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ htids = BTreeTupleGetPosting(itup);
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ htids = &itup->t_tid;
+ }
+
+ ui = 0;
+ d = 0;
+ for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
+ {
+ if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
+ {
+ d++;
+ continue;
+ }
+ htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
+ }
+ Assert(ui == nhtids);
+ Assert(d == vacposting->ndeletedtids);
+ Assert(nhtids == 1 || _bt_posting_valid(itup));
+ Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
+
+ /* vacposting arg's itup will now point to updated version */
+ vacposting->itup = itup;
+}
+
+/*
+ * Prepare for a posting list split by swapping heap TID in newitem with heap
+ * TID from original posting list (the 'oposting' heap TID located at offset
+ * 'postingoff'). Modifies newitem, so caller should pass their own private
+ * copy that can safely be modified.
+ *
+ * Returns new posting list tuple, which is palloc()'d in caller's context.
+ * This is guaranteed to be the same size as 'oposting'. Modified newitem is
+ * what caller actually inserts. (This happens inside the same critical
+ * section that performs an in-place update of old posting list using new
+ * posting list returned here.)
+ *
+ * While the keys from newitem and oposting must be opclass equal, and must
+ * generate identical output when run through the underlying type's output
+ * function, it doesn't follow that their representations match exactly.
+ * Caller must avoid assuming that there can't be representational differences
+ * that make datums from oposting bigger or smaller than the corresponding
+ * datums from newitem. For example, differences in TOAST input state might
+ * break a faulty assumption about tuple size (the executor is entitled to
+ * apply TOAST compression based on its own criteria). It also seems possible
+ * that further representational variation will be introduced in the future,
+ * in order to support nbtree features like page-level prefix compression.
+ *
+ * See nbtree/README for details on the design of posting list splits.
+ */
+IndexTuple
+_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
+{
+ int nhtids;
+ char *replacepos;
+ char *replaceposright;
+ Size nmovebytes;
+ IndexTuple nposting;
+
+ nhtids = BTreeTupleGetNPosting(oposting);
+ Assert(_bt_posting_valid(oposting));
+
+ /*
+ * The postingoff argument originated as a _bt_binsrch_posting() return
+ * value. It will be 0 in the event of corruption that makes a leaf page
+ * contain a non-pivot tuple that's somehow identical to newitem (no two
+ * non-pivot tuples should ever have the same TID). This has been known
+ * to happen in the field from time to time.
+ *
+ * Perform a basic sanity check to catch this case now.
+ */
+ if (!(postingoff > 0 && postingoff < nhtids))
+ elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
+ nhtids, postingoff);
+
+ /*
+ * Move item pointers in posting list to make a gap for the new item's
+ * heap TID. We shift TIDs one place to the right, losing original
+ * rightmost TID. (nmovebytes must not include TIDs to the left of
+ * postingoff, nor the existing rightmost/max TID that gets overwritten.)
+ */
+ nposting = CopyIndexTuple(oposting);
+ replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+ replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
+ nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+ memmove(replaceposright, replacepos, nmovebytes);
+
+ /* Fill the gap at postingoff with TID of new item (original new TID) */
+ Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
+ ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+ /* Now copy oposting's rightmost/max TID into new item (final new TID) */
+ ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
+
+ Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+ BTreeTupleGetHeapTID(newitem)) < 0);
+ Assert(_bt_posting_valid(nposting));
+
+ return nposting;
+}
+
+/*
+ * Verify posting list invariants for "posting", which must be a posting list
+ * tuple. Used within assertions.
+ */
+#ifdef USE_ASSERT_CHECKING
+static bool
+_bt_posting_valid(IndexTuple posting)
+{
+ ItemPointerData last;
+ ItemPointer htid;
+
+ if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
+ return false;
+
+ /* Remember first heap TID for loop */
+ ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
+ if (!ItemPointerIsValid(&last))
+ return false;
+
+ /* Iterate, starting from second TID */
+ for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
+ {
+ htid = BTreeTupleGetPostingN(posting, i);
+
+ if (!ItemPointerIsValid(htid))
+ return false;
+ if (ItemPointerCompare(htid, &last) <= 0)
+ return false;
+ ItemPointerCopy(htid, &last);
+ }
+
+ return true;
+}
+#endif
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
new file mode 100644
index 0000000..1241c56
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -0,0 +1,3009 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtinsert.c
+ * Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "lib/qunique.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/smgr.h"
+
+/* Minimum tree height for application of fastpath optimization */
+#define BTREE_FASTPATH_MIN_LEVEL 2
+
+
+static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
+static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
+ Relation heapRel,
+ IndexUniqueCheck checkUnique, bool *is_unique,
+ uint32 *speculativeToken);
+static OffsetNumber _bt_findinsertloc(Relation rel,
+ BTInsertState insertstate,
+ bool checkingunique,
+ bool indexUnchanged,
+ BTStack stack,
+ Relation heapRel);
+static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
+static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
+ Buffer buf,
+ Buffer cbuf,
+ BTStack stack,
+ IndexTuple itup,
+ Size itemsz,
+ OffsetNumber newitemoff,
+ int postingoff,
+ bool split_only_page);
+static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
+ Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
+ IndexTuple newitem, IndexTuple orignewitem,
+ IndexTuple nposting, uint16 postingoff);
+static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+ BTStack stack, bool isroot, bool isonly);
+static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
+ OffsetNumber itup_off, bool newfirstdataitem);
+static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+ BTInsertState insertstate,
+ bool simpleonly, bool checkingunique,
+ bool uniquedup, bool indexUnchanged);
+static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+ OffsetNumber *deletable, int ndeletable,
+ IndexTuple newitem, OffsetNumber minoff,
+ OffsetNumber maxoff);
+static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
+ int ndeletable, IndexTuple newitem,
+ int *nblocks);
+static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
+
+/*
+ * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
+ *
+ * This routine is called by the public interface routine, btinsert.
+ * By here, itup is filled in, including the TID.
+ *
+ * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
+ * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
+ * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
+ * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
+ * don't actually insert.
+ *
+ * indexUnchanged executor hint indicates if itup is from an
+ * UPDATE that didn't logically change the indexed value, but
+ * must nevertheless have a new entry to point to a successor
+ * version.
+ *
+ * The result value is only significant for UNIQUE_CHECK_PARTIAL:
+ * it must be true if the entry is known unique, else false.
+ * (In the current implementation we'll also return true after a
+ * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
+ * that's just a coding artifact.)
+ */
+bool
+_bt_doinsert(Relation rel, IndexTuple itup,
+ IndexUniqueCheck checkUnique, bool indexUnchanged,
+ Relation heapRel)
+{
+ bool is_unique = false;
+ BTInsertStateData insertstate;
+ BTScanInsert itup_key;
+ BTStack stack;
+ bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
+
+ /* we need an insertion scan key to do our search, so build one */
+ itup_key = _bt_mkscankey(rel, itup);
+
+ if (checkingunique)
+ {
+ if (!itup_key->anynullkeys)
+ {
+ /* No (heapkeyspace) scantid until uniqueness established */
+ itup_key->scantid = NULL;
+ }
+ else
+ {
+ /*
+ * Scan key for new tuple contains NULL key values. Bypass
+ * checkingunique steps. They are unnecessary because core code
+ * considers NULL unequal to every value, including NULL.
+ *
+ * This optimization avoids O(N^2) behavior within the
+ * _bt_findinsertloc() heapkeyspace path when a unique index has a
+ * large number of "duplicates" with NULL key values.
+ */
+ checkingunique = false;
+ /* Tuple is unique in the sense that core code cares about */
+ Assert(checkUnique != UNIQUE_CHECK_EXISTING);
+ is_unique = true;
+ }
+ }
+
+ /*
+ * Fill in the BTInsertState working area, to track the current page and
+ * position within the page to insert on.
+ *
+ * Note that itemsz is passed down to lower level code that deals with
+ * inserting the item. It must be MAXALIGN()'d. This ensures that space
+ * accounting code consistently considers the alignment overhead that we
+ * expect PageAddItem() will add later. (Actually, index_form_tuple() is
+ * already conservative about alignment, but we don't rely on that from
+ * this distance. Besides, preserving the "true" tuple size in index
+ * tuple headers for the benefit of nbtsplitloc.c might happen someday.
+ * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
+ */
+ insertstate.itup = itup;
+ insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
+ insertstate.itup_key = itup_key;
+ insertstate.bounds_valid = false;
+ insertstate.buf = InvalidBuffer;
+ insertstate.postingoff = 0;
+
+search:
+
+ /*
+ * Find and lock the leaf page that the tuple should be added to by
+ * searching from the root page. insertstate.buf will hold a buffer that
+ * is locked in exclusive mode afterwards.
+ */
+ stack = _bt_search_insert(rel, &insertstate);
+
+ /*
+ * checkingunique inserts are not allowed to go ahead when two tuples with
+ * equal key attribute values would be visible to new MVCC snapshots once
+ * the xact commits. Check for conflicts in the locked page/buffer (if
+ * needed) here.
+ *
+ * It might be necessary to check a page to the right in _bt_check_unique,
+ * though that should be very rare. In practice the first page the value
+ * could be on (with scantid omitted) is almost always also the only page
+ * that a matching tuple might be found on. This is due to the behavior
+ * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
+ * only be allowed to cross a page boundary when there is no candidate
+ * leaf page split point that avoids it. Also, _bt_check_unique can use
+ * the leaf page high key to determine that there will be no duplicates on
+ * the right sibling without actually visiting it (it uses the high key in
+ * cases where the new item happens to belong at the far right of the leaf
+ * page).
+ *
+ * NOTE: obviously, _bt_check_unique can only detect keys that are already
+ * in the index; so it cannot defend against concurrent insertions of the
+ * same key. We protect against that by means of holding a write lock on
+ * the first page the value could be on, with omitted/-inf value for the
+ * implicit heap TID tiebreaker attribute. Any other would-be inserter of
+ * the same key must acquire a write lock on the same page, so only one
+ * would-be inserter can be making the check at one time. Furthermore,
+ * once we are past the check we hold write locks continuously until we
+ * have performed our insertion, so no later inserter can fail to see our
+ * insertion. (This requires some care in _bt_findinsertloc.)
+ *
+ * If we must wait for another xact, we release the lock while waiting,
+ * and then must perform a new search.
+ *
+ * For a partial uniqueness check, we don't wait for the other xact. Just
+ * let the tuple in and return false for possibly non-unique, or true for
+ * definitely unique.
+ */
+ if (checkingunique)
+ {
+ TransactionId xwait;
+ uint32 speculativeToken;
+
+ xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
+ &is_unique, &speculativeToken);
+
+ if (unlikely(TransactionIdIsValid(xwait)))
+ {
+ /* Have to wait for the other guy ... */
+ _bt_relbuf(rel, insertstate.buf);
+ insertstate.buf = InvalidBuffer;
+
+ /*
+ * If it's a speculative insertion, wait for it to finish (ie. to
+ * go ahead with the insertion, or kill the tuple). Otherwise
+ * wait for the transaction to finish as usual.
+ */
+ if (speculativeToken)
+ SpeculativeInsertionWait(xwait, speculativeToken);
+ else
+ XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
+
+ /* start over... */
+ if (stack)
+ _bt_freestack(stack);
+ goto search;
+ }
+
+ /* Uniqueness is established -- restore heap tid as scantid */
+ if (itup_key->heapkeyspace)
+ itup_key->scantid = &itup->t_tid;
+ }
+
+ if (checkUnique != UNIQUE_CHECK_EXISTING)
+ {
+ OffsetNumber newitemoff;
+
+ /*
+ * The only conflict predicate locking cares about for indexes is when
+ * an index tuple insert conflicts with an existing lock. We don't
+ * know the actual page we're going to insert on for sure just yet in
+ * checkingunique and !heapkeyspace cases, but it's okay to use the
+ * first page the value could be on (with scantid omitted) instead.
+ */
+ CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
+
+ /*
+ * Do the insertion. Note that insertstate contains cached binary
+ * search bounds established within _bt_check_unique when insertion is
+ * checkingunique.
+ */
+ newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
+ indexUnchanged, stack, heapRel);
+ _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
+ itup, insertstate.itemsz, newitemoff,
+ insertstate.postingoff, false);
+ }
+ else
+ {
+ /* just release the buffer */
+ _bt_relbuf(rel, insertstate.buf);
+ }
+
+ /* be tidy */
+ if (stack)
+ _bt_freestack(stack);
+ pfree(itup_key);
+
+ return is_unique;
+}
+
+/*
+ * _bt_search_insert() -- _bt_search() wrapper for inserts
+ *
+ * Search the tree for a particular scankey, or more precisely for the first
+ * leaf page it could be on. Try to make use of the fastpath optimization's
+ * rightmost leaf page cache before actually searching the tree from the root
+ * page, though.
+ *
+ * Return value is a stack of parent-page pointers (though see notes about
+ * fastpath optimization and page splits below). insertstate->buf is set to
+ * the address of the leaf-page buffer, which is write-locked and pinned in
+ * all cases (if necessary by creating a new empty root page for caller).
+ *
+ * The fastpath optimization avoids most of the work of searching the tree
+ * repeatedly when a single backend inserts successive new tuples on the
+ * rightmost leaf page of an index. A backend cache of the rightmost leaf
+ * page is maintained within _bt_insertonpg(), and used here. The cache is
+ * invalidated here when an insert of a non-pivot tuple must take place on a
+ * non-rightmost leaf page.
+ *
+ * The optimization helps with indexes on an auto-incremented field. It also
+ * helps with indexes on datetime columns, as well as indexes with lots of
+ * NULL values. (NULLs usually get inserted in the rightmost page for single
+ * column indexes, since they usually get treated as coming after everything
+ * else in the key space. Individual NULL tuples will generally be placed on
+ * the rightmost leaf page due to the influence of the heap TID column.)
+ *
+ * Note that we avoid applying the optimization when there is insufficient
+ * space on the rightmost page to fit caller's new item. This is necessary
+ * because we'll need to return a real descent stack when a page split is
+ * expected (actually, caller can cope with a leaf page split that uses a NULL
+ * stack, but that's very slow and so must be avoided). Note also that the
+ * fastpath optimization acquires the lock on the page conditionally as a way
+ * of reducing extra contention when there are concurrent insertions into the
+ * rightmost page (we give up if we'd have to wait for the lock). We assume
+ * that it isn't useful to apply the optimization when there is contention,
+ * since each per-backend cache won't stay valid for long.
+ */
+static BTStack
+_bt_search_insert(Relation rel, BTInsertState insertstate)
+{
+ Assert(insertstate->buf == InvalidBuffer);
+ Assert(!insertstate->bounds_valid);
+ Assert(insertstate->postingoff == 0);
+
+ if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
+ {
+ /* Simulate a _bt_getbuf() call with conditional locking */
+ insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
+ if (_bt_conditionallockbuf(rel, insertstate->buf))
+ {
+ Page page;
+ BTPageOpaque opaque;
+
+ _bt_checkpage(rel, insertstate->buf);
+ page = BufferGetPage(insertstate->buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Check if the page is still the rightmost leaf page and has
+ * enough free space to accommodate the new tuple. Also check
+ * that the insertion scan key is strictly greater than the first
+ * non-pivot tuple on the page. (Note that we expect itup_key's
+ * scantid to be unset when our caller is a checkingunique
+ * inserter.)
+ */
+ if (P_RIGHTMOST(opaque) &&
+ P_ISLEAF(opaque) &&
+ !P_IGNORE(opaque) &&
+ PageGetFreeSpace(page) > insertstate->itemsz &&
+ PageGetMaxOffsetNumber(page) >= P_HIKEY &&
+ _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+ {
+ /*
+ * Caller can use the fastpath optimization because cached
+ * block is still rightmost leaf page, which can fit caller's
+ * new tuple without splitting. Keep block in local cache for
+ * next insert, and have caller use NULL stack.
+ *
+ * Note that _bt_insert_parent() has an assertion that catches
+ * leaf page splits that somehow follow from a fastpath insert
+ * (it should only be passed a NULL stack when it must deal
+ * with a concurrent root page split, and never because a NULL
+ * stack was returned here).
+ */
+ return NULL;
+ }
+
+ /* Page unsuitable for caller, drop lock and pin */
+ _bt_relbuf(rel, insertstate->buf);
+ }
+ else
+ {
+ /* Lock unavailable, drop pin */
+ ReleaseBuffer(insertstate->buf);
+ }
+
+ /* Forget block, since cache doesn't appear to be useful */
+ RelationSetTargetBlock(rel, InvalidBlockNumber);
+ }
+
+ /* Cannot use optimization -- descend tree, return proper descent stack */
+ return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
+ NULL);
+}
+
+/*
+ * _bt_check_unique() -- Check for violation of unique index constraint
+ *
+ * Returns InvalidTransactionId if there is no conflict, else an xact ID
+ * we must wait for to see if it commits a conflicting tuple. If an actual
+ * conflict is detected, no return --- just ereport(). If an xact ID is
+ * returned, and the conflicting tuple still has a speculative insertion in
+ * progress, *speculativeToken is set to non-zero, and the caller can wait for
+ * the verdict on the insertion using SpeculativeInsertionWait().
+ *
+ * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
+ * InvalidTransactionId because we don't want to wait. In this case we
+ * set *is_unique to false if there is a potential conflict, and the
+ * core code must redo the uniqueness check later.
+ *
+ * As a side-effect, sets state in insertstate that can later be used by
+ * _bt_findinsertloc() to reuse most of the binary search work we do
+ * here.
+ *
+ * Do not call here when there are NULL values in scan key. NULL should be
+ * considered unequal to NULL when checking for duplicates, but we are not
+ * prepared to handle that correctly.
+ */
+static TransactionId
+_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
+ IndexUniqueCheck checkUnique, bool *is_unique,
+ uint32 *speculativeToken)
+{
+ IndexTuple itup = insertstate->itup;
+ IndexTuple curitup = NULL;
+ ItemId curitemid = NULL;
+ BTScanInsert itup_key = insertstate->itup_key;
+ SnapshotData SnapshotDirty;
+ OffsetNumber offset;
+ OffsetNumber maxoff;
+ Page page;
+ BTPageOpaque opaque;
+ Buffer nbuf = InvalidBuffer;
+ bool found = false;
+ bool inposting = false;
+ bool prevalldead = true;
+ int curposti = 0;
+
+ /* Assume unique until we find a duplicate */
+ *is_unique = true;
+
+ InitDirtySnapshot(SnapshotDirty);
+
+ page = BufferGetPage(insertstate->buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Find the first tuple with the same key.
+ *
+ * This also saves the binary search bounds in insertstate. We use them
+ * in the fastpath below, but also in the _bt_findinsertloc() call later.
+ */
+ Assert(!insertstate->bounds_valid);
+ offset = _bt_binsrch_insert(rel, insertstate);
+
+ /*
+ * Scan over all equal tuples, looking for live conflicts.
+ */
+ Assert(!insertstate->bounds_valid || insertstate->low == offset);
+ Assert(!itup_key->anynullkeys);
+ Assert(itup_key->scantid == NULL);
+ for (;;)
+ {
+ /*
+ * Each iteration of the loop processes one heap TID, not one index
+ * tuple. Current offset number for page isn't usually advanced on
+ * iterations that process heap TIDs from posting list tuples.
+ *
+ * "inposting" state is set when _inside_ a posting list --- not when
+ * we're at the start (or end) of a posting list. We advance curposti
+ * at the end of the iteration when inside a posting list tuple. In
+ * general, every loop iteration either advances the page offset or
+ * advances curposti --- an iteration that handles the rightmost/max
+ * heap TID in a posting list finally advances the page offset (and
+ * unsets "inposting").
+ *
+ * Make sure the offset points to an actual index tuple before trying
+ * to examine it...
+ */
+ if (offset <= maxoff)
+ {
+ /*
+ * Fastpath: In most cases, we can use cached search bounds to
+ * limit our consideration to items that are definitely
+ * duplicates. This fastpath doesn't apply when the original page
+ * is empty, or when initial offset is past the end of the
+ * original page, which may indicate that we need to examine a
+ * second or subsequent page.
+ *
+ * Note that this optimization allows us to avoid calling
+ * _bt_compare() directly when there are no duplicates, as long as
+ * the offset where the key will go is not at the end of the page.
+ */
+ if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
+ {
+ Assert(insertstate->bounds_valid);
+ Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
+ Assert(insertstate->low <= insertstate->stricthigh);
+ Assert(_bt_compare(rel, itup_key, page, offset) < 0);
+ break;
+ }
+
+ /*
+ * We can skip items that are already marked killed.
+ *
+ * In the presence of heavy update activity an index may contain
+ * many killed items with the same key; running _bt_compare() on
+ * each killed item gets expensive. Just advance over killed
+ * items as quickly as we can. We only apply _bt_compare() when
+ * we get to a non-killed item. We could reuse the bounds to
+ * avoid _bt_compare() calls for known equal tuples, but it
+ * doesn't seem worth it.
+ */
+ if (!inposting)
+ curitemid = PageGetItemId(page, offset);
+ if (inposting || !ItemIdIsDead(curitemid))
+ {
+ ItemPointerData htid;
+ bool all_dead = false;
+
+ if (!inposting)
+ {
+ /* Plain tuple, or first TID in posting list tuple */
+ if (_bt_compare(rel, itup_key, page, offset) != 0)
+ break; /* we're past all the equal tuples */
+
+ /* Advanced curitup */
+ curitup = (IndexTuple) PageGetItem(page, curitemid);
+ Assert(!BTreeTupleIsPivot(curitup));
+ }
+
+ /* okay, we gotta fetch the heap tuple using htid ... */
+ if (!BTreeTupleIsPosting(curitup))
+ {
+ /* ... htid is from simple non-pivot tuple */
+ Assert(!inposting);
+ htid = curitup->t_tid;
+ }
+ else if (!inposting)
+ {
+ /* ... htid is first TID in new posting list */
+ inposting = true;
+ prevalldead = true;
+ curposti = 0;
+ htid = *BTreeTupleGetPostingN(curitup, 0);
+ }
+ else
+ {
+ /* ... htid is second or subsequent TID in posting list */
+ Assert(curposti > 0);
+ htid = *BTreeTupleGetPostingN(curitup, curposti);
+ }
+
+ /*
+ * If we are doing a recheck, we expect to find the tuple we
+ * are rechecking. It's not a duplicate, but we have to keep
+ * scanning.
+ */
+ if (checkUnique == UNIQUE_CHECK_EXISTING &&
+ ItemPointerCompare(&htid, &itup->t_tid) == 0)
+ {
+ found = true;
+ }
+
+ /*
+ * Check if there's any table tuples for this index entry
+ * satisfying SnapshotDirty. This is necessary because for AMs
+ * with optimizations like heap's HOT, we have just a single
+ * index entry for the entire chain.
+ */
+ else if (table_index_fetch_tuple_check(heapRel, &htid,
+ &SnapshotDirty,
+ &all_dead))
+ {
+ TransactionId xwait;
+
+ /*
+ * It is a duplicate. If we are only doing a partial
+ * check, then don't bother checking if the tuple is being
+ * updated in another transaction. Just return the fact
+ * that it is a potential conflict and leave the full
+ * check till later. Don't invalidate binary search
+ * bounds.
+ */
+ if (checkUnique == UNIQUE_CHECK_PARTIAL)
+ {
+ if (nbuf != InvalidBuffer)
+ _bt_relbuf(rel, nbuf);
+ *is_unique = false;
+ return InvalidTransactionId;
+ }
+
+ /*
+ * If this tuple is being updated by other transaction
+ * then we have to wait for its commit/abort.
+ */
+ xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
+ SnapshotDirty.xmin : SnapshotDirty.xmax;
+
+ if (TransactionIdIsValid(xwait))
+ {
+ if (nbuf != InvalidBuffer)
+ _bt_relbuf(rel, nbuf);
+ /* Tell _bt_doinsert to wait... */
+ *speculativeToken = SnapshotDirty.speculativeToken;
+ /* Caller releases lock on buf immediately */
+ insertstate->bounds_valid = false;
+ return xwait;
+ }
+
+ /*
+ * Otherwise we have a definite conflict. But before
+ * complaining, look to see if the tuple we want to insert
+ * is itself now committed dead --- if so, don't complain.
+ * This is a waste of time in normal scenarios but we must
+ * do it to support CREATE INDEX CONCURRENTLY.
+ *
+ * We must follow HOT-chains here because during
+ * concurrent index build, we insert the root TID though
+ * the actual tuple may be somewhere in the HOT-chain.
+ * While following the chain we might not stop at the
+ * exact tuple which triggered the insert, but that's OK
+ * because if we find a live tuple anywhere in this chain,
+ * we have a unique key conflict. The other live tuple is
+ * not part of this chain because it had a different index
+ * entry.
+ */
+ htid = itup->t_tid;
+ if (table_index_fetch_tuple_check(heapRel, &htid,
+ SnapshotSelf, NULL))
+ {
+ /* Normal case --- it's still live */
+ }
+ else
+ {
+ /*
+ * It's been deleted, so no error, and no need to
+ * continue searching
+ */
+ break;
+ }
+
+ /*
+ * Check for a conflict-in as we would if we were going to
+ * write to this page. We aren't actually going to write,
+ * but we want a chance to report SSI conflicts that would
+ * otherwise be masked by this unique constraint
+ * violation.
+ */
+ CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
+
+ /*
+ * This is a definite conflict. Break the tuple down into
+ * datums and report the error. But first, make sure we
+ * release the buffer locks we're holding ---
+ * BuildIndexValueDescription could make catalog accesses,
+ * which in the worst case might touch this same index and
+ * cause deadlocks.
+ */
+ if (nbuf != InvalidBuffer)
+ _bt_relbuf(rel, nbuf);
+ _bt_relbuf(rel, insertstate->buf);
+ insertstate->buf = InvalidBuffer;
+ insertstate->bounds_valid = false;
+
+ {
+ Datum values[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ char *key_desc;
+
+ index_deform_tuple(itup, RelationGetDescr(rel),
+ values, isnull);
+
+ key_desc = BuildIndexValueDescription(rel, values,
+ isnull);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_UNIQUE_VIOLATION),
+ errmsg("duplicate key value violates unique constraint \"%s\"",
+ RelationGetRelationName(rel)),
+ key_desc ? errdetail("Key %s already exists.",
+ key_desc) : 0,
+ errtableconstraint(heapRel,
+ RelationGetRelationName(rel))));
+ }
+ }
+ else if (all_dead && (!inposting ||
+ (prevalldead &&
+ curposti == BTreeTupleGetNPosting(curitup) - 1)))
+ {
+ /*
+ * The conflicting tuple (or all HOT chains pointed to by
+ * all posting list TIDs) is dead to everyone, so mark the
+ * index entry killed.
+ */
+ ItemIdMarkDead(curitemid);
+ opaque->btpo_flags |= BTP_HAS_GARBAGE;
+
+ /*
+ * Mark buffer with a dirty hint, since state is not
+ * crucial. Be sure to mark the proper buffer dirty.
+ */
+ if (nbuf != InvalidBuffer)
+ MarkBufferDirtyHint(nbuf, true);
+ else
+ MarkBufferDirtyHint(insertstate->buf, true);
+ }
+
+ /*
+ * Remember if posting list tuple has even a single HOT chain
+ * whose members are not all dead
+ */
+ if (!all_dead && inposting)
+ prevalldead = false;
+ }
+ }
+
+ if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
+ {
+ /* Advance to next TID in same posting list */
+ curposti++;
+ continue;
+ }
+ else if (offset < maxoff)
+ {
+ /* Advance to next tuple */
+ curposti = 0;
+ inposting = false;
+ offset = OffsetNumberNext(offset);
+ }
+ else
+ {
+ int highkeycmp;
+
+ /* If scankey == hikey we gotta check the next page too */
+ if (P_RIGHTMOST(opaque))
+ break;
+ highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
+ Assert(highkeycmp <= 0);
+ if (highkeycmp != 0)
+ break;
+ /* Advance to next non-dead page --- there must be one */
+ for (;;)
+ {
+ BlockNumber nblkno = opaque->btpo_next;
+
+ nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
+ page = BufferGetPage(nbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_IGNORE(opaque))
+ break;
+ if (P_RIGHTMOST(opaque))
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+ }
+ /* Will also advance to next tuple */
+ curposti = 0;
+ inposting = false;
+ maxoff = PageGetMaxOffsetNumber(page);
+ offset = P_FIRSTDATAKEY(opaque);
+ /* Don't invalidate binary search bounds */
+ }
+ }
+
+ /*
+ * If we are doing a recheck then we should have found the tuple we are
+ * checking. Otherwise there's something very wrong --- probably, the
+ * index is on a non-immutable expression.
+ */
+ if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to re-find tuple within index \"%s\"",
+ RelationGetRelationName(rel)),
+ errhint("This may be because of a non-immutable index expression."),
+ errtableconstraint(heapRel,
+ RelationGetRelationName(rel))));
+
+ if (nbuf != InvalidBuffer)
+ _bt_relbuf(rel, nbuf);
+
+ return InvalidTransactionId;
+}
+
+
+/*
+ * _bt_findinsertloc() -- Finds an insert location for a tuple
+ *
+ * On entry, insertstate buffer contains the page the new tuple belongs
+ * on. It is exclusive-locked and pinned by the caller.
+ *
+ * If 'checkingunique' is true, the buffer on entry is the first page
+ * that contains duplicates of the new key. If there are duplicates on
+ * multiple pages, the correct insertion position might be some page to
+ * the right, rather than the first page. In that case, this function
+ * moves right to the correct target page.
+ *
+ * (In a !heapkeyspace index, there can be multiple pages with the same
+ * high key, where the new tuple could legitimately be placed on. In
+ * that case, the caller passes the first page containing duplicates,
+ * just like when checkingunique=true. If that page doesn't have enough
+ * room for the new tuple, this function moves right, trying to find a
+ * legal page that does.)
+ *
+ * If 'indexUnchanged' is true, this is for an UPDATE that didn't
+ * logically change the indexed value, but must nevertheless have a new
+ * entry to point to a successor version. This hint from the executor
+ * will influence our behavior when the page might have to be split and
+ * we must consider our options. Bottom-up index deletion can avoid
+ * pathological version-driven page splits, but we only want to go to the
+ * trouble of trying it when we already have moderate confidence that
+ * it's appropriate. The hint should not significantly affect our
+ * behavior over time unless practically all inserts on to the leaf page
+ * get the hint.
+ *
+ * On exit, insertstate buffer contains the chosen insertion page, and
+ * the offset within that page is returned. If _bt_findinsertloc needed
+ * to move right, the lock and pin on the original page are released, and
+ * the new buffer is exclusively locked and pinned instead.
+ *
+ * If insertstate contains cached binary search bounds, we will take
+ * advantage of them. This avoids repeating comparisons that we made in
+ * _bt_check_unique() already.
+ *
+ * If there is not enough room on the page for the new tuple, we try to
+ * make room by removing any LP_DEAD tuples.
+ */
+static OffsetNumber
+_bt_findinsertloc(Relation rel,
+ BTInsertState insertstate,
+ bool checkingunique,
+ bool indexUnchanged,
+ BTStack stack,
+ Relation heapRel)
+{
+ BTScanInsert itup_key = insertstate->itup_key;
+ Page page = BufferGetPage(insertstate->buf);
+ BTPageOpaque opaque;
+ OffsetNumber newitemoff;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* Check 1/3 of a page restriction */
+ if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
+ _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
+ insertstate->itup);
+
+ Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
+ Assert(!insertstate->bounds_valid || checkingunique);
+ Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
+ Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
+ Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
+
+ if (itup_key->heapkeyspace)
+ {
+ /* Keep track of whether checkingunique duplicate seen */
+ bool uniquedup = indexUnchanged;
+
+ /*
+ * If we're inserting into a unique index, we may have to walk right
+ * through leaf pages to find the one leaf page that we must insert on
+ * to.
+ *
+ * This is needed for checkingunique callers because a scantid was not
+ * used when we called _bt_search(). scantid can only be set after
+ * _bt_check_unique() has checked for duplicates. The buffer
+ * initially stored in insertstate->buf has the page where the first
+ * duplicate key might be found, which isn't always the page that new
+ * tuple belongs on. The heap TID attribute for new tuple (scantid)
+ * could force us to insert on a sibling page, though that should be
+ * very rare in practice.
+ */
+ if (checkingunique)
+ {
+ if (insertstate->low < insertstate->stricthigh)
+ {
+ /* Encountered a duplicate in _bt_check_unique() */
+ Assert(insertstate->bounds_valid);
+ uniquedup = true;
+ }
+
+ for (;;)
+ {
+ /*
+ * Does the new tuple belong on this page?
+ *
+ * The earlier _bt_check_unique() call may well have
+ * established a strict upper bound on the offset for the new
+ * item. If it's not the last item of the page (i.e. if there
+ * is at least one tuple on the page that goes after the tuple
+ * we're inserting) then we know that the tuple belongs on
+ * this page. We can skip the high key check.
+ */
+ if (insertstate->bounds_valid &&
+ insertstate->low <= insertstate->stricthigh &&
+ insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+ break;
+
+ /* Test '<=', not '!=', since scantid is set now */
+ if (P_RIGHTMOST(opaque) ||
+ _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+ break;
+
+ _bt_stepright(rel, insertstate, stack);
+ /* Update local state after stepping right */
+ page = BufferGetPage(insertstate->buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ /* Assume duplicates (if checkingunique) */
+ uniquedup = true;
+ }
+ }
+
+ /*
+ * If the target page cannot fit newitem, try to avoid splitting the
+ * page on insert by performing deletion or deduplication now
+ */
+ if (PageGetFreeSpace(page) < insertstate->itemsz)
+ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
+ checkingunique, uniquedup,
+ indexUnchanged);
+ }
+ else
+ {
+ /*----------
+ * This is a !heapkeyspace (version 2 or 3) index. The current page
+ * is the first page that we could insert the new tuple to, but there
+ * may be other pages to the right that we could opt to use instead.
+ *
+ * If the new key is equal to one or more existing keys, we can
+ * legitimately place it anywhere in the series of equal keys. In
+ * fact, if the new key is equal to the page's "high key" we can place
+ * it on the next page. If it is equal to the high key, and there's
+ * not room to insert the new tuple on the current page without
+ * splitting, then we move right hoping to find more free space and
+ * avoid a split.
+ *
+ * Keep scanning right until we
+ * (a) find a page with enough free space,
+ * (b) reach the last page where the tuple can legally go, or
+ * (c) get tired of searching.
+ * (c) is not flippant; it is important because if there are many
+ * pages' worth of equal keys, it's better to split one of the early
+ * pages than to scan all the way to the end of the run of equal keys
+ * on every insert. We implement "get tired" as a random choice,
+ * since stopping after scanning a fixed number of pages wouldn't work
+ * well (we'd never reach the right-hand side of previously split
+ * pages). The probability of moving right is set at 0.99, which may
+ * seem too high to change the behavior much, but it does an excellent
+ * job of preventing O(N^2) behavior with many equal keys.
+ *----------
+ */
+ while (PageGetFreeSpace(page) < insertstate->itemsz)
+ {
+ /*
+ * Before considering moving right, see if we can obtain enough
+ * space by erasing LP_DEAD items
+ */
+ if (P_HAS_GARBAGE(opaque))
+ {
+ /* Perform simple deletion */
+ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+ false, false, false);
+
+ if (PageGetFreeSpace(page) >= insertstate->itemsz)
+ break; /* OK, now we have enough space */
+ }
+
+ /*
+ * Nope, so check conditions (b) and (c) enumerated above
+ *
+ * The earlier _bt_check_unique() call may well have established a
+ * strict upper bound on the offset for the new item. If it's not
+ * the last item of the page (i.e. if there is at least one tuple
+ * on the page that's greater than the tuple we're inserting to)
+ * then we know that the tuple belongs on this page. We can skip
+ * the high key check.
+ */
+ if (insertstate->bounds_valid &&
+ insertstate->low <= insertstate->stricthigh &&
+ insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+ break;
+
+ if (P_RIGHTMOST(opaque) ||
+ _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+ random() <= (MAX_RANDOM_VALUE / 100))
+ break;
+
+ _bt_stepright(rel, insertstate, stack);
+ /* Update local state after stepping right */
+ page = BufferGetPage(insertstate->buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+ }
+
+ /*
+ * We should now be on the correct page. Find the offset within the page
+ * for the new tuple. (Possibly reusing earlier search bounds.)
+ */
+ Assert(P_RIGHTMOST(opaque) ||
+ _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
+
+ newitemoff = _bt_binsrch_insert(rel, insertstate);
+
+ if (insertstate->postingoff == -1)
+ {
+ /*
+ * There is an overlapping posting list tuple with its LP_DEAD bit
+ * set. We don't want to unnecessarily unset its LP_DEAD bit while
+ * performing a posting list split, so perform simple index tuple
+ * deletion early.
+ */
+ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
+ false, false, false);
+
+ /*
+ * Do new binary search. New insert location cannot overlap with any
+ * posting list now.
+ */
+ Assert(!insertstate->bounds_valid);
+ insertstate->postingoff = 0;
+ newitemoff = _bt_binsrch_insert(rel, insertstate);
+ Assert(insertstate->postingoff == 0);
+ }
+
+ return newitemoff;
+}
+
+/*
+ * Step right to next non-dead page, during insertion.
+ *
+ * This is a bit more complicated than moving right in a search. We must
+ * write-lock the target page before releasing write lock on current page;
+ * else someone else's _bt_check_unique scan could fail to see our insertion.
+ * Write locks on intermediate dead pages won't do because we don't know when
+ * they will get de-linked from the tree.
+ *
+ * This is more aggressive than it needs to be for non-unique !heapkeyspace
+ * indexes.
+ */
+static void
+_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
+{
+ Page page;
+ BTPageOpaque opaque;
+ Buffer rbuf;
+ BlockNumber rblkno;
+
+ page = BufferGetPage(insertstate->buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ rbuf = InvalidBuffer;
+ rblkno = opaque->btpo_next;
+ for (;;)
+ {
+ rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
+ page = BufferGetPage(rbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * If this page was incompletely split, finish the split now. We do
+ * this while holding a lock on the left sibling, which is not good
+ * because finishing the split could be a fairly lengthy operation.
+ * But this should happen very seldom.
+ */
+ if (P_INCOMPLETE_SPLIT(opaque))
+ {
+ _bt_finish_split(rel, rbuf, stack);
+ rbuf = InvalidBuffer;
+ continue;
+ }
+
+ if (!P_IGNORE(opaque))
+ break;
+ if (P_RIGHTMOST(opaque))
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+
+ rblkno = opaque->btpo_next;
+ }
+ /* rbuf locked; unlock buf, update state for caller */
+ _bt_relbuf(rel, insertstate->buf);
+ insertstate->buf = rbuf;
+ insertstate->bounds_valid = false;
+}
+
+/*----------
+ * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ * This recursive procedure does the following things:
+ *
+ * + if postingoff != 0, splits existing posting list tuple
+ * (since it overlaps with new 'itup' tuple).
+ * + if necessary, splits the target page, using 'itup_key' for
+ * suffix truncation on leaf pages (caller passes NULL for
+ * non-leaf pages).
+ * + inserts the new tuple (might be split from posting list).
+ * + if the page was split, pops the parent stack, and finds the
+ * right place to insert the new child pointer (by walking
+ * right using information stored in the parent stack).
+ * + invokes itself with the appropriate tuple for the right
+ * child page on the parent.
+ * + updates the metapage if a true root or fast root is split.
+ *
+ * On entry, we must have the correct buffer in which to do the
+ * insertion, and the buffer must be pinned and write-locked. On return,
+ * we will have dropped both the pin and the lock on the buffer.
+ *
+ * This routine only performs retail tuple insertions. 'itup' should
+ * always be either a non-highkey leaf item, or a downlink (new high
+ * key items are created indirectly, when a page is split). When
+ * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
+ * we're inserting the downlink for. This function will clear the
+ * INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *----------
+ */
+static void
+_bt_insertonpg(Relation rel,
+ BTScanInsert itup_key,
+ Buffer buf,
+ Buffer cbuf,
+ BTStack stack,
+ IndexTuple itup,
+ Size itemsz,
+ OffsetNumber newitemoff,
+ int postingoff,
+ bool split_only_page)
+{
+ Page page;
+ BTPageOpaque opaque;
+ bool isleaf,
+ isroot,
+ isrightmost,
+ isonly;
+ IndexTuple oposting = NULL;
+ IndexTuple origitup = NULL;
+ IndexTuple nposting = NULL;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ isleaf = P_ISLEAF(opaque);
+ isroot = P_ISROOT(opaque);
+ isrightmost = P_RIGHTMOST(opaque);
+ isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
+
+ /* child buffer must be given iff inserting on an internal page */
+ Assert(isleaf == !BufferIsValid(cbuf));
+ /* tuple must have appropriate number of attributes */
+ Assert(!isleaf ||
+ BTreeTupleGetNAtts(itup, rel) ==
+ IndexRelationGetNumberOfAttributes(rel));
+ Assert(isleaf ||
+ BTreeTupleGetNAtts(itup, rel) <=
+ IndexRelationGetNumberOfKeyAttributes(rel));
+ Assert(!BTreeTupleIsPosting(itup));
+ Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
+ /* Caller must always finish incomplete split for us */
+ Assert(!P_INCOMPLETE_SPLIT(opaque));
+
+ /*
+ * Every internal page should have exactly one negative infinity item at
+ * all times. Only _bt_split() and _bt_newroot() should add items that
+ * become negative infinity items through truncation, since they're the
+ * only routines that allocate new internal pages.
+ */
+ Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
+
+ /*
+ * Do we need to split an existing posting list item?
+ */
+ if (postingoff != 0)
+ {
+ ItemId itemid = PageGetItemId(page, newitemoff);
+
+ /*
+ * The new tuple is a duplicate with a heap TID that falls inside the
+ * range of an existing posting list tuple on a leaf page. Prepare to
+ * split an existing posting list. Overwriting the posting list with
+ * its post-split version is treated as an extra step in either the
+ * insert or page split critical section.
+ */
+ Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
+ oposting = (IndexTuple) PageGetItem(page, itemid);
+
+ /*
+ * postingoff value comes from earlier call to _bt_binsrch_posting().
+ * Its binary search might think that a plain tuple must be a posting
+ * list tuple that needs to be split. This can happen with corruption
+ * involving an existing plain tuple that is a duplicate of the new
+ * item, up to and including its table TID. Check for that here in
+ * passing.
+ *
+ * Also verify that our caller has made sure that the existing posting
+ * list tuple does not have its LP_DEAD bit set.
+ */
+ if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
+ ItemPointerGetBlockNumber(&itup->t_tid),
+ ItemPointerGetOffsetNumber(&itup->t_tid),
+ newitemoff, BufferGetBlockNumber(buf),
+ RelationGetRelationName(rel))));
+
+ /* use a mutable copy of itup as our itup from here on */
+ origitup = itup;
+ itup = CopyIndexTuple(origitup);
+ nposting = _bt_swap_posting(itup, oposting, postingoff);
+ /* itup now contains rightmost/max TID from oposting */
+
+ /* Alter offset so that newitem goes after posting list */
+ newitemoff = OffsetNumberNext(newitemoff);
+ }
+
+ /*
+ * Do we need to split the page to fit the item on it?
+ *
+ * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
+ * so this comparison is correct even though we appear to be accounting
+ * only for the item and not for its line pointer.
+ */
+ if (PageGetFreeSpace(page) < itemsz)
+ {
+ Buffer rbuf;
+
+ Assert(!split_only_page);
+
+ /* split the buffer into left and right halves */
+ rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+ origitup, nposting, postingoff);
+ PredicateLockPageSplit(rel,
+ BufferGetBlockNumber(buf),
+ BufferGetBlockNumber(rbuf));
+
+ /*----------
+ * By here,
+ *
+ * + our target page has been split;
+ * + the original tuple has been inserted;
+ * + we have write locks on both the old (left half)
+ * and new (right half) buffers, after the split; and
+ * + we know the key we want to insert into the parent
+ * (it's the "high key" on the left child page).
+ *
+ * We're ready to do the parent insertion. We need to hold onto the
+ * locks for the child pages until we locate the parent, but we can
+ * at least release the lock on the right child before doing the
+ * actual insertion. The lock on the left child will be released
+ * last of all by parent insertion, where it is the 'cbuf' of parent
+ * page.
+ *----------
+ */
+ _bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly);
+ }
+ else
+ {
+ Buffer metabuf = InvalidBuffer;
+ Page metapg = NULL;
+ BTMetaPageData *metad = NULL;
+ BlockNumber blockcache;
+
+ /*
+ * If we are doing this insert because we split a page that was the
+ * only one on its tree level, but was not the root, it may have been
+ * the "fast root". We need to ensure that the fast root link points
+ * at or above the current page. We can safely acquire a lock on the
+ * metapage here --- see comments for _bt_newroot().
+ */
+ if (unlikely(split_only_page))
+ {
+ Assert(!isleaf);
+ Assert(BufferIsValid(cbuf));
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
+
+ if (metad->btm_fastlevel >= opaque->btpo_level)
+ {
+ /* no update wanted */
+ _bt_relbuf(rel, metabuf);
+ metabuf = InvalidBuffer;
+ }
+ }
+
+ /* Do the update. No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ if (postingoff != 0)
+ memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+ if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
+ false) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add new item to block %u in index \"%s\"",
+ BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+
+ MarkBufferDirty(buf);
+
+ if (BufferIsValid(metabuf))
+ {
+ /* upgrade meta-page if needed */
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
+ _bt_upgrademetapage(metapg);
+ metad->btm_fastroot = BufferGetBlockNumber(buf);
+ metad->btm_fastlevel = opaque->btpo_level;
+ MarkBufferDirty(metabuf);
+ }
+
+ /*
+ * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
+ * finishes a split
+ */
+ if (!isleaf)
+ {
+ Page cpage = BufferGetPage(cbuf);
+ BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+ Assert(P_INCOMPLETE_SPLIT(cpageop));
+ cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+ MarkBufferDirty(cbuf);
+ }
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_insert xlrec;
+ xl_btree_metadata xlmeta;
+ uint8 xlinfo;
+ XLogRecPtr recptr;
+ uint16 upostingoff;
+
+ xlrec.offnum = newitemoff;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+
+ if (isleaf && postingoff == 0)
+ {
+ /* Simple leaf insert */
+ xlinfo = XLOG_BTREE_INSERT_LEAF;
+ }
+ else if (postingoff != 0)
+ {
+ /*
+ * Leaf insert with posting list split. Must include
+ * postingoff field before newitem/orignewitem.
+ */
+ Assert(isleaf);
+ xlinfo = XLOG_BTREE_INSERT_POST;
+ }
+ else
+ {
+ /* Internal page insert, which finishes a split on cbuf */
+ xlinfo = XLOG_BTREE_INSERT_UPPER;
+ XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
+
+ if (BufferIsValid(metabuf))
+ {
+ /* Actually, it's an internal page insert + meta update */
+ xlinfo = XLOG_BTREE_INSERT_META;
+
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ xlmeta.version = metad->btm_version;
+ xlmeta.root = metad->btm_root;
+ xlmeta.level = metad->btm_level;
+ xlmeta.fastroot = metad->btm_fastroot;
+ xlmeta.fastlevel = metad->btm_fastlevel;
+ xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+ xlmeta.allequalimage = metad->btm_allequalimage;
+
+ XLogRegisterBuffer(2, metabuf,
+ REGBUF_WILL_INIT | REGBUF_STANDARD);
+ XLogRegisterBufData(2, (char *) &xlmeta,
+ sizeof(xl_btree_metadata));
+ }
+ }
+
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ if (postingoff == 0)
+ {
+ /* Just log itup from caller */
+ XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+ }
+ else
+ {
+ /*
+ * Insert with posting list split (XLOG_BTREE_INSERT_POST
+ * record) case.
+ *
+ * Log postingoff. Also log origitup, not itup. REDO routine
+ * must reconstruct final itup (as well as nposting) using
+ * _bt_swap_posting().
+ */
+ upostingoff = postingoff;
+
+ XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
+ XLogRegisterBufData(0, (char *) origitup,
+ IndexTupleSize(origitup));
+ }
+
+ recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+ if (BufferIsValid(metabuf))
+ PageSetLSN(metapg, recptr);
+ if (!isleaf)
+ PageSetLSN(BufferGetPage(cbuf), recptr);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Release subsidiary buffers */
+ if (BufferIsValid(metabuf))
+ _bt_relbuf(rel, metabuf);
+ if (!isleaf)
+ _bt_relbuf(rel, cbuf);
+
+ /*
+ * Cache the block number if this is the rightmost leaf page. Cache
+ * may be used by a future inserter within _bt_search_insert().
+ */
+ blockcache = InvalidBlockNumber;
+ if (isrightmost && isleaf && !isroot)
+ blockcache = BufferGetBlockNumber(buf);
+
+ /* Release buffer for insertion target block */
+ _bt_relbuf(rel, buf);
+
+ /*
+ * If we decided to cache the insertion target block before releasing
+ * its buffer lock, then cache it now. Check the height of the tree
+ * first, though. We don't go for the optimization with small
+ * indexes. Defer final check to this point to ensure that we don't
+ * call _bt_getrootheight while holding a buffer lock.
+ */
+ if (BlockNumberIsValid(blockcache) &&
+ _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
+ RelationSetTargetBlock(rel, blockcache);
+ }
+
+ /* be tidy */
+ if (postingoff != 0)
+ {
+ /* itup is actually a modified copy of caller's original */
+ pfree(nposting);
+ pfree(itup);
+ }
+}
+
+/*
+ * _bt_split() -- split a page in the btree.
+ *
+ * On entry, buf is the page to split, and is pinned and write-locked.
+ * newitemoff etc. tell us about the new item that must be inserted
+ * along with the data from the original page.
+ *
+ * itup_key is used for suffix truncation on leaf pages (internal
+ * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
+ * is the left-sibling of the page we're inserting the downlink for.
+ * This function will clear the INCOMPLETE_SPLIT flag on it, and
+ * release the buffer.
+ *
+ * orignewitem, nposting, and postingoff are needed when an insert of
+ * orignewitem results in both a posting list split and a page split.
+ * These extra posting list split details are used here in the same
+ * way as they are used in the more common case where a posting list
+ * split does not coincide with a page split. We need to deal with
+ * posting list splits directly in order to ensure that everything
+ * that follows from the insert of orignewitem is handled as a single
+ * atomic operation (though caller's insert of a new pivot/downlink
+ * into parent page will still be a separate operation). See
+ * nbtree/README for details on the design of posting list splits.
+ *
+ * Returns the new right sibling of buf, pinned and write-locked.
+ * The pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
+ OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+ IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
+{
+ Buffer rbuf;
+ Page origpage;
+ Page leftpage,
+ rightpage;
+ BlockNumber origpagenumber,
+ rightpagenumber;
+ BTPageOpaque ropaque,
+ lopaque,
+ oopaque;
+ Buffer sbuf = InvalidBuffer;
+ Page spage = NULL;
+ BTPageOpaque sopaque = NULL;
+ Size itemsz;
+ ItemId itemid;
+ IndexTuple firstright,
+ lefthighkey;
+ OffsetNumber firstrightoff;
+ OffsetNumber afterleftoff,
+ afterrightoff,
+ minusinfoff;
+ OffsetNumber origpagepostingoff;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ bool newitemonleft,
+ isleaf,
+ isrightmost;
+
+ /*
+ * origpage is the original page to be split. leftpage is a temporary
+ * buffer that receives the left-sibling data, which will be copied back
+ * into origpage on success. rightpage is the new page that will receive
+ * the right-sibling data.
+ *
+ * leftpage is allocated after choosing a split point. rightpage's new
+ * buffer isn't acquired until after leftpage is initialized and has new
+ * high key, the last point where splitting the page may fail (barring
+ * corruption). Failing before acquiring new buffer won't have lasting
+ * consequences, since origpage won't have been modified and leftpage is
+ * only workspace.
+ */
+ origpage = BufferGetPage(buf);
+ oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ isleaf = P_ISLEAF(oopaque);
+ isrightmost = P_RIGHTMOST(oopaque);
+ maxoff = PageGetMaxOffsetNumber(origpage);
+ origpagenumber = BufferGetBlockNumber(buf);
+
+ /*
+ * Choose a point to split origpage at.
+ *
+ * A split point can be thought of as a point _between_ two existing data
+ * items on origpage (the lastleft and firstright tuples), provided you
+ * pretend that the new item that didn't fit is already on origpage.
+ *
+ * Since origpage does not actually contain newitem, the representation of
+ * split points needs to work with two boundary cases: splits where
+ * newitem is lastleft, and splits where newitem is firstright.
+ * newitemonleft resolves the ambiguity that would otherwise exist when
+ * newitemoff == firstrightoff. In all other cases it's clear which side
+ * of the split every tuple goes on from context. newitemonleft is
+ * usually (but not always) redundant information.
+ *
+ * firstrightoff is supposed to be an origpage offset number, but it's
+ * possible that its value will be maxoff+1, which is "past the end" of
+ * origpage. This happens in the rare case where newitem goes after all
+ * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
+ * origpage at the point that leaves newitem alone on new right page. Any
+ * "!newitemonleft && newitemoff == firstrightoff" split point makes
+ * newitem the firstright tuple, though, so this case isn't a special
+ * case.
+ */
+ firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
+ newitem, &newitemonleft);
+
+ /* Allocate temp buffer for leftpage */
+ leftpage = PageGetTempPage(origpage);
+ _bt_pageinit(leftpage, BufferGetPageSize(buf));
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+
+ /*
+ * leftpage won't be the root when we're done. Also, clear the SPLIT_END
+ * and HAS_GARBAGE flags.
+ */
+ lopaque->btpo_flags = oopaque->btpo_flags;
+ lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+ /* set flag in leftpage indicating that rightpage has no downlink yet */
+ lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
+ lopaque->btpo_prev = oopaque->btpo_prev;
+ /* handle btpo_next after rightpage buffer acquired */
+ lopaque->btpo_level = oopaque->btpo_level;
+ /* handle btpo_cycleid after rightpage buffer acquired */
+
+ /*
+ * Copy the original page's LSN into leftpage, which will become the
+ * updated version of the page. We need this because XLogInsert will
+ * examine the LSN and possibly dump it in a page image.
+ */
+ PageSetLSN(leftpage, PageGetLSN(origpage));
+
+ /*
+ * Determine page offset number of existing overlapped-with-orignewitem
+ * posting list when it is necessary to perform a posting list split in
+ * passing. Note that newitem was already changed by caller (newitem no
+ * longer has the orignewitem TID).
+ *
+ * This page offset number (origpagepostingoff) will be used to pretend
+ * that the posting split has already taken place, even though the
+ * required modifications to origpage won't occur until we reach the
+ * critical section. The lastleft and firstright tuples of our page split
+ * point should, in effect, come from an imaginary version of origpage
+ * that has the nposting tuple instead of the original posting list tuple.
+ *
+ * Note: _bt_findsplitloc() should have compensated for coinciding posting
+ * list splits in just the same way, at least in theory. It doesn't
+ * bother with that, though. In practice it won't affect its choice of
+ * split point.
+ */
+ origpagepostingoff = InvalidOffsetNumber;
+ if (postingoff != 0)
+ {
+ Assert(isleaf);
+ Assert(ItemPointerCompare(&orignewitem->t_tid,
+ &newitem->t_tid) < 0);
+ Assert(BTreeTupleIsPosting(nposting));
+ origpagepostingoff = OffsetNumberPrev(newitemoff);
+ }
+
+ /*
+ * The high key for the new left page is a possibly-truncated copy of
+ * firstright on the leaf level (it's "firstright itself" on internal
+ * pages; see !isleaf comments below). This may seem to be contrary to
+ * Lehman & Yao's approach of using a copy of lastleft as the new high key
+ * when splitting on the leaf level. It isn't, though.
+ *
+ * Suffix truncation will leave the left page's high key fully equal to
+ * lastleft when lastleft and firstright are equal prior to heap TID (that
+ * is, the tiebreaker TID value comes from lastleft). It isn't actually
+ * necessary for a new leaf high key to be a copy of lastleft for the L&Y
+ * "subtree" invariant to hold. It's sufficient to make sure that the new
+ * leaf high key is strictly less than firstright, and greater than or
+ * equal to (not necessarily equal to) lastleft. In other words, when
+ * suffix truncation isn't possible during a leaf page split, we take
+ * L&Y's exact approach to generating a new high key for the left page.
+ * (Actually, that is slightly inaccurate. We don't just use a copy of
+ * lastleft. A tuple with all the keys from firstright but the max heap
+ * TID from lastleft is used, to avoid introducing a special case.)
+ */
+ if (!newitemonleft && newitemoff == firstrightoff)
+ {
+ /* incoming tuple becomes firstright */
+ itemsz = newitemsz;
+ firstright = newitem;
+ }
+ else
+ {
+ /* existing item at firstrightoff becomes firstright */
+ itemid = PageGetItemId(origpage, firstrightoff);
+ itemsz = ItemIdGetLength(itemid);
+ firstright = (IndexTuple) PageGetItem(origpage, itemid);
+ if (firstrightoff == origpagepostingoff)
+ firstright = nposting;
+ }
+
+ if (isleaf)
+ {
+ IndexTuple lastleft;
+
+ /* Attempt suffix truncation for leaf page splits */
+ if (newitemonleft && newitemoff == firstrightoff)
+ {
+ /* incoming tuple becomes lastleft */
+ lastleft = newitem;
+ }
+ else
+ {
+ OffsetNumber lastleftoff;
+
+ /* existing item before firstrightoff becomes lastleft */
+ lastleftoff = OffsetNumberPrev(firstrightoff);
+ Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
+ itemid = PageGetItemId(origpage, lastleftoff);
+ lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+ if (lastleftoff == origpagepostingoff)
+ lastleft = nposting;
+ }
+
+ lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
+ itemsz = IndexTupleSize(lefthighkey);
+ }
+ else
+ {
+ /*
+ * Don't perform suffix truncation on a copy of firstright to make
+ * left page high key for internal page splits. Must use firstright
+ * as new high key directly.
+ *
+ * Each distinct separator key value originates as a leaf level high
+ * key; all other separator keys/pivot tuples are copied from one
+ * level down. A separator key in a grandparent page must be
+ * identical to high key in rightmost parent page of the subtree to
+ * its left, which must itself be identical to high key in rightmost
+ * child page of that same subtree (this even applies to separator
+ * from grandparent's high key). There must always be an unbroken
+ * "seam" of identical separator keys that guide index scans at every
+ * level, starting from the grandparent. That's why suffix truncation
+ * is unsafe here.
+ *
+ * Internal page splits will truncate firstright into a "negative
+ * infinity" data item when it gets inserted on the new right page
+ * below, though. This happens during the call to _bt_pgaddtup() for
+ * the new first data item for right page. Do not confuse this
+ * mechanism with suffix truncation. It is just a convenient way of
+ * implementing page splits that split the internal page "inside"
+ * firstright. The lefthighkey separator key cannot appear a second
+ * time in the right page (only firstright's downlink goes in right
+ * page).
+ */
+ lefthighkey = firstright;
+ }
+
+ /*
+ * Add new high key to leftpage
+ */
+ afterleftoff = P_HIKEY;
+
+ Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
+ Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
+ IndexRelationGetNumberOfKeyAttributes(rel));
+ Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
+ if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
+ false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add high key to the left sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ afterleftoff = OffsetNumberNext(afterleftoff);
+
+ /*
+ * Acquire a new right page to split into, now that left page has a new
+ * high key. From here on, it's not okay to throw an error without
+ * zeroing rightpage first. This coding rule ensures that we won't
+ * confuse future VACUUM operations, which might otherwise try to re-find
+ * a downlink to a leftover junk page as the page undergoes deletion.
+ *
+ * It would be reasonable to start the critical section just after the new
+ * rightpage buffer is acquired instead; that would allow us to avoid
+ * leftover junk pages without bothering to zero rightpage. We do it this
+ * way because it avoids an unnecessary PANIC when either origpage or its
+ * existing sibling page are corrupt.
+ */
+ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rightpage = BufferGetPage(rbuf);
+ rightpagenumber = BufferGetBlockNumber(rbuf);
+ /* rightpage was initialized by _bt_getbuf */
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+
+ /*
+ * Finish off remaining leftpage special area fields. They cannot be set
+ * before both origpage (leftpage) and rightpage buffers are acquired and
+ * locked.
+ *
+ * btpo_cycleid is only used with leaf pages, though we set it here in all
+ * cases just to be consistent.
+ */
+ lopaque->btpo_next = rightpagenumber;
+ lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
+
+ /*
+ * rightpage won't be the root when we're done. Also, clear the SPLIT_END
+ * and HAS_GARBAGE flags.
+ */
+ ropaque->btpo_flags = oopaque->btpo_flags;
+ ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+ ropaque->btpo_prev = origpagenumber;
+ ropaque->btpo_next = oopaque->btpo_next;
+ ropaque->btpo_level = oopaque->btpo_level;
+ ropaque->btpo_cycleid = lopaque->btpo_cycleid;
+
+ /*
+ * Add new high key to rightpage where necessary.
+ *
+ * If the page we're splitting is not the rightmost page at its level in
+ * the tree, then the first entry on the page is the high key from
+ * origpage.
+ */
+ afterrightoff = P_HIKEY;
+
+ if (!isrightmost)
+ {
+ IndexTuple righthighkey;
+
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ itemsz = ItemIdGetLength(itemid);
+ righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+ Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
+ Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
+ IndexRelationGetNumberOfKeyAttributes(rel));
+ if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
+ false, false) == InvalidOffsetNumber)
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add high key to the right sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
+ afterrightoff = OffsetNumberNext(afterrightoff);
+ }
+
+ /*
+ * Internal page splits truncate first data item on right page -- it
+ * becomes "minus infinity" item for the page. Set this up here.
+ */
+ minusinfoff = InvalidOffsetNumber;
+ if (!isleaf)
+ minusinfoff = afterrightoff;
+
+ /*
+ * Now transfer all the data items (non-pivot tuples in isleaf case, or
+ * additional pivot tuples in !isleaf case) to the appropriate page.
+ *
+ * Note: we *must* insert at least the right page's items in item-number
+ * order, for the benefit of _bt_restore_page().
+ */
+ for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
+ {
+ IndexTuple dataitem;
+
+ itemid = PageGetItemId(origpage, i);
+ itemsz = ItemIdGetLength(itemid);
+ dataitem = (IndexTuple) PageGetItem(origpage, itemid);
+
+ /* replace original item with nposting due to posting split? */
+ if (i == origpagepostingoff)
+ {
+ Assert(BTreeTupleIsPosting(dataitem));
+ Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+ dataitem = nposting;
+ }
+
+ /* does new item belong before this one? */
+ else if (i == newitemoff)
+ {
+ if (newitemonleft)
+ {
+ Assert(newitemoff <= firstrightoff);
+ if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
+ false))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the left sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
+ afterleftoff = OffsetNumberNext(afterleftoff);
+ }
+ else
+ {
+ Assert(newitemoff >= firstrightoff);
+ if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+ afterrightoff == minusinfoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the right sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
+ afterrightoff = OffsetNumberNext(afterrightoff);
+ }
+ }
+
+ /* decide which page to put it on */
+ if (i < firstrightoff)
+ {
+ if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add old item to the left sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
+ afterleftoff = OffsetNumberNext(afterleftoff);
+ }
+ else
+ {
+ if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
+ afterrightoff == minusinfoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add old item to the right sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
+ afterrightoff = OffsetNumberNext(afterrightoff);
+ }
+ }
+
+ /* Handle case where newitem goes at the end of rightpage */
+ if (i <= newitemoff)
+ {
+ /*
+ * Can't have newitemonleft here; that would imply we were told to put
+ * *everything* on the left page, which cannot fit (if it could, we'd
+ * not be splitting the page).
+ */
+ Assert(!newitemonleft && newitemoff == maxoff + 1);
+ if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+ afterrightoff == minusinfoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the right sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
+ afterrightoff = OffsetNumberNext(afterrightoff);
+ }
+
+ /*
+ * We have to grab the original right sibling (if any) and update its prev
+ * link. We are guaranteed that this is deadlock-free, since we couple
+ * the locks in the standard order: left to right.
+ */
+ if (!isrightmost)
+ {
+ sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
+ spage = BufferGetPage(sbuf);
+ sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+ if (sopaque->btpo_prev != origpagenumber)
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("right sibling's left-link doesn't match: "
+ "block %u links to %u instead of expected %u in index \"%s\"",
+ oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
+ RelationGetRelationName(rel))));
+ }
+
+ /*
+ * Check to see if we can set the SPLIT_END flag in the right-hand
+ * split page; this can save some I/O for vacuum since it need not
+ * proceed to the right sibling. We can set the flag if the right
+ * sibling has a different cycleid: that means it could not be part of
+ * a group of pages that were all split off from the same ancestor
+ * page. If you're confused, imagine that page A splits to A B and
+ * then again, yielding A C B, while vacuum is in progress. Tuples
+ * originally in A could now be in either B or C, hence vacuum must
+ * examine both pages. But if D, our right sibling, has a different
+ * cycleid then it could not contain any tuples that were in A when
+ * the vacuum started.
+ */
+ if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
+ ropaque->btpo_flags |= BTP_SPLIT_END;
+ }
+
+ /*
+ * Right sibling is locked, new siblings are prepared, but original page
+ * is not updated yet.
+ *
+ * NO EREPORT(ERROR) till right sibling is updated. We can get away with
+ * not starting the critical section till here because we haven't been
+ * scribbling on the original page yet; see comments above.
+ */
+ START_CRIT_SECTION();
+
+ /*
+ * By here, the original data page has been split into two new halves, and
+ * these are correct. The algorithm requires that the left page never
+ * move during a split, so we copy the new left page back on top of the
+ * original. We need to do this before writing the WAL record, so that
+ * XLogInsert can WAL log an image of the page if necessary.
+ */
+ PageRestoreTempPage(leftpage, origpage);
+ /* leftpage, lopaque must not be used below here */
+
+ MarkBufferDirty(buf);
+ MarkBufferDirty(rbuf);
+
+ if (!isrightmost)
+ {
+ sopaque->btpo_prev = rightpagenumber;
+ MarkBufferDirty(sbuf);
+ }
+
+ /*
+ * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
+ * a split
+ */
+ if (!isleaf)
+ {
+ Page cpage = BufferGetPage(cbuf);
+ BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+ cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+ MarkBufferDirty(cbuf);
+ }
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_split xlrec;
+ uint8 xlinfo;
+ XLogRecPtr recptr;
+
+ xlrec.level = ropaque->btpo_level;
+ /* See comments below on newitem, orignewitem, and posting lists */
+ xlrec.firstrightoff = firstrightoff;
+ xlrec.newitemoff = newitemoff;
+ xlrec.postingoff = 0;
+ if (postingoff != 0 && origpagepostingoff < firstrightoff)
+ xlrec.postingoff = postingoff;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
+
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
+ /* Log original right sibling, since we've changed its prev-pointer */
+ if (!isrightmost)
+ XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
+ if (!isleaf)
+ XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
+
+ /*
+ * Log the new item, if it was inserted on the left page. (If it was
+ * put on the right page, we don't need to explicitly WAL log it
+ * because it's included with all the other items on the right page.)
+ * Show the new item as belonging to the left page buffer, so that it
+ * is not stored if XLogInsert decides it needs a full-page image of
+ * the left page. We always store newitemoff in the record, though.
+ *
+ * The details are sometimes slightly different for page splits that
+ * coincide with a posting list split. If both the replacement
+ * posting list and newitem go on the right page, then we don't need
+ * to log anything extra, just like the simple !newitemonleft
+ * no-posting-split case (postingoff is set to zero in the WAL record,
+ * so recovery doesn't need to process a posting list split at all).
+ * Otherwise, we set postingoff and log orignewitem instead of
+ * newitem, despite having actually inserted newitem. REDO routine
+ * must reconstruct nposting and newitem using _bt_swap_posting().
+ *
+ * Note: It's possible that our page split point is the point that
+ * makes the posting list lastleft and newitem firstright. This is
+ * the only case where we log orignewitem/newitem despite newitem
+ * going on the right page. If XLogInsert decides that it can omit
+ * orignewitem due to logging a full-page image of the left page,
+ * everything still works out, since recovery only needs to log
+ * orignewitem for items on the left page (just like the regular
+ * newitem-logged case).
+ */
+ if (newitemonleft && xlrec.postingoff == 0)
+ XLogRegisterBufData(0, (char *) newitem, newitemsz);
+ else if (xlrec.postingoff != 0)
+ {
+ Assert(isleaf);
+ Assert(newitemonleft || firstrightoff == newitemoff);
+ Assert(newitemsz == IndexTupleSize(orignewitem));
+ XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
+ }
+
+ /* Log the left page's new high key */
+ if (!isleaf)
+ {
+ /* lefthighkey isn't local copy, get current pointer */
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+ }
+ XLogRegisterBufData(0, (char *) lefthighkey,
+ MAXALIGN(IndexTupleSize(lefthighkey)));
+
+ /*
+ * Log the contents of the right page in the format understood by
+ * _bt_restore_page(). The whole right page will be recreated.
+ *
+ * Direct access to page is not good but faster - we should implement
+ * some new func in page API. Note we only store the tuples
+ * themselves, knowing that they were inserted in item-number order
+ * and so the line pointers can be reconstructed. See comments for
+ * _bt_restore_page().
+ */
+ XLogRegisterBufData(1,
+ (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
+ ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
+
+ xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
+ recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+ PageSetLSN(origpage, recptr);
+ PageSetLSN(rightpage, recptr);
+ if (!isrightmost)
+ PageSetLSN(spage, recptr);
+ if (!isleaf)
+ PageSetLSN(BufferGetPage(cbuf), recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* release the old right sibling */
+ if (!isrightmost)
+ _bt_relbuf(rel, sbuf);
+
+ /* release the child */
+ if (!isleaf)
+ _bt_relbuf(rel, cbuf);
+
+ /* be tidy */
+ if (isleaf)
+ pfree(lefthighkey);
+
+ /* split's done */
+ return rbuf;
+}
+
+/*
+ * _bt_insert_parent() -- Insert downlink into parent, completing split.
+ *
+ * On entry, buf and rbuf are the left and right split pages, which we
+ * still hold write locks on. Both locks will be released here. We
+ * release the rbuf lock once we have a write lock on the page that we
+ * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
+ * The lock on buf is released at the same point as the lock on the parent
+ * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
+ * atomic operation that completes the split by inserting a new downlink.
+ *
+ * stack - stack showing how we got here. Will be NULL when splitting true
+ * root, or during concurrent root split, where we can be inefficient
+ * isroot - we split the true root
+ * isonly - we split a page alone on its level (might have been fast root)
+ */
+static void
+_bt_insert_parent(Relation rel,
+ Buffer buf,
+ Buffer rbuf,
+ BTStack stack,
+ bool isroot,
+ bool isonly)
+{
+ /*
+ * Here we have to do something Lehman and Yao don't talk about: deal with
+ * a root split and construction of a new root. If our stack is empty
+ * then we have just split a node on what had been the root level when we
+ * descended the tree. If it was still the root then we perform a
+ * new-root construction. If it *wasn't* the root anymore, search to find
+ * the next higher level that someone constructed meanwhile, and find the
+ * right place to insert as for the normal case.
+ *
+ * If we have to search for the parent level, we do so by re-descending
+ * from the root. This is not super-efficient, but it's rare enough not
+ * to matter.
+ */
+ if (isroot)
+ {
+ Buffer rootbuf;
+
+ Assert(stack == NULL);
+ Assert(isonly);
+ /* create a new root node and update the metapage */
+ rootbuf = _bt_newroot(rel, buf, rbuf);
+ /* release the split buffers */
+ _bt_relbuf(rel, rootbuf);
+ _bt_relbuf(rel, rbuf);
+ _bt_relbuf(rel, buf);
+ }
+ else
+ {
+ BlockNumber bknum = BufferGetBlockNumber(buf);
+ BlockNumber rbknum = BufferGetBlockNumber(rbuf);
+ Page page = BufferGetPage(buf);
+ IndexTuple new_item;
+ BTStackData fakestack;
+ IndexTuple ritem;
+ Buffer pbuf;
+
+ if (stack == NULL)
+ {
+ BTPageOpaque opaque;
+
+ elog(DEBUG2, "concurrent ROOT page split");
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * We should never reach here when a leaf page split takes place
+ * despite the insert of newitem being able to apply the fastpath
+ * optimization. Make sure of that with an assertion.
+ *
+ * This is more of a performance issue than a correctness issue.
+ * The fastpath won't have a descent stack. Using a phony stack
+ * here works, but never rely on that. The fastpath should be
+ * rejected within _bt_search_insert() when the rightmost leaf
+ * page will split, since it's faster to go through _bt_search()
+ * and get a stack in the usual way.
+ */
+ Assert(!(P_ISLEAF(opaque) &&
+ BlockNumberIsValid(RelationGetTargetBlock(rel))));
+
+ /* Find the leftmost page at the next level up */
+ pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
+ /* Set up a phony stack entry pointing there */
+ stack = &fakestack;
+ stack->bts_blkno = BufferGetBlockNumber(pbuf);
+ stack->bts_offset = InvalidOffsetNumber;
+ stack->bts_parent = NULL;
+ _bt_relbuf(rel, pbuf);
+ }
+
+ /* get high key from left, a strict lower bound for new right page */
+ ritem = (IndexTuple) PageGetItem(page,
+ PageGetItemId(page, P_HIKEY));
+
+ /* form an index tuple that points at the new right page */
+ new_item = CopyIndexTuple(ritem);
+ BTreeTupleSetDownLink(new_item, rbknum);
+
+ /*
+ * Re-find and write lock the parent of buf.
+ *
+ * It's possible that the location of buf's downlink has changed since
+ * our initial _bt_search() descent. _bt_getstackbuf() will detect
+ * and recover from this, updating the stack, which ensures that the
+ * new downlink will be inserted at the correct offset. Even buf's
+ * parent may have changed.
+ */
+ pbuf = _bt_getstackbuf(rel, stack, bknum);
+
+ /*
+ * Unlock the right child. The left child will be unlocked in
+ * _bt_insertonpg().
+ *
+ * Unlocking the right child must be delayed until here to ensure that
+ * no concurrent VACUUM operation can become confused. Page deletion
+ * cannot be allowed to fail to re-find a downlink for the rbuf page.
+ * (Actually, this is just a vestige of how things used to work. The
+ * page deletion code is expected to check for the INCOMPLETE_SPLIT
+ * flag on the left child. It won't attempt deletion of the right
+ * child until the split is complete. Despite all this, we opt to
+ * conservatively delay unlocking the right child until here.)
+ */
+ _bt_relbuf(rel, rbuf);
+
+ if (pbuf == InvalidBuffer)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
+ RelationGetRelationName(rel), bknum, rbknum)));
+
+ /* Recursively insert into the parent */
+ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
+ new_item, MAXALIGN(IndexTupleSize(new_item)),
+ stack->bts_offset + 1, 0, isonly);
+
+ /* be tidy */
+ pfree(new_item);
+ }
+}
+
+/*
+ * _bt_finish_split() -- Finish an incomplete split
+ *
+ * A crash or other failure can leave a split incomplete. The insertion
+ * routines won't allow to insert on a page that is incompletely split.
+ * Before inserting on such a page, call _bt_finish_split().
+ *
+ * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
+ * and unpinned.
+ */
+void
+_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
+{
+ Page lpage = BufferGetPage(lbuf);
+ BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque rpageop;
+ bool wasroot;
+ bool wasonly;
+
+ Assert(P_INCOMPLETE_SPLIT(lpageop));
+
+ /* Lock right sibling, the one missing the downlink */
+ rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
+ rpage = BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /* Could this be a root split? */
+ if (!stack)
+ {
+ Buffer metabuf;
+ Page metapg;
+ BTMetaPageData *metad;
+
+ /* acquire lock on the metapage */
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
+
+ wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
+
+ _bt_relbuf(rel, metabuf);
+ }
+ else
+ wasroot = false;
+
+ /* Was this the only page on the level before split? */
+ wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
+
+ elog(DEBUG1, "finishing incomplete split of %u/%u",
+ BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
+
+ _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
+}
+
+/*
+ * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
+ * tuple whose downlink points to child page.
+ *
+ * Caller passes child's block number, which is used to identify
+ * associated pivot tuple in parent page using a linear search that
+ * matches on pivot's downlink/block number. The expected location of
+ * the pivot tuple is taken from the stack one level above the child
+ * page. This is used as a starting point. Insertions into the
+ * parent level could cause the pivot tuple to move right; deletions
+ * could cause it to move left, but not left of the page we previously
+ * found it on.
+ *
+ * Caller can use its stack to relocate the pivot tuple/downlink for
+ * any same-level page to the right of the page found by its initial
+ * descent. This is necessary because of the possibility that caller
+ * moved right to recover from a concurrent page split. It's also
+ * convenient for certain callers to be able to step right when there
+ * wasn't a concurrent page split, while still using their original
+ * stack. For example, the checkingunique _bt_doinsert() case may
+ * have to step right when there are many physical duplicates, and its
+ * scantid forces an insertion to the right of the "first page the
+ * value could be on". (This is also relied on by all of our callers
+ * when dealing with !heapkeyspace indexes.)
+ *
+ * Returns write-locked parent page buffer, or InvalidBuffer if pivot
+ * tuple not found (should not happen). Adjusts bts_blkno &
+ * bts_offset if changed. Page split caller should insert its new
+ * pivot tuple for its new right sibling page on parent page, at the
+ * offset number bts_offset + 1.
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
+{
+ BlockNumber blkno;
+ OffsetNumber start;
+
+ blkno = stack->bts_blkno;
+ start = stack->bts_offset;
+
+ for (;;)
+ {
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_INCOMPLETE_SPLIT(opaque))
+ {
+ _bt_finish_split(rel, buf, stack->bts_parent);
+ continue;
+ }
+
+ if (!P_IGNORE(opaque))
+ {
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ ItemId itemid;
+ IndexTuple item;
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * start = InvalidOffsetNumber means "search the whole page". We
+ * need this test anyway due to possibility that page has a high
+ * key now when it didn't before.
+ */
+ if (start < minoff)
+ start = minoff;
+
+ /*
+ * Need this check too, to guard against possibility that page
+ * split since we visited it originally.
+ */
+ if (start > maxoff)
+ start = OffsetNumberNext(maxoff);
+
+ /*
+ * These loops will check every item on the page --- but in an
+ * order that's attuned to the probability of where it actually
+ * is. Scan to the right first, then to the left.
+ */
+ for (offnum = start;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ itemid = PageGetItemId(page, offnum);
+ item = (IndexTuple) PageGetItem(page, itemid);
+
+ if (BTreeTupleGetDownLink(item) == child)
+ {
+ /* Return accurate pointer to where link is now */
+ stack->bts_blkno = blkno;
+ stack->bts_offset = offnum;
+ return buf;
+ }
+ }
+
+ for (offnum = OffsetNumberPrev(start);
+ offnum >= minoff;
+ offnum = OffsetNumberPrev(offnum))
+ {
+ itemid = PageGetItemId(page, offnum);
+ item = (IndexTuple) PageGetItem(page, itemid);
+
+ if (BTreeTupleGetDownLink(item) == child)
+ {
+ /* Return accurate pointer to where link is now */
+ stack->bts_blkno = blkno;
+ stack->bts_offset = offnum;
+ return buf;
+ }
+ }
+ }
+
+ /*
+ * The item we're looking for moved right at least one page.
+ *
+ * Lehman and Yao couple/chain locks when moving right here, which we
+ * can avoid. See nbtree/README.
+ */
+ if (P_RIGHTMOST(opaque))
+ {
+ _bt_relbuf(rel, buf);
+ return InvalidBuffer;
+ }
+ blkno = opaque->btpo_next;
+ start = InvalidOffsetNumber;
+ _bt_relbuf(rel, buf);
+ }
+}
+
+/*
+ * _bt_newroot() -- Create a new root page for the index.
+ *
+ * We've just split the old root page and need to create a new one.
+ * In order to do this, we add a new root page to the file, then lock
+ * the metadata page and update it. This is guaranteed to be deadlock-
+ * free, because all readers release their locks on the metadata page
+ * before trying to lock the root, and all writers lock the root before
+ * trying to lock the metadata page. We have a write lock on the old
+ * root page, so we have not introduced any cycles into the waits-for
+ * graph.
+ *
+ * On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ * locked. On exit, a new root page exists with entries for the
+ * two new children, metapage is updated and unlocked/unpinned.
+ * The new root buffer is returned to caller which has to unlock/unpin
+ * lbuf, rbuf & rootbuf.
+ */
+static Buffer
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+ Buffer rootbuf;
+ Page lpage,
+ rootpage;
+ BlockNumber lbkno,
+ rbkno;
+ BlockNumber rootblknum;
+ BTPageOpaque rootopaque;
+ BTPageOpaque lopaque;
+ ItemId itemid;
+ IndexTuple item;
+ IndexTuple left_item;
+ Size left_item_sz;
+ IndexTuple right_item;
+ Size right_item_sz;
+ Buffer metabuf;
+ Page metapg;
+ BTMetaPageData *metad;
+
+ lbkno = BufferGetBlockNumber(lbuf);
+ rbkno = BufferGetBlockNumber(rbuf);
+ lpage = BufferGetPage(lbuf);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+
+ /* get a new root page */
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootpage = BufferGetPage(rootbuf);
+ rootblknum = BufferGetBlockNumber(rootbuf);
+
+ /* acquire lock on the metapage */
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
+
+ /*
+ * Create downlink item for left page (old root). The key value used is
+ * "minus infinity", a sentinel value that's reliably less than any real
+ * key value that could appear in the left page.
+ */
+ left_item_sz = sizeof(IndexTupleData);
+ left_item = (IndexTuple) palloc(left_item_sz);
+ left_item->t_info = left_item_sz;
+ BTreeTupleSetDownLink(left_item, lbkno);
+ BTreeTupleSetNAtts(left_item, 0, false);
+
+ /*
+ * Create downlink item for right page. The key for it is obtained from
+ * the "high key" position in the left page.
+ */
+ itemid = PageGetItemId(lpage, P_HIKEY);
+ right_item_sz = ItemIdGetLength(itemid);
+ item = (IndexTuple) PageGetItem(lpage, itemid);
+ right_item = CopyIndexTuple(item);
+ BTreeTupleSetDownLink(right_item, rbkno);
+
+ /* NO EREPORT(ERROR) from here till newroot op is logged */
+ START_CRIT_SECTION();
+
+ /* upgrade metapage if needed */
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
+ _bt_upgrademetapage(metapg);
+
+ /* set btree special data */
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+ rootopaque->btpo_flags = BTP_ROOT;
+ rootopaque->btpo_level =
+ ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1;
+ rootopaque->btpo_cycleid = 0;
+
+ /* update metapage data */
+ metad->btm_root = rootblknum;
+ metad->btm_level = rootopaque->btpo_level;
+ metad->btm_fastroot = rootblknum;
+ metad->btm_fastlevel = rootopaque->btpo_level;
+
+ /*
+ * Insert the left page pointer into the new root page. The root page is
+ * the rightmost page on its level so there is no "high key" in it; the
+ * two items will go into positions P_HIKEY and P_FIRSTKEY.
+ *
+ * Note: we *must* insert the two items in item-number order, for the
+ * benefit of _bt_restore_page().
+ */
+ Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
+ if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add leftkey to new root page"
+ " while splitting block %u of index \"%s\"",
+ BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+ /*
+ * insert the right page pointer into the new root page.
+ */
+ Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
+ Assert(BTreeTupleGetNAtts(right_item, rel) <=
+ IndexRelationGetNumberOfKeyAttributes(rel));
+ if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add rightkey to new root page"
+ " while splitting block %u of index \"%s\"",
+ BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+ /* Clear the incomplete-split flag in the left child */
+ Assert(P_INCOMPLETE_SPLIT(lopaque));
+ lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+ MarkBufferDirty(lbuf);
+
+ MarkBufferDirty(rootbuf);
+ MarkBufferDirty(metabuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_newroot xlrec;
+ XLogRecPtr recptr;
+ xl_btree_metadata md;
+
+ xlrec.rootblk = rootblknum;
+ xlrec.level = metad->btm_level;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
+
+ XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
+ XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
+ XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
+ md.root = rootblknum;
+ md.level = metad->btm_level;
+ md.fastroot = rootblknum;
+ md.fastlevel = metad->btm_level;
+ md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+ md.allequalimage = metad->btm_allequalimage;
+
+ XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
+
+ /*
+ * Direct access to page is not good but faster - we should implement
+ * some new func in page API.
+ */
+ XLogRegisterBufData(0,
+ (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
+ ((PageHeader) rootpage)->pd_special -
+ ((PageHeader) rootpage)->pd_upper);
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
+
+ PageSetLSN(lpage, recptr);
+ PageSetLSN(rootpage, recptr);
+ PageSetLSN(metapg, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* done with metapage */
+ _bt_relbuf(rel, metabuf);
+
+ pfree(left_item);
+ pfree(right_item);
+
+ return rootbuf;
+}
+
+/*
+ * _bt_pgaddtup() -- add a data item to a particular page during split.
+ *
+ * The difference between this routine and a bare PageAddItem call is
+ * that this code can deal with the first data item on an internal btree
+ * page in passing. This data item (which is called "firstright" within
+ * _bt_split()) has a key that must be treated as minus infinity after
+ * the split. Therefore, we truncate away all attributes when caller
+ * specifies it's the first data item on page (downlink is not changed,
+ * though). This extra step is only needed for the right page of an
+ * internal page split. There is no need to do this for the first data
+ * item on the existing/left page, since that will already have been
+ * truncated during an earlier page split.
+ *
+ * See _bt_split() for a high level explanation of why we truncate here.
+ * Note that this routine has nothing to do with suffix truncation,
+ * despite using some of the same infrastructure.
+ */
+static inline bool
+_bt_pgaddtup(Page page,
+ Size itemsize,
+ IndexTuple itup,
+ OffsetNumber itup_off,
+ bool newfirstdataitem)
+{
+ IndexTupleData trunctuple;
+
+ if (newfirstdataitem)
+ {
+ trunctuple = *itup;
+ trunctuple.t_info = sizeof(IndexTupleData);
+ BTreeTupleSetNAtts(&trunctuple, 0, false);
+ itup = &trunctuple;
+ itemsize = sizeof(IndexTupleData);
+ }
+
+ if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
+ false) == InvalidOffsetNumber))
+ return false;
+
+ return true;
+}
+
+/*
+ * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
+ *
+ * There are three operations performed here: simple index deletion, bottom-up
+ * index deletion, and deduplication. If all three operations fail to free
+ * enough space for the incoming item then caller will go on to split the
+ * page. We always consider simple deletion first. If that doesn't work out
+ * we consider alternatives. Callers that only want us to consider simple
+ * deletion (without any fallback) ask for that using the 'simpleonly'
+ * argument.
+ *
+ * We usually pick only one alternative "complex" operation when simple
+ * deletion alone won't prevent a page split. The 'checkingunique',
+ * 'uniquedup', and 'indexUnchanged' arguments are used for that.
+ *
+ * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
+ * level flag was found set. The flag was useful back when there wasn't
+ * necessarily one single page for a duplicate tuple to go on (before heap TID
+ * became a part of the key space in version 4 indexes). But we don't
+ * actually look at the flag anymore (it's not a gating condition for our
+ * caller). That would cause us to miss tuples that are safe to delete,
+ * without getting any benefit in return. We know that the alternative is to
+ * split the page; scanning the line pointer array in passing won't have
+ * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
+ * all this because !heapkeyspace indexes must still do a "getting tired"
+ * linear search, and so are likely to get some benefit from using it as a
+ * gating condition.)
+ */
+static void
+_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
+ BTInsertState insertstate,
+ bool simpleonly, bool checkingunique,
+ bool uniquedup, bool indexUnchanged)
+{
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
+ int ndeletable = 0;
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Buffer buffer = insertstate->buf;
+ BTScanInsert itup_key = insertstate->itup_key;
+ Page page = BufferGetPage(buffer);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ Assert(P_ISLEAF(opaque));
+ Assert(simpleonly || itup_key->heapkeyspace);
+ Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
+
+ /*
+ * Scan over all items to see which ones need to be deleted according to
+ * LP_DEAD flags. We'll usually manage to delete a few extra items that
+ * are not marked LP_DEAD in passing. Often the extra items that actually
+ * end up getting deleted are items that would have had their LP_DEAD bit
+ * set before long anyway (if we opted not to include them as extras).
+ */
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemId = PageGetItemId(page, offnum);
+
+ if (ItemIdIsDead(itemId))
+ deletable[ndeletable++] = offnum;
+ }
+
+ if (ndeletable > 0)
+ {
+ _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
+ insertstate->itup, minoff, maxoff);
+ insertstate->bounds_valid = false;
+
+ /* Return when a page split has already been avoided */
+ if (PageGetFreeSpace(page) >= insertstate->itemsz)
+ return;
+
+ /* Might as well assume duplicates (if checkingunique) */
+ uniquedup = true;
+ }
+
+ /*
+ * We're done with simple deletion. Return early with callers that only
+ * call here so that simple deletion can be considered. This includes
+ * callers that explicitly ask for this and checkingunique callers that
+ * probably don't have any version churn duplicates on the page.
+ *
+ * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
+ * return at this point (or when we go on the try either or both of our
+ * other strategies and they also fail). We do not bother expending a
+ * separate write to clear it, however. Caller will definitely clear it
+ * when it goes on to split the page (note also that the deduplication
+ * process will clear the flag in passing, just to keep things tidy).
+ */
+ if (simpleonly || (checkingunique && !uniquedup))
+ {
+ Assert(!indexUnchanged);
+ return;
+ }
+
+ /* Assume bounds about to be invalidated (this is almost certain now) */
+ insertstate->bounds_valid = false;
+
+ /*
+ * Perform bottom-up index deletion pass when executor hint indicated that
+ * incoming item is logically unchanged, or for a unique index that is
+ * known to have physical duplicates for some other reason. (There is a
+ * large overlap between these two cases for a unique index. It's worth
+ * having both triggering conditions in order to apply the optimization in
+ * the event of successive related INSERT and DELETE statements.)
+ *
+ * We'll go on to do a deduplication pass when a bottom-up pass fails to
+ * delete an acceptable amount of free space (a significant fraction of
+ * the page, or space for the new item, whichever is greater).
+ *
+ * Note: Bottom-up index deletion uses the same equality/equivalence
+ * routines as deduplication internally. However, it does not merge
+ * together index tuples, so the same correctness considerations do not
+ * apply. We deliberately omit an index-is-allequalimage test here.
+ */
+ if ((indexUnchanged || uniquedup) &&
+ _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
+ return;
+
+ /* Perform deduplication pass (when enabled and index-is-allequalimage) */
+ if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
+ _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
+ insertstate->itemsz, (indexUnchanged || uniquedup));
+}
+
+/*
+ * _bt_simpledel_pass - Simple index tuple deletion pass.
+ *
+ * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
+ * of all such tuples are determined by caller (caller passes these to us as
+ * its 'deletable' argument).
+ *
+ * We might also delete extra index tuples that turn out to be safe to delete
+ * in passing (though they must be cheap to check in passing to begin with).
+ * There is no certainty that any extra tuples will be deleted, though. The
+ * high level goal of the approach we take is to get the most out of each call
+ * here (without noticeably increasing the per-call overhead compared to what
+ * we need to do just to be able to delete the page's LP_DEAD-marked index
+ * tuples).
+ *
+ * The number of extra index tuples that turn out to be deletable might
+ * greatly exceed the number of LP_DEAD-marked index tuples due to various
+ * locality related effects. For example, it's possible that the total number
+ * of table blocks (pointed to by all TIDs on the leaf page) is naturally
+ * quite low, in which case we might end up checking if it's possible to
+ * delete _most_ index tuples on the page (without the tableam needing to
+ * access additional table blocks). The tableam will sometimes stumble upon
+ * _many_ extra deletable index tuples in indexes where this pattern is
+ * common.
+ *
+ * See nbtree/README for further details on simple index tuple deletion.
+ */
+static void
+_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
+ OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
+ OffsetNumber minoff, OffsetNumber maxoff)
+{
+ Page page = BufferGetPage(buffer);
+ BlockNumber *deadblocks;
+ int ndeadblocks;
+ TM_IndexDeleteOp delstate;
+ OffsetNumber offnum;
+
+ /* Get array of table blocks pointed to by LP_DEAD-set tuples */
+ deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
+ &ndeadblocks);
+
+ /* Initialize tableam state that describes index deletion operation */
+ delstate.bottomup = false;
+ delstate.bottomupfreespace = 0;
+ delstate.ndeltids = 0;
+ delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
+ delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
+
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+ TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
+ TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
+ BlockNumber tidblock;
+ void *match;
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
+ match = bsearch(&tidblock, deadblocks, ndeadblocks,
+ sizeof(BlockNumber), _bt_blk_cmp);
+
+ if (!match)
+ {
+ Assert(!ItemIdIsDead(itemid));
+ continue;
+ }
+
+ /*
+ * TID's table block is among those pointed to by the TIDs from
+ * LP_DEAD-bit set tuples on page -- add TID to deltids
+ */
+ odeltid->tid = itup->t_tid;
+ odeltid->id = delstate.ndeltids;
+ ostatus->idxoffnum = offnum;
+ ostatus->knowndeletable = ItemIdIsDead(itemid);
+ ostatus->promising = false; /* unused */
+ ostatus->freespace = 0; /* unused */
+
+ delstate.ndeltids++;
+ }
+ else
+ {
+ int nitem = BTreeTupleGetNPosting(itup);
+
+ for (int p = 0; p < nitem; p++)
+ {
+ ItemPointer tid = BTreeTupleGetPostingN(itup, p);
+
+ tidblock = ItemPointerGetBlockNumber(tid);
+ match = bsearch(&tidblock, deadblocks, ndeadblocks,
+ sizeof(BlockNumber), _bt_blk_cmp);
+
+ if (!match)
+ {
+ Assert(!ItemIdIsDead(itemid));
+ continue;
+ }
+
+ /*
+ * TID's table block is among those pointed to by the TIDs
+ * from LP_DEAD-bit set tuples on page -- add TID to deltids
+ */
+ odeltid->tid = *tid;
+ odeltid->id = delstate.ndeltids;
+ ostatus->idxoffnum = offnum;
+ ostatus->knowndeletable = ItemIdIsDead(itemid);
+ ostatus->promising = false; /* unused */
+ ostatus->freespace = 0; /* unused */
+
+ odeltid++;
+ ostatus++;
+ delstate.ndeltids++;
+ }
+ }
+ }
+
+ pfree(deadblocks);
+
+ Assert(delstate.ndeltids >= ndeletable);
+
+ /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
+ _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
+
+ pfree(delstate.deltids);
+ pfree(delstate.status);
+}
+
+/*
+ * _bt_deadblocks() -- Get LP_DEAD related table blocks.
+ *
+ * Builds sorted and unique-ified array of table block numbers from index
+ * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
+ * block from incoming newitem just in case it isn't among the LP_DEAD-related
+ * table blocks.
+ *
+ * Always counting the newitem's table block as an LP_DEAD related block makes
+ * sense because the cost is consistently low; it is practically certain that
+ * the table block will not incur a buffer miss in tableam. On the other hand
+ * the benefit is often quite high. There is a decent chance that there will
+ * be some deletable items from this block, since in general most garbage
+ * tuples became garbage in the recent past (in many cases this won't be the
+ * first logical row that core code added to/modified in table block
+ * recently).
+ *
+ * Returns final array, and sets *nblocks to its final size for caller.
+ */
+static BlockNumber *
+_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
+ IndexTuple newitem, int *nblocks)
+{
+ int spacentids,
+ ntids;
+ BlockNumber *tidblocks;
+
+ /*
+ * Accumulate each TID's block in array whose initial size has space for
+ * one table block per LP_DEAD-set tuple (plus space for the newitem table
+ * block). Array will only need to grow when there are LP_DEAD-marked
+ * posting list tuples (which is not that common).
+ */
+ spacentids = ndeletable + 1;
+ ntids = 0;
+ tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
+
+ /*
+ * First add the table block for the incoming newitem. This is the one
+ * case where simple deletion can visit a table block that doesn't have
+ * any known deletable items.
+ */
+ Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
+ tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
+
+ for (int i = 0; i < ndeletable; i++)
+ {
+ ItemId itemid = PageGetItemId(page, deletable[i]);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(ItemIdIsDead(itemid));
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ if (ntids + 1 > spacentids)
+ {
+ spacentids *= 2;
+ tidblocks = (BlockNumber *)
+ repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+ }
+
+ tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
+ }
+ else
+ {
+ int nposting = BTreeTupleGetNPosting(itup);
+
+ if (ntids + nposting > spacentids)
+ {
+ spacentids = Max(spacentids * 2, ntids + nposting);
+ tidblocks = (BlockNumber *)
+ repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
+ }
+
+ for (int j = 0; j < nposting; j++)
+ {
+ ItemPointer tid = BTreeTupleGetPostingN(itup, j);
+
+ tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
+ }
+ }
+ }
+
+ qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+ *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
+
+ return tidblocks;
+}
+
+/*
+ * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
+ */
+static inline int
+_bt_blk_cmp(const void *arg1, const void *arg2)
+{
+ BlockNumber b1 = *((BlockNumber *) arg1);
+ BlockNumber b2 = *((BlockNumber *) arg2);
+
+ if (b1 < b2)
+ return -1;
+ else if (b1 > b2)
+ return 1;
+
+ return 0;
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
new file mode 100644
index 0000000..ebec8fa
--- /dev/null
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -0,0 +1,3073 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtpage.c
+ * BTree-specific page management code for the Postgres btree access
+ * method.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtpage.c
+ *
+ * NOTES
+ * Postgres btree pages look like ordinary relation pages. The opaque
+ * data at high addresses includes pointers to left and right siblings
+ * and flag data describing page state. The first page in a btree, page
+ * zero, is special -- it stores meta-information describing the tree.
+ * Pages one and higher store the actual tree data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/procarray.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+
+static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
+static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
+ FullTransactionId safexid);
+static void _bt_delitems_delete(Relation rel, Buffer buf,
+ TransactionId latestRemovedXid,
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable);
+static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
+ OffsetNumber *updatedoffsets,
+ Size *updatedbuflen, bool needswal);
+static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
+ BTStack stack);
+static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
+ BlockNumber scanblkno,
+ bool *rightsib_empty,
+ BTVacState *vstate);
+static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
+ BTStack stack,
+ Buffer *subtreeparent,
+ OffsetNumber *poffset,
+ BlockNumber *topparent,
+ BlockNumber *topparentrightsib);
+static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
+ FullTransactionId safexid);
+
+/*
+ * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
+ */
+void
+_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+ bool allequalimage)
+{
+ BTMetaPageData *metad;
+ BTPageOpaque metaopaque;
+
+ _bt_pageinit(page, BLCKSZ);
+
+ metad = BTPageGetMeta(page);
+ metad->btm_magic = BTREE_MAGIC;
+ metad->btm_version = BTREE_VERSION;
+ metad->btm_root = rootbknum;
+ metad->btm_level = level;
+ metad->btm_fastroot = rootbknum;
+ metad->btm_fastlevel = level;
+ metad->btm_last_cleanup_num_delpages = 0;
+ metad->btm_last_cleanup_num_heap_tuples = -1.0;
+ metad->btm_allequalimage = allequalimage;
+
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ metaopaque->btpo_flags = BTP_META;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page.
+ */
+ ((PageHeader) page)->pd_lower =
+ ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
+}
+
+/*
+ * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
+ * 3, the last version that can be updated without broadly affecting
+ * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
+ *
+ * This routine does purely in-memory image upgrade. Caller is
+ * responsible for locking, WAL-logging etc.
+ */
+void
+_bt_upgrademetapage(Page page)
+{
+ BTMetaPageData *metad;
+ BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
+
+ metad = BTPageGetMeta(page);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* It must be really a meta page of upgradable version */
+ Assert(metaopaque->btpo_flags & BTP_META);
+ Assert(metad->btm_version < BTREE_NOVAC_VERSION);
+ Assert(metad->btm_version >= BTREE_MIN_VERSION);
+
+ /* Set version number and fill extra fields added into version 3 */
+ metad->btm_version = BTREE_NOVAC_VERSION;
+ metad->btm_last_cleanup_num_delpages = 0;
+ metad->btm_last_cleanup_num_heap_tuples = -1.0;
+ /* Only a REINDEX can set this field */
+ Assert(!metad->btm_allequalimage);
+ metad->btm_allequalimage = false;
+
+ /* Adjust pd_lower (see _bt_initmetapage() for details) */
+ ((PageHeader) page)->pd_lower =
+ ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
+}
+
+/*
+ * Get metadata from share-locked buffer containing metapage, while performing
+ * standard sanity checks.
+ *
+ * Callers that cache data returned here in local cache should note that an
+ * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
+ * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
+ */
+static BTMetaPageData *
+_bt_getmeta(Relation rel, Buffer metabuf)
+{
+ Page metapg;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ metad = BTPageGetMeta(metapg);
+
+ /* sanity-check the metapage */
+ if (!P_ISMETA(metaopaque) ||
+ metad->btm_magic != BTREE_MAGIC)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" is not a btree",
+ RelationGetRelationName(rel))));
+
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+
+ return metad;
+}
+
+/*
+ * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
+ *
+ * Called by btvacuumcleanup when btbulkdelete was never called because no
+ * index tuples needed to be deleted.
+ */
+bool
+_bt_vacuum_needs_cleanup(Relation rel)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTMetaPageData *metad;
+ uint32 btm_version;
+ BlockNumber prev_num_delpages;
+
+ /*
+ * Copy details from metapage to local variables quickly.
+ *
+ * Note that we deliberately avoid using cached version of metapage here.
+ */
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
+ btm_version = metad->btm_version;
+
+ if (btm_version < BTREE_NOVAC_VERSION)
+ {
+ /*
+ * Metapage needs to be dynamically upgraded to store fields that are
+ * only present when btm_version >= BTREE_NOVAC_VERSION
+ */
+ _bt_relbuf(rel, metabuf);
+ return true;
+ }
+
+ prev_num_delpages = metad->btm_last_cleanup_num_delpages;
+ _bt_relbuf(rel, metabuf);
+
+ /*
+ * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
+ * total size of the index. We can reasonably expect (though are not
+ * guaranteed) to be able to recycle this many pages if we decide to do a
+ * btvacuumscan call during the ongoing btvacuumcleanup. For further
+ * details see the nbtree/README section on placing deleted pages in the
+ * FSM.
+ */
+ if (prev_num_delpages > 0 &&
+ prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
+ return true;
+
+ return false;
+}
+
+/*
+ * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
+ *
+ * Called at the end of btvacuumcleanup, when num_delpages value has been
+ * finalized.
+ */
+void
+_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTMetaPageData *metad;
+
+ /*
+ * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
+ * field started out as a TransactionId field called btm_oldest_btpo_xact.
+ * Both "versions" are just uint32 fields. It was convenient to repurpose
+ * the field when we began to use 64-bit XIDs in deleted pages.
+ *
+ * It's possible that a pg_upgrade'd database will contain an XID value in
+ * what is now recognized as the metapage's btm_last_cleanup_num_delpages
+ * field. _bt_vacuum_needs_cleanup() may even believe that this value
+ * indicates that there are lots of pages that it needs to recycle, when
+ * in reality there are only one or two. The worst that can happen is
+ * that there will be a call to btvacuumscan a little earlier, which will
+ * set btm_last_cleanup_num_delpages to a sane value when we're called.
+ *
+ * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
+ * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
+ * to be consistent.
+ */
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
+
+ /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
+ if (metad->btm_version >= BTREE_NOVAC_VERSION &&
+ metad->btm_last_cleanup_num_delpages == num_delpages)
+ {
+ /* Usually means index continues to have num_delpages of 0 */
+ _bt_relbuf(rel, metabuf);
+ return;
+ }
+
+ /* trade in our read lock for a write lock */
+ _bt_unlockbuf(rel, metabuf);
+ _bt_lockbuf(rel, metabuf, BT_WRITE);
+
+ START_CRIT_SECTION();
+
+ /* upgrade meta-page if needed */
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
+ _bt_upgrademetapage(metapg);
+
+ /* update cleanup-related information */
+ metad->btm_last_cleanup_num_delpages = num_delpages;
+ metad->btm_last_cleanup_num_heap_tuples = -1.0;
+ MarkBufferDirty(metabuf);
+
+ /* write wal record if needed */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_metadata md;
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
+ md.root = metad->btm_root;
+ md.level = metad->btm_level;
+ md.fastroot = metad->btm_fastroot;
+ md.fastlevel = metad->btm_fastlevel;
+ md.last_cleanup_num_delpages = num_delpages;
+ md.allequalimage = metad->btm_allequalimage;
+
+ XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
+
+ PageSetLSN(metapg, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ _bt_relbuf(rel, metabuf);
+}
+
+/*
+ * _bt_getroot() -- Get the root page of the btree.
+ *
+ * Since the root page can move around the btree file, we have to read
+ * its location from the metadata page, and then read the root page
+ * itself. If no root page exists yet, we have to create one.
+ *
+ * The access type parameter (BT_READ or BT_WRITE) controls whether
+ * a new root page will be created or not. If access = BT_READ,
+ * and no root page exists, we just return InvalidBuffer. For
+ * BT_WRITE, we try to create the root page if it doesn't exist.
+ * NOTE that the returned root page will have only a read lock set
+ * on it even if access = BT_WRITE!
+ *
+ * The returned page is not necessarily the true root --- it could be
+ * a "fast root" (a page that is alone in its level due to deletions).
+ * Also, if the root page is split while we are "in flight" to it,
+ * what we will return is the old root, which is now just the leftmost
+ * page on a probably-not-very-wide level. For most purposes this is
+ * as good as or better than the true root, so we do not bother to
+ * insist on finding the true root. We do, however, guarantee to
+ * return a live (not deleted or half-dead) page.
+ *
+ * On successful return, the root page is pinned and read-locked.
+ * The metadata page is not locked or pinned on exit.
+ */
+Buffer
+_bt_getroot(Relation rel, int access)
+{
+ Buffer metabuf;
+ Buffer rootbuf;
+ Page rootpage;
+ BTPageOpaque rootopaque;
+ BlockNumber rootblkno;
+ uint32 rootlevel;
+ BTMetaPageData *metad;
+
+ /*
+ * Try to use previously-cached metapage data to find the root. This
+ * normally saves one buffer access per index search, which is a very
+ * helpful savings in bufmgr traffic and hence contention.
+ */
+ if (rel->rd_amcache != NULL)
+ {
+ metad = (BTMetaPageData *) rel->rd_amcache;
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version >= BTREE_MIN_VERSION);
+ Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
+ Assert(metad->btm_root != P_NONE);
+
+ rootblkno = metad->btm_fastroot;
+ Assert(rootblkno != P_NONE);
+ rootlevel = metad->btm_fastlevel;
+
+ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+ rootpage = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+ /*
+ * Since the cache might be stale, we check the page more carefully
+ * here than normal. We *must* check that it's not deleted. If it's
+ * not alone on its level, then we reject too --- this may be overly
+ * paranoid but better safe than sorry. Note we don't check P_ISROOT,
+ * because that's not set in a "fast root".
+ */
+ if (!P_IGNORE(rootopaque) &&
+ rootopaque->btpo_level == rootlevel &&
+ P_LEFTMOST(rootopaque) &&
+ P_RIGHTMOST(rootopaque))
+ {
+ /* OK, accept cached page as the root */
+ return rootbuf;
+ }
+ _bt_relbuf(rel, rootbuf);
+ /* Cache is stale, throw it away */
+ if (rel->rd_amcache)
+ pfree(rel->rd_amcache);
+ rel->rd_amcache = NULL;
+ }
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metad = _bt_getmeta(rel, metabuf);
+
+ /* if no root page initialized yet, do it */
+ if (metad->btm_root == P_NONE)
+ {
+ Page metapg;
+
+ /* If access = BT_READ, caller doesn't want us to create root yet */
+ if (access == BT_READ)
+ {
+ _bt_relbuf(rel, metabuf);
+ return InvalidBuffer;
+ }
+
+ /* trade in our read lock for a write lock */
+ _bt_unlockbuf(rel, metabuf);
+ _bt_lockbuf(rel, metabuf, BT_WRITE);
+
+ /*
+ * Race condition: if someone else initialized the metadata between
+ * the time we released the read lock and acquired the write lock, we
+ * must avoid doing it again.
+ */
+ if (metad->btm_root != P_NONE)
+ {
+ /*
+ * Metadata initialized by someone else. In order to guarantee no
+ * deadlocks, we have to release the metadata page and start all
+ * over again. (Is that really true? But it's hardly worth trying
+ * to optimize this case.)
+ */
+ _bt_relbuf(rel, metabuf);
+ return _bt_getroot(rel, access);
+ }
+
+ /*
+ * Get, initialize, write, and leave a lock of the appropriate type on
+ * the new root page. Since this is the first page in the tree, it's
+ * a leaf as well as the root.
+ */
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootblkno = BufferGetBlockNumber(rootbuf);
+ rootpage = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+ rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
+ rootopaque->btpo_level = 0;
+ rootopaque->btpo_cycleid = 0;
+ /* Get raw page pointer for metapage */
+ metapg = BufferGetPage(metabuf);
+
+ /* NO ELOG(ERROR) till meta is updated */
+ START_CRIT_SECTION();
+
+ /* upgrade metapage if needed */
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
+ _bt_upgrademetapage(metapg);
+
+ metad->btm_root = rootblkno;
+ metad->btm_level = 0;
+ metad->btm_fastroot = rootblkno;
+ metad->btm_fastlevel = 0;
+ metad->btm_last_cleanup_num_delpages = 0;
+ metad->btm_last_cleanup_num_heap_tuples = -1.0;
+
+ MarkBufferDirty(rootbuf);
+ MarkBufferDirty(metabuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_newroot xlrec;
+ XLogRecPtr recptr;
+ xl_btree_metadata md;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
+ XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
+ md.root = rootblkno;
+ md.level = 0;
+ md.fastroot = rootblkno;
+ md.fastlevel = 0;
+ md.last_cleanup_num_delpages = 0;
+ md.allequalimage = metad->btm_allequalimage;
+
+ XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
+
+ xlrec.rootblk = rootblkno;
+ xlrec.level = 0;
+
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
+
+ PageSetLSN(rootpage, recptr);
+ PageSetLSN(metapg, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /*
+ * swap root write lock for read lock. There is no danger of anyone
+ * else accessing the new root page while it's unlocked, since no one
+ * else knows where it is yet.
+ */
+ _bt_unlockbuf(rel, rootbuf);
+ _bt_lockbuf(rel, rootbuf, BT_READ);
+
+ /* okay, metadata is correct, release lock on it without caching */
+ _bt_relbuf(rel, metabuf);
+ }
+ else
+ {
+ rootblkno = metad->btm_fastroot;
+ Assert(rootblkno != P_NONE);
+ rootlevel = metad->btm_fastlevel;
+
+ /*
+ * Cache the metapage data for next time
+ */
+ rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+ sizeof(BTMetaPageData));
+ memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+
+ /*
+ * We are done with the metapage; arrange to release it via first
+ * _bt_relandgetbuf call
+ */
+ rootbuf = metabuf;
+
+ for (;;)
+ {
+ rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
+ rootpage = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+ if (!P_IGNORE(rootopaque))
+ break;
+
+ /* it's dead, Jim. step right one page */
+ if (P_RIGHTMOST(rootopaque))
+ elog(ERROR, "no live root page found in index \"%s\"",
+ RelationGetRelationName(rel));
+ rootblkno = rootopaque->btpo_next;
+ }
+
+ if (rootopaque->btpo_level != rootlevel)
+ elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
+ rootblkno, RelationGetRelationName(rel),
+ rootopaque->btpo_level, rootlevel);
+ }
+
+ /*
+ * By here, we have a pin and read lock on the root page, and no lock set
+ * on the metadata page. Return the root page's buffer.
+ */
+ return rootbuf;
+}
+
+/*
+ * _bt_gettrueroot() -- Get the true root page of the btree.
+ *
+ * This is the same as the BT_READ case of _bt_getroot(), except
+ * we follow the true-root link not the fast-root link.
+ *
+ * By the time we acquire lock on the root page, it might have been split and
+ * not be the true root anymore. This is okay for the present uses of this
+ * routine; we only really need to be able to move up at least one tree level
+ * from whatever non-root page we were at. If we ever do need to lock the
+ * one true root page, we could loop here, re-reading the metapage on each
+ * failure. (Note that it wouldn't do to hold the lock on the metapage while
+ * moving to the root --- that'd deadlock against any concurrent root split.)
+ */
+Buffer
+_bt_gettrueroot(Relation rel)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTPageOpaque metaopaque;
+ Buffer rootbuf;
+ Page rootpage;
+ BTPageOpaque rootopaque;
+ BlockNumber rootblkno;
+ uint32 rootlevel;
+ BTMetaPageData *metad;
+
+ /*
+ * We don't try to use cached metapage data here, since (a) this path is
+ * not performance-critical, and (b) if we are here it suggests our cache
+ * is out-of-date anyway. In light of point (b), it's probably safest to
+ * actively flush any cached metapage info.
+ */
+ if (rel->rd_amcache)
+ pfree(rel->rd_amcache);
+ rel->rd_amcache = NULL;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ metad = BTPageGetMeta(metapg);
+
+ if (!P_ISMETA(metaopaque) ||
+ metad->btm_magic != BTREE_MAGIC)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" is not a btree",
+ RelationGetRelationName(rel))));
+
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+
+ /* if no root page initialized yet, fail */
+ if (metad->btm_root == P_NONE)
+ {
+ _bt_relbuf(rel, metabuf);
+ return InvalidBuffer;
+ }
+
+ rootblkno = metad->btm_root;
+ rootlevel = metad->btm_level;
+
+ /*
+ * We are done with the metapage; arrange to release it via first
+ * _bt_relandgetbuf call
+ */
+ rootbuf = metabuf;
+
+ for (;;)
+ {
+ rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
+ rootpage = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+ if (!P_IGNORE(rootopaque))
+ break;
+
+ /* it's dead, Jim. step right one page */
+ if (P_RIGHTMOST(rootopaque))
+ elog(ERROR, "no live root page found in index \"%s\"",
+ RelationGetRelationName(rel));
+ rootblkno = rootopaque->btpo_next;
+ }
+
+ if (rootopaque->btpo_level != rootlevel)
+ elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
+ rootblkno, RelationGetRelationName(rel),
+ rootopaque->btpo_level, rootlevel);
+
+ return rootbuf;
+}
+
+/*
+ * _bt_getrootheight() -- Get the height of the btree search tree.
+ *
+ * We return the level (counting from zero) of the current fast root.
+ * This represents the number of tree levels we'd have to descend through
+ * to start any btree index search.
+ *
+ * This is used by the planner for cost-estimation purposes. Since it's
+ * only an estimate, slightly-stale data is fine, hence we don't worry
+ * about updating previously cached data.
+ */
+int
+_bt_getrootheight(Relation rel)
+{
+ BTMetaPageData *metad;
+
+ if (rel->rd_amcache == NULL)
+ {
+ Buffer metabuf;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metad = _bt_getmeta(rel, metabuf);
+
+ /*
+ * If there's no root page yet, _bt_getroot() doesn't expect a cache
+ * to be made, so just stop here and report the index height is zero.
+ * (XXX perhaps _bt_getroot() should be changed to allow this case.)
+ */
+ if (metad->btm_root == P_NONE)
+ {
+ _bt_relbuf(rel, metabuf);
+ return 0;
+ }
+
+ /*
+ * Cache the metapage data for next time
+ */
+ rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+ sizeof(BTMetaPageData));
+ memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+ _bt_relbuf(rel, metabuf);
+ }
+
+ /* Get cached page */
+ metad = (BTMetaPageData *) rel->rd_amcache;
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version >= BTREE_MIN_VERSION);
+ Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
+ Assert(metad->btm_fastroot != P_NONE);
+
+ return metad->btm_fastlevel;
+}
+
+/*
+ * _bt_metaversion() -- Get version/status info from metapage.
+ *
+ * Sets caller's *heapkeyspace and *allequalimage arguments using data
+ * from the B-Tree metapage (could be locally-cached version). This
+ * information needs to be stashed in insertion scankey, so we provide a
+ * single function that fetches both at once.
+ *
+ * This is used to determine the rules that must be used to descend a
+ * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
+ * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
+ * performance when inserting a new BTScanInsert-wise duplicate tuple
+ * among many leaf pages already full of such duplicates.
+ *
+ * Also sets allequalimage field, which indicates whether or not it is
+ * safe to apply deduplication. We rely on the assumption that
+ * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
+ * pg_upgrade'd from Postgres 12.
+ */
+void
+_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
+{
+ BTMetaPageData *metad;
+
+ if (rel->rd_amcache == NULL)
+ {
+ Buffer metabuf;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metad = _bt_getmeta(rel, metabuf);
+
+ /*
+ * If there's no root page yet, _bt_getroot() doesn't expect a cache
+ * to be made, so just stop here. (XXX perhaps _bt_getroot() should
+ * be changed to allow this case.)
+ */
+ if (metad->btm_root == P_NONE)
+ {
+ *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+ *allequalimage = metad->btm_allequalimage;
+
+ _bt_relbuf(rel, metabuf);
+ return;
+ }
+
+ /*
+ * Cache the metapage data for next time
+ *
+ * An on-the-fly version upgrade performed by _bt_upgrademetapage()
+ * can change the nbtree version for an index without invalidating any
+ * local cache. This is okay because it can only happen when moving
+ * from version 2 to version 3, both of which are !heapkeyspace
+ * versions.
+ */
+ rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+ sizeof(BTMetaPageData));
+ memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+ _bt_relbuf(rel, metabuf);
+ }
+
+ /* Get cached page */
+ metad = (BTMetaPageData *) rel->rd_amcache;
+ /* We shouldn't have cached it if any of these fail */
+ Assert(metad->btm_magic == BTREE_MAGIC);
+ Assert(metad->btm_version >= BTREE_MIN_VERSION);
+ Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
+ Assert(metad->btm_fastroot != P_NONE);
+
+ *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+ *allequalimage = metad->btm_allequalimage;
+}
+
+/*
+ * _bt_checkpage() -- Verify that a freshly-read page looks sane.
+ */
+void
+_bt_checkpage(Relation rel, Buffer buf)
+{
+ Page page = BufferGetPage(buf);
+
+ /*
+ * ReadBuffer verifies that every newly-read page passes
+ * PageHeaderIsValid, which means it either contains a reasonably sane
+ * page header or is all-zero. We have to defend against the all-zero
+ * case, however.
+ */
+ if (PageIsNew(page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" contains unexpected zero page at block %u",
+ RelationGetRelationName(rel),
+ BufferGetBlockNumber(buf)),
+ errhint("Please REINDEX it.")));
+
+ /*
+ * Additionally check that the special area looks sane.
+ */
+ if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" contains corrupted page at block %u",
+ RelationGetRelationName(rel),
+ BufferGetBlockNumber(buf)),
+ errhint("Please REINDEX it.")));
+}
+
+/*
+ * Log the reuse of a page from the FSM.
+ */
+static void
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
+{
+ xl_btree_reuse_page xlrec_reuse;
+
+ /*
+ * Note that we don't register the buffer with the record, because this
+ * operation doesn't modify the page. This record only exists to provide a
+ * conflict point for Hot Standby.
+ */
+
+ /* XLOG stuff */
+ xlrec_reuse.node = rel->rd_node;
+ xlrec_reuse.block = blkno;
+ xlrec_reuse.latestRemovedFullXid = safexid;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
+
+ XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
+}
+
+/*
+ * _bt_getbuf() -- Get a buffer by block number for read or write.
+ *
+ * blkno == P_NEW means to get an unallocated index page. The page
+ * will be initialized before returning it.
+ *
+ * The general rule in nbtree is that it's never okay to access a
+ * page without holding both a buffer pin and a buffer lock on
+ * the page's buffer.
+ *
+ * When this routine returns, the appropriate lock is set on the
+ * requested buffer and its reference count has been incremented
+ * (ie, the buffer is "locked and pinned"). Also, we apply
+ * _bt_checkpage to sanity-check the page (except in P_NEW case),
+ * and perform Valgrind client requests that help Valgrind detect
+ * unsafe page accesses.
+ *
+ * Note: raw LockBuffer() calls are disallowed in nbtree; all
+ * buffer lock requests need to go through wrapper functions such
+ * as _bt_lockbuf().
+ */
+Buffer
+_bt_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+ Buffer buf;
+
+ if (blkno != P_NEW)
+ {
+ /* Read an existing block of the relation */
+ buf = ReadBuffer(rel, blkno);
+ _bt_lockbuf(rel, buf, access);
+ _bt_checkpage(rel, buf);
+ }
+ else
+ {
+ bool needLock;
+ Page page;
+
+ Assert(access == BT_WRITE);
+
+ /*
+ * First see if the FSM knows of any free pages.
+ *
+ * We can't trust the FSM's report unreservedly; we have to check that
+ * the page is still free. (For example, an already-free page could
+ * have been re-used between the time the last VACUUM scanned it and
+ * the time the VACUUM made its FSM updates.)
+ *
+ * In fact, it's worse than that: we can't even assume that it's safe
+ * to take a lock on the reported page. If somebody else has a lock
+ * on it, or even worse our own caller does, we could deadlock. (The
+ * own-caller scenario is actually not improbable. Consider an index
+ * on a serial or timestamp column. Nearly all splits will be at the
+ * rightmost page, so it's entirely likely that _bt_split will call us
+ * while holding a lock on the page most recently acquired from FSM. A
+ * VACUUM running concurrently with the previous split could well have
+ * placed that page back in FSM.)
+ *
+ * To get around that, we ask for only a conditional lock on the
+ * reported page. If we fail, then someone else is using the page,
+ * and we may reasonably assume it's not free. (If we happen to be
+ * wrong, the worst consequence is the page will be lost to use till
+ * the next VACUUM, which is no big problem.)
+ */
+ for (;;)
+ {
+ blkno = GetFreeIndexPage(rel);
+ if (blkno == InvalidBlockNumber)
+ break;
+ buf = ReadBuffer(rel, blkno);
+ if (_bt_conditionallockbuf(rel, buf))
+ {
+ page = BufferGetPage(buf);
+
+ /*
+ * It's possible to find an all-zeroes page in an index. For
+ * example, a backend might successfully extend the relation
+ * one page and then crash before it is able to make a WAL
+ * entry for adding the page. If we find a zeroed page then
+ * reclaim it immediately.
+ */
+ if (PageIsNew(page))
+ {
+ /* Okay to use page. Initialize and return it. */
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ return buf;
+ }
+
+ if (BTPageIsRecyclable(page))
+ {
+ /*
+ * If we are generating WAL for Hot Standby then create a
+ * WAL record that will allow us to conflict with queries
+ * running on standby, in case they have snapshots older
+ * than safexid value
+ */
+ if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+ _bt_log_reuse_page(rel, blkno,
+ BTPageGetDeleteXid(page));
+
+ /* Okay to use page. Re-initialize and return it. */
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ return buf;
+ }
+ elog(DEBUG2, "FSM returned nonrecyclable page");
+ _bt_relbuf(rel, buf);
+ }
+ else
+ {
+ elog(DEBUG2, "FSM returned nonlockable page");
+ /* couldn't get lock, so just drop pin */
+ ReleaseBuffer(buf);
+ }
+ }
+
+ /*
+ * Extend the relation by one page.
+ *
+ * We have to use a lock to ensure no one else is extending the rel at
+ * the same time, else we will both try to initialize the same new
+ * page. We can skip locking for new or temp relations, however,
+ * since no one else could be accessing them.
+ */
+ needLock = !RELATION_IS_LOCAL(rel);
+
+ if (needLock)
+ LockRelationForExtension(rel, ExclusiveLock);
+
+ buf = ReadBuffer(rel, P_NEW);
+
+ /* Acquire buffer lock on new page */
+ _bt_lockbuf(rel, buf, BT_WRITE);
+
+ /*
+ * Release the file-extension lock; it's now OK for someone else to
+ * extend the relation some more. Note that we cannot release this
+ * lock before we have buffer lock on the new page, or we risk a race
+ * condition against btvacuumscan --- see comments therein.
+ */
+ if (needLock)
+ UnlockRelationForExtension(rel, ExclusiveLock);
+
+ /* Initialize the new page before returning it */
+ page = BufferGetPage(buf);
+ Assert(PageIsNew(page));
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ }
+
+ /* ref count and lock type are correct */
+ return buf;
+}
+
+/*
+ * _bt_relandgetbuf() -- release a locked buffer and get another one.
+ *
+ * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
+ * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
+ * then it reduces to just _bt_getbuf; allowing this case simplifies some
+ * callers.
+ *
+ * The original motivation for using this was to avoid two entries to the
+ * bufmgr when one would do. However, now it's mainly just a notational
+ * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
+ * is when the target page is the same one already in the buffer.
+ */
+Buffer
+_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
+{
+ Buffer buf;
+
+ Assert(blkno != P_NEW);
+ if (BufferIsValid(obuf))
+ _bt_unlockbuf(rel, obuf);
+ buf = ReleaseAndReadBuffer(obuf, rel, blkno);
+ _bt_lockbuf(rel, buf, access);
+
+ _bt_checkpage(rel, buf);
+ return buf;
+}
+
+/*
+ * _bt_relbuf() -- release a locked buffer.
+ *
+ * Lock and pin (refcount) are both dropped.
+ */
+void
+_bt_relbuf(Relation rel, Buffer buf)
+{
+ _bt_unlockbuf(rel, buf);
+ ReleaseBuffer(buf);
+}
+
+/*
+ * _bt_lockbuf() -- lock a pinned buffer.
+ *
+ * Lock is acquired without acquiring another pin. This is like a raw
+ * LockBuffer() call, but performs extra steps needed by Valgrind.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+void
+_bt_lockbuf(Relation rel, Buffer buf, int access)
+{
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, access);
+
+ /*
+ * It doesn't matter that _bt_unlockbuf() won't get called in the event of
+ * an nbtree error (e.g. a unique violation error). That won't cause
+ * Valgrind false positives.
+ *
+ * The nbtree client requests are superimposed on top of the bufmgr.c
+ * buffer pin client requests. In the event of an nbtree error the buffer
+ * will certainly get marked as defined when the backend once again
+ * acquires its first pin on the buffer. (Of course, if the backend never
+ * touches the buffer again then it doesn't matter that it remains
+ * non-accessible to Valgrind.)
+ *
+ * Note: When an IndexTuple C pointer gets computed using an ItemId read
+ * from a page while a lock was held, the C pointer becomes unsafe to
+ * dereference forever as soon as the lock is released. Valgrind can only
+ * detect cases where the pointer gets dereferenced with no _current_
+ * lock/pin held, though.
+ */
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ * _bt_unlockbuf() -- unlock a pinned buffer.
+ */
+void
+_bt_unlockbuf(Relation rel, Buffer buf)
+{
+ /*
+ * Buffer is pinned and locked, which means that it is expected to be
+ * defined and addressable. Check that proactively.
+ */
+ VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
+ * buffer.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+bool
+_bt_conditionallockbuf(Relation rel, Buffer buf)
+{
+ /* ConditionalLockBuffer() asserts that pin is held by this backend */
+ if (!ConditionalLockBuffer(buf))
+ return false;
+
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ return true;
+}
+
+/*
+ * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
+ * lock.
+ */
+void
+_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
+{
+ /*
+ * Buffer is pinned and locked, which means that it is expected to be
+ * defined and addressable. Check that proactively.
+ */
+ VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBufferForCleanup(buf);
+}
+
+/*
+ * _bt_pageinit() -- Initialize a new page.
+ *
+ * On return, the page header is initialized; data space is empty;
+ * special space is zeroed out.
+ */
+void
+_bt_pageinit(Page page, Size size)
+{
+ PageInit(page, size, sizeof(BTPageOpaqueData));
+}
+
+/*
+ * Delete item(s) from a btree leaf page during VACUUM.
+ *
+ * This routine assumes that the caller has a super-exclusive write lock on
+ * the buffer. Also, the given deletable and updatable arrays *must* be
+ * sorted in ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed. This works by
+ * updating/overwriting an existing item with caller's new version of the item
+ * (a version that lacks the TIDs that are to be deleted).
+ *
+ * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
+ * generate their own latestRemovedXid by accessing the table directly,
+ * whereas VACUUMs rely on the initial VACUUM table scan performing
+ * WAL-logging that takes care of the issue for the table's indexes
+ * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree
+ * deletes don't do.
+ */
+void
+_bt_delitems_vacuum(Relation rel, Buffer buf,
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable)
+{
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque;
+ bool needswal = RelationNeedsWAL(rel);
+ char *updatedbuf = NULL;
+ Size updatedbuflen = 0;
+ OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
+
+ /* Shouldn't be called unless there's something to do */
+ Assert(ndeletable > 0 || nupdatable > 0);
+
+ /* Generate new version of posting lists without deleted TIDs */
+ if (nupdatable > 0)
+ updatedbuf = _bt_delitems_update(updatable, nupdatable,
+ updatedoffsets, &updatedbuflen,
+ needswal);
+
+ /* No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ /*
+ * Handle posting tuple updates.
+ *
+ * Deliberately do this before handling simple deletes. If we did it the
+ * other way around (i.e. WAL record order -- simple deletes before
+ * updates) then we'd have to make compensating changes to the 'updatable'
+ * array of offset numbers.
+ *
+ * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
+ * happens to already be set. It's important that we not interfere with
+ * _bt_delitems_delete().
+ */
+ for (int i = 0; i < nupdatable; i++)
+ {
+ OffsetNumber updatedoffset = updatedoffsets[i];
+ IndexTuple itup;
+ Size itemsz;
+
+ itup = updatable[i]->itup;
+ itemsz = MAXALIGN(IndexTupleSize(itup));
+ if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+ itemsz))
+ elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+ BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+ }
+
+ /* Now handle simple deletes of entire tuples */
+ if (ndeletable > 0)
+ PageIndexMultiDelete(page, deletable, ndeletable);
+
+ /*
+ * We can clear the vacuum cycle ID since this page has certainly been
+ * processed by the current vacuum scan.
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ opaque->btpo_cycleid = 0;
+
+ /*
+ * Clear the BTP_HAS_GARBAGE page flag.
+ *
+ * This flag indicates the presence of LP_DEAD items on the page (though
+ * not reliably). Note that we only rely on it with pg_upgrade'd
+ * !heapkeyspace indexes. That's why clearing it here won't usually
+ * interfere with _bt_delitems_delete().
+ */
+ opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (needswal)
+ {
+ XLogRecPtr recptr;
+ xl_btree_vacuum xlrec_vacuum;
+
+ xlrec_vacuum.ndeleted = ndeletable;
+ xlrec_vacuum.nupdated = nupdatable;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
+
+ if (ndeletable > 0)
+ XLogRegisterBufData(0, (char *) deletable,
+ ndeletable * sizeof(OffsetNumber));
+
+ if (nupdatable > 0)
+ {
+ XLogRegisterBufData(0, (char *) updatedoffsets,
+ nupdatable * sizeof(OffsetNumber));
+ XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+ }
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* can't leak memory here */
+ if (updatedbuf != NULL)
+ pfree(updatedbuf);
+ /* free tuples allocated within _bt_delitems_update() */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]->itup);
+}
+
+/*
+ * Delete item(s) from a btree leaf page during single-page cleanup.
+ *
+ * This routine assumes that the caller has pinned and write locked the
+ * buffer. Also, the given deletable and updatable arrays *must* be sorted in
+ * ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed. This works by
+ * updating/overwriting an existing item with caller's new version of the item
+ * (a version that lacks the TIDs that are to be deleted).
+ *
+ * This is nearly the same as _bt_delitems_vacuum as far as what it does to
+ * the page, but it needs its own latestRemovedXid from caller (caller gets
+ * this from tableam). This is used by the REDO routine to generate recovery
+ * conflicts. The other difference is that only _bt_delitems_vacuum will
+ * clear page's VACUUM cycle ID.
+ */
+static void
+_bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid,
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable)
+{
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque;
+ bool needswal = RelationNeedsWAL(rel);
+ char *updatedbuf = NULL;
+ Size updatedbuflen = 0;
+ OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
+
+ /* Shouldn't be called unless there's something to do */
+ Assert(ndeletable > 0 || nupdatable > 0);
+
+ /* Generate new versions of posting lists without deleted TIDs */
+ if (nupdatable > 0)
+ updatedbuf = _bt_delitems_update(updatable, nupdatable,
+ updatedoffsets, &updatedbuflen,
+ needswal);
+
+ /* No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ /* Handle updates and deletes just like _bt_delitems_vacuum */
+ for (int i = 0; i < nupdatable; i++)
+ {
+ OffsetNumber updatedoffset = updatedoffsets[i];
+ IndexTuple itup;
+ Size itemsz;
+
+ itup = updatable[i]->itup;
+ itemsz = MAXALIGN(IndexTupleSize(itup));
+ if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+ itemsz))
+ elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+ BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+ }
+
+ if (ndeletable > 0)
+ PageIndexMultiDelete(page, deletable, ndeletable);
+
+ /*
+ * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
+ * this point. The VACUUM command alone controls vacuum cycle IDs.
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Clear the BTP_HAS_GARBAGE page flag.
+ *
+ * This flag indicates the presence of LP_DEAD items on the page (though
+ * not reliably). Note that we only rely on it with pg_upgrade'd
+ * !heapkeyspace indexes.
+ */
+ opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (needswal)
+ {
+ XLogRecPtr recptr;
+ xl_btree_delete xlrec_delete;
+
+ xlrec_delete.latestRemovedXid = latestRemovedXid;
+ xlrec_delete.ndeleted = ndeletable;
+ xlrec_delete.nupdated = nupdatable;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
+
+ if (ndeletable > 0)
+ XLogRegisterBufData(0, (char *) deletable,
+ ndeletable * sizeof(OffsetNumber));
+
+ if (nupdatable > 0)
+ {
+ XLogRegisterBufData(0, (char *) updatedoffsets,
+ nupdatable * sizeof(OffsetNumber));
+ XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+ }
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* can't leak memory here */
+ if (updatedbuf != NULL)
+ pfree(updatedbuf);
+ /* free tuples allocated within _bt_delitems_update() */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]->itup);
+}
+
+/*
+ * Set up state needed to delete TIDs from posting list tuples via "updating"
+ * the tuple. Performs steps common to both _bt_delitems_vacuum and
+ * _bt_delitems_delete. These steps must take place before each function's
+ * critical section begins.
+ *
+ * updatable and nupdatable are inputs, though note that we will use
+ * _bt_update_posting() to replace the original itup with a pointer to a final
+ * version in palloc()'d memory. Caller should free the tuples when its done.
+ *
+ * The first nupdatable entries from updatedoffsets are set to the page offset
+ * number for posting list tuples that caller updates. This is mostly useful
+ * because caller may need to WAL-log the page offsets (though we always do
+ * this for caller out of convenience).
+ *
+ * Returns buffer consisting of an array of xl_btree_update structs that
+ * describe the steps we perform here for caller (though only when needswal is
+ * true). Also sets *updatedbuflen to the final size of the buffer. This
+ * buffer is used by caller when WAL logging is required.
+ */
+static char *
+_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
+ OffsetNumber *updatedoffsets, Size *updatedbuflen,
+ bool needswal)
+{
+ char *updatedbuf = NULL;
+ Size buflen = 0;
+
+ /* Shouldn't be called unless there's something to do */
+ Assert(nupdatable > 0);
+
+ for (int i = 0; i < nupdatable; i++)
+ {
+ BTVacuumPosting vacposting = updatable[i];
+ Size itemsz;
+
+ /* Replace work area IndexTuple with updated version */
+ _bt_update_posting(vacposting);
+
+ /* Keep track of size of xl_btree_update for updatedbuf in passing */
+ itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
+ buflen += itemsz;
+
+ /* Build updatedoffsets buffer in passing */
+ updatedoffsets[i] = vacposting->updatedoffset;
+ }
+
+ /* XLOG stuff */
+ if (needswal)
+ {
+ Size offset = 0;
+
+ /* Allocate, set final size for caller */
+ updatedbuf = palloc(buflen);
+ *updatedbuflen = buflen;
+ for (int i = 0; i < nupdatable; i++)
+ {
+ BTVacuumPosting vacposting = updatable[i];
+ Size itemsz;
+ xl_btree_update update;
+
+ update.ndeletedtids = vacposting->ndeletedtids;
+ memcpy(updatedbuf + offset, &update.ndeletedtids,
+ SizeOfBtreeUpdate);
+ offset += SizeOfBtreeUpdate;
+
+ itemsz = update.ndeletedtids * sizeof(uint16);
+ memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
+ offset += itemsz;
+ }
+ }
+
+ return updatedbuf;
+}
+
+/*
+ * Comparator used by _bt_delitems_delete_check() to restore deltids array
+ * back to its original leaf-page-wise sort order
+ */
+static int
+_bt_delitems_cmp(const void *a, const void *b)
+{
+ TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
+ TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
+
+ if (indexdelete1->id > indexdelete2->id)
+ return 1;
+ if (indexdelete1->id < indexdelete2->id)
+ return -1;
+
+ Assert(false);
+
+ return 0;
+}
+
+/*
+ * Try to delete item(s) from a btree leaf page during single-page cleanup.
+ *
+ * nbtree interface to table_index_delete_tuples(). Deletes a subset of index
+ * tuples from caller's deltids array: those whose TIDs are found safe to
+ * delete by the tableam (or already marked LP_DEAD in index, and so already
+ * known to be deletable by our simple index deletion caller). We physically
+ * delete index tuples from buf leaf page last of all (for index tuples where
+ * that is known to be safe following our table_index_delete_tuples() call).
+ *
+ * Simple index deletion caller only includes TIDs from index tuples marked
+ * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
+ * included without increasing the total number of distinct table blocks for
+ * the deletion operation as a whole. This approach often allows us to delete
+ * some extra index tuples that were practically free for tableam to check in
+ * passing (when they actually turn out to be safe to delete). It probably
+ * only makes sense for the tableam to go ahead with these extra checks when
+ * it is block-oriented (otherwise the checks probably won't be practically
+ * free, which we rely on). The tableam interface requires the tableam side
+ * to handle the problem, though, so this is okay (we as an index AM are free
+ * to make the simplifying assumption that all tableams must be block-based).
+ *
+ * Bottom-up index deletion caller provides all the TIDs from the leaf page,
+ * without expecting that tableam will check most of them. The tableam has
+ * considerable discretion around which entries/blocks it checks. Our role in
+ * costing the bottom-up deletion operation is strictly advisory.
+ *
+ * Note: Caller must have added deltids entries (i.e. entries that go in
+ * delstate's main array) in leaf-page-wise order: page offset number order,
+ * TID order among entries taken from the same posting list tuple (tiebreak on
+ * TID). This order is convenient to work with here.
+ *
+ * Note: We also rely on the id field of each deltids element "capturing" this
+ * original leaf-page-wise order. That is, we expect to be able to get back
+ * to the original leaf-page-wise order just by sorting deltids on the id
+ * field (tableam will sort deltids for its own reasons, so we'll need to put
+ * it back in leaf-page-wise order afterwards).
+ */
+void
+_bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
+ TM_IndexDeleteOp *delstate)
+{
+ Page page = BufferGetPage(buf);
+ TransactionId latestRemovedXid;
+ OffsetNumber postingidxoffnum = InvalidOffsetNumber;
+ int ndeletable = 0,
+ nupdatable = 0;
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
+ BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+
+ /* Use tableam interface to determine which tuples to delete first */
+ latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
+
+ /* Should not WAL-log latestRemovedXid unless it's required */
+ if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
+ latestRemovedXid = InvalidTransactionId;
+
+ /*
+ * Construct a leaf-page-wise description of what _bt_delitems_delete()
+ * needs to do to physically delete index tuples from the page.
+ *
+ * Must sort deltids array to restore leaf-page-wise order (original order
+ * before call to tableam). This is the order that the loop expects.
+ *
+ * Note that deltids array might be a lot smaller now. It might even have
+ * no entries at all (with bottom-up deletion caller), in which case there
+ * is nothing left to do.
+ */
+ qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
+ _bt_delitems_cmp);
+ if (delstate->ndeltids == 0)
+ {
+ Assert(delstate->bottomup);
+ return;
+ }
+
+ /* We definitely have to delete at least one index tuple (or one TID) */
+ for (int i = 0; i < delstate->ndeltids; i++)
+ {
+ TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
+ OffsetNumber idxoffnum = dstatus->idxoffnum;
+ ItemId itemid = PageGetItemId(page, idxoffnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+ int nestedi,
+ nitem;
+ BTVacuumPosting vacposting;
+
+ Assert(OffsetNumberIsValid(idxoffnum));
+
+ if (idxoffnum == postingidxoffnum)
+ {
+ /*
+ * This deltid entry is a TID from a posting list tuple that has
+ * already been completely processed
+ */
+ Assert(BTreeTupleIsPosting(itup));
+ Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+ &delstate->deltids[i].tid) < 0);
+ Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
+ &delstate->deltids[i].tid) >= 0);
+ continue;
+ }
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Plain non-pivot tuple */
+ Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
+ if (dstatus->knowndeletable)
+ deletable[ndeletable++] = idxoffnum;
+ continue;
+ }
+
+ /*
+ * itup is a posting list tuple whose lowest deltids entry (which may
+ * or may not be for the first TID from itup) is considered here now.
+ * We should process all of the deltids entries for the posting list
+ * together now, though (not just the lowest). Remember to skip over
+ * later itup-related entries during later iterations of outermost
+ * loop.
+ */
+ postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
+ nestedi = i; /* Initialize for first itup deltids entry */
+ vacposting = NULL; /* Describes final action for itup */
+ nitem = BTreeTupleGetNPosting(itup);
+ for (int p = 0; p < nitem; p++)
+ {
+ ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
+ int ptidcmp = -1;
+
+ /*
+ * This nested loop reuses work across ptid TIDs taken from itup.
+ * We take advantage of the fact that both itup's TIDs and deltids
+ * entries (within a single itup/posting list grouping) must both
+ * be in ascending TID order.
+ */
+ for (; nestedi < delstate->ndeltids; nestedi++)
+ {
+ TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
+ TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
+
+ /* Stop once we get past all itup related deltids entries */
+ Assert(tdstatus->idxoffnum >= idxoffnum);
+ if (tdstatus->idxoffnum != idxoffnum)
+ break;
+
+ /* Skip past non-deletable itup related entries up front */
+ if (!tdstatus->knowndeletable)
+ continue;
+
+ /* Entry is first partial ptid match (or an exact match)? */
+ ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
+ if (ptidcmp >= 0)
+ {
+ /* Greater than or equal (partial or exact) match... */
+ break;
+ }
+ }
+
+ /* ...exact ptid match to a deletable deltids entry? */
+ if (ptidcmp != 0)
+ continue;
+
+ /* Exact match for deletable deltids entry -- ptid gets deleted */
+ if (vacposting == NULL)
+ {
+ vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+ nitem * sizeof(uint16));
+ vacposting->itup = itup;
+ vacposting->updatedoffset = idxoffnum;
+ vacposting->ndeletedtids = 0;
+ }
+ vacposting->deletetids[vacposting->ndeletedtids++] = p;
+ }
+
+ /* Final decision on itup, a posting list tuple */
+
+ if (vacposting == NULL)
+ {
+ /* No TIDs to delete from itup -- do nothing */
+ }
+ else if (vacposting->ndeletedtids == nitem)
+ {
+ /* Straight delete of itup (to delete all TIDs) */
+ deletable[ndeletable++] = idxoffnum;
+ /* Turns out we won't need granular information */
+ pfree(vacposting);
+ }
+ else
+ {
+ /* Delete some (but not all) TIDs from itup */
+ Assert(vacposting->ndeletedtids > 0 &&
+ vacposting->ndeletedtids < nitem);
+ updatable[nupdatable++] = vacposting;
+ }
+ }
+
+ /* Physically delete tuples (or TIDs) using deletable (or updatable) */
+ _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
+ updatable, nupdatable);
+
+ /* be tidy */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]);
+}
+
+/*
+ * Check that leftsib page (the btpo_prev of target page) is not marked with
+ * INCOMPLETE_SPLIT flag. Used during page deletion.
+ *
+ * Returning true indicates that page flag is set in leftsib (which is
+ * definitely still the left sibling of target). When that happens, the
+ * target doesn't have a downlink in parent, and the page deletion algorithm
+ * isn't prepared to handle that. Deletion of the target page (or the whole
+ * subtree that contains the target page) cannot take place.
+ *
+ * Caller should not have a lock on the target page itself, since pages on the
+ * same level must always be locked left to right to avoid deadlocks.
+ */
+static bool
+_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
+{
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ bool result;
+
+ /* Easy case: No left sibling */
+ if (leftsib == P_NONE)
+ return false;
+
+ buf = _bt_getbuf(rel, leftsib, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * If the left sibling was concurrently split, so that its next-pointer
+ * doesn't point to the current page anymore, the split that created
+ * target must be completed. Caller can reasonably expect that there will
+ * be a downlink to the target page that it can relocate using its stack.
+ * (We don't allow splitting an incompletely split page again until the
+ * previous split has been completed.)
+ */
+ result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
+ _bt_relbuf(rel, buf);
+
+ return result;
+}
+
+/*
+ * Check that leafrightsib page (the btpo_next of target leaf page) is not
+ * marked with ISHALFDEAD flag. Used during page deletion.
+ *
+ * Returning true indicates that page flag is set in leafrightsib, so page
+ * deletion cannot go ahead. Our caller is not prepared to deal with the case
+ * where the parent page does not have a pivot tuples whose downlink points to
+ * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
+ * seem worth going to the trouble of teaching our caller to deal with it.
+ * The situation will be resolved after VACUUM finishes the deletion of the
+ * half-dead page (when a future VACUUM operation reaches the target page
+ * again).
+ *
+ * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
+ * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
+ * okay because of the restriction on deleting pages that are the rightmost
+ * page of their parent (i.e. that such deletions can only take place when the
+ * entire subtree must be deleted). The leaf level check made here will apply
+ * to a right "cousin" leaf page rather than a simple right sibling leaf page
+ * in cases where caller actually goes on to attempt deleting pages that are
+ * above the leaf page. The right cousin leaf page is representative of the
+ * left edge of the subtree to the right of the to-be-deleted subtree as a
+ * whole, which is exactly the condition that our caller cares about.
+ * (Besides, internal pages are never marked half-dead, so it isn't even
+ * possible to _directly_ assess if an internal page is part of some other
+ * to-be-deleted subtree.)
+ */
+static bool
+_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
+{
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ bool result;
+
+ Assert(leafrightsib != P_NONE);
+
+ buf = _bt_getbuf(rel, leafrightsib, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
+ result = P_ISHALFDEAD(opaque);
+ _bt_relbuf(rel, buf);
+
+ return result;
+}
+
+/*
+ * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
+ *
+ * This action unlinks the leaf page from the b-tree structure, removing all
+ * pointers leading to it --- but not touching its own left and right links.
+ * The page cannot be physically reclaimed right away, since other processes
+ * may currently be trying to follow links leading to the page; they have to
+ * be allowed to use its right-link to recover. See nbtree/README.
+ *
+ * On entry, the target buffer must be pinned and locked (either read or write
+ * lock is OK). The page must be an empty leaf page, which may be half-dead
+ * already (a half-dead page should only be passed to us when an earlier
+ * VACUUM operation was interrupted, though). Note in particular that caller
+ * should never pass a buffer containing an existing deleted page here. The
+ * lock and pin on caller's buffer will be dropped before we return.
+ *
+ * Maintains bulk delete stats for caller, which are taken from vstate. We
+ * need to cooperate closely with caller here so that whole VACUUM operation
+ * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
+ * delete in passing. If such pages happen to be from a block number that is
+ * ahead of the current scanblkno position, then caller is expected to count
+ * them directly later on. It's simpler for us to understand caller's
+ * requirements than it would be for caller to understand when or how a
+ * deleted page became deleted after the fact.
+ *
+ * NOTE: this leaks memory. Rather than trying to clean up everything
+ * carefully, it's better to run it in a temp context that can be reset
+ * frequently.
+ */
+void
+_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
+{
+ BlockNumber rightsib;
+ bool rightsib_empty;
+ Page page;
+ BTPageOpaque opaque;
+
+ /*
+ * Save original leafbuf block number from caller. Only deleted blocks
+ * that are <= scanblkno are added to bulk delete stat's pages_deleted
+ * count.
+ */
+ BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
+
+ /*
+ * "stack" is a search stack leading (approximately) to the target page.
+ * It is initially NULL, but when iterating, we keep it to avoid
+ * duplicated search effort.
+ *
+ * Also, when "stack" is not NULL, we have already checked that the
+ * current page is not the right half of an incomplete split, i.e. the
+ * left sibling does not have its INCOMPLETE_SPLIT flag set, including
+ * when the current target page is to the right of caller's initial page
+ * (the scanblkno page).
+ */
+ BTStack stack = NULL;
+
+ for (;;)
+ {
+ page = BufferGetPage(leafbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Internal pages are never deleted directly, only as part of deleting
+ * the whole subtree all the way down to leaf level.
+ *
+ * Also check for deleted pages here. Caller never passes us a fully
+ * deleted page. Only VACUUM can delete pages, so there can't have
+ * been a concurrent deletion. Assume that we reached any deleted
+ * page encountered here by following a sibling link, and that the
+ * index is corrupt.
+ */
+ Assert(!P_ISDELETED(opaque));
+ if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
+ {
+ /*
+ * Pre-9.4 page deletion only marked internal pages as half-dead,
+ * but now we only use that flag on leaf pages. The old algorithm
+ * was never supposed to leave half-dead pages in the tree, it was
+ * just a transient state, but it was nevertheless possible in
+ * error scenarios. We don't know how to deal with them here. They
+ * are harmless as far as searches are considered, but inserts
+ * into the deleted keyspace could add out-of-order downlinks in
+ * the upper levels. Log a notice, hopefully the admin will notice
+ * and reindex.
+ */
+ if (P_ISHALFDEAD(opaque))
+ ereport(LOG,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("index \"%s\" contains a half-dead internal page",
+ RelationGetRelationName(rel)),
+ errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
+
+ if (P_ISDELETED(opaque))
+ ereport(LOG,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
+ BufferGetBlockNumber(leafbuf),
+ scanblkno,
+ RelationGetRelationName(rel))));
+
+ _bt_relbuf(rel, leafbuf);
+ return;
+ }
+
+ /*
+ * We can never delete rightmost pages nor root pages. While at it,
+ * check that page is empty, since it's possible that the leafbuf page
+ * was empty a moment ago, but has since had some inserts.
+ *
+ * To keep the algorithm simple, we also never delete an incompletely
+ * split page (they should be rare enough that this doesn't make any
+ * meaningful difference to disk usage):
+ *
+ * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
+ * left half of an incomplete split, but ensuring that it's not the
+ * right half is more complicated. For that, we have to check that
+ * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
+ * _bt_leftsib_splitflag(). On the first iteration, we temporarily
+ * release the lock on scanblkno/leafbuf, check the left sibling, and
+ * construct a search stack to scanblkno. On subsequent iterations,
+ * we know we stepped right from a page that passed these tests, so
+ * it's OK.
+ */
+ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
+ P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+ P_INCOMPLETE_SPLIT(opaque))
+ {
+ /* Should never fail to delete a half-dead page */
+ Assert(!P_ISHALFDEAD(opaque));
+
+ _bt_relbuf(rel, leafbuf);
+ return;
+ }
+
+ /*
+ * First, remove downlink pointing to the page (or a parent of the
+ * page, if we are going to delete a taller subtree), and mark the
+ * leafbuf page half-dead
+ */
+ if (!P_ISHALFDEAD(opaque))
+ {
+ /*
+ * We need an approximate pointer to the page's parent page. We
+ * use a variant of the standard search mechanism to search for
+ * the page's high key; this will give us a link to either the
+ * current parent or someplace to its left (if there are multiple
+ * equal high keys, which is possible with !heapkeyspace indexes).
+ *
+ * Also check if this is the right-half of an incomplete split
+ * (see comment above).
+ */
+ if (!stack)
+ {
+ BTScanInsert itup_key;
+ ItemId itemid;
+ IndexTuple targetkey;
+ BlockNumber leftsib,
+ leafblkno;
+ Buffer sleafbuf;
+
+ itemid = PageGetItemId(page, P_HIKEY);
+ targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
+
+ leftsib = opaque->btpo_prev;
+ leafblkno = BufferGetBlockNumber(leafbuf);
+
+ /*
+ * To avoid deadlocks, we'd better drop the leaf page lock
+ * before going further.
+ */
+ _bt_unlockbuf(rel, leafbuf);
+
+ /*
+ * Check that the left sibling of leafbuf (if any) is not
+ * marked with INCOMPLETE_SPLIT flag before proceeding
+ */
+ Assert(leafblkno == scanblkno);
+ if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
+ {
+ ReleaseBuffer(leafbuf);
+ return;
+ }
+
+ /* we need an insertion scan key for the search, so build one */
+ itup_key = _bt_mkscankey(rel, targetkey);
+ /* find the leftmost leaf page with matching pivot/high key */
+ itup_key->pivotsearch = true;
+ stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
+ /* won't need a second lock or pin on leafbuf */
+ _bt_relbuf(rel, sleafbuf);
+
+ /*
+ * Re-lock the leaf page, and start over to use our stack
+ * within _bt_mark_page_halfdead. We must do it that way
+ * because it's possible that leafbuf can no longer be
+ * deleted. We need to recheck.
+ *
+ * Note: We can't simply hold on to the sleafbuf lock instead,
+ * because it's barely possible that sleafbuf is not the same
+ * page as leafbuf. This happens when leafbuf split after our
+ * original lock was dropped, but before _bt_search finished
+ * its descent. We rely on the assumption that we'll find
+ * leafbuf isn't safe to delete anymore in this scenario.
+ * (Page deletion can cope with the stack being to the left of
+ * leafbuf, but not to the right of leafbuf.)
+ */
+ _bt_lockbuf(rel, leafbuf, BT_WRITE);
+ continue;
+ }
+
+ /*
+ * See if it's safe to delete the leaf page, and determine how
+ * many parent/internal pages above the leaf level will be
+ * deleted. If it's safe then _bt_mark_page_halfdead will also
+ * perform the first phase of deletion, which includes marking the
+ * leafbuf page half-dead.
+ */
+ Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
+ if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
+ {
+ _bt_relbuf(rel, leafbuf);
+ return;
+ }
+ }
+
+ /*
+ * Then unlink it from its siblings. Each call to
+ * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
+ * making it shallower. Iterate until the leafbuf page is deleted.
+ */
+ rightsib_empty = false;
+ Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
+ while (P_ISHALFDEAD(opaque))
+ {
+ /* Check for interrupts in _bt_unlink_halfdead_page */
+ if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
+ &rightsib_empty, vstate))
+ {
+ /*
+ * _bt_unlink_halfdead_page should never fail, since we
+ * established that deletion is generally safe in
+ * _bt_mark_page_halfdead -- index must be corrupt.
+ *
+ * Note that _bt_unlink_halfdead_page already released the
+ * lock and pin on leafbuf for us.
+ */
+ Assert(false);
+ return;
+ }
+ }
+
+ Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
+
+ rightsib = opaque->btpo_next;
+
+ _bt_relbuf(rel, leafbuf);
+
+ /*
+ * Check here, as calling loops will have locks held, preventing
+ * interrupts from being processed.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * The page has now been deleted. If its right sibling is completely
+ * empty, it's possible that the reason we haven't deleted it earlier
+ * is that it was the rightmost child of the parent. Now that we
+ * removed the downlink for this page, the right sibling might now be
+ * the only child of the parent, and could be removed. It would be
+ * picked up by the next vacuum anyway, but might as well try to
+ * remove it now, so loop back to process the right sibling.
+ *
+ * Note: This relies on the assumption that _bt_getstackbuf() will be
+ * able to reuse our original descent stack with a different child
+ * block (provided that the child block is to the right of the
+ * original leaf page reached by _bt_search()). It will even update
+ * the descent stack each time we loop around, avoiding repeated work.
+ */
+ if (!rightsib_empty)
+ break;
+
+ leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
+ }
+}
+
+/*
+ * First stage of page deletion.
+ *
+ * Establish the height of the to-be-deleted subtree with leafbuf at its
+ * lowest level, remove the downlink to the subtree, and mark leafbuf
+ * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
+ * but may include additional internal pages (at most one per level of the
+ * tree below the root).
+ *
+ * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
+ * the rightmost child of its parent (and parent has more than one downlink).
+ * Returns 'true' when the first stage of page deletion completed
+ * successfully.
+ */
+static bool
+_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
+{
+ BlockNumber leafblkno;
+ BlockNumber leafrightsib;
+ BlockNumber topparent;
+ BlockNumber topparentrightsib;
+ ItemId itemid;
+ Page page;
+ BTPageOpaque opaque;
+ Buffer subtreeparent;
+ OffsetNumber poffset;
+ OffsetNumber nextoffset;
+ IndexTuple itup;
+ IndexTupleData trunctuple;
+
+ page = BufferGetPage(leafbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
+ P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
+ P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
+
+ /*
+ * Save info about the leaf page.
+ */
+ leafblkno = BufferGetBlockNumber(leafbuf);
+ leafrightsib = opaque->btpo_next;
+
+ /*
+ * Before attempting to lock the parent page, check that the right sibling
+ * is not in half-dead state. A half-dead right sibling would have no
+ * downlink in the parent, which would be highly confusing later when we
+ * delete the downlink. It would fail the "right sibling of target page
+ * is also the next child in parent page" cross-check below.
+ */
+ if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
+ {
+ elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
+ leafblkno, leafrightsib);
+ return false;
+ }
+
+ /*
+ * We cannot delete a page that is the rightmost child of its immediate
+ * parent, unless it is the only child --- in which case the parent has to
+ * be deleted too, and the same condition applies recursively to it. We
+ * have to check this condition all the way up before trying to delete,
+ * and lock the parent of the root of the to-be-deleted subtree (the
+ * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
+ * for us. We remove the downlink to the "top parent" page (subtree root
+ * page) from the subtree parent page below.
+ *
+ * Initialize topparent to be leafbuf page now. The final to-be-deleted
+ * subtree is often a degenerate one page subtree consisting only of the
+ * leafbuf page. When that happens, the leafbuf page is the final subtree
+ * root page/top parent page.
+ */
+ topparent = leafblkno;
+ topparentrightsib = leafrightsib;
+ if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
+ &subtreeparent, &poffset,
+ &topparent, &topparentrightsib))
+ return false;
+
+ /*
+ * Check that the parent-page index items we're about to delete/overwrite
+ * in subtree parent page contain what we expect. This can fail if the
+ * index has become corrupt for some reason. We want to throw any error
+ * before entering the critical section --- otherwise it'd be a PANIC.
+ */
+ page = BufferGetPage(subtreeparent);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+#ifdef USE_ASSERT_CHECKING
+
+ /*
+ * This is just an assertion because _bt_lock_subtree_parent should have
+ * guaranteed tuple has the expected contents
+ */
+ itemid = PageGetItemId(page, poffset);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ Assert(BTreeTupleGetDownLink(itup) == topparent);
+#endif
+
+ nextoffset = OffsetNumberNext(poffset);
+ itemid = PageGetItemId(page, nextoffset);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ if (BTreeTupleGetDownLink(itup) != topparentrightsib)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
+ topparentrightsib, topparent,
+ BTreeTupleGetDownLink(itup),
+ BufferGetBlockNumber(subtreeparent),
+ RelationGetRelationName(rel))));
+
+ /*
+ * Any insert which would have gone on the leaf block will now go to its
+ * right sibling. In other words, the key space moves right.
+ */
+ PredicateLockPageCombine(rel, leafblkno, leafrightsib);
+
+ /* No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ /*
+ * Update parent of subtree. We want to delete the downlink to the top
+ * parent page/root of the subtree, and the *following* key. Easiest way
+ * is to copy the right sibling's downlink over the downlink that points
+ * to top parent page, and then delete the right sibling's original pivot
+ * tuple.
+ *
+ * Lanin and Shasha make the key space move left when deleting a page,
+ * whereas the key space moves right here. That's why we cannot simply
+ * delete the pivot tuple with the downlink to the top parent page. See
+ * nbtree/README.
+ */
+ page = BufferGetPage(subtreeparent);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ itemid = PageGetItemId(page, poffset);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ BTreeTupleSetDownLink(itup, topparentrightsib);
+
+ nextoffset = OffsetNumberNext(poffset);
+ PageIndexTupleDelete(page, nextoffset);
+
+ /*
+ * Mark the leaf page as half-dead, and stamp it with a link to the top
+ * parent page. When the leaf page is also the top parent page, the link
+ * is set to InvalidBlockNumber.
+ */
+ page = BufferGetPage(leafbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ opaque->btpo_flags |= BTP_HALF_DEAD;
+
+ Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
+ MemSet(&trunctuple, 0, sizeof(IndexTupleData));
+ trunctuple.t_info = sizeof(IndexTupleData);
+ if (topparent != leafblkno)
+ BTreeTupleSetTopParent(&trunctuple, topparent);
+ else
+ BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
+
+ if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
+ IndexTupleSize(&trunctuple)))
+ elog(ERROR, "could not overwrite high key in half-dead page");
+
+ /* Must mark buffers dirty before XLogInsert */
+ MarkBufferDirty(subtreeparent);
+ MarkBufferDirty(leafbuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_mark_page_halfdead xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.poffset = poffset;
+ xlrec.leafblk = leafblkno;
+ if (topparent != leafblkno)
+ xlrec.topparent = topparent;
+ else
+ xlrec.topparent = InvalidBlockNumber;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
+ XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
+
+ page = BufferGetPage(leafbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ xlrec.leftblk = opaque->btpo_prev;
+ xlrec.rightblk = opaque->btpo_next;
+
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
+
+ page = BufferGetPage(subtreeparent);
+ PageSetLSN(page, recptr);
+ page = BufferGetPage(leafbuf);
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ _bt_relbuf(rel, subtreeparent);
+ return true;
+}
+
+/*
+ * Second stage of page deletion.
+ *
+ * Unlinks a single page (in the subtree undergoing deletion) from its
+ * siblings. Also marks the page deleted.
+ *
+ * To get rid of the whole subtree, including the leaf page itself, call here
+ * until the leaf page is deleted. The original "top parent" established in
+ * the first stage of deletion is deleted in the first call here, while the
+ * leaf page is deleted in the last call here. Note that the leaf page itself
+ * is often the initial top parent page.
+ *
+ * Returns 'false' if the page could not be unlinked (shouldn't happen). If
+ * the right sibling of the current target page is empty, *rightsib_empty is
+ * set to true, allowing caller to delete the target's right sibling page in
+ * passing. Note that *rightsib_empty is only actually used by caller when
+ * target page is leafbuf, following last call here for leafbuf/the subtree
+ * containing leafbuf. (We always set *rightsib_empty for caller, just to be
+ * consistent.)
+ *
+ * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
+ * On success exit, we'll be holding pin and write lock. On failure exit,
+ * we'll release both pin and lock before returning (we define it that way
+ * to avoid having to reacquire a lock we already released).
+ */
+static bool
+_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
+ bool *rightsib_empty, BTVacState *vstate)
+{
+ BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
+ IndexBulkDeleteResult *stats = vstate->stats;
+ BlockNumber leafleftsib;
+ BlockNumber leafrightsib;
+ BlockNumber target;
+ BlockNumber leftsib;
+ BlockNumber rightsib;
+ Buffer lbuf = InvalidBuffer;
+ Buffer buf;
+ Buffer rbuf;
+ Buffer metabuf = InvalidBuffer;
+ Page metapg = NULL;
+ BTMetaPageData *metad = NULL;
+ ItemId itemid;
+ Page page;
+ BTPageOpaque opaque;
+ FullTransactionId safexid;
+ bool rightsib_is_rightmost;
+ uint32 targetlevel;
+ IndexTuple leafhikey;
+ BlockNumber leaftopparent;
+
+ page = BufferGetPage(leafbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
+
+ /*
+ * Remember some information about the leaf page.
+ */
+ itemid = PageGetItemId(page, P_HIKEY);
+ leafhikey = (IndexTuple) PageGetItem(page, itemid);
+ target = BTreeTupleGetTopParent(leafhikey);
+ leafleftsib = opaque->btpo_prev;
+ leafrightsib = opaque->btpo_next;
+
+ _bt_unlockbuf(rel, leafbuf);
+
+ /*
+ * Check here, as calling loops will have locks held, preventing
+ * interrupts from being processed.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ /* Unlink the current top parent of the subtree */
+ if (!BlockNumberIsValid(target))
+ {
+ /* Target is leaf page (or leaf page is top parent, if you prefer) */
+ target = leafblkno;
+
+ buf = leafbuf;
+ leftsib = leafleftsib;
+ targetlevel = 0;
+ }
+ else
+ {
+ /* Target is the internal page taken from leaf's top parent link */
+ Assert(target != leafblkno);
+
+ /* Fetch the block number of the target's left sibling */
+ buf = _bt_getbuf(rel, target, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ leftsib = opaque->btpo_prev;
+ targetlevel = opaque->btpo_level;
+ Assert(targetlevel > 0);
+
+ /*
+ * To avoid deadlocks, we'd better drop the target page lock before
+ * going further.
+ */
+ _bt_unlockbuf(rel, buf);
+ }
+
+ /*
+ * We have to lock the pages we need to modify in the standard order:
+ * moving right, then up. Else we will deadlock against other writers.
+ *
+ * So, first lock the leaf page, if it's not the target. Then find and
+ * write-lock the current left sibling of the target page. The sibling
+ * that was current a moment ago could have split, so we may have to move
+ * right.
+ */
+ if (target != leafblkno)
+ _bt_lockbuf(rel, leafbuf, BT_WRITE);
+ if (leftsib != P_NONE)
+ {
+ lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
+ page = BufferGetPage(lbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ while (P_ISDELETED(opaque) || opaque->btpo_next != target)
+ {
+ bool leftsibvalid = true;
+
+ /*
+ * Before we follow the link from the page that was the left
+ * sibling mere moments ago, validate its right link. This
+ * reduces the opportunities for loop to fail to ever make any
+ * progress in the presence of index corruption.
+ *
+ * Note: we rely on the assumption that there can only be one
+ * vacuum process running at a time (against the same index).
+ */
+ if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
+ leftsib == opaque->btpo_next)
+ leftsibvalid = false;
+
+ leftsib = opaque->btpo_next;
+ _bt_relbuf(rel, lbuf);
+
+ if (!leftsibvalid)
+ {
+ if (target != leafblkno)
+ {
+ /* we have only a pin on target, but pin+lock on leafbuf */
+ ReleaseBuffer(buf);
+ _bt_relbuf(rel, leafbuf);
+ }
+ else
+ {
+ /* we have only a pin on leafbuf */
+ ReleaseBuffer(leafbuf);
+ }
+
+ ereport(LOG,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("valid left sibling for deletion target could not be located: "
+ "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
+ leftsib, target, leafblkno, scanblkno,
+ RelationGetRelationName(rel))));
+
+ return false;
+ }
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* step right one page */
+ lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
+ page = BufferGetPage(lbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+ }
+ else
+ lbuf = InvalidBuffer;
+
+ /* Next write-lock the target page itself */
+ _bt_lockbuf(rel, buf, BT_WRITE);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Check page is still empty etc, else abandon deletion. This is just for
+ * paranoia's sake; a half-dead page cannot resurrect because there can be
+ * only one vacuum process running at a time.
+ */
+ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
+ elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
+ target, RelationGetRelationName(rel));
+
+ if (opaque->btpo_prev != leftsib)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
+ leftsib, opaque->btpo_prev, target,
+ RelationGetRelationName(rel))));
+
+ if (target == leafblkno)
+ {
+ if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+ !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
+ elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
+ target, RelationGetRelationName(rel));
+
+ /* Leaf page is also target page: don't set leaftopparent */
+ leaftopparent = InvalidBlockNumber;
+ }
+ else
+ {
+ IndexTuple finaldataitem;
+
+ if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
+ P_ISLEAF(opaque))
+ elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
+ targetlevel, target, RelationGetRelationName(rel));
+
+ /* Target is internal: set leaftopparent for next call here... */
+ itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
+ finaldataitem = (IndexTuple) PageGetItem(page, itemid);
+ leaftopparent = BTreeTupleGetDownLink(finaldataitem);
+ /* ...except when it would be a redundant pointer-to-self */
+ if (leaftopparent == leafblkno)
+ leaftopparent = InvalidBlockNumber;
+ }
+
+ /* No leaftopparent for level 0 (leaf page) or level 1 target */
+ Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
+
+ /*
+ * And next write-lock the (current) right sibling.
+ */
+ rightsib = opaque->btpo_next;
+ rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
+ page = BufferGetPage(rbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (opaque->btpo_prev != target)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("right sibling's left-link doesn't match: "
+ "block %u links to %u instead of expected %u in index \"%s\"",
+ rightsib, opaque->btpo_prev, target,
+ RelationGetRelationName(rel))));
+ rightsib_is_rightmost = P_RIGHTMOST(opaque);
+ *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
+
+ /*
+ * If we are deleting the next-to-last page on the target's level, then
+ * the rightsib is a candidate to become the new fast root. (In theory, it
+ * might be possible to push the fast root even further down, but the odds
+ * of doing so are slim, and the locking considerations daunting.)
+ *
+ * We can safely acquire a lock on the metapage here --- see comments for
+ * _bt_newroot().
+ */
+ if (leftsib == P_NONE && rightsib_is_rightmost)
+ {
+ page = BufferGetPage(rbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_RIGHTMOST(opaque))
+ {
+ /* rightsib will be the only one left on the level */
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
+
+ /*
+ * The expected case here is btm_fastlevel == targetlevel+1; if
+ * the fastlevel is <= targetlevel, something is wrong, and we
+ * choose to overwrite it to fix it.
+ */
+ if (metad->btm_fastlevel > targetlevel + 1)
+ {
+ /* no update wanted */
+ _bt_relbuf(rel, metabuf);
+ metabuf = InvalidBuffer;
+ }
+ }
+ }
+
+ /*
+ * Here we begin doing the deletion.
+ */
+
+ /* No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ /*
+ * Update siblings' side-links. Note the target page's side-links will
+ * continue to point to the siblings. Asserts here are just rechecking
+ * things we already verified above.
+ */
+ if (BufferIsValid(lbuf))
+ {
+ page = BufferGetPage(lbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->btpo_next == target);
+ opaque->btpo_next = rightsib;
+ }
+ page = BufferGetPage(rbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->btpo_prev == target);
+ opaque->btpo_prev = leftsib;
+
+ /*
+ * If we deleted a parent of the targeted leaf page, instead of the leaf
+ * itself, update the leaf to point to the next remaining child in the
+ * subtree.
+ *
+ * Note: We rely on the fact that a buffer pin on the leaf page has been
+ * held since leafhikey was initialized. This is safe, though only
+ * because the page was already half-dead at that point. The leaf page
+ * cannot have been modified by any other backend during the period when
+ * no lock was held.
+ */
+ if (target != leafblkno)
+ BTreeTupleSetTopParent(leafhikey, leaftopparent);
+
+ /*
+ * Mark the page itself deleted. It can be recycled when all current
+ * transactions are gone. Storing GetTopTransactionId() would work, but
+ * we're in VACUUM and would not otherwise have an XID. Having already
+ * updated links to the target, ReadNextFullTransactionId() suffices as an
+ * upper bound. Any scan having retained a now-stale link is advertising
+ * in its PGPROC an xmin less than or equal to the value we read here. It
+ * will continue to do so, holding back the xmin horizon, for the duration
+ * of that scan.
+ */
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
+
+ /*
+ * Store upper bound XID that's used to determine when deleted page is no
+ * longer needed as a tombstone
+ */
+ safexid = ReadNextFullTransactionId();
+ BTPageSetDeleted(page, safexid);
+ opaque->btpo_cycleid = 0;
+
+ /* And update the metapage, if needed */
+ if (BufferIsValid(metabuf))
+ {
+ /* upgrade metapage if needed */
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
+ _bt_upgrademetapage(metapg);
+ metad->btm_fastroot = rightsib;
+ metad->btm_fastlevel = targetlevel;
+ MarkBufferDirty(metabuf);
+ }
+
+ /* Must mark buffers dirty before XLogInsert */
+ MarkBufferDirty(rbuf);
+ MarkBufferDirty(buf);
+ if (BufferIsValid(lbuf))
+ MarkBufferDirty(lbuf);
+ if (target != leafblkno)
+ MarkBufferDirty(leafbuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_unlink_page xlrec;
+ xl_btree_metadata xlmeta;
+ uint8 xlinfo;
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+
+ XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
+ if (BufferIsValid(lbuf))
+ XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
+ XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
+ if (target != leafblkno)
+ XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
+
+ /* information stored on the target/to-be-unlinked block */
+ xlrec.leftsib = leftsib;
+ xlrec.rightsib = rightsib;
+ xlrec.level = targetlevel;
+ xlrec.safexid = safexid;
+
+ /* information needed to recreate the leaf block (if not the target) */
+ xlrec.leafleftsib = leafleftsib;
+ xlrec.leafrightsib = leafrightsib;
+ xlrec.leaftopparent = leaftopparent;
+
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
+
+ if (BufferIsValid(metabuf))
+ {
+ XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ xlmeta.version = metad->btm_version;
+ xlmeta.root = metad->btm_root;
+ xlmeta.level = metad->btm_level;
+ xlmeta.fastroot = metad->btm_fastroot;
+ xlmeta.fastlevel = metad->btm_fastlevel;
+ xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
+ xlmeta.allequalimage = metad->btm_allequalimage;
+
+ XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
+ xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
+ }
+ else
+ xlinfo = XLOG_BTREE_UNLINK_PAGE;
+
+ recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+ if (BufferIsValid(metabuf))
+ {
+ PageSetLSN(metapg, recptr);
+ }
+ page = BufferGetPage(rbuf);
+ PageSetLSN(page, recptr);
+ page = BufferGetPage(buf);
+ PageSetLSN(page, recptr);
+ if (BufferIsValid(lbuf))
+ {
+ page = BufferGetPage(lbuf);
+ PageSetLSN(page, recptr);
+ }
+ if (target != leafblkno)
+ {
+ page = BufferGetPage(leafbuf);
+ PageSetLSN(page, recptr);
+ }
+ }
+
+ END_CRIT_SECTION();
+
+ /* release metapage */
+ if (BufferIsValid(metabuf))
+ _bt_relbuf(rel, metabuf);
+
+ /* release siblings */
+ if (BufferIsValid(lbuf))
+ _bt_relbuf(rel, lbuf);
+ _bt_relbuf(rel, rbuf);
+
+ /* If the target is not leafbuf, we're done with it now -- release it */
+ if (target != leafblkno)
+ _bt_relbuf(rel, buf);
+
+ /*
+ * Maintain pages_newly_deleted, which is simply the number of pages
+ * deleted by the ongoing VACUUM operation.
+ *
+ * Maintain pages_deleted in a way that takes into account how
+ * btvacuumpage() will count deleted pages that have yet to become
+ * scanblkno -- only count page when it's not going to get that treatment
+ * later on.
+ */
+ stats->pages_newly_deleted++;
+ if (target <= scanblkno)
+ stats->pages_deleted++;
+
+ /*
+ * Remember information about the target page (now a newly deleted page)
+ * in dedicated vstate space for later. The page will be considered as a
+ * candidate to place in the FSM at the end of the current btvacuumscan()
+ * call.
+ */
+ _bt_pendingfsm_add(vstate, target, safexid);
+
+ return true;
+}
+
+/*
+ * Establish how tall the to-be-deleted subtree will be during the first stage
+ * of page deletion.
+ *
+ * Caller's child argument is the block number of the page caller wants to
+ * delete (this is leafbuf's block number, except when we're called
+ * recursively). stack is a search stack leading to it. Note that we will
+ * update the stack entry(s) to reflect current downlink positions --- this is
+ * similar to the corresponding point in page split handling.
+ *
+ * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
+ * false. Returns true on success, in which case caller can use certain
+ * details established here to perform the first stage of deletion. This
+ * function is the last point at which page deletion may be deemed unsafe
+ * (barring index corruption, or unexpected concurrent page deletions).
+ *
+ * We write lock the parent of the root of the to-be-deleted subtree for
+ * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
+ * caller). Caller will have to remove a downlink from *subtreeparent. We
+ * also set a *subtreeparent offset number in *poffset, to indicate the
+ * location of the pivot tuple that contains the relevant downlink.
+ *
+ * The root of the to-be-deleted subtree is called the "top parent". Note
+ * that the leafbuf page is often the final "top parent" page (you can think
+ * of the leafbuf page as a degenerate single page subtree when that happens).
+ * Caller should initialize *topparent to the target leafbuf page block number
+ * (while *topparentrightsib should be set to leafbuf's right sibling block
+ * number). We will update *topparent (and *topparentrightsib) for caller
+ * here, though only when it turns out that caller will delete at least one
+ * internal page (i.e. only when caller needs to store a valid link to the top
+ * parent block in the leafbuf page using BTreeTupleSetTopParent()).
+ */
+static bool
+_bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack,
+ Buffer *subtreeparent, OffsetNumber *poffset,
+ BlockNumber *topparent, BlockNumber *topparentrightsib)
+{
+ BlockNumber parent,
+ leftsibparent;
+ OffsetNumber parentoffset,
+ maxoff;
+ Buffer pbuf;
+ Page page;
+ BTPageOpaque opaque;
+
+ /*
+ * Locate the pivot tuple whose downlink points to "child". Write lock
+ * the parent page itself.
+ */
+ pbuf = _bt_getstackbuf(rel, stack, child);
+ if (pbuf == InvalidBuffer)
+ {
+ /*
+ * Failed to "re-find" a pivot tuple whose downlink matched our child
+ * block number on the parent level -- the index must be corrupt.
+ * Don't even try to delete the leafbuf subtree. Just report the
+ * issue and press on with vacuuming the index.
+ *
+ * Note: _bt_getstackbuf() recovers from concurrent page splits that
+ * take place on the parent level. Its approach is a near-exhaustive
+ * linear search. This also gives it a surprisingly good chance of
+ * recovering in the event of a buggy or inconsistent opclass. But we
+ * don't rely on that here.
+ */
+ ereport(LOG,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
+ RelationGetRelationName(rel), child)));
+ return false;
+ }
+
+ parent = stack->bts_blkno;
+ parentoffset = stack->bts_offset;
+
+ page = BufferGetPage(pbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ leftsibparent = opaque->btpo_prev;
+
+ /*
+ * _bt_getstackbuf() completes page splits on returned parent buffer when
+ * required.
+ *
+ * In general it's a bad idea for VACUUM to use up more disk space, which
+ * is why page deletion does not finish incomplete page splits most of the
+ * time. We allow this limited exception because the risk is much lower,
+ * and the potential downside of not proceeding is much higher: A single
+ * internal page with the INCOMPLETE_SPLIT flag set might otherwise
+ * prevent us from deleting hundreds of empty leaf pages from one level
+ * down.
+ */
+ Assert(!P_INCOMPLETE_SPLIT(opaque));
+
+ if (parentoffset < maxoff)
+ {
+ /*
+ * Child is not the rightmost child in parent, so it's safe to delete
+ * the subtree whose root/topparent is child page
+ */
+ *subtreeparent = pbuf;
+ *poffset = parentoffset;
+ return true;
+ }
+
+ /*
+ * Child is the rightmost child of parent.
+ *
+ * Since it's the rightmost child of parent, deleting the child (or
+ * deleting the subtree whose root/topparent is the child page) is only
+ * safe when it's also possible to delete the parent.
+ */
+ Assert(parentoffset == maxoff);
+ if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
+ {
+ /*
+ * Child isn't parent's only child, or parent is rightmost on its
+ * entire level. Definitely cannot delete any pages.
+ */
+ _bt_relbuf(rel, pbuf);
+ return false;
+ }
+
+ /*
+ * Now make sure that the parent deletion is itself safe by examining the
+ * child's grandparent page. Recurse, passing the parent page as the
+ * child page (child's grandparent is the parent on the next level up). If
+ * parent deletion is unsafe, then child deletion must also be unsafe (in
+ * which case caller cannot delete any pages at all).
+ */
+ *topparent = parent;
+ *topparentrightsib = opaque->btpo_next;
+
+ /*
+ * Release lock on parent before recursing.
+ *
+ * It's OK to release page locks on parent before recursive call locks
+ * grandparent. An internal page can only acquire an entry if the child
+ * is split, but that cannot happen as long as we still hold a lock on the
+ * leafbuf page.
+ */
+ _bt_relbuf(rel, pbuf);
+
+ /*
+ * Before recursing, check that the left sibling of parent (if any) is not
+ * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
+ * parent lock).
+ *
+ * Note: We deliberately avoid completing incomplete splits here.
+ */
+ if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
+ return false;
+
+ /* Recurse to examine child page's grandparent page */
+ return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
+ subtreeparent, poffset,
+ topparent, topparentrightsib);
+}
+
+/*
+ * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
+ * optimization.
+ *
+ * Called at the start of a btvacuumscan(). Caller's cleanuponly argument
+ * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
+ *
+ * We expect to allocate memory inside VACUUM's top-level memory context here.
+ * The working buffer is subject to a limit based on work_mem. Our strategy
+ * when the array can no longer grow within the bounds of that limit is to
+ * stop saving additional newly deleted pages, while proceeding as usual with
+ * the pages that we can fit.
+ */
+void
+_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
+{
+ int64 maxbufsize;
+
+ /*
+ * Don't bother with optimization in cleanup-only case -- we don't expect
+ * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
+ * can only take place because this optimization didn't work out during
+ * the last VACUUM.
+ */
+ if (cleanuponly)
+ return;
+
+ /*
+ * Cap maximum size of array so that we always respect work_mem. Avoid
+ * int overflow here.
+ */
+ vstate->bufsize = 256;
+ maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
+ maxbufsize = Min(maxbufsize, INT_MAX);
+ maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
+ /* Stay sane with small work_mem */
+ maxbufsize = Max(maxbufsize, vstate->bufsize);
+ vstate->maxbufsize = maxbufsize;
+
+ /* Allocate buffer, indicate that there are currently 0 pending pages */
+ vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
+ vstate->npendingpages = 0;
+}
+
+/*
+ * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
+ * the ongoing VACUUM operation) into the free space map -- though only when
+ * it is actually safe to do so by now.
+ *
+ * Called at the end of a btvacuumscan(), just before free space map vacuuming
+ * takes place.
+ *
+ * Frees memory allocated by _bt_pendingfsm_init(), if any.
+ */
+void
+_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
+{
+ IndexBulkDeleteResult *stats = vstate->stats;
+
+ Assert(stats->pages_newly_deleted >= vstate->npendingpages);
+
+ if (vstate->npendingpages == 0)
+ {
+ /* Just free memory when nothing to do */
+ if (vstate->pendingpages)
+ pfree(vstate->pendingpages);
+
+ return;
+ }
+
+#ifdef DEBUG_BTREE_PENDING_FSM
+
+ /*
+ * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
+ * placing pending pages in the FSM. Note that the optimization will
+ * never be effective without some other backend concurrently consuming an
+ * XID.
+ */
+ pg_usleep(5000000L);
+#endif
+
+ /*
+ * Recompute VACUUM XID boundaries.
+ *
+ * We don't actually care about the oldest non-removable XID. Computing
+ * the oldest such XID has a useful side-effect that we rely on: it
+ * forcibly updates the XID horizon state for this backend. This step is
+ * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
+ * that it is now safe to recycle newly deleted pages without this step.
+ */
+ GetOldestNonRemovableTransactionId(NULL);
+
+ for (int i = 0; i < vstate->npendingpages; i++)
+ {
+ BlockNumber target = vstate->pendingpages[i].target;
+ FullTransactionId safexid = vstate->pendingpages[i].safexid;
+
+ /*
+ * Do the equivalent of checking BTPageIsRecyclable(), but without
+ * accessing the page again a second time.
+ *
+ * Give up on finding the first non-recyclable page -- all later pages
+ * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
+ * to the array in safexid order.
+ */
+ if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
+ break;
+
+ RecordFreeIndexPage(rel, target);
+ stats->pages_free++;
+ }
+
+ pfree(vstate->pendingpages);
+}
+
+/*
+ * Maintain array of pages that were deleted during current btvacuumscan()
+ * call, for use in _bt_pendingfsm_finalize()
+ */
+static void
+_bt_pendingfsm_add(BTVacState *vstate,
+ BlockNumber target,
+ FullTransactionId safexid)
+{
+ Assert(vstate->npendingpages <= vstate->bufsize);
+ Assert(vstate->bufsize <= vstate->maxbufsize);
+
+#ifdef USE_ASSERT_CHECKING
+
+ /*
+ * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
+ * array will always be in safexid order (since that is the order that we
+ * save them in here)
+ */
+ if (vstate->npendingpages > 0)
+ {
+ FullTransactionId lastsafexid =
+ vstate->pendingpages[vstate->npendingpages - 1].safexid;
+
+ Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
+ }
+#endif
+
+ /*
+ * If temp buffer reaches maxbufsize/work_mem capacity then we discard
+ * information about this page.
+ *
+ * Note that this also covers the case where we opted to not use the
+ * optimization in _bt_pendingfsm_init().
+ */
+ if (vstate->npendingpages == vstate->maxbufsize)
+ return;
+
+ /* Consider enlarging buffer */
+ if (vstate->npendingpages == vstate->bufsize)
+ {
+ int newbufsize = vstate->bufsize * 2;
+
+ /* Respect work_mem */
+ if (newbufsize > vstate->maxbufsize)
+ newbufsize = vstate->maxbufsize;
+
+ vstate->bufsize = newbufsize;
+ vstate->pendingpages =
+ repalloc(vstate->pendingpages,
+ sizeof(BTPendingFSM) * vstate->bufsize);
+ }
+
+ /* Save metadata for newly deleted page */
+ vstate->pendingpages[vstate->npendingpages].target = target;
+ vstate->pendingpages[vstate->npendingpages].safexid = safexid;
+ vstate->npendingpages++;
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
new file mode 100644
index 0000000..1360ab8
--- /dev/null
+++ b/src/backend/access/nbtree/nbtree.c
@@ -0,0 +1,1446 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtree.c
+ * Implementation of Lehman and Yao's btree management algorithm for
+ * Postgres.
+ *
+ * NOTES
+ * This file contains only the public interface routines.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtree.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/relscan.h"
+#include "access/xlog.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "storage/condition_variable.h"
+#include "storage/indexfsm.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/index_selfuncs.h"
+#include "utils/memutils.h"
+
+
+/*
+ * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
+ *
+ * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
+ * a new page; others must wait.
+ *
+ * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
+ * to a new page; some process can start doing that.
+ *
+ * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
+ * We reach this state once for every distinct combination of array keys.
+ */
+typedef enum
+{
+ BTPARALLEL_NOT_INITIALIZED,
+ BTPARALLEL_ADVANCING,
+ BTPARALLEL_IDLE,
+ BTPARALLEL_DONE
+} BTPS_State;
+
+/*
+ * BTParallelScanDescData contains btree specific shared information required
+ * for parallel scan.
+ */
+typedef struct BTParallelScanDescData
+{
+ BlockNumber btps_scanPage; /* latest or next page to be scanned */
+ BTPS_State btps_pageStatus; /* indicates whether next page is
+ * available for scan. see above for
+ * possible states of parallel scan. */
+ int btps_arrayKeyCount; /* count indicating number of array scan
+ * keys processed by parallel scan */
+ slock_t btps_mutex; /* protects above variables */
+ ConditionVariable btps_cv; /* used to synchronize parallel scan */
+} BTParallelScanDescData;
+
+typedef struct BTParallelScanDescData *BTParallelScanDesc;
+
+
+static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state,
+ BTCycleId cycleid);
+static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
+static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
+ IndexTuple posting,
+ OffsetNumber updatedoffset,
+ int *nremaining);
+
+
+/*
+ * Btree handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+bthandler(PG_FUNCTION_ARGS)
+{
+ IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+ amroutine->amstrategies = BTMaxStrategyNumber;
+ amroutine->amsupport = BTNProcs;
+ amroutine->amoptsprocnum = BTOPTIONS_PROC;
+ amroutine->amcanorder = true;
+ amroutine->amcanorderbyop = false;
+ amroutine->amcanbackward = true;
+ amroutine->amcanunique = true;
+ amroutine->amcanmulticol = true;
+ amroutine->amoptionalkey = true;
+ amroutine->amsearcharray = true;
+ amroutine->amsearchnulls = true;
+ amroutine->amstorage = false;
+ amroutine->amclusterable = true;
+ amroutine->ampredlocks = true;
+ amroutine->amcanparallel = true;
+ amroutine->amcaninclude = true;
+ amroutine->amusemaintenanceworkmem = false;
+ amroutine->amparallelvacuumoptions =
+ VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
+ amroutine->amkeytype = InvalidOid;
+
+ amroutine->ambuild = btbuild;
+ amroutine->ambuildempty = btbuildempty;
+ amroutine->aminsert = btinsert;
+ amroutine->ambulkdelete = btbulkdelete;
+ amroutine->amvacuumcleanup = btvacuumcleanup;
+ amroutine->amcanreturn = btcanreturn;
+ amroutine->amcostestimate = btcostestimate;
+ amroutine->amoptions = btoptions;
+ amroutine->amproperty = btproperty;
+ amroutine->ambuildphasename = btbuildphasename;
+ amroutine->amvalidate = btvalidate;
+ amroutine->amadjustmembers = btadjustmembers;
+ amroutine->ambeginscan = btbeginscan;
+ amroutine->amrescan = btrescan;
+ amroutine->amgettuple = btgettuple;
+ amroutine->amgetbitmap = btgetbitmap;
+ amroutine->amendscan = btendscan;
+ amroutine->ammarkpos = btmarkpos;
+ amroutine->amrestrpos = btrestrpos;
+ amroutine->amestimateparallelscan = btestimateparallelscan;
+ amroutine->aminitparallelscan = btinitparallelscan;
+ amroutine->amparallelrescan = btparallelrescan;
+
+ PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * btbuildempty() -- build an empty btree index in the initialization fork
+ */
+void
+btbuildempty(Relation index)
+{
+ Page metapage;
+
+ /* Construct metapage. */
+ metapage = (Page) palloc(BLCKSZ);
+ _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
+
+ /*
+ * Write the page and log it. It might seem that an immediate sync would
+ * be sufficient to guarantee that the file exists on disk, but recovery
+ * itself might remove it while replaying, for example, an
+ * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
+ * this even when wal_level=minimal.
+ */
+ PageSetChecksumInplace(metapage, BTREE_METAPAGE);
+ smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
+ (char *) metapage, true);
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ BTREE_METAPAGE, metapage, true);
+
+ /*
+ * An immediate sync is required even if we xlog'd the page, because the
+ * write did not go through shared_buffers and therefore a concurrent
+ * checkpoint may have moved the redo pointer past our xlog record.
+ */
+ smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+}
+
+/*
+ * btinsert() -- insert an index tuple into a btree.
+ *
+ * Descend the tree recursively, find the appropriate location for our
+ * new tuple, and put it there.
+ */
+bool
+btinsert(Relation rel, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ IndexInfo *indexInfo)
+{
+ bool result;
+ IndexTuple itup;
+
+ /* generate an index tuple */
+ itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
+ itup->t_tid = *ht_ctid;
+
+ result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
+
+ pfree(itup);
+
+ return result;
+}
+
+/*
+ * btgettuple() -- Get the next tuple in the scan.
+ */
+bool
+btgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ bool res;
+
+ /* btree indexes are never lossy */
+ scan->xs_recheck = false;
+
+ /*
+ * If we have any array keys, initialize them during first call for a
+ * scan. We can't do this in btrescan because we don't know the scan
+ * direction at that time.
+ */
+ if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
+ {
+ /* punt if we have any unsatisfiable array keys */
+ if (so->numArrayKeys < 0)
+ return false;
+
+ _bt_start_array_keys(scan, dir);
+ }
+
+ /* This loop handles advancing to the next array elements, if any */
+ do
+ {
+ /*
+ * If we've already initialized this scan, we can just advance it in
+ * the appropriate direction. If we haven't done so yet, we call
+ * _bt_first() to get the first item in the scan.
+ */
+ if (!BTScanPosIsValid(so->currPos))
+ res = _bt_first(scan, dir);
+ else
+ {
+ /*
+ * Check to see if we should kill the previously-fetched tuple.
+ */
+ if (scan->kill_prior_tuple)
+ {
+ /*
+ * Yes, remember it for later. (We'll deal with all such
+ * tuples at once right before leaving the index page.) The
+ * test for numKilled overrun is not just paranoia: if the
+ * caller reverses direction in the indexscan then the same
+ * item might get entered multiple times. It's not worth
+ * trying to optimize that, so we don't detect it, but instead
+ * just forget any excess entries.
+ */
+ if (so->killedItems == NULL)
+ so->killedItems = (int *)
+ palloc(MaxTIDsPerBTreePage * sizeof(int));
+ if (so->numKilled < MaxTIDsPerBTreePage)
+ so->killedItems[so->numKilled++] = so->currPos.itemIndex;
+ }
+
+ /*
+ * Now continue the scan.
+ */
+ res = _bt_next(scan, dir);
+ }
+
+ /* If we have a tuple, return it ... */
+ if (res)
+ break;
+ /* ... otherwise see if we have more array keys to deal with */
+ } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+
+ return res;
+}
+
+/*
+ * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
+ */
+int64
+btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int64 ntids = 0;
+ ItemPointer heapTid;
+
+ /*
+ * If we have any array keys, initialize them.
+ */
+ if (so->numArrayKeys)
+ {
+ /* punt if we have any unsatisfiable array keys */
+ if (so->numArrayKeys < 0)
+ return ntids;
+
+ _bt_start_array_keys(scan, ForwardScanDirection);
+ }
+
+ /* This loop handles advancing to the next array elements, if any */
+ do
+ {
+ /* Fetch the first page & tuple */
+ if (_bt_first(scan, ForwardScanDirection))
+ {
+ /* Save tuple ID, and continue scanning */
+ heapTid = &scan->xs_heaptid;
+ tbm_add_tuples(tbm, heapTid, 1, false);
+ ntids++;
+
+ for (;;)
+ {
+ /*
+ * Advance to next tuple within page. This is the same as the
+ * easy case in _bt_next().
+ */
+ if (++so->currPos.itemIndex > so->currPos.lastItem)
+ {
+ /* let _bt_next do the heavy lifting */
+ if (!_bt_next(scan, ForwardScanDirection))
+ break;
+ }
+
+ /* Save tuple ID, and continue scanning */
+ heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
+ tbm_add_tuples(tbm, heapTid, 1, false);
+ ntids++;
+ }
+ }
+ /* Now see if we have more array keys to deal with */
+ } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+
+ return ntids;
+}
+
+/*
+ * btbeginscan() -- start a scan on a btree index
+ */
+IndexScanDesc
+btbeginscan(Relation rel, int nkeys, int norderbys)
+{
+ IndexScanDesc scan;
+ BTScanOpaque so;
+
+ /* no order by operators allowed */
+ Assert(norderbys == 0);
+
+ /* get the scan */
+ scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
+ /* allocate private workspace */
+ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+ BTScanPosInvalidate(so->currPos);
+ BTScanPosInvalidate(so->markPos);
+ if (scan->numberOfKeys > 0)
+ so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
+ else
+ so->keyData = NULL;
+
+ so->arrayKeyData = NULL; /* assume no array keys for now */
+ so->numArrayKeys = 0;
+ so->arrayKeys = NULL;
+ so->arrayContext = NULL;
+
+ so->killedItems = NULL; /* until needed */
+ so->numKilled = 0;
+
+ /*
+ * We don't know yet whether the scan will be index-only, so we do not
+ * allocate the tuple workspace arrays until btrescan. However, we set up
+ * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
+ */
+ so->currTuples = so->markTuples = NULL;
+
+ scan->xs_itupdesc = RelationGetDescr(rel);
+
+ scan->opaque = so;
+
+ return scan;
+}
+
+/*
+ * btrescan() -- rescan an index relation
+ */
+void
+btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+ /* we aren't holding any read locks, but gotta drop the pins */
+ if (BTScanPosIsValid(so->currPos))
+ {
+ /* Before leaving current page, deal with any killed items */
+ if (so->numKilled > 0)
+ _bt_killitems(scan);
+ BTScanPosUnpinIfPinned(so->currPos);
+ BTScanPosInvalidate(so->currPos);
+ }
+
+ so->markItemIndex = -1;
+ so->arrayKeyCount = 0;
+ BTScanPosUnpinIfPinned(so->markPos);
+ BTScanPosInvalidate(so->markPos);
+
+ /*
+ * Allocate tuple workspace arrays, if needed for an index-only scan and
+ * not already done in a previous rescan call. To save on palloc
+ * overhead, both workspaces are allocated as one palloc block; only this
+ * function and btendscan know that.
+ *
+ * NOTE: this data structure also makes it safe to return data from a
+ * "name" column, even though btree name_ops uses an underlying storage
+ * datatype of cstring. The risk there is that "name" is supposed to be
+ * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
+ * However, since we only return data out of tuples sitting in the
+ * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
+ * data out of the markTuples array --- running off the end of memory for
+ * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
+ * adding special-case treatment for name_ops elsewhere.
+ */
+ if (scan->xs_want_itup && so->currTuples == NULL)
+ {
+ so->currTuples = (char *) palloc(BLCKSZ * 2);
+ so->markTuples = so->currTuples + BLCKSZ;
+ }
+
+ /*
+ * Reset the scan keys
+ */
+ if (scankey && scan->numberOfKeys > 0)
+ memmove(scan->keyData,
+ scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
+
+ /* If any keys are SK_SEARCHARRAY type, set up array-key info */
+ _bt_preprocess_array_keys(scan);
+}
+
+/*
+ * btendscan() -- close down a scan
+ */
+void
+btendscan(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+ /* we aren't holding any read locks, but gotta drop the pins */
+ if (BTScanPosIsValid(so->currPos))
+ {
+ /* Before leaving current page, deal with any killed items */
+ if (so->numKilled > 0)
+ _bt_killitems(scan);
+ BTScanPosUnpinIfPinned(so->currPos);
+ }
+
+ so->markItemIndex = -1;
+ BTScanPosUnpinIfPinned(so->markPos);
+
+ /* No need to invalidate positions, the RAM is about to be freed. */
+
+ /* Release storage */
+ if (so->keyData != NULL)
+ pfree(so->keyData);
+ /* so->arrayKeyData and so->arrayKeys are in arrayContext */
+ if (so->arrayContext != NULL)
+ MemoryContextDelete(so->arrayContext);
+ if (so->killedItems != NULL)
+ pfree(so->killedItems);
+ if (so->currTuples != NULL)
+ pfree(so->currTuples);
+ /* so->markTuples should not be pfree'd, see btrescan */
+ pfree(so);
+}
+
+/*
+ * btmarkpos() -- save current scan position
+ */
+void
+btmarkpos(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+ /* There may be an old mark with a pin (but no lock). */
+ BTScanPosUnpinIfPinned(so->markPos);
+
+ /*
+ * Just record the current itemIndex. If we later step to next page
+ * before releasing the marked position, _bt_steppage makes a full copy of
+ * the currPos struct in markPos. If (as often happens) the mark is moved
+ * before we leave the page, we don't have to do that work.
+ */
+ if (BTScanPosIsValid(so->currPos))
+ so->markItemIndex = so->currPos.itemIndex;
+ else
+ {
+ BTScanPosInvalidate(so->markPos);
+ so->markItemIndex = -1;
+ }
+
+ /* Also record the current positions of any array keys */
+ if (so->numArrayKeys)
+ _bt_mark_array_keys(scan);
+}
+
+/*
+ * btrestrpos() -- restore scan to last saved position
+ */
+void
+btrestrpos(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+ /* Restore the marked positions of any array keys */
+ if (so->numArrayKeys)
+ _bt_restore_array_keys(scan);
+
+ if (so->markItemIndex >= 0)
+ {
+ /*
+ * The scan has never moved to a new page since the last mark. Just
+ * restore the itemIndex.
+ *
+ * NB: In this case we can't count on anything in so->markPos to be
+ * accurate.
+ */
+ so->currPos.itemIndex = so->markItemIndex;
+ }
+ else
+ {
+ /*
+ * The scan moved to a new page after last mark or restore, and we are
+ * now restoring to the marked page. We aren't holding any read
+ * locks, but if we're still holding the pin for the current position,
+ * we must drop it.
+ */
+ if (BTScanPosIsValid(so->currPos))
+ {
+ /* Before leaving current page, deal with any killed items */
+ if (so->numKilled > 0)
+ _bt_killitems(scan);
+ BTScanPosUnpinIfPinned(so->currPos);
+ }
+
+ if (BTScanPosIsValid(so->markPos))
+ {
+ /* bump pin on mark buffer for assignment to current buffer */
+ if (BTScanPosIsPinned(so->markPos))
+ IncrBufferRefCount(so->markPos.buf);
+ memcpy(&so->currPos, &so->markPos,
+ offsetof(BTScanPosData, items[1]) +
+ so->markPos.lastItem * sizeof(BTScanPosItem));
+ if (so->currTuples)
+ memcpy(so->currTuples, so->markTuples,
+ so->markPos.nextTupleOffset);
+ }
+ else
+ BTScanPosInvalidate(so->currPos);
+ }
+}
+
+/*
+ * btestimateparallelscan -- estimate storage for BTParallelScanDescData
+ */
+Size
+btestimateparallelscan(void)
+{
+ return sizeof(BTParallelScanDescData);
+}
+
+/*
+ * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
+ */
+void
+btinitparallelscan(void *target)
+{
+ BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
+
+ SpinLockInit(&bt_target->btps_mutex);
+ bt_target->btps_scanPage = InvalidBlockNumber;
+ bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+ bt_target->btps_arrayKeyCount = 0;
+ ConditionVariableInit(&bt_target->btps_cv);
+}
+
+/*
+ * btparallelrescan() -- reset parallel scan
+ */
+void
+btparallelrescan(IndexScanDesc scan)
+{
+ BTParallelScanDesc btscan;
+ ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+
+ Assert(parallel_scan);
+
+ btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+ parallel_scan->ps_offset);
+
+ /*
+ * In theory, we don't need to acquire the spinlock here, because there
+ * shouldn't be any other workers running at this point, but we do so for
+ * consistency.
+ */
+ SpinLockAcquire(&btscan->btps_mutex);
+ btscan->btps_scanPage = InvalidBlockNumber;
+ btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+ btscan->btps_arrayKeyCount = 0;
+ SpinLockRelease(&btscan->btps_mutex);
+}
+
+/*
+ * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
+ * page. Other scans must wait until we call _bt_parallel_release()
+ * or _bt_parallel_done().
+ *
+ * The return value is true if we successfully seized the scan and false
+ * if we did not. The latter case occurs if no pages remain for the current
+ * set of scankeys.
+ *
+ * If the return value is true, *pageno returns the next or current page
+ * of the scan (depending on the scan direction). An invalid block number
+ * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * The first time a participating process reaches the last page, it will return
+ * true and set *pageno to P_NONE; after that, further attempts to seize the
+ * scan will return false.
+ *
+ * Callers should ignore the value of pageno if the return value is false.
+ */
+bool
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTPS_State pageStatus;
+ bool exit_loop = false;
+ bool status = true;
+ ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+ BTParallelScanDesc btscan;
+
+ *pageno = P_NONE;
+
+ btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+ parallel_scan->ps_offset);
+
+ while (1)
+ {
+ SpinLockAcquire(&btscan->btps_mutex);
+ pageStatus = btscan->btps_pageStatus;
+
+ if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+ {
+ /* Parallel scan has already advanced to a new set of scankeys. */
+ status = false;
+ }
+ else if (pageStatus == BTPARALLEL_DONE)
+ {
+ /*
+ * We're done with this set of scankeys. This may be the end, or
+ * there could be more sets to try.
+ */
+ status = false;
+ }
+ else if (pageStatus != BTPARALLEL_ADVANCING)
+ {
+ /*
+ * We have successfully seized control of the scan for the purpose
+ * of advancing it to a new page!
+ */
+ btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+ *pageno = btscan->btps_scanPage;
+ exit_loop = true;
+ }
+ SpinLockRelease(&btscan->btps_mutex);
+ if (exit_loop || !status)
+ break;
+ ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
+ }
+ ConditionVariableCancelSleep();
+
+ return status;
+}
+
+/*
+ * _bt_parallel_release() -- Complete the process of advancing the scan to a
+ * new page. We now have the new value btps_scanPage; some other backend
+ * can now begin advancing the scan.
+ */
+void
+_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+{
+ ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+ BTParallelScanDesc btscan;
+
+ btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+ parallel_scan->ps_offset);
+
+ SpinLockAcquire(&btscan->btps_mutex);
+ btscan->btps_scanPage = scan_page;
+ btscan->btps_pageStatus = BTPARALLEL_IDLE;
+ SpinLockRelease(&btscan->btps_mutex);
+ ConditionVariableSignal(&btscan->btps_cv);
+}
+
+/*
+ * _bt_parallel_done() -- Mark the parallel scan as complete.
+ *
+ * When there are no pages left to scan, this function should be called to
+ * notify other workers. Otherwise, they might wait forever for the scan to
+ * advance to the next page.
+ */
+void
+_bt_parallel_done(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+ BTParallelScanDesc btscan;
+ bool status_changed = false;
+
+ /* Do nothing, for non-parallel scans */
+ if (parallel_scan == NULL)
+ return;
+
+ btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+ parallel_scan->ps_offset);
+
+ /*
+ * Mark the parallel scan as done for this combination of scan keys,
+ * unless some other process already did so. See also
+ * _bt_advance_array_keys.
+ */
+ SpinLockAcquire(&btscan->btps_mutex);
+ if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
+ btscan->btps_pageStatus != BTPARALLEL_DONE)
+ {
+ btscan->btps_pageStatus = BTPARALLEL_DONE;
+ status_changed = true;
+ }
+ SpinLockRelease(&btscan->btps_mutex);
+
+ /* wake up all the workers associated with this parallel scan */
+ if (status_changed)
+ ConditionVariableBroadcast(&btscan->btps_cv);
+}
+
+/*
+ * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
+ * keys.
+ *
+ * Updates the count of array keys processed for both local and parallel
+ * scans.
+ */
+void
+_bt_parallel_advance_array_keys(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+ BTParallelScanDesc btscan;
+
+ btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+ parallel_scan->ps_offset);
+
+ so->arrayKeyCount++;
+ SpinLockAcquire(&btscan->btps_mutex);
+ if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+ {
+ btscan->btps_scanPage = InvalidBlockNumber;
+ btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+ btscan->btps_arrayKeyCount++;
+ }
+ SpinLockRelease(&btscan->btps_mutex);
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ Relation rel = info->index;
+ BTCycleId cycleid;
+
+ /* allocate stats if first time through, else re-use existing struct */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+ /* Establish the vacuum cycle ID to use for this scan */
+ /* The ENSURE stuff ensures we clean up shared memory on failure */
+ PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
+ {
+ cycleid = _bt_start_vacuum(rel);
+
+ btvacuumscan(info, stats, callback, callback_state, cycleid);
+ }
+ PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
+ _bt_end_vacuum(rel);
+
+ return stats;
+}
+
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+ BlockNumber num_delpages;
+
+ /* No-op in ANALYZE ONLY mode */
+ if (info->analyze_only)
+ return stats;
+
+ /*
+ * If btbulkdelete was called, we need not do anything (we just maintain
+ * the information used within _bt_vacuum_needs_cleanup() by calling
+ * _bt_set_cleanup_info() below).
+ *
+ * If btbulkdelete was _not_ called, then we have a choice to make: we
+ * must decide whether or not a btvacuumscan() call is needed now (i.e.
+ * whether the ongoing VACUUM operation can entirely avoid a physical scan
+ * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
+ * now.
+ */
+ if (stats == NULL)
+ {
+ /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
+ if (!_bt_vacuum_needs_cleanup(info->index))
+ return NULL;
+
+ /*
+ * Since we aren't going to actually delete any leaf items, there's no
+ * need to go through all the vacuum-cycle-ID pushups here.
+ *
+ * Posting list tuples are a source of inaccuracy for cleanup-only
+ * scans. btvacuumscan() will assume that the number of index tuples
+ * from each page can be used as num_index_tuples, even though
+ * num_index_tuples is supposed to represent the number of TIDs in the
+ * index. This naive approach can underestimate the number of tuples
+ * in the index significantly.
+ *
+ * We handle the problem by making num_index_tuples an estimate in
+ * cleanup-only case.
+ */
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ btvacuumscan(info, stats, NULL, NULL, 0);
+ stats->estimated_count = true;
+ }
+
+ /*
+ * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
+ *
+ * num_delpages is the number of deleted pages now in the index that were
+ * not safe to place in the FSM to be recycled just yet. num_delpages is
+ * greater than 0 only when _bt_pagedel() actually deleted pages during
+ * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
+ * have failed to place any newly deleted pages in the FSM just moments
+ * ago. (Actually, there are edge cases where recycling of the current
+ * VACUUM's newly deleted pages does not even become safe by the time the
+ * next VACUUM comes around. See nbtree/README.)
+ */
+ Assert(stats->pages_deleted >= stats->pages_free);
+ num_delpages = stats->pages_deleted - stats->pages_free;
+ _bt_set_cleanup_info(info->index, num_delpages);
+
+ /*
+ * It's quite possible for us to be fooled by concurrent page splits into
+ * double-counting some index tuples, so disbelieve any total that exceeds
+ * the underlying heap's count ... if we know that accurately. Otherwise
+ * this might just make matters worse.
+ */
+ if (!info->estimated_count)
+ {
+ if (stats->num_index_tuples > info->num_heap_tuples)
+ stats->num_index_tuples = info->num_heap_tuples;
+ }
+
+ return stats;
+}
+
+/*
+ * btvacuumscan --- scan the index for VACUUMing purposes
+ *
+ * This combines the functions of looking for leaf tuples that are deletable
+ * according to the vacuum callback, looking for empty pages that can be
+ * deleted, and looking for old deleted pages that can be recycled. Both
+ * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
+ * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
+ *
+ * The caller is responsible for initially allocating/zeroing a stats struct
+ * and for obtaining a vacuum cycle ID if necessary.
+ */
+static void
+btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state,
+ BTCycleId cycleid)
+{
+ Relation rel = info->index;
+ BTVacState vstate;
+ BlockNumber num_pages;
+ BlockNumber scanblkno;
+ bool needLock;
+
+ /*
+ * Reset fields that track information about the entire index now. This
+ * avoids double-counting in the case where a single VACUUM command
+ * requires multiple scans of the index.
+ *
+ * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
+ * since they track information about the VACUUM command, and so must last
+ * across each call to btvacuumscan().
+ *
+ * (Note that pages_free is treated as state about the whole index, not
+ * the current VACUUM. This is appropriate because RecordFreeIndexPage()
+ * calls are idempotent, and get repeated for the same deleted pages in
+ * some scenarios. The point for us is to track the number of recyclable
+ * pages in the index at the end of the VACUUM command.)
+ */
+ stats->num_pages = 0;
+ stats->num_index_tuples = 0;
+ stats->pages_deleted = 0;
+ stats->pages_free = 0;
+
+ /* Set up info to pass down to btvacuumpage */
+ vstate.info = info;
+ vstate.stats = stats;
+ vstate.callback = callback;
+ vstate.callback_state = callback_state;
+ vstate.cycleid = cycleid;
+
+ /* Create a temporary memory context to run _bt_pagedel in */
+ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "_bt_pagedel",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /* Initialize vstate fields used by _bt_pendingfsm_finalize */
+ vstate.bufsize = 0;
+ vstate.maxbufsize = 0;
+ vstate.pendingpages = NULL;
+ vstate.npendingpages = 0;
+ /* Consider applying _bt_pendingfsm_finalize optimization */
+ _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
+
+ /*
+ * The outer loop iterates over all index pages except the metapage, in
+ * physical order (we hope the kernel will cooperate in providing
+ * read-ahead for speed). It is critical that we visit all leaf pages,
+ * including ones added after we start the scan, else we might fail to
+ * delete some deletable tuples. Hence, we must repeatedly check the
+ * relation length. We must acquire the relation-extension lock while
+ * doing so to avoid a race condition: if someone else is extending the
+ * relation, there is a window where bufmgr/smgr have created a new
+ * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
+ * we manage to scan such a page here, we'll improperly assume it can be
+ * recycled. Taking the lock synchronizes things enough to prevent a
+ * problem: either num_pages won't include the new page, or _bt_getbuf
+ * already has write lock on the buffer and it will be fully initialized
+ * before we can examine it. (See also vacuumlazy.c, which has the same
+ * issue.) Also, we need not worry if a page is added immediately after
+ * we look; the page splitting code already has write-lock on the left
+ * page before it adds a right page, so we must already have processed any
+ * tuples due to be moved into such a page.
+ *
+ * We can skip locking for new or temp relations, however, since no one
+ * else could be accessing them.
+ */
+ needLock = !RELATION_IS_LOCAL(rel);
+
+ scanblkno = BTREE_METAPAGE + 1;
+ for (;;)
+ {
+ /* Get the current relation length */
+ if (needLock)
+ LockRelationForExtension(rel, ExclusiveLock);
+ num_pages = RelationGetNumberOfBlocks(rel);
+ if (needLock)
+ UnlockRelationForExtension(rel, ExclusiveLock);
+
+ if (info->report_progress)
+ pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
+ num_pages);
+
+ /* Quit if we've scanned the whole relation */
+ if (scanblkno >= num_pages)
+ break;
+ /* Iterate over pages, then loop back to recheck length */
+ for (; scanblkno < num_pages; scanblkno++)
+ {
+ btvacuumpage(&vstate, scanblkno);
+ if (info->report_progress)
+ pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
+ scanblkno);
+ }
+ }
+
+ /* Set statistics num_pages field to final size of index */
+ stats->num_pages = num_pages;
+
+ MemoryContextDelete(vstate.pagedelcontext);
+
+ /*
+ * If there were any calls to _bt_pagedel() during scan of the index then
+ * see if any of the resulting pages can be placed in the FSM now. When
+ * it's not safe we'll have to leave it up to a future VACUUM operation.
+ *
+ * Finally, if we placed any pages in the FSM (either just now or during
+ * the scan), forcibly update the upper-level FSM pages to ensure that
+ * searchers can find them.
+ */
+ _bt_pendingfsm_finalize(rel, &vstate);
+ if (stats->pages_free > 0)
+ IndexFreeSpaceMapVacuum(rel);
+}
+
+/*
+ * btvacuumpage --- VACUUM one page
+ *
+ * This processes a single page for btvacuumscan(). In some cases we must
+ * backtrack to re-examine and VACUUM pages that were the scanblkno during
+ * a previous call here. This is how we handle page splits (that happened
+ * after our cycleid was acquired) whose right half page happened to reuse
+ * a block that we might have processed at some point before it was
+ * recycled (i.e. before the page split).
+ */
+static void
+btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
+{
+ IndexVacuumInfo *info = vstate->info;
+ IndexBulkDeleteResult *stats = vstate->stats;
+ IndexBulkDeleteCallback callback = vstate->callback;
+ void *callback_state = vstate->callback_state;
+ Relation rel = info->index;
+ bool attempt_pagedel;
+ BlockNumber blkno,
+ backtrack_to;
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+
+ blkno = scanblkno;
+
+backtrack:
+
+ attempt_pagedel = false;
+ backtrack_to = P_NONE;
+
+ /* call vacuum_delay_point while not holding any buffer lock */
+ vacuum_delay_point();
+
+ /*
+ * We can't use _bt_getbuf() here because it always applies
+ * _bt_checkpage(), which will barf on an all-zero page. We want to
+ * recycle all-zero pages, not fail. Also, we want to use a nondefault
+ * buffer access strategy.
+ */
+ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+ info->strategy);
+ _bt_lockbuf(rel, buf, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = NULL;
+ if (!PageIsNew(page))
+ {
+ _bt_checkpage(rel, buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ Assert(blkno <= scanblkno);
+ if (blkno != scanblkno)
+ {
+ /*
+ * We're backtracking.
+ *
+ * We followed a right link to a sibling leaf page (a page that
+ * happens to be from a block located before scanblkno). The only
+ * case we want to do anything with is a live leaf page having the
+ * current vacuum cycle ID.
+ *
+ * The page had better be in a state that's consistent with what we
+ * expect. Check for conditions that imply corruption in passing. It
+ * can't be half-dead because only an interrupted VACUUM process can
+ * leave pages in that state, so we'd definitely have dealt with it
+ * back when the page was the scanblkno page (half-dead pages are
+ * always marked fully deleted by _bt_pagedel()). This assumes that
+ * there can be only one vacuum process running at a time.
+ */
+ if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
+ {
+ Assert(false);
+ ereport(LOG,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
+ blkno, scanblkno, RelationGetRelationName(rel))));
+ _bt_relbuf(rel, buf);
+ return;
+ }
+
+ /*
+ * We may have already processed the page in an earlier call, when the
+ * page was scanblkno. This happens when the leaf page split occurred
+ * after the scan began, but before the right sibling page became the
+ * scanblkno.
+ *
+ * Page may also have been deleted by current btvacuumpage() call,
+ * since _bt_pagedel() sometimes deletes the right sibling page of
+ * scanblkno in passing (it does so after we decided where to
+ * backtrack to). We don't need to process this page as a deleted
+ * page a second time now (in fact, it would be wrong to count it as a
+ * deleted page in the bulk delete statistics a second time).
+ */
+ if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
+ {
+ /* Done with current scanblkno (and all lower split pages) */
+ _bt_relbuf(rel, buf);
+ return;
+ }
+ }
+
+ if (!opaque || BTPageIsRecyclable(page))
+ {
+ /* Okay to recycle this page (which could be leaf or internal) */
+ RecordFreeIndexPage(rel, blkno);
+ stats->pages_deleted++;
+ stats->pages_free++;
+ }
+ else if (P_ISDELETED(opaque))
+ {
+ /*
+ * Already deleted page (which could be leaf or internal). Can't
+ * recycle yet.
+ */
+ stats->pages_deleted++;
+ }
+ else if (P_ISHALFDEAD(opaque))
+ {
+ /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
+ attempt_pagedel = true;
+
+ /*
+ * _bt_pagedel() will increment both pages_newly_deleted and
+ * pages_deleted stats in all cases (barring corruption)
+ */
+ }
+ else if (P_ISLEAF(opaque))
+ {
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
+ int ndeletable;
+ BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+ int nupdatable;
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ int nhtidsdead,
+ nhtidslive;
+
+ /*
+ * Trade in the initial read lock for a super-exclusive write lock on
+ * this page. We must get such a lock on every leaf page over the
+ * course of the vacuum scan, whether or not it actually contains any
+ * deletable tuples --- see nbtree/README.
+ */
+ _bt_upgradelockbufcleanup(rel, buf);
+
+ /*
+ * Check whether we need to backtrack to earlier pages. What we are
+ * concerned about is a page split that happened since we started the
+ * vacuum scan. If the split moved tuples on the right half of the
+ * split (i.e. the tuples that sort high) to a block that we already
+ * passed over, then we might have missed the tuples. We need to
+ * backtrack now. (Must do this before possibly clearing btpo_cycleid
+ * or deleting scanblkno page below!)
+ */
+ if (vstate->cycleid != 0 &&
+ opaque->btpo_cycleid == vstate->cycleid &&
+ !(opaque->btpo_flags & BTP_SPLIT_END) &&
+ !P_RIGHTMOST(opaque) &&
+ opaque->btpo_next < scanblkno)
+ backtrack_to = opaque->btpo_next;
+
+ /*
+ * When each VACUUM begins, it determines an OldestXmin cutoff value.
+ * Tuples before the cutoff are removed by VACUUM. Scan over all
+ * items to see which ones need to be deleted according to cutoff
+ * point using callback.
+ */
+ ndeletable = 0;
+ nupdatable = 0;
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ nhtidsdead = 0;
+ nhtidslive = 0;
+ if (callback)
+ {
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ IndexTuple itup;
+
+ itup = (IndexTuple) PageGetItem(page,
+ PageGetItemId(page, offnum));
+
+ /*
+ * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
+ * records do not produce their own conflicts. This is safe
+ * as long as the callback function only considers whether the
+ * index tuple refers to pre-cutoff heap tuples that were
+ * certainly already pruned away during VACUUM's initial heap
+ * scan by the time we get here. (heapam's XLOG_HEAP2_PRUNE
+ * records produce conflicts using a latestRemovedXid value
+ * for the pointed-to heap tuples, so there is no need to
+ * produce our own conflict now.)
+ *
+ * Backends with snapshots acquired after a VACUUM starts but
+ * before it finishes could have visibility cutoff with a
+ * later xid than VACUUM's OldestXmin cutoff. These backends
+ * might happen to opportunistically mark some index tuples
+ * LP_DEAD before we reach them, even though they may be after
+ * our cutoff. We don't try to kill these "extra" index
+ * tuples in _bt_delitems_vacuum(). This keep things simple,
+ * and allows us to always avoid generating our own conflicts.
+ */
+ Assert(!BTreeTupleIsPivot(itup));
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Regular tuple, standard table TID representation */
+ if (callback(&itup->t_tid, callback_state))
+ {
+ deletable[ndeletable++] = offnum;
+ nhtidsdead++;
+ }
+ else
+ nhtidslive++;
+ }
+ else
+ {
+ BTVacuumPosting vacposting;
+ int nremaining;
+
+ /* Posting list tuple */
+ vacposting = btreevacuumposting(vstate, itup, offnum,
+ &nremaining);
+ if (vacposting == NULL)
+ {
+ /*
+ * All table TIDs from the posting tuple remain, so no
+ * delete or update required
+ */
+ Assert(nremaining == BTreeTupleGetNPosting(itup));
+ }
+ else if (nremaining > 0)
+ {
+
+ /*
+ * Store metadata about posting list tuple in
+ * updatable array for entire page. Existing tuple
+ * will be updated during the later call to
+ * _bt_delitems_vacuum().
+ */
+ Assert(nremaining < BTreeTupleGetNPosting(itup));
+ updatable[nupdatable++] = vacposting;
+ nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
+ }
+ else
+ {
+ /*
+ * All table TIDs from the posting list must be
+ * deleted. We'll delete the index tuple completely
+ * (no update required).
+ */
+ Assert(nremaining == 0);
+ deletable[ndeletable++] = offnum;
+ nhtidsdead += BTreeTupleGetNPosting(itup);
+ pfree(vacposting);
+ }
+
+ nhtidslive += nremaining;
+ }
+ }
+ }
+
+ /*
+ * Apply any needed deletes or updates. We issue just one
+ * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
+ */
+ if (ndeletable > 0 || nupdatable > 0)
+ {
+ Assert(nhtidsdead >= ndeletable + nupdatable);
+ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
+ nupdatable);
+
+ stats->tuples_removed += nhtidsdead;
+ /* must recompute maxoff */
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* can't leak memory here */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]);
+ }
+ else
+ {
+ /*
+ * If the leaf page has been split during this vacuum cycle, it
+ * seems worth expending a write to clear btpo_cycleid even if we
+ * don't have any deletions to do. (If we do, _bt_delitems_vacuum
+ * takes care of this.) This ensures we won't process the page
+ * again.
+ *
+ * We treat this like a hint-bit update because there's no need to
+ * WAL-log it.
+ */
+ Assert(nhtidsdead == 0);
+ if (vstate->cycleid != 0 &&
+ opaque->btpo_cycleid == vstate->cycleid)
+ {
+ opaque->btpo_cycleid = 0;
+ MarkBufferDirtyHint(buf, true);
+ }
+ }
+
+ /*
+ * If the leaf page is now empty, try to delete it; else count the
+ * live tuples (live table TIDs in posting lists are counted as
+ * separate live tuples). We don't delete when backtracking, though,
+ * since that would require teaching _bt_pagedel() about backtracking
+ * (doesn't seem worth adding more complexity to deal with that).
+ *
+ * We don't count the number of live TIDs during cleanup-only calls to
+ * btvacuumscan (i.e. when callback is not set). We count the number
+ * of index tuples directly instead. This avoids the expense of
+ * directly examining all of the tuples on each page. VACUUM will
+ * treat num_index_tuples as an estimate in cleanup-only case, so it
+ * doesn't matter that this underestimates num_index_tuples
+ * significantly in some cases.
+ */
+ if (minoff > maxoff)
+ attempt_pagedel = (blkno == scanblkno);
+ else if (callback)
+ stats->num_index_tuples += nhtidslive;
+ else
+ stats->num_index_tuples += maxoff - minoff + 1;
+
+ Assert(!attempt_pagedel || nhtidslive == 0);
+ }
+
+ if (attempt_pagedel)
+ {
+ MemoryContext oldcontext;
+
+ /* Run pagedel in a temp context to avoid memory leakage */
+ MemoryContextReset(vstate->pagedelcontext);
+ oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
+
+ /*
+ * _bt_pagedel maintains the bulk delete stats on our behalf;
+ * pages_newly_deleted and pages_deleted are likely to be incremented
+ * during call
+ */
+ Assert(blkno == scanblkno);
+ _bt_pagedel(rel, buf, vstate);
+
+ MemoryContextSwitchTo(oldcontext);
+ /* pagedel released buffer, so we shouldn't */
+ }
+ else
+ _bt_relbuf(rel, buf);
+
+ if (backtrack_to != P_NONE)
+ {
+ blkno = backtrack_to;
+ goto backtrack;
+ }
+}
+
+/*
+ * btreevacuumposting --- determine TIDs still needed in posting list
+ *
+ * Returns metadata describing how to build replacement tuple without the TIDs
+ * that VACUUM needs to delete. Returned value is NULL in the common case
+ * where no changes are needed to caller's posting list tuple (we avoid
+ * allocating memory here as an optimization).
+ *
+ * The number of TIDs that should remain in the posting list tuple is set for
+ * caller in *nremaining.
+ */
+static BTVacuumPosting
+btreevacuumposting(BTVacState *vstate, IndexTuple posting,
+ OffsetNumber updatedoffset, int *nremaining)
+{
+ int live = 0;
+ int nitem = BTreeTupleGetNPosting(posting);
+ ItemPointer items = BTreeTupleGetPosting(posting);
+ BTVacuumPosting vacposting = NULL;
+
+ for (int i = 0; i < nitem; i++)
+ {
+ if (!vstate->callback(items + i, vstate->callback_state))
+ {
+ /* Live table TID */
+ live++;
+ }
+ else if (vacposting == NULL)
+ {
+ /*
+ * First dead table TID encountered.
+ *
+ * It's now clear that we need to delete one or more dead table
+ * TIDs, so start maintaining metadata describing how to update
+ * existing posting list tuple.
+ */
+ vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+ nitem * sizeof(uint16));
+
+ vacposting->itup = posting;
+ vacposting->updatedoffset = updatedoffset;
+ vacposting->ndeletedtids = 0;
+ vacposting->deletetids[vacposting->ndeletedtids++] = i;
+ }
+ else
+ {
+ /* Second or subsequent dead table TID */
+ vacposting->deletetids[vacposting->ndeletedtids++] = i;
+ }
+ }
+
+ *nremaining = live;
+ return vacposting;
+}
+
+/*
+ * btcanreturn() -- Check whether btree indexes support index-only scans.
+ *
+ * btrees always do, so this is trivial.
+ */
+bool
+btcanreturn(Relation index, int attno)
+{
+ return true;
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
new file mode 100644
index 0000000..fdf0e56
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -0,0 +1,2501 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsearch.c
+ * Search code for postgres btrees.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtsearch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/predicate.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+
+
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int _bt_binsrch_posting(BTScanInsert key, Page page,
+ OffsetNumber offnum);
+static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
+ OffsetNumber offnum);
+static void _bt_saveitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, IndexTuple itup);
+static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, ItemPointer heapTid,
+ IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum,
+ ItemPointer heapTid, int tupleOffset);
+static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
+static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
+ ScanDirection dir);
+static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
+static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
+
+
+/*
+ * _bt_drop_lock_and_maybe_pin()
+ *
+ * Unlock the buffer; and if it is safe to release the pin, do that, too. It
+ * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
+ * This will prevent vacuum from stalling in a blocked state trying to read a
+ * page when a cursor is sitting on it -- at least in many important cases.
+ *
+ * Set the buffer to invalid if the pin is released, since the buffer may be
+ * re-used. If we need to go back to this block (for example, to apply
+ * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
+ * will remain in shared memory for as long as it takes to scan the index
+ * buffer page.
+ */
+static void
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+{
+ _bt_unlockbuf(scan->indexRelation, sp->buf);
+
+ if (IsMVCCSnapshot(scan->xs_snapshot) &&
+ RelationNeedsWAL(scan->indexRelation) &&
+ !scan->xs_want_itup)
+ {
+ ReleaseBuffer(sp->buf);
+ sp->buf = InvalidBuffer;
+ }
+}
+
+/*
+ * _bt_search() -- Search the tree for a particular scankey,
+ * or more precisely for the first leaf page it could be on.
+ *
+ * The passed scankey is an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * Return value is a stack of parent-page pointers (i.e. there is no entry for
+ * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
+ * which is locked and pinned. No locks are held on the parent pages,
+ * however!
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree. This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ *
+ * The returned buffer is locked according to access parameter. Additionally,
+ * access = BT_WRITE will allow an empty root page to be created and returned.
+ * When access = BT_READ, an empty index will result in *bufP being set to
+ * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
+ * during the search will be finished.
+ */
+BTStack
+_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
+ Snapshot snapshot)
+{
+ BTStack stack_in = NULL;
+ int page_access = BT_READ;
+
+ /* Get the root page to start with */
+ *bufP = _bt_getroot(rel, access);
+
+ /* If index is empty and access = BT_READ, no root page is created. */
+ if (!BufferIsValid(*bufP))
+ return (BTStack) NULL;
+
+ /* Loop iterates once per level descended in the tree */
+ for (;;)
+ {
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum;
+ ItemId itemid;
+ IndexTuple itup;
+ BlockNumber child;
+ BTStack new_stack;
+
+ /*
+ * Race -- the page we just grabbed may have split since we read its
+ * downlink in its parent page (or the metapage). If it has, we may
+ * need to move right to its new sibling. Do that.
+ *
+ * In write-mode, allow _bt_moveright to finish any incomplete splits
+ * along the way. Strictly speaking, we'd only need to finish an
+ * incomplete split on the leaf page we're about to insert to, not on
+ * any of the upper levels (internal pages with incomplete splits are
+ * also taken care of in _bt_getstackbuf). But this is a good
+ * opportunity to finish splits of internal pages too.
+ */
+ *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
+ page_access, snapshot);
+
+ /* if this is a leaf page, we're done */
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISLEAF(opaque))
+ break;
+
+ /*
+ * Find the appropriate pivot tuple on this page. Its downlink points
+ * to the child page that we're about to descend to.
+ */
+ offnum = _bt_binsrch(rel, key, *bufP);
+ itemid = PageGetItemId(page, offnum);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
+ child = BTreeTupleGetDownLink(itup);
+
+ /*
+ * We need to save the location of the pivot tuple we chose in a new
+ * stack entry for this page/level. If caller ends up splitting a
+ * page one level down, it usually ends up inserting a new pivot
+ * tuple/downlink immediately after the location recorded here.
+ */
+ new_stack = (BTStack) palloc(sizeof(BTStackData));
+ new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
+ new_stack->bts_offset = offnum;
+ new_stack->bts_parent = stack_in;
+
+ /*
+ * Page level 1 is lowest non-leaf page level prior to leaves. So, if
+ * we're on the level 1 and asked to lock leaf page in write mode,
+ * then lock next page in write mode, because it must be a leaf.
+ */
+ if (opaque->btpo_level == 1 && access == BT_WRITE)
+ page_access = BT_WRITE;
+
+ /* drop the read lock on the page, then acquire one on its child */
+ *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
+
+ /* okay, all set to move down a level */
+ stack_in = new_stack;
+ }
+
+ /*
+ * If we're asked to lock leaf in write mode, but didn't manage to, then
+ * relock. This should only happen when the root page is a leaf page (and
+ * the only page in the index other than the metapage).
+ */
+ if (access == BT_WRITE && page_access == BT_READ)
+ {
+ /* trade in our read lock for a write lock */
+ _bt_unlockbuf(rel, *bufP);
+ _bt_lockbuf(rel, *bufP, BT_WRITE);
+
+ /*
+ * Race -- the leaf page may have split after we dropped the read lock
+ * but before we acquired a write lock. If it has, we may need to
+ * move right to its new sibling. Do that.
+ */
+ *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
+ snapshot);
+ }
+
+ return stack_in;
+}
+
+/*
+ * _bt_moveright() -- move right in the btree if necessary.
+ *
+ * When we follow a pointer to reach a page, it is possible that
+ * the page has changed in the meanwhile. If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page. If that entry is
+ * strictly less than the scankey, or <= the scankey in the
+ * key.nextkey=true case, then we followed the wrong link and we need
+ * to move right.
+ *
+ * The passed insertion-type scankey can omit the rightmost column(s) of the
+ * index. (see nbtree/README)
+ *
+ * When key.nextkey is false (the usual case), we are looking for the first
+ * item >= key. When key.nextkey is true, we are looking for the first item
+ * strictly greater than key.
+ *
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter. This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed. 'stack' is only used if forupdate is true.
+ *
+ * On entry, we have the buffer pinned and a lock of the type specified by
+ * 'access'. If we move right, we release the buffer and lock and acquire
+ * the same on the right sibling. Return value is the buffer we stop at.
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree. This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ */
+Buffer
+_bt_moveright(Relation rel,
+ BTScanInsert key,
+ Buffer buf,
+ bool forupdate,
+ BTStack stack,
+ int access,
+ Snapshot snapshot)
+{
+ Page page;
+ BTPageOpaque opaque;
+ int32 cmpval;
+
+ /*
+ * When nextkey = false (normal case): if the scan key that brought us to
+ * this page is > the high key stored on the page, then the page has split
+ * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
+ * have some duplicates to the right as well as the left, but that's
+ * something that's only ever dealt with on the leaf level, after
+ * _bt_search has found an initial leaf page.)
+ *
+ * When nextkey = true: move right if the scan key is >= page's high key.
+ * (Note that key.scantid cannot be set in this case.)
+ *
+ * The page could even have split more than once, so scan as far as
+ * needed.
+ *
+ * We also have to move right if we followed a link that brought us to a
+ * dead page.
+ */
+ cmpval = key->nextkey ? 0 : 1;
+
+ for (;;)
+ {
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_RIGHTMOST(opaque))
+ break;
+
+ /*
+ * Finish any incomplete splits we encounter along the way.
+ */
+ if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+ {
+ BlockNumber blkno = BufferGetBlockNumber(buf);
+
+ /* upgrade our lock if necessary */
+ if (access == BT_READ)
+ {
+ _bt_unlockbuf(rel, buf);
+ _bt_lockbuf(rel, buf, BT_WRITE);
+ }
+
+ if (P_INCOMPLETE_SPLIT(opaque))
+ _bt_finish_split(rel, buf, stack);
+ else
+ _bt_relbuf(rel, buf);
+
+ /* re-acquire the lock in the right mode, and re-check */
+ buf = _bt_getbuf(rel, blkno, access);
+ continue;
+ }
+
+ if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
+ {
+ /* step right one page */
+ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+ continue;
+ }
+ else
+ break;
+ }
+
+ if (P_IGNORE(opaque))
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+
+ return buf;
+}
+
+/*
+ * _bt_binsrch() -- Do a binary search for a key on a particular page.
+ *
+ * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
+ * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
+ * particular, this means it is possible to return a value 1 greater than the
+ * number of keys on the page, if the scankey is > all keys on the page.)
+ *
+ * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
+ * of the last key < given scankey, or last key <= given scankey if nextkey
+ * is true. (Since _bt_compare treats the first data key of such a page as
+ * minus infinity, there will be at least one key < scankey, so the result
+ * always points at one of the keys on the page.) This key indicates the
+ * right place to descend to be sure we find all leaf keys >= given scankey
+ * (or leaf keys > given scankey when nextkey is true).
+ *
+ * This procedure is not responsible for walking right, it just examines
+ * the given page. _bt_binsrch() has no lock or refcount side effects
+ * on the buffer.
+ */
+static OffsetNumber
+_bt_binsrch(Relation rel,
+ BTScanInsert key,
+ Buffer buf)
+{
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber low,
+ high;
+ int32 result,
+ cmpval;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* Requesting nextkey semantics while using scantid seems nonsensical */
+ Assert(!key->nextkey || key->scantid == NULL);
+ /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
+ Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
+
+ low = P_FIRSTDATAKEY(opaque);
+ high = PageGetMaxOffsetNumber(page);
+
+ /*
+ * If there are no keys on the page, return the first available slot. Note
+ * this covers two cases: the page is really empty (no keys), or it
+ * contains only a high key. The latter case is possible after vacuuming.
+ * This can never happen on an internal page, however, since they are
+ * never empty (an internal page must have children).
+ */
+ if (unlikely(high < low))
+ return low;
+
+ /*
+ * Binary search to find the first key on the page >= scan key, or first
+ * key > scankey when nextkey is true.
+ *
+ * For nextkey=false (cmpval=1), the loop invariant is: all slots before
+ * 'low' are < scan key, all slots at or after 'high' are >= scan key.
+ *
+ * For nextkey=true (cmpval=0), the loop invariant is: all slots before
+ * 'low' are <= scan key, all slots at or after 'high' are > scan key.
+ *
+ * We can fall out when high == low.
+ */
+ high++; /* establish the loop invariant for high */
+
+ cmpval = key->nextkey ? 0 : 1; /* select comparison value */
+
+ while (high > low)
+ {
+ OffsetNumber mid = low + ((high - low) / 2);
+
+ /* We have low <= mid < high, so mid points at a real slot */
+
+ result = _bt_compare(rel, key, page, mid);
+
+ if (result >= cmpval)
+ low = mid + 1;
+ else
+ high = mid;
+ }
+
+ /*
+ * At this point we have high == low, but be careful: they could point
+ * past the last slot on the page.
+ *
+ * On a leaf page, we always return the first key >= scan key (resp. >
+ * scan key), which could be the last slot + 1.
+ */
+ if (P_ISLEAF(opaque))
+ return low;
+
+ /*
+ * On a non-leaf page, return the last key < scan key (resp. <= scan key).
+ * There must be one if _bt_compare() is playing by the rules.
+ */
+ Assert(low > P_FIRSTDATAKEY(opaque));
+
+ return OffsetNumberPrev(low);
+}
+
+/*
+ *
+ * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
+ *
+ * Like _bt_binsrch(), but with support for caching the binary search
+ * bounds. Only used during insertion, and only on the leaf page that it
+ * looks like caller will insert tuple on. Exclusive-locked and pinned
+ * leaf page is contained within insertstate.
+ *
+ * Caches the bounds fields in insertstate so that a subsequent call can
+ * reuse the low and strict high bounds of original binary search. Callers
+ * that use these fields directly must be prepared for the case where low
+ * and/or stricthigh are not on the same page (one or both exceed maxoff
+ * for the page). The case where there are no items on the page (high <
+ * low) makes bounds invalid.
+ *
+ * Caller is responsible for invalidating bounds when it modifies the page
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by a posting
+ * list split).
+ */
+OffsetNumber
+_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
+{
+ BTScanInsert key = insertstate->itup_key;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber low,
+ high,
+ stricthigh;
+ int32 result,
+ cmpval;
+
+ page = BufferGetPage(insertstate->buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ Assert(P_ISLEAF(opaque));
+ Assert(!key->nextkey);
+ Assert(insertstate->postingoff == 0);
+
+ if (!insertstate->bounds_valid)
+ {
+ /* Start new binary search */
+ low = P_FIRSTDATAKEY(opaque);
+ high = PageGetMaxOffsetNumber(page);
+ }
+ else
+ {
+ /* Restore result of previous binary search against same page */
+ low = insertstate->low;
+ high = insertstate->stricthigh;
+ }
+
+ /* If there are no keys on the page, return the first available slot */
+ if (unlikely(high < low))
+ {
+ /* Caller can't reuse bounds */
+ insertstate->low = InvalidOffsetNumber;
+ insertstate->stricthigh = InvalidOffsetNumber;
+ insertstate->bounds_valid = false;
+ return low;
+ }
+
+ /*
+ * Binary search to find the first key on the page >= scan key. (nextkey
+ * is always false when inserting).
+ *
+ * The loop invariant is: all slots before 'low' are < scan key, all slots
+ * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
+ * maintained to save additional search effort for caller.
+ *
+ * We can fall out when high == low.
+ */
+ if (!insertstate->bounds_valid)
+ high++; /* establish the loop invariant for high */
+ stricthigh = high; /* high initially strictly higher */
+
+ cmpval = 1; /* !nextkey comparison value */
+
+ while (high > low)
+ {
+ OffsetNumber mid = low + ((high - low) / 2);
+
+ /* We have low <= mid < high, so mid points at a real slot */
+
+ result = _bt_compare(rel, key, page, mid);
+
+ if (result >= cmpval)
+ low = mid + 1;
+ else
+ {
+ high = mid;
+ if (result != 0)
+ stricthigh = high;
+ }
+
+ /*
+ * If tuple at offset located by binary search is a posting list whose
+ * TID range overlaps with caller's scantid, perform posting list
+ * binary search to set postingoff for caller. Caller must split the
+ * posting list when postingoff is set. This should happen
+ * infrequently.
+ */
+ if (unlikely(result == 0 && key->scantid != NULL))
+ {
+ /*
+ * postingoff should never be set more than once per leaf page
+ * binary search. That would mean that there are duplicate table
+ * TIDs in the index, which is never okay. Check for that here.
+ */
+ if (insertstate->postingoff != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
+ ItemPointerGetBlockNumber(key->scantid),
+ ItemPointerGetOffsetNumber(key->scantid),
+ low, stricthigh,
+ BufferGetBlockNumber(insertstate->buf),
+ RelationGetRelationName(rel))));
+
+ insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
+ }
+ }
+
+ /*
+ * On a leaf page, a binary search always returns the first key >= scan
+ * key (at least in !nextkey case), which could be the last slot + 1. This
+ * is also the lower bound of cached search.
+ *
+ * stricthigh may also be the last slot + 1, which prevents caller from
+ * using bounds directly, but is still useful to us if we're called a
+ * second time with cached bounds (cached low will be < stricthigh when
+ * that happens).
+ */
+ insertstate->low = low;
+ insertstate->stricthigh = stricthigh;
+ insertstate->bounds_valid = true;
+
+ return low;
+}
+
+/*----------
+ * _bt_binsrch_posting() -- posting list binary search.
+ *
+ * Helper routine for _bt_binsrch_insert().
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+ IndexTuple itup;
+ ItemId itemid;
+ int low,
+ high,
+ mid,
+ res;
+
+ /*
+ * If this isn't a posting tuple, then the index must be corrupt (if it is
+ * an ordinary non-pivot tuple then there must be an existing tuple with a
+ * heap TID that equals inserter's new heap TID/scantid). Defensively
+ * check that tuple is a posting list tuple whose posting list range
+ * includes caller's scantid.
+ *
+ * (This is also needed because contrib/amcheck's rootdescend option needs
+ * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+ */
+ itemid = PageGetItemId(page, offnum);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ if (!BTreeTupleIsPosting(itup))
+ return 0;
+
+ Assert(key->heapkeyspace && key->allequalimage);
+
+ /*
+ * In the event that posting list tuple has LP_DEAD bit set, indicate this
+ * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
+ * second call to _bt_binsrch_insert() can take place when its caller has
+ * removed the dead item.
+ */
+ if (ItemIdIsDead(itemid))
+ return -1;
+
+ /* "high" is past end of posting list for loop invariant */
+ low = 0;
+ high = BTreeTupleGetNPosting(itup);
+ Assert(high >= 2);
+
+ while (high > low)
+ {
+ mid = low + ((high - low) / 2);
+ res = ItemPointerCompare(key->scantid,
+ BTreeTupleGetPostingN(itup, mid));
+
+ if (res > 0)
+ low = mid + 1;
+ else if (res < 0)
+ high = mid;
+ else
+ return mid;
+ }
+
+ /* Exact match not found */
+ return low;
+}
+
+/*----------
+ * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
+ *
+ * page/offnum: location of btree item to be compared to.
+ *
+ * This routine returns:
+ * <0 if scankey < tuple at offnum;
+ * 0 if scankey == tuple at offnum;
+ * >0 if scankey > tuple at offnum.
+ *
+ * NULLs in the keys are treated as sortable values. Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key. Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid. There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
+ *
+ * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
+ * "minus infinity": this routine will always claim it is less than the
+ * scankey. The actual key value stored is explicitly truncated to 0
+ * attributes (explicitly minus infinity) with version 3+ indexes, but
+ * that isn't relied upon. This allows us to implement the Lehman and
+ * Yao convention that the first down-link pointer is before the first
+ * key. See backend/access/nbtree/README for details.
+ *----------
+ */
+int32
+_bt_compare(Relation rel,
+ BTScanInsert key,
+ Page page,
+ OffsetNumber offnum)
+{
+ TupleDesc itupdesc = RelationGetDescr(rel);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ IndexTuple itup;
+ ItemPointer heapTid;
+ ScanKey scankey;
+ int ncmpkey;
+ int ntupatts;
+ int32 result;
+
+ Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
+ Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
+ Assert(key->heapkeyspace || key->scantid == NULL);
+
+ /*
+ * Force result ">" if target item is first data item on an internal page
+ * --- see NOTE above.
+ */
+ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
+ return 1;
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ ntupatts = BTreeTupleGetNAtts(itup, rel);
+
+ /*
+ * The scan key is set up with the attribute number associated with each
+ * term in the key. It is important that, if the index is multi-key, the
+ * scan contain the first k key attributes, and that they be in order. If
+ * you think about how multi-key ordering works, you'll understand why
+ * this is.
+ *
+ * We don't test for violation of this condition here, however. The
+ * initial setup for the index scan had better have gotten it right (see
+ * _bt_first).
+ */
+
+ ncmpkey = Min(ntupatts, key->keysz);
+ Assert(key->heapkeyspace || ncmpkey == key->keysz);
+ Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
+ scankey = key->scankeys;
+ for (int i = 1; i <= ncmpkey; i++)
+ {
+ Datum datum;
+ bool isNull;
+
+ datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
+
+ if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
+ {
+ if (isNull)
+ result = 0; /* NULL "=" NULL */
+ else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+ result = -1; /* NULL "<" NOT_NULL */
+ else
+ result = 1; /* NULL ">" NOT_NULL */
+ }
+ else if (isNull) /* key is NOT_NULL and item is NULL */
+ {
+ if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+ result = 1; /* NOT_NULL ">" NULL */
+ else
+ result = -1; /* NOT_NULL "<" NULL */
+ }
+ else
+ {
+ /*
+ * The sk_func needs to be passed the index value as left arg and
+ * the sk_argument as right arg (they might be of different
+ * types). Since it is convenient for callers to think of
+ * _bt_compare as comparing the scankey to the index item, we have
+ * to flip the sign of the comparison result. (Unless it's a DESC
+ * column, in which case we *don't* flip the sign.)
+ */
+ result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+ scankey->sk_collation,
+ datum,
+ scankey->sk_argument));
+
+ if (!(scankey->sk_flags & SK_BT_DESC))
+ INVERT_COMPARE_RESULT(result);
+ }
+
+ /* if the keys are unequal, return the difference */
+ if (result != 0)
+ return result;
+
+ scankey++;
+ }
+
+ /*
+ * All non-truncated attributes (other than heap TID) were found to be
+ * equal. Treat truncated attributes as minus infinity when scankey has a
+ * key attribute value that would otherwise be compared directly.
+ *
+ * Note: it doesn't matter if ntupatts includes non-key attributes;
+ * scankey won't, so explicitly excluding non-key attributes isn't
+ * necessary.
+ */
+ if (key->keysz > ntupatts)
+ return 1;
+
+ /*
+ * Use the heap TID attribute and scantid to try to break the tie. The
+ * rules are the same as any other key attribute -- only the
+ * representation differs.
+ */
+ heapTid = BTreeTupleGetHeapTID(itup);
+ if (key->scantid == NULL)
+ {
+ /*
+ * Most searches have a scankey that is considered greater than a
+ * truncated pivot tuple if and when the scankey has equal values for
+ * attributes up to and including the least significant untruncated
+ * attribute in tuple.
+ *
+ * For example, if an index has the minimum two attributes (single
+ * user key attribute, plus heap TID attribute), and a page's high key
+ * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
+ * will not descend to the page to the left. The search will descend
+ * right instead. The truncated attribute in pivot tuple means that
+ * all non-pivot tuples on the page to the left are strictly < 'foo',
+ * so it isn't necessary to descend left. In other words, search
+ * doesn't have to descend left because it isn't interested in a match
+ * that has a heap TID value of -inf.
+ *
+ * However, some searches (pivotsearch searches) actually require that
+ * we descend left when this happens. -inf is treated as a possible
+ * match for omitted scankey attribute(s). This is needed by page
+ * deletion, which must re-find leaf pages that are targets for
+ * deletion using their high keys.
+ *
+ * Note: the heap TID part of the test ensures that scankey is being
+ * compared to a pivot tuple with one or more truncated key
+ * attributes.
+ *
+ * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
+ * left here, since they have no heap TID attribute (and cannot have
+ * any -inf key values in any case, since truncation can only remove
+ * non-key attributes). !heapkeyspace searches must always be
+ * prepared to deal with matches on both sides of the pivot once the
+ * leaf level is reached.
+ */
+ if (key->heapkeyspace && !key->pivotsearch &&
+ key->keysz == ntupatts && heapTid == NULL)
+ return 1;
+
+ /* All provided scankey arguments found to be equal */
+ return 0;
+ }
+
+ /*
+ * Treat truncated heap TID as minus infinity, since scankey has a key
+ * attribute value (scantid) that would otherwise be compared directly
+ */
+ Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
+ if (heapTid == NULL)
+ return 1;
+
+ /*
+ * Scankey must be treated as equal to a posting list tuple if its scantid
+ * value falls within the range of the posting list. In all other cases
+ * there can only be a single heap TID value, which is compared directly
+ * with scantid.
+ */
+ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
+ result = ItemPointerCompare(key->scantid, heapTid);
+ if (result <= 0 || !BTreeTupleIsPosting(itup))
+ return result;
+ else
+ {
+ result = ItemPointerCompare(key->scantid,
+ BTreeTupleGetMaxHeapTID(itup));
+ if (result > 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * _bt_first() -- Find the first item in a scan.
+ *
+ * We need to be clever about the direction of scan, the search
+ * conditions, and the tree ordering. We find the first item (or,
+ * if backwards scan, the last item) in the tree that satisfies the
+ * qualifications in the scan key. On success exit, the page containing
+ * the current index tuple is pinned but not locked, and data about
+ * the matching tuple(s) on the page has been loaded into so->currPos.
+ * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
+ * and if requested, scan->xs_itup points to a copy of the index tuple.
+ *
+ * If there are no matching items in the index, we return false, with no
+ * pins or locks held.
+ *
+ * Note that scan->keyData[], and the so->keyData[] scankey built from it,
+ * are both search-type scankeys (see nbtree/README for more about this).
+ * Within this routine, we build a temporary insertion-type scankey to use
+ * in locating the scan start position.
+ */
+bool
+_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel = scan->indexRelation;
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Buffer buf;
+ BTStack stack;
+ OffsetNumber offnum;
+ StrategyNumber strat;
+ bool nextkey;
+ bool goback;
+ BTScanInsertData inskey;
+ ScanKey startKeys[INDEX_MAX_KEYS];
+ ScanKeyData notnullkeys[INDEX_MAX_KEYS];
+ int keysCount = 0;
+ int i;
+ bool status;
+ StrategyNumber strat_total;
+ BTScanPosItem *currItem;
+ BlockNumber blkno;
+
+ Assert(!BTScanPosIsValid(so->currPos));
+
+ pgstat_count_index_scan(rel);
+
+ /*
+ * Examine the scan keys and eliminate any redundant keys; also mark the
+ * keys that must be matched to continue the scan.
+ */
+ _bt_preprocess_keys(scan);
+
+ /*
+ * Quit now if _bt_preprocess_keys() discovered that the scan keys can
+ * never be satisfied (eg, x == 1 AND x > 2).
+ */
+ if (!so->qual_ok)
+ {
+ /* Notify any other workers that we're done with this scan key. */
+ _bt_parallel_done(scan);
+ return false;
+ }
+
+ /*
+ * For parallel scans, get the starting page from shared state. If the
+ * scan has not started, proceed to find out first leaf page in the usual
+ * way while keeping other participating processes waiting. If the scan
+ * has already begun, use the page number from the shared structure.
+ */
+ if (scan->parallel_scan != NULL)
+ {
+ status = _bt_parallel_seize(scan, &blkno);
+ if (!status)
+ return false;
+ else if (blkno == P_NONE)
+ {
+ _bt_parallel_done(scan);
+ return false;
+ }
+ else if (blkno != InvalidBlockNumber)
+ {
+ if (!_bt_parallel_readpage(scan, blkno, dir))
+ return false;
+ goto readcomplete;
+ }
+ }
+
+ /*----------
+ * Examine the scan keys to discover where we need to start the scan.
+ *
+ * We want to identify the keys that can be used as starting boundaries;
+ * these are =, >, or >= keys for a forward scan or =, <, <= keys for
+ * a backwards scan. We can use keys for multiple attributes so long as
+ * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
+ * a > or < boundary or find an attribute with no boundary (which can be
+ * thought of as the same as "> -infinity"), we can't use keys for any
+ * attributes to its right, because it would break our simplistic notion
+ * of what initial positioning strategy to use.
+ *
+ * When the scan keys include cross-type operators, _bt_preprocess_keys
+ * may not be able to eliminate redundant keys; in such cases we will
+ * arbitrarily pick a usable one for each attribute. This is correct
+ * but possibly not optimal behavior. (For example, with keys like
+ * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
+ * x=5 would be more efficient.) Since the situation only arises given
+ * a poorly-worded query plus an incomplete opfamily, live with it.
+ *
+ * When both equality and inequality keys appear for a single attribute
+ * (again, only possible when cross-type operators appear), we *must*
+ * select one of the equality keys for the starting point, because
+ * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
+ * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
+ * start at x=4, we will fail and stop before reaching x=10. If multiple
+ * equality quals survive preprocessing, however, it doesn't matter which
+ * one we use --- by definition, they are either redundant or
+ * contradictory.
+ *
+ * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
+ * If the index stores nulls at the end of the index we'll be starting
+ * from, and we have no boundary key for the column (which means the key
+ * we deduced NOT NULL from is an inequality key that constrains the other
+ * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
+ * use as a boundary key. If we didn't do this, we might find ourselves
+ * traversing a lot of null entries at the start of the scan.
+ *
+ * In this loop, row-comparison keys are treated the same as keys on their
+ * first (leftmost) columns. We'll add on lower-order columns of the row
+ * comparison below, if possible.
+ *
+ * The selected scan keys (at most one per index column) are remembered by
+ * storing their addresses into the local startKeys[] array.
+ *----------
+ */
+ strat_total = BTEqualStrategyNumber;
+ if (so->numberOfKeys > 0)
+ {
+ AttrNumber curattr;
+ ScanKey chosen;
+ ScanKey impliesNN;
+ ScanKey cur;
+
+ /*
+ * chosen is the so-far-chosen key for the current attribute, if any.
+ * We don't cast the decision in stone until we reach keys for the
+ * next attribute.
+ */
+ curattr = 1;
+ chosen = NULL;
+ /* Also remember any scankey that implies a NOT NULL constraint */
+ impliesNN = NULL;
+
+ /*
+ * Loop iterates from 0 to numberOfKeys inclusive; we use the last
+ * pass to handle after-last-key processing. Actual exit from the
+ * loop is at one of the "break" statements below.
+ */
+ for (cur = so->keyData, i = 0;; cur++, i++)
+ {
+ if (i >= so->numberOfKeys || cur->sk_attno != curattr)
+ {
+ /*
+ * Done looking at keys for curattr. If we didn't find a
+ * usable boundary key, see if we can deduce a NOT NULL key.
+ */
+ if (chosen == NULL && impliesNN != NULL &&
+ ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+ ScanDirectionIsForward(dir) :
+ ScanDirectionIsBackward(dir)))
+ {
+ /* Yes, so build the key in notnullkeys[keysCount] */
+ chosen = &notnullkeys[keysCount];
+ ScanKeyEntryInitialize(chosen,
+ (SK_SEARCHNOTNULL | SK_ISNULL |
+ (impliesNN->sk_flags &
+ (SK_BT_DESC | SK_BT_NULLS_FIRST))),
+ curattr,
+ ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+ BTGreaterStrategyNumber :
+ BTLessStrategyNumber),
+ InvalidOid,
+ InvalidOid,
+ InvalidOid,
+ (Datum) 0);
+ }
+
+ /*
+ * If we still didn't find a usable boundary key, quit; else
+ * save the boundary key pointer in startKeys.
+ */
+ if (chosen == NULL)
+ break;
+ startKeys[keysCount++] = chosen;
+
+ /*
+ * Adjust strat_total, and quit if we have stored a > or <
+ * key.
+ */
+ strat = chosen->sk_strategy;
+ if (strat != BTEqualStrategyNumber)
+ {
+ strat_total = strat;
+ if (strat == BTGreaterStrategyNumber ||
+ strat == BTLessStrategyNumber)
+ break;
+ }
+
+ /*
+ * Done if that was the last attribute, or if next key is not
+ * in sequence (implying no boundary key is available for the
+ * next attribute).
+ */
+ if (i >= so->numberOfKeys ||
+ cur->sk_attno != curattr + 1)
+ break;
+
+ /*
+ * Reset for next attr.
+ */
+ curattr = cur->sk_attno;
+ chosen = NULL;
+ impliesNN = NULL;
+ }
+
+ /*
+ * Can we use this key as a starting boundary for this attr?
+ *
+ * If not, does it imply a NOT NULL constraint? (Because
+ * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
+ * *any* inequality key works for that; we need not test.)
+ */
+ switch (cur->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ if (chosen == NULL)
+ {
+ if (ScanDirectionIsBackward(dir))
+ chosen = cur;
+ else
+ impliesNN = cur;
+ }
+ break;
+ case BTEqualStrategyNumber:
+ /* override any non-equality choice */
+ chosen = cur;
+ break;
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ if (chosen == NULL)
+ {
+ if (ScanDirectionIsForward(dir))
+ chosen = cur;
+ else
+ impliesNN = cur;
+ }
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we found no usable boundary keys, we have to start from one end of
+ * the tree. Walk down that edge to the first or last key, and scan from
+ * there.
+ */
+ if (keysCount == 0)
+ {
+ bool match;
+
+ match = _bt_endpoint(scan, dir);
+
+ if (!match)
+ {
+ /* No match, so mark (parallel) scan finished */
+ _bt_parallel_done(scan);
+ }
+
+ return match;
+ }
+
+ /*
+ * We want to start the scan somewhere within the index. Set up an
+ * insertion scankey we can use to search for the boundary point we
+ * identified above. The insertion scankey is built using the keys
+ * identified by startKeys[]. (Remaining insertion scankey fields are
+ * initialized after initial-positioning strategy is finalized.)
+ */
+ Assert(keysCount <= INDEX_MAX_KEYS);
+ for (i = 0; i < keysCount; i++)
+ {
+ ScanKey cur = startKeys[i];
+
+ Assert(cur->sk_attno == i + 1);
+
+ if (cur->sk_flags & SK_ROW_HEADER)
+ {
+ /*
+ * Row comparison header: look to the first row member instead.
+ *
+ * The member scankeys are already in insertion format (ie, they
+ * have sk_func = 3-way-comparison function), but we have to watch
+ * out for nulls, which _bt_preprocess_keys didn't check. A null
+ * in the first row member makes the condition unmatchable, just
+ * like qual_ok = false.
+ */
+ ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
+
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ if (subkey->sk_flags & SK_ISNULL)
+ {
+ _bt_parallel_done(scan);
+ return false;
+ }
+ memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
+
+ /*
+ * If the row comparison is the last positioning key we accepted,
+ * try to add additional keys from the lower-order row members.
+ * (If we accepted independent conditions on additional index
+ * columns, we use those instead --- doesn't seem worth trying to
+ * determine which is more restrictive.) Note that this is OK
+ * even if the row comparison is of ">" or "<" type, because the
+ * condition applied to all but the last row member is effectively
+ * ">=" or "<=", and so the extra keys don't break the positioning
+ * scheme. But, by the same token, if we aren't able to use all
+ * the row members, then the part of the row comparison that we
+ * did use has to be treated as just a ">=" or "<=" condition, and
+ * so we'd better adjust strat_total accordingly.
+ */
+ if (i == keysCount - 1)
+ {
+ bool used_all_subkeys = false;
+
+ Assert(!(subkey->sk_flags & SK_ROW_END));
+ for (;;)
+ {
+ subkey++;
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ if (subkey->sk_attno != keysCount + 1)
+ break; /* out-of-sequence, can't use it */
+ if (subkey->sk_strategy != cur->sk_strategy)
+ break; /* wrong direction, can't use it */
+ if (subkey->sk_flags & SK_ISNULL)
+ break; /* can't use null keys */
+ Assert(keysCount < INDEX_MAX_KEYS);
+ memcpy(inskey.scankeys + keysCount, subkey,
+ sizeof(ScanKeyData));
+ keysCount++;
+ if (subkey->sk_flags & SK_ROW_END)
+ {
+ used_all_subkeys = true;
+ break;
+ }
+ }
+ if (!used_all_subkeys)
+ {
+ switch (strat_total)
+ {
+ case BTLessStrategyNumber:
+ strat_total = BTLessEqualStrategyNumber;
+ break;
+ case BTGreaterStrategyNumber:
+ strat_total = BTGreaterEqualStrategyNumber;
+ break;
+ }
+ }
+ break; /* done with outer loop */
+ }
+ }
+ else
+ {
+ /*
+ * Ordinary comparison key. Transform the search-style scan key
+ * to an insertion scan key by replacing the sk_func with the
+ * appropriate btree comparison function.
+ *
+ * If scankey operator is not a cross-type comparison, we can use
+ * the cached comparison function; otherwise gotta look it up in
+ * the catalogs. (That can't lead to infinite recursion, since no
+ * indexscan initiated by syscache lookup will use cross-data-type
+ * operators.)
+ *
+ * We support the convention that sk_subtype == InvalidOid means
+ * the opclass input type; this is a hack to simplify life for
+ * ScanKeyInit().
+ */
+ if (cur->sk_subtype == rel->rd_opcintype[i] ||
+ cur->sk_subtype == InvalidOid)
+ {
+ FmgrInfo *procinfo;
+
+ procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
+ ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
+ cur->sk_flags,
+ cur->sk_attno,
+ InvalidStrategy,
+ cur->sk_subtype,
+ cur->sk_collation,
+ procinfo,
+ cur->sk_argument);
+ }
+ else
+ {
+ RegProcedure cmp_proc;
+
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+ rel->rd_opcintype[i],
+ cur->sk_subtype,
+ BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+ BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
+ cur->sk_attno, RelationGetRelationName(rel));
+ ScanKeyEntryInitialize(inskey.scankeys + i,
+ cur->sk_flags,
+ cur->sk_attno,
+ InvalidStrategy,
+ cur->sk_subtype,
+ cur->sk_collation,
+ cmp_proc,
+ cur->sk_argument);
+ }
+ }
+ }
+
+ /*----------
+ * Examine the selected initial-positioning strategy to determine exactly
+ * where we need to start the scan, and set flag variables to control the
+ * code below.
+ *
+ * If nextkey = false, _bt_search and _bt_binsrch will locate the first
+ * item >= scan key. If nextkey = true, they will locate the first
+ * item > scan key.
+ *
+ * If goback = true, we will then step back one item, while if
+ * goback = false, we will start the scan on the located item.
+ *----------
+ */
+ switch (strat_total)
+ {
+ case BTLessStrategyNumber:
+
+ /*
+ * Find first item >= scankey, then back up one to arrive at last
+ * item < scankey. (Note: this positioning strategy is only used
+ * for a backward scan, so that is always the correct starting
+ * position.)
+ */
+ nextkey = false;
+ goback = true;
+ break;
+
+ case BTLessEqualStrategyNumber:
+
+ /*
+ * Find first item > scankey, then back up one to arrive at last
+ * item <= scankey. (Note: this positioning strategy is only used
+ * for a backward scan, so that is always the correct starting
+ * position.)
+ */
+ nextkey = true;
+ goback = true;
+ break;
+
+ case BTEqualStrategyNumber:
+
+ /*
+ * If a backward scan was specified, need to start with last equal
+ * item not first one.
+ */
+ if (ScanDirectionIsBackward(dir))
+ {
+ /*
+ * This is the same as the <= strategy. We will check at the
+ * end whether the found item is actually =.
+ */
+ nextkey = true;
+ goback = true;
+ }
+ else
+ {
+ /*
+ * This is the same as the >= strategy. We will check at the
+ * end whether the found item is actually =.
+ */
+ nextkey = false;
+ goback = false;
+ }
+ break;
+
+ case BTGreaterEqualStrategyNumber:
+
+ /*
+ * Find first item >= scankey. (This is only used for forward
+ * scans.)
+ */
+ nextkey = false;
+ goback = false;
+ break;
+
+ case BTGreaterStrategyNumber:
+
+ /*
+ * Find first item > scankey. (This is only used for forward
+ * scans.)
+ */
+ nextkey = true;
+ goback = false;
+ break;
+
+ default:
+ /* can't get here, but keep compiler quiet */
+ elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
+ return false;
+ }
+
+ /* Initialize remaining insertion scan key fields */
+ _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
+ inskey.anynullkeys = false; /* unused */
+ inskey.nextkey = nextkey;
+ inskey.pivotsearch = false;
+ inskey.scantid = NULL;
+ inskey.keysz = keysCount;
+
+ /*
+ * Use the manufactured insertion scan key to descend the tree and
+ * position ourselves on the target leaf page.
+ */
+ stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
+
+ /* don't need to keep the stack around... */
+ _bt_freestack(stack);
+
+ if (!BufferIsValid(buf))
+ {
+ /*
+ * We only get here if the index is completely empty. Lock relation
+ * because nothing finer to lock exists.
+ */
+ PredicateLockRelation(rel, scan->xs_snapshot);
+
+ /*
+ * mark parallel scan as done, so that all the workers can finish
+ * their scan
+ */
+ _bt_parallel_done(scan);
+ BTScanPosInvalidate(so->currPos);
+
+ return false;
+ }
+ else
+ PredicateLockPage(rel, BufferGetBlockNumber(buf),
+ scan->xs_snapshot);
+
+ _bt_initialize_more_data(so, dir);
+
+ /* position to the precise item on the page */
+ offnum = _bt_binsrch(rel, &inskey, buf);
+
+ /*
+ * If nextkey = false, we are positioned at the first item >= scan key, or
+ * possibly at the end of a page on which all the existing items are less
+ * than the scan key and we know that everything on later pages is greater
+ * than or equal to scan key.
+ *
+ * If nextkey = true, we are positioned at the first item > scan key, or
+ * possibly at the end of a page on which all the existing items are less
+ * than or equal to the scan key and we know that everything on later
+ * pages is greater than scan key.
+ *
+ * The actually desired starting point is either this item or the prior
+ * one, or in the end-of-page case it's the first item on the next page or
+ * the last item on this page. Adjust the starting offset if needed. (If
+ * this results in an offset before the first item or after the last one,
+ * _bt_readpage will report no items found, and then we'll step to the
+ * next page as needed.)
+ */
+ if (goback)
+ offnum = OffsetNumberPrev(offnum);
+
+ /* remember which buffer we have pinned, if any */
+ Assert(!BTScanPosIsValid(so->currPos));
+ so->currPos.buf = buf;
+
+ /*
+ * Now load data from the first page of the scan.
+ */
+ if (!_bt_readpage(scan, dir, offnum))
+ {
+ /*
+ * There's no actually-matching data on this page. Try to advance to
+ * the next page. Return false if there's no matching data at all.
+ */
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ if (!_bt_steppage(scan, dir))
+ return false;
+ }
+ else
+ {
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ }
+
+readcomplete:
+ /* OK, itemIndex says what to return */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_heaptid = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
+}
+
+/*
+ * _bt_next() -- Get the next item in a scan.
+ *
+ * On entry, so->currPos describes the current page, which may be pinned
+ * but is not locked, and so->currPos.itemIndex identifies which item was
+ * previously returned.
+ *
+ * On successful exit, scan->xs_ctup.t_self is set to the TID of the
+ * next heap tuple, and if requested, scan->xs_itup points to a copy of
+ * the index tuple. so->currPos is updated as needed.
+ *
+ * On failure exit (no more tuples), we release pin and set
+ * so->currPos.buf to InvalidBuffer.
+ */
+bool
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTScanPosItem *currItem;
+
+ /*
+ * Advance to next tuple on current page; or if there's no more, try to
+ * step to the next page with data.
+ */
+ if (ScanDirectionIsForward(dir))
+ {
+ if (++so->currPos.itemIndex > so->currPos.lastItem)
+ {
+ if (!_bt_steppage(scan, dir))
+ return false;
+ }
+ }
+ else
+ {
+ if (--so->currPos.itemIndex < so->currPos.firstItem)
+ {
+ if (!_bt_steppage(scan, dir))
+ return false;
+ }
+ }
+
+ /* OK, itemIndex says what to return */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_heaptid = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
+}
+
+/*
+ * _bt_readpage() -- Load data from current index page into so->currPos
+ *
+ * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
+ * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
+ * they are updated as appropriate. All other fields of so->currPos are
+ * initialized from scratch here.
+ *
+ * We scan the current page starting at offnum and moving in the indicated
+ * direction. All items matching the scan keys are loaded into currPos.items.
+ * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
+ * that there can be no more matching tuples in the current scan direction.
+ *
+ * In the case of a parallel scan, caller must have called _bt_parallel_seize
+ * prior to calling this function; this function will invoke
+ * _bt_parallel_release before returning.
+ *
+ * Returns true if any matching items found on the page, false if none.
+ */
+static bool
+_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber minoff;
+ OffsetNumber maxoff;
+ int itemIndex;
+ bool continuescan;
+ int indnatts;
+
+ /*
+ * We must have the buffer pinned and locked, but the usual macro can't be
+ * used here; this function is what makes it good for currPos.
+ */
+ Assert(BufferIsValid(so->currPos.buf));
+
+ page = BufferGetPage(so->currPos.buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* allow next page be processed by parallel worker */
+ if (scan->parallel_scan)
+ {
+ if (ScanDirectionIsForward(dir))
+ _bt_parallel_release(scan, opaque->btpo_next);
+ else
+ _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+ }
+
+ continuescan = true; /* default assumption */
+ indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * We note the buffer's block number so that we can release the pin later.
+ * This allows us to re-read the buffer if it is needed again for hinting.
+ */
+ so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+
+ /*
+ * We save the LSN of the page as we read it, so that we know whether it
+ * safe to apply LP_DEAD hints to the page later. This allows us to drop
+ * the pin for MVCC scans, which allows vacuum to avoid blocking.
+ */
+ so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+
+ /*
+ * we must save the page's right-link while scanning it; this tells us
+ * where to step right to after we're done with these items. There is no
+ * corresponding need for the left-link, since splits always go right.
+ */
+ so->currPos.nextPage = opaque->btpo_next;
+
+ /* initialize tuple workspace to empty */
+ so->currPos.nextTupleOffset = 0;
+
+ /*
+ * Now that the current page has been made consistent, the macro should be
+ * good.
+ */
+ Assert(BTScanPosIsPinned(so->currPos));
+
+ if (ScanDirectionIsForward(dir))
+ {
+ /* load items[] in ascending order */
+ itemIndex = 0;
+
+ offnum = Max(offnum, minoff);
+
+ while (offnum <= maxoff)
+ {
+ ItemId iid = PageGetItemId(page, offnum);
+ IndexTuple itup;
+
+ /*
+ * If the scan specifies not to return killed tuples, then we
+ * treat a killed tuple as not passing the qual
+ */
+ if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+ {
+ offnum = OffsetNumberNext(offnum);
+ continue;
+ }
+
+ itup = (IndexTuple) PageGetItem(page, iid);
+
+ if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+ {
+ /* tuple passes all scan key conditions */
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Remember it */
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ itemIndex++;
+ }
+ else
+ {
+ int tupleOffset;
+
+ /*
+ * Set up state to return posting list, and remember first
+ * TID
+ */
+ tupleOffset =
+ _bt_setuppostingitems(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, 0),
+ itup);
+ itemIndex++;
+ /* Remember additional TIDs */
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ tupleOffset);
+ itemIndex++;
+ }
+ }
+ }
+ /* When !continuescan, there can't be any more matches, so stop */
+ if (!continuescan)
+ break;
+
+ offnum = OffsetNumberNext(offnum);
+ }
+
+ /*
+ * We don't need to visit page to the right when the high key
+ * indicates that no more matches will be found there.
+ *
+ * Checking the high key like this works out more often than you might
+ * think. Leaf page splits pick a split point between the two most
+ * dissimilar tuples (this is weighed against the need to evenly share
+ * free space). Leaf pages with high key attribute values that can
+ * only appear on non-pivot tuples on the right sibling page are
+ * common.
+ */
+ if (continuescan && !P_RIGHTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, P_HIKEY);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
+ int truncatt;
+
+ truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
+ _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+ }
+
+ if (!continuescan)
+ so->currPos.moreRight = false;
+
+ Assert(itemIndex <= MaxTIDsPerBTreePage);
+ so->currPos.firstItem = 0;
+ so->currPos.lastItem = itemIndex - 1;
+ so->currPos.itemIndex = 0;
+ }
+ else
+ {
+ /* load items[] in descending order */
+ itemIndex = MaxTIDsPerBTreePage;
+
+ offnum = Min(offnum, maxoff);
+
+ while (offnum >= minoff)
+ {
+ ItemId iid = PageGetItemId(page, offnum);
+ IndexTuple itup;
+ bool tuple_alive;
+ bool passes_quals;
+
+ /*
+ * If the scan specifies not to return killed tuples, then we
+ * treat a killed tuple as not passing the qual. Most of the
+ * time, it's a win to not bother examining the tuple's index
+ * keys, but just skip to the next tuple (previous, actually,
+ * since we're scanning backwards). However, if this is the first
+ * tuple on the page, we do check the index keys, to prevent
+ * uselessly advancing to the page to the left. This is similar
+ * to the high key optimization used by forward scans.
+ */
+ if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+ {
+ Assert(offnum >= P_FIRSTDATAKEY(opaque));
+ if (offnum > P_FIRSTDATAKEY(opaque))
+ {
+ offnum = OffsetNumberPrev(offnum);
+ continue;
+ }
+
+ tuple_alive = false;
+ }
+ else
+ tuple_alive = true;
+
+ itup = (IndexTuple) PageGetItem(page, iid);
+
+ passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+ &continuescan);
+ if (passes_quals && tuple_alive)
+ {
+ /* tuple passes all scan key conditions */
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Remember it */
+ itemIndex--;
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ }
+ else
+ {
+ int tupleOffset;
+
+ /*
+ * Set up state to return posting list, and remember first
+ * TID.
+ *
+ * Note that we deliberately save/return items from
+ * posting lists in ascending heap TID order for backwards
+ * scans. This allows _bt_killitems() to make a
+ * consistent assumption about the order of items
+ * associated with the same posting list tuple.
+ */
+ itemIndex--;
+ tupleOffset =
+ _bt_setuppostingitems(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, 0),
+ itup);
+ /* Remember additional TIDs */
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ itemIndex--;
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ tupleOffset);
+ }
+ }
+ }
+ if (!continuescan)
+ {
+ /* there can't be any more matches, so stop */
+ so->currPos.moreLeft = false;
+ break;
+ }
+
+ offnum = OffsetNumberPrev(offnum);
+ }
+
+ Assert(itemIndex >= 0);
+ so->currPos.firstItem = itemIndex;
+ so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
+ so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
+ }
+
+ return (so->currPos.firstItem <= so->currPos.lastItem);
+}
+
+/* Save an index item into so->currPos.items[itemIndex] */
+static void
+_bt_saveitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, IndexTuple itup)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+
+ currItem->heapTid = itup->t_tid;
+ currItem->indexOffset = offnum;
+ if (so->currTuples)
+ {
+ Size itupsz = IndexTupleSize(itup);
+
+ currItem->tupleOffset = so->currPos.nextTupleOffset;
+ memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+ so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+ }
+}
+
+/*
+ * Setup state to save TIDs/items from a single posting list tuple.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for TID that is
+ * returned to scan first. Second or subsequent TIDs for posting list should
+ * be saved by calling _bt_savepostingitem().
+ *
+ * Returns an offset into tuple storage space that main tuple is stored at if
+ * needed.
+ */
+static int
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+ ItemPointer heapTid, IndexTuple itup)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ Assert(BTreeTupleIsPosting(itup));
+
+ currItem->heapTid = *heapTid;
+ currItem->indexOffset = offnum;
+ if (so->currTuples)
+ {
+ /* Save base IndexTuple (truncate posting list) */
+ IndexTuple base;
+ Size itupsz = BTreeTupleGetPostingOffset(itup);
+
+ itupsz = MAXALIGN(itupsz);
+ currItem->tupleOffset = so->currPos.nextTupleOffset;
+ base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+ memcpy(base, itup, itupsz);
+ /* Defensively reduce work area index tuple header size */
+ base->t_info &= ~INDEX_SIZE_MASK;
+ base->t_info |= itupsz;
+ so->currPos.nextTupleOffset += itupsz;
+
+ return currItem->tupleOffset;
+ }
+
+ return 0;
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for current posting
+ * tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple. Caller passes its return value as tupleOffset.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+ ItemPointer heapTid, int tupleOffset)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ currItem->heapTid = *heapTid;
+ currItem->indexOffset = offnum;
+
+ /*
+ * Have index-only scans return the same base IndexTuple for every TID
+ * that originates from the same posting list
+ */
+ if (so->currTuples)
+ currItem->tupleOffset = tupleOffset;
+}
+
+/*
+ * _bt_steppage() -- Step to next page containing valid data for scan
+ *
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
+ * if pinned, we'll drop the pin before moving to next page. The buffer is
+ * not locked on entry.
+ *
+ * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
+ * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
+ * to InvalidBuffer. We return true to indicate success.
+ */
+static bool
+_bt_steppage(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BlockNumber blkno = InvalidBlockNumber;
+ bool status;
+
+ Assert(BTScanPosIsValid(so->currPos));
+
+ /* Before leaving current page, deal with any killed items */
+ if (so->numKilled > 0)
+ _bt_killitems(scan);
+
+ /*
+ * Before we modify currPos, make a copy of the page data if there was a
+ * mark position that needs it.
+ */
+ if (so->markItemIndex >= 0)
+ {
+ /* bump pin on current buffer for assignment to mark buffer */
+ if (BTScanPosIsPinned(so->currPos))
+ IncrBufferRefCount(so->currPos.buf);
+ memcpy(&so->markPos, &so->currPos,
+ offsetof(BTScanPosData, items[1]) +
+ so->currPos.lastItem * sizeof(BTScanPosItem));
+ if (so->markTuples)
+ memcpy(so->markTuples, so->currTuples,
+ so->currPos.nextTupleOffset);
+ so->markPos.itemIndex = so->markItemIndex;
+ so->markItemIndex = -1;
+ }
+
+ if (ScanDirectionIsForward(dir))
+ {
+ /* Walk right to the next page with data */
+ if (scan->parallel_scan != NULL)
+ {
+ /*
+ * Seize the scan to get the next block number; if the scan has
+ * ended already, bail out.
+ */
+ status = _bt_parallel_seize(scan, &blkno);
+ if (!status)
+ {
+ /* release the previous buffer, if pinned */
+ BTScanPosUnpinIfPinned(so->currPos);
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+ }
+ else
+ {
+ /* Not parallel, so use the previously-saved nextPage link. */
+ blkno = so->currPos.nextPage;
+ }
+
+ /* Remember we left a page with data */
+ so->currPos.moreLeft = true;
+
+ /* release the previous buffer, if pinned */
+ BTScanPosUnpinIfPinned(so->currPos);
+ }
+ else
+ {
+ /* Remember we left a page with data */
+ so->currPos.moreRight = true;
+
+ if (scan->parallel_scan != NULL)
+ {
+ /*
+ * Seize the scan to get the current block number; if the scan has
+ * ended already, bail out.
+ */
+ status = _bt_parallel_seize(scan, &blkno);
+ BTScanPosUnpinIfPinned(so->currPos);
+ if (!status)
+ {
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+ }
+ else
+ {
+ /* Not parallel, so just use our own notion of the current page */
+ blkno = so->currPos.currPage;
+ }
+ }
+
+ if (!_bt_readnextpage(scan, blkno, dir))
+ return false;
+
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+ return true;
+}
+
+/*
+ * _bt_readnextpage() -- Read next page containing valid data for scan
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page. Caller is responsible to release lock and pin on
+ * buffer on success. We return true to indicate success.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
+ */
+static bool
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel;
+ Page page;
+ BTPageOpaque opaque;
+ bool status;
+
+ rel = scan->indexRelation;
+
+ if (ScanDirectionIsForward(dir))
+ {
+ for (;;)
+ {
+ /*
+ * if we're at end of scan, give up and mark parallel scan as
+ * done, so that all the workers can finish their scan
+ */
+ if (blkno == P_NONE || !so->currPos.moreRight)
+ {
+ _bt_parallel_done(scan);
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
+ /* step right one page */
+ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(so->currPos.buf);
+ TestForOldSnapshot(scan->xs_snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ /* check for deleted page */
+ if (!P_IGNORE(opaque))
+ {
+ PredicateLockPage(rel, blkno, scan->xs_snapshot);
+ /* see if there are any matches on this page */
+ /* note that this will clear moreRight if we can stop */
+ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
+ break;
+ }
+ else if (scan->parallel_scan != NULL)
+ {
+ /* allow next page be processed by parallel worker */
+ _bt_parallel_release(scan, opaque->btpo_next);
+ }
+
+ /* nope, keep going */
+ if (scan->parallel_scan != NULL)
+ {
+ _bt_relbuf(rel, so->currPos.buf);
+ status = _bt_parallel_seize(scan, &blkno);
+ if (!status)
+ {
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+ }
+ else
+ {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, so->currPos.buf);
+ }
+ }
+ }
+ else
+ {
+ /*
+ * Should only happen in parallel cases, when some other backend
+ * advanced the scan.
+ */
+ if (so->currPos.currPage != blkno)
+ {
+ BTScanPosUnpinIfPinned(so->currPos);
+ so->currPos.currPage = blkno;
+ }
+
+ /*
+ * Walk left to the next page with data. This is much more complex
+ * than the walk-right case because of the possibility that the page
+ * to our left splits while we are in flight to it, plus the
+ * possibility that the page we were on gets deleted after we leave
+ * it. See nbtree/README for details.
+ *
+ * It might be possible to rearrange this code to have less overhead
+ * in pinning and locking, but that would require capturing the left
+ * pointer when the page is initially read, and using it here, along
+ * with big changes to _bt_walk_left() and the code below. It is not
+ * clear whether this would be a win, since if the page immediately to
+ * the left splits after we read this page and before we step left, we
+ * would need to visit more pages than with the current code.
+ *
+ * Note that if we change the code so that we drop the pin for a scan
+ * which uses a non-MVCC snapshot, we will need to modify the code for
+ * walking left, to allow for the possibility that a referenced page
+ * has been deleted. As long as the buffer is pinned or the snapshot
+ * is MVCC the page cannot move past the half-dead state to fully
+ * deleted.
+ */
+ if (BTScanPosIsPinned(so->currPos))
+ _bt_lockbuf(rel, so->currPos.buf, BT_READ);
+ else
+ so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
+
+ for (;;)
+ {
+ /* Done if we know there are no matching keys to the left */
+ if (!so->currPos.moreLeft)
+ {
+ _bt_relbuf(rel, so->currPos.buf);
+ _bt_parallel_done(scan);
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+
+ /* Step to next physical page */
+ so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
+ scan->xs_snapshot);
+
+ /* if we're physically at end of index, return failure */
+ if (so->currPos.buf == InvalidBuffer)
+ {
+ _bt_parallel_done(scan);
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+
+ /*
+ * Okay, we managed to move left to a non-deleted page. Done if
+ * it's not half-dead and contains matching tuples. Else loop back
+ * and do it all again.
+ */
+ page = BufferGetPage(so->currPos.buf);
+ TestForOldSnapshot(scan->xs_snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_IGNORE(opaque))
+ {
+ PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+ /* see if there are any matches on this page */
+ /* note that this will clear moreLeft if we can stop */
+ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
+ break;
+ }
+ else if (scan->parallel_scan != NULL)
+ {
+ /* allow next page be processed by parallel worker */
+ _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+ }
+
+ /*
+ * For parallel scans, get the last page scanned as it is quite
+ * possible that by the time we try to seize the scan, some other
+ * worker has already advanced the scan to a different page. We
+ * must continue based on the latest page scanned by any worker.
+ */
+ if (scan->parallel_scan != NULL)
+ {
+ _bt_relbuf(rel, so->currPos.buf);
+ status = _bt_parallel_seize(scan, &blkno);
+ if (!status)
+ {
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+ }
+ }
+ }
+
+ return true;
+}
+
+/*
+ * _bt_parallel_readpage() -- Read current page containing valid data for scan
+ *
+ * On success, release lock and maybe pin on buffer. We return true to
+ * indicate success.
+ */
+static bool
+_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+ _bt_initialize_more_data(so, dir);
+
+ if (!_bt_readnextpage(scan, blkno, dir))
+ return false;
+
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+ return true;
+}
+
+/*
+ * _bt_walk_left() -- step left one page, if possible
+ *
+ * The given buffer must be pinned and read-locked. This will be dropped
+ * before stepping left. On return, we have pin and read lock on the
+ * returned page, instead.
+ *
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
+ *
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
+ */
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
+{
+ Page page;
+ BTPageOpaque opaque;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ for (;;)
+ {
+ BlockNumber obknum;
+ BlockNumber lblkno;
+ BlockNumber blkno;
+ int tries;
+
+ /* if we're at end of tree, release buf and return failure */
+ if (P_LEFTMOST(opaque))
+ {
+ _bt_relbuf(rel, buf);
+ break;
+ }
+ /* remember original page we are stepping left from */
+ obknum = BufferGetBlockNumber(buf);
+ /* step left */
+ blkno = lblkno = opaque->btpo_prev;
+ _bt_relbuf(rel, buf);
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * If this isn't the page we want, walk right till we find what we
+ * want --- but go no more than four hops (an arbitrary limit). If we
+ * don't find the correct page by then, the most likely bet is that
+ * the original page got deleted and isn't in the sibling chain at all
+ * anymore, not that its left sibling got split more than four times.
+ *
+ * Note that it is correct to test P_ISDELETED not P_IGNORE here,
+ * because half-dead pages are still in the sibling chain. Caller
+ * must reject half-dead pages if wanted.
+ */
+ tries = 0;
+ for (;;)
+ {
+ if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+ {
+ /* Found desired page, return it */
+ return buf;
+ }
+ if (P_RIGHTMOST(opaque) || ++tries > 4)
+ break;
+ blkno = opaque->btpo_next;
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ /* Return to the original page to see what's up */
+ buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISDELETED(opaque))
+ {
+ /*
+ * It was deleted. Move right to first nondeleted page (there
+ * must be one); that is the page that has acquired the deleted
+ * one's keyspace, so stepping left from it will take us where we
+ * want to be.
+ */
+ for (;;)
+ {
+ if (P_RIGHTMOST(opaque))
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+ blkno = opaque->btpo_next;
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_ISDELETED(opaque))
+ break;
+ }
+
+ /*
+ * Now return to top of loop, resetting obknum to point to this
+ * nondeleted page, and try again.
+ */
+ }
+ else
+ {
+ /*
+ * It wasn't deleted; the explanation had better be that the page
+ * to the left got split or deleted. Without this check, we'd go
+ * into an infinite loop if there's anything wrong.
+ */
+ if (opaque->btpo_prev == lblkno)
+ elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
+ obknum, RelationGetRelationName(rel));
+ /* Okay to try again with new lblkno value */
+ }
+ }
+
+ return InvalidBuffer;
+}
+
+/*
+ * _bt_get_endpoint() -- Find the first or last page on a given tree level
+ *
+ * If the index is empty, we will return InvalidBuffer; any other failure
+ * condition causes ereport(). We will not return a dead page.
+ *
+ * The returned buffer is pinned and read-locked.
+ */
+Buffer
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+ Snapshot snapshot)
+{
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum;
+ BlockNumber blkno;
+ IndexTuple itup;
+
+ /*
+ * If we are looking for a leaf page, okay to descend from fast root;
+ * otherwise better descend from true root. (There is no point in being
+ * smarter about intermediate levels.)
+ */
+ if (level == 0)
+ buf = _bt_getroot(rel, BT_READ);
+ else
+ buf = _bt_gettrueroot(rel);
+
+ if (!BufferIsValid(buf))
+ return InvalidBuffer;
+
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ for (;;)
+ {
+ /*
+ * If we landed on a deleted page, step right to find a live page
+ * (there must be one). Also, if we want the rightmost page, step
+ * right if needed to get to it (this could happen if the page split
+ * since we obtained a pointer to it).
+ */
+ while (P_IGNORE(opaque) ||
+ (rightmost && !P_RIGHTMOST(opaque)))
+ {
+ blkno = opaque->btpo_next;
+ if (blkno == P_NONE)
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ /* Done? */
+ if (opaque->btpo_level == level)
+ break;
+ if (opaque->btpo_level < level)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("btree level %u not found in index \"%s\"",
+ level, RelationGetRelationName(rel))));
+
+ /* Descend to leftmost or rightmost child page */
+ if (rightmost)
+ offnum = PageGetMaxOffsetNumber(page);
+ else
+ offnum = P_FIRSTDATAKEY(opaque);
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ blkno = BTreeTupleGetDownLink(itup);
+
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ return buf;
+}
+
+/*
+ * _bt_endpoint() -- Find the first or last page in the index, and scan
+ * from there to the first key satisfying all the quals.
+ *
+ * This is used by _bt_first() to set up a scan when we've determined
+ * that the scan must start at the beginning or end of the index (for
+ * a forward or backward scan respectively). Exit conditions are the
+ * same as for _bt_first().
+ */
+static bool
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel = scan->indexRelation;
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber start;
+ BTScanPosItem *currItem;
+
+ /*
+ * Scan down to the leftmost or rightmost leaf page. This is a simplified
+ * version of _bt_search(). We don't maintain a stack since we know we
+ * won't need it.
+ */
+ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
+
+ if (!BufferIsValid(buf))
+ {
+ /*
+ * Empty index. Lock the whole relation, as nothing finer to lock
+ * exists.
+ */
+ PredicateLockRelation(rel, scan->xs_snapshot);
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
+
+ PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISLEAF(opaque));
+
+ if (ScanDirectionIsForward(dir))
+ {
+ /* There could be dead pages to the left, so not this: */
+ /* Assert(P_LEFTMOST(opaque)); */
+
+ start = P_FIRSTDATAKEY(opaque);
+ }
+ else if (ScanDirectionIsBackward(dir))
+ {
+ Assert(P_RIGHTMOST(opaque));
+
+ start = PageGetMaxOffsetNumber(page);
+ }
+ else
+ {
+ elog(ERROR, "invalid scan direction: %d", (int) dir);
+ start = 0; /* keep compiler quiet */
+ }
+
+ /* remember which buffer we have pinned */
+ so->currPos.buf = buf;
+
+ _bt_initialize_more_data(so, dir);
+
+ /*
+ * Now load data from the first page of the scan.
+ */
+ if (!_bt_readpage(scan, dir, start))
+ {
+ /*
+ * There's no actually-matching data on this page. Try to advance to
+ * the next page. Return false if there's no matching data at all.
+ */
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ if (!_bt_steppage(scan, dir))
+ return false;
+ }
+ else
+ {
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ }
+
+ /* OK, itemIndex says what to return */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_heaptid = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
+}
+
+/*
+ * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
+ * for scan direction
+ */
+static inline void
+_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
+{
+ /* initialize moreLeft/moreRight appropriately for scan direction */
+ if (ScanDirectionIsForward(dir))
+ {
+ so->currPos.moreLeft = false;
+ so->currPos.moreRight = true;
+ }
+ else
+ {
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = false;
+ }
+ so->numKilled = 0; /* just paranoia */
+ so->markItemIndex = -1; /* ditto */
+}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
new file mode 100644
index 0000000..78f78e7
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -0,0 +1,2016 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsort.c
+ * Build a btree from sorted input by loading leaf pages sequentially.
+ *
+ * NOTES
+ *
+ * We use tuplesort.c to sort the given index tuples into order.
+ * Then we scan the index tuples in order and build the btree pages
+ * for each level. We load source tuples into leaf-level pages.
+ * Whenever we fill a page at one level, we add a link to it to its
+ * parent level (starting a new parent level if necessary). When
+ * done, we write out each final page on each level, adding it to
+ * its parent level. When we have only one page on a level, it must be
+ * the root -- it can be attached to the btree metapage and we are done.
+ *
+ * It is not wise to pack the pages entirely full, since then *any*
+ * insertion would cause a split (and not only of the leaf page; the need
+ * for a split would cascade right up the tree). The steady-state load
+ * factor for btrees is usually estimated at 70%. We choose to pack leaf
+ * pages to the user-controllable fill factor (default 90%) while upper pages
+ * are always packed to 70%. This gives us reasonable density (there aren't
+ * many upper pages if the keys are reasonable-size) without risking a lot of
+ * cascading splits during early insertions.
+ *
+ * Formerly the index pages being built were kept in shared buffers, but
+ * that is of no value (since other backends have no interest in them yet)
+ * and it created locking problems for CHECKPOINT, because the upper-level
+ * pages were held exclusive-locked for long periods. Now we just build
+ * the pages in local memory and smgrwrite or smgrextend them as we finish
+ * them. They will need to be re-read into shared buffers on first use after
+ * the build finishes.
+ *
+ * This code isn't concerned about the FSM at all. The caller is responsible
+ * for initializing that.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtsort.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/parallel.h"
+#include "access/relscan.h"
+#include "access/table.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "commands/progress.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h" /* pgrminclude ignore */
+#include "utils/rel.h"
+#include "utils/sortsupport.h"
+#include "utils/tuplesort.h"
+
+
+/* Magic numbers for parallel state sharing */
+#define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
+#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
+#define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
+#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
+#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
+#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
+
+/*
+ * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
+ * parallel index builds. This may be useful as a debugging aid.
+#undef DISABLE_LEADER_PARTICIPATION
+ */
+
+/*
+ * Status record for spooling/sorting phase. (Note we may have two of
+ * these due to the special requirements for uniqueness-checking with
+ * dead tuples.)
+ */
+typedef struct BTSpool
+{
+ Tuplesortstate *sortstate; /* state data for tuplesort.c */
+ Relation heap;
+ Relation index;
+ bool isunique;
+} BTSpool;
+
+/*
+ * Status for index builds performed in parallel. This is allocated in a
+ * dynamic shared memory segment. Note that there is a separate tuplesort TOC
+ * entry, private to tuplesort.c but allocated by this module on its behalf.
+ */
+typedef struct BTShared
+{
+ /*
+ * These fields are not modified during the sort. They primarily exist
+ * for the benefit of worker processes that need to create BTSpool state
+ * corresponding to that used by the leader.
+ */
+ Oid heaprelid;
+ Oid indexrelid;
+ bool isunique;
+ bool isconcurrent;
+ int scantuplesortstates;
+
+ /*
+ * workersdonecv is used to monitor the progress of workers. All parallel
+ * participants must indicate that they are done before leader can use
+ * mutable state that workers maintain during scan (and before leader can
+ * proceed to tuplesort_performsort()).
+ */
+ ConditionVariable workersdonecv;
+
+ /*
+ * mutex protects all fields before heapdesc.
+ *
+ * These fields contain status information of interest to B-Tree index
+ * builds that must work just the same when an index is built in parallel.
+ */
+ slock_t mutex;
+
+ /*
+ * Mutable state that is maintained by workers, and reported back to
+ * leader at end of parallel scan.
+ *
+ * nparticipantsdone is number of worker processes finished.
+ *
+ * reltuples is the total number of input heap tuples.
+ *
+ * havedead indicates if RECENTLY_DEAD tuples were encountered during
+ * build.
+ *
+ * indtuples is the total number of tuples that made it into the index.
+ *
+ * brokenhotchain indicates if any worker detected a broken HOT chain
+ * during build.
+ */
+ int nparticipantsdone;
+ double reltuples;
+ bool havedead;
+ double indtuples;
+ bool brokenhotchain;
+
+ /*
+ * ParallelTableScanDescData data follows. Can't directly embed here, as
+ * implementations of the parallel table scan desc interface might need
+ * stronger alignment.
+ */
+} BTShared;
+
+/*
+ * Return pointer to a BTShared's parallel table scan.
+ *
+ * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
+ * MAXALIGN.
+ */
+#define ParallelTableScanFromBTShared(shared) \
+ (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
+
+/*
+ * Status for leader in parallel index build.
+ */
+typedef struct BTLeader
+{
+ /* parallel context itself */
+ ParallelContext *pcxt;
+
+ /*
+ * nparticipanttuplesorts is the exact number of worker processes
+ * successfully launched, plus one leader process if it participates as a
+ * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
+ * participating as a worker).
+ */
+ int nparticipanttuplesorts;
+
+ /*
+ * Leader process convenience pointers to shared state (leader avoids TOC
+ * lookups).
+ *
+ * btshared is the shared state for entire build. sharedsort is the
+ * shared, tuplesort-managed state passed to each process tuplesort.
+ * sharedsort2 is the corresponding btspool2 shared state, used only when
+ * building unique indexes. snapshot is the snapshot used by the scan iff
+ * an MVCC snapshot is required.
+ */
+ BTShared *btshared;
+ Sharedsort *sharedsort;
+ Sharedsort *sharedsort2;
+ Snapshot snapshot;
+ WalUsage *walusage;
+ BufferUsage *bufferusage;
+} BTLeader;
+
+/*
+ * Working state for btbuild and its callback.
+ *
+ * When parallel CREATE INDEX is used, there is a BTBuildState for each
+ * participant.
+ */
+typedef struct BTBuildState
+{
+ bool isunique;
+ bool havedead;
+ Relation heap;
+ BTSpool *spool;
+
+ /*
+ * spool2 is needed only when the index is a unique index. Dead tuples are
+ * put into spool2 instead of spool in order to avoid uniqueness check.
+ */
+ BTSpool *spool2;
+ double indtuples;
+
+ /*
+ * btleader is only present when a parallel index build is performed, and
+ * only in the leader process. (Actually, only the leader has a
+ * BTBuildState. Workers have their own spool and spool2, though.)
+ */
+ BTLeader *btleader;
+} BTBuildState;
+
+/*
+ * Status record for a btree page being built. We have one of these
+ * for each active tree level.
+ */
+typedef struct BTPageState
+{
+ Page btps_page; /* workspace for page building */
+ BlockNumber btps_blkno; /* block # to write this page at */
+ IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
+ OffsetNumber btps_lastoff; /* last item offset loaded */
+ Size btps_lastextra; /* last item's extra posting list space */
+ uint32 btps_level; /* tree level (0 = leaf) */
+ Size btps_full; /* "full" if less than this much free space */
+ struct BTPageState *btps_next; /* link to parent level, if any */
+} BTPageState;
+
+/*
+ * Overall status record for index writing phase.
+ */
+typedef struct BTWriteState
+{
+ Relation heap;
+ Relation index;
+ BTScanInsert inskey; /* generic insertion scankey */
+ bool btws_use_wal; /* dump pages to WAL? */
+ BlockNumber btws_pages_alloced; /* # pages allocated */
+ BlockNumber btws_pages_written; /* # pages written out */
+ Page btws_zeropage; /* workspace for filling zeroes */
+} BTWriteState;
+
+
+static double _bt_spools_heapscan(Relation heap, Relation index,
+ BTBuildState *buildstate, IndexInfo *indexInfo);
+static void _bt_spooldestroy(BTSpool *btspool);
+static void _bt_spool(BTSpool *btspool, ItemPointer self,
+ Datum *values, bool *isnull);
+static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
+static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
+ bool *isnull, bool tupleIsAlive, void *state);
+static Page _bt_blnewpage(uint32 level);
+static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
+static void _bt_slideleft(Page rightmostpage);
+static void _bt_sortaddtup(Page page, Size itemsize,
+ IndexTuple itup, OffsetNumber itup_off,
+ bool newfirstdataitem);
+static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
+ IndexTuple itup, Size truncextra);
+static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
+ BTPageState *state,
+ BTDedupState dstate);
+static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
+static void _bt_load(BTWriteState *wstate,
+ BTSpool *btspool, BTSpool *btspool2);
+static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
+ int request);
+static void _bt_end_parallel(BTLeader *btleader);
+static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
+static double _bt_parallel_heapscan(BTBuildState *buildstate,
+ bool *brokenhotchain);
+static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
+static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
+ BTShared *btshared, Sharedsort *sharedsort,
+ Sharedsort *sharedsort2, int sortmem,
+ bool progress);
+
+
+/*
+ * btbuild() -- build a new btree index.
+ */
+IndexBuildResult *
+btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+ IndexBuildResult *result;
+ BTBuildState buildstate;
+ double reltuples;
+
+#ifdef BTREE_BUILD_STATS
+ if (log_btree_build_stats)
+ ResetUsage();
+#endif /* BTREE_BUILD_STATS */
+
+ buildstate.isunique = indexInfo->ii_Unique;
+ buildstate.havedead = false;
+ buildstate.heap = heap;
+ buildstate.spool = NULL;
+ buildstate.spool2 = NULL;
+ buildstate.indtuples = 0;
+ buildstate.btleader = NULL;
+
+ /*
+ * We expect to be called exactly once for any index relation. If that's
+ * not the case, big trouble's what we have.
+ */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
+
+ /*
+ * Finish the build by (1) completing the sort of the spool file, (2)
+ * inserting the sorted tuples into btree pages and (3) building the upper
+ * levels. Finally, it may also be necessary to end use of parallelism.
+ */
+ _bt_leafbuild(buildstate.spool, buildstate.spool2);
+ _bt_spooldestroy(buildstate.spool);
+ if (buildstate.spool2)
+ _bt_spooldestroy(buildstate.spool2);
+ if (buildstate.btleader)
+ _bt_end_parallel(buildstate.btleader);
+
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = buildstate.indtuples;
+
+#ifdef BTREE_BUILD_STATS
+ if (log_btree_build_stats)
+ {
+ ShowUsage("BTREE BUILD STATS");
+ ResetUsage();
+ }
+#endif /* BTREE_BUILD_STATS */
+
+ return result;
+}
+
+/*
+ * Create and initialize one or two spool structures, and save them in caller's
+ * buildstate argument. May also fill-in fields within indexInfo used by index
+ * builds.
+ *
+ * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
+ * routine encapsulates all aspects of managing parallelism. Caller need only
+ * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
+ *
+ * Returns the total number of heap tuples scanned.
+ */
+static double
+_bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
+ IndexInfo *indexInfo)
+{
+ BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
+ SortCoordinate coordinate = NULL;
+ double reltuples = 0;
+
+ /*
+ * We size the sort area as maintenance_work_mem rather than work_mem to
+ * speed index creation. This should be OK since a single backend can't
+ * run multiple index creations in parallel (see also: notes on
+ * parallelism and maintenance_work_mem below).
+ */
+ btspool->heap = heap;
+ btspool->index = index;
+ btspool->isunique = indexInfo->ii_Unique;
+
+ /* Save as primary spool */
+ buildstate->spool = btspool;
+
+ /* Report table scan phase started */
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
+
+ /* Attempt to launch parallel worker scan when required */
+ if (indexInfo->ii_ParallelWorkers > 0)
+ _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
+ indexInfo->ii_ParallelWorkers);
+
+ /*
+ * If parallel build requested and at least one worker process was
+ * successfully launched, set up coordination state
+ */
+ if (buildstate->btleader)
+ {
+ coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
+ coordinate->isWorker = false;
+ coordinate->nParticipants =
+ buildstate->btleader->nparticipanttuplesorts;
+ coordinate->sharedsort = buildstate->btleader->sharedsort;
+ }
+
+ /*
+ * Begin serial/leader tuplesort.
+ *
+ * In cases where parallelism is involved, the leader receives the same
+ * share of maintenance_work_mem as a serial sort (it is generally treated
+ * in the same way as a serial sort once we return). Parallel worker
+ * Tuplesortstates will have received only a fraction of
+ * maintenance_work_mem, though.
+ *
+ * We rely on the lifetime of the Leader Tuplesortstate almost not
+ * overlapping with any worker Tuplesortstate's lifetime. There may be
+ * some small overlap, but that's okay because we rely on leader
+ * Tuplesortstate only allocating a small, fixed amount of memory here.
+ * When its tuplesort_performsort() is called (by our caller), and
+ * significant amounts of memory are likely to be used, all workers must
+ * have already freed almost all memory held by their Tuplesortstates
+ * (they are about to go away completely, too). The overall effect is
+ * that maintenance_work_mem always represents an absolute high watermark
+ * on the amount of memory used by a CREATE INDEX operation, regardless of
+ * the use of parallelism or any other factor.
+ */
+ buildstate->spool->sortstate =
+ tuplesort_begin_index_btree(heap, index, buildstate->isunique,
+ maintenance_work_mem, coordinate,
+ false);
+
+ /*
+ * If building a unique index, put dead tuples in a second spool to keep
+ * them out of the uniqueness check. We expect that the second spool (for
+ * dead tuples) won't get very full, so we give it only work_mem.
+ */
+ if (indexInfo->ii_Unique)
+ {
+ BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
+ SortCoordinate coordinate2 = NULL;
+
+ /* Initialize secondary spool */
+ btspool2->heap = heap;
+ btspool2->index = index;
+ btspool2->isunique = false;
+ /* Save as secondary spool */
+ buildstate->spool2 = btspool2;
+
+ if (buildstate->btleader)
+ {
+ /*
+ * Set up non-private state that is passed to
+ * tuplesort_begin_index_btree() about the basic high level
+ * coordination of a parallel sort.
+ */
+ coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
+ coordinate2->isWorker = false;
+ coordinate2->nParticipants =
+ buildstate->btleader->nparticipanttuplesorts;
+ coordinate2->sharedsort = buildstate->btleader->sharedsort2;
+ }
+
+ /*
+ * We expect that the second one (for dead tuples) won't get very
+ * full, so we give it only work_mem
+ */
+ buildstate->spool2->sortstate =
+ tuplesort_begin_index_btree(heap, index, false, work_mem,
+ coordinate2, false);
+ }
+
+ /* Fill spool using either serial or parallel heap scan */
+ if (!buildstate->btleader)
+ reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+ _bt_build_callback, (void *) buildstate,
+ NULL);
+ else
+ reltuples = _bt_parallel_heapscan(buildstate,
+ &indexInfo->ii_BrokenHotChain);
+
+ /*
+ * Set the progress target for the next phase. Reset the block number
+ * values set by table_index_build_scan
+ */
+ {
+ const int progress_index[] = {
+ PROGRESS_CREATEIDX_TUPLES_TOTAL,
+ PROGRESS_SCAN_BLOCKS_TOTAL,
+ PROGRESS_SCAN_BLOCKS_DONE
+ };
+ const int64 progress_vals[] = {
+ buildstate->indtuples,
+ 0, 0
+ };
+
+ pgstat_progress_update_multi_param(3, progress_index, progress_vals);
+ }
+
+ /* okay, all heap tuples are spooled */
+ if (buildstate->spool2 && !buildstate->havedead)
+ {
+ /* spool2 turns out to be unnecessary */
+ _bt_spooldestroy(buildstate->spool2);
+ buildstate->spool2 = NULL;
+ }
+
+ return reltuples;
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+static void
+_bt_spooldestroy(BTSpool *btspool)
+{
+ tuplesort_end(btspool->sortstate);
+ pfree(btspool);
+}
+
+/*
+ * spool an index entry into the sort file.
+ */
+static void
+_bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
+{
+ tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
+ self, values, isnull);
+}
+
+/*
+ * given a spool loaded by successive calls to _bt_spool,
+ * create an entire btree.
+ */
+static void
+_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
+{
+ BTWriteState wstate;
+
+#ifdef BTREE_BUILD_STATS
+ if (log_btree_build_stats)
+ {
+ ShowUsage("BTREE BUILD (Spool) STATISTICS");
+ ResetUsage();
+ }
+#endif /* BTREE_BUILD_STATS */
+
+ /* Execute the sort */
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+ PROGRESS_BTREE_PHASE_PERFORMSORT_1);
+ tuplesort_performsort(btspool->sortstate);
+ if (btspool2)
+ {
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+ PROGRESS_BTREE_PHASE_PERFORMSORT_2);
+ tuplesort_performsort(btspool2->sortstate);
+ }
+
+ wstate.heap = btspool->heap;
+ wstate.index = btspool->index;
+ wstate.inskey = _bt_mkscankey(wstate.index, NULL);
+ /* _bt_mkscankey() won't set allequalimage without metapage */
+ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
+ wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
+
+ /* reserve the metapage */
+ wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
+ wstate.btws_pages_written = 0;
+ wstate.btws_zeropage = NULL; /* until needed */
+
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+ PROGRESS_BTREE_PHASE_LEAF_LOAD);
+ _bt_load(&wstate, btspool, btspool2);
+}
+
+/*
+ * Per-tuple callback for table_index_build_scan
+ */
+static void
+_bt_build_callback(Relation index,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state)
+{
+ BTBuildState *buildstate = (BTBuildState *) state;
+
+ /*
+ * insert the index tuple into the appropriate spool file for subsequent
+ * processing
+ */
+ if (tupleIsAlive || buildstate->spool2 == NULL)
+ _bt_spool(buildstate->spool, tid, values, isnull);
+ else
+ {
+ /* dead tuples are put into spool2 */
+ buildstate->havedead = true;
+ _bt_spool(buildstate->spool2, tid, values, isnull);
+ }
+
+ buildstate->indtuples += 1;
+}
+
+/*
+ * allocate workspace for a new, clean btree page, not linked to any siblings.
+ */
+static Page
+_bt_blnewpage(uint32 level)
+{
+ Page page;
+ BTPageOpaque opaque;
+
+ page = (Page) palloc(BLCKSZ);
+
+ /* Zero the page and set up standard page header info */
+ _bt_pageinit(page, BLCKSZ);
+
+ /* Initialize BT opaque state */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ opaque->btpo_prev = opaque->btpo_next = P_NONE;
+ opaque->btpo_level = level;
+ opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
+ opaque->btpo_cycleid = 0;
+
+ /* Make the P_HIKEY line pointer appear allocated */
+ ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
+
+ return page;
+}
+
+/*
+ * emit a completed btree page, and release the working storage.
+ */
+static void
+_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
+{
+ /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
+ RelationOpenSmgr(wstate->index);
+
+ /* XLOG stuff */
+ if (wstate->btws_use_wal)
+ {
+ /* We use the XLOG_FPI record type for this */
+ log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
+ }
+
+ /*
+ * If we have to write pages nonsequentially, fill in the space with
+ * zeroes until we come back and overwrite. This is not logically
+ * necessary on standard Unix filesystems (unwritten space will read as
+ * zeroes anyway), but it should help to avoid fragmentation. The dummy
+ * pages aren't WAL-logged though.
+ */
+ while (blkno > wstate->btws_pages_written)
+ {
+ if (!wstate->btws_zeropage)
+ wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
+ /* don't set checksum for all-zero page */
+ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
+ wstate->btws_pages_written++,
+ (char *) wstate->btws_zeropage,
+ true);
+ }
+
+ PageSetChecksumInplace(page, blkno);
+
+ /*
+ * Now write the page. There's no need for smgr to schedule an fsync for
+ * this write; we'll do it ourselves before ending the build.
+ */
+ if (blkno == wstate->btws_pages_written)
+ {
+ /* extending the file... */
+ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
+ (char *) page, true);
+ wstate->btws_pages_written++;
+ }
+ else
+ {
+ /* overwriting a block we zero-filled before */
+ smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
+ (char *) page, true);
+ }
+
+ pfree(page);
+}
+
+/*
+ * allocate and initialize a new BTPageState. the returned structure
+ * is suitable for immediate use by _bt_buildadd.
+ */
+static BTPageState *
+_bt_pagestate(BTWriteState *wstate, uint32 level)
+{
+ BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
+
+ /* create initial page for level */
+ state->btps_page = _bt_blnewpage(level);
+
+ /* and assign it a page position */
+ state->btps_blkno = wstate->btws_pages_alloced++;
+
+ state->btps_lowkey = NULL;
+ /* initialize lastoff so first item goes into P_FIRSTKEY */
+ state->btps_lastoff = P_HIKEY;
+ state->btps_lastextra = 0;
+ state->btps_level = level;
+ /* set "full" threshold based on level. See notes at head of file. */
+ if (level > 0)
+ state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
+ else
+ state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
+
+ /* no parent level, yet */
+ state->btps_next = NULL;
+
+ return state;
+}
+
+/*
+ * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
+ * P_HIKEY, overwriting P_HIKEY).
+ *
+ * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
+ * rightmost page on its level is not supposed to get a high key. Now that
+ * it's clear that this page is a rightmost page, remove the unneeded empty
+ * P_HIKEY line pointer space.
+ */
+static void
+_bt_slideleft(Page rightmostpage)
+{
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ ItemId previi;
+
+ maxoff = PageGetMaxOffsetNumber(rightmostpage);
+ Assert(maxoff >= P_FIRSTKEY);
+ previi = PageGetItemId(rightmostpage, P_HIKEY);
+ for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
+ {
+ ItemId thisii = PageGetItemId(rightmostpage, off);
+
+ *previi = *thisii;
+ previi = thisii;
+ }
+ ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
+}
+
+/*
+ * Add an item to a page being built.
+ *
+ * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
+ * raises an error directly.
+ *
+ * Note that our nbtsort.c caller does not know yet if the page will be
+ * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
+ * caller. Page that turns out to be the rightmost on its level is fixed by
+ * calling _bt_slideleft().
+ */
+static void
+_bt_sortaddtup(Page page,
+ Size itemsize,
+ IndexTuple itup,
+ OffsetNumber itup_off,
+ bool newfirstdataitem)
+{
+ IndexTupleData trunctuple;
+
+ if (newfirstdataitem)
+ {
+ trunctuple = *itup;
+ trunctuple.t_info = sizeof(IndexTupleData);
+ BTreeTupleSetNAtts(&trunctuple, 0, false);
+ itup = &trunctuple;
+ itemsize = sizeof(IndexTupleData);
+ }
+
+ if (PageAddItem(page, (Item) itup, itemsize, itup_off,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to the index page");
+}
+
+/*----------
+ * Add an item to a disk page from the sort output (or add a posting list
+ * item formed from the sort output).
+ *
+ * We must be careful to observe the page layout conventions of nbtsearch.c:
+ * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
+ * - on non-leaf pages, the key portion of the first item need not be
+ * stored, we should store only the link.
+ *
+ * A leaf page being built looks like:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ... |
+ * +-----------+----+---------------------------------+
+ * | ... linpN | |
+ * +-----------+--------------------------------------+
+ * | ^ last |
+ * | |
+ * +-------------+------------------------------------+
+ * | | itemN ... |
+ * +-------------+------------------+-----------------+
+ * | ... item3 item2 item1 | "special space" |
+ * +--------------------------------+-----------------+
+ *
+ * Contrast this with the diagram in bufpage.h; note the mismatch
+ * between linps and items. This is because we reserve linp0 as a
+ * placeholder for the pointer to the "high key" item; when we have
+ * filled up the page, we will set linp0 to point to itemN and clear
+ * linpN. On the other hand, if we find this is the last (rightmost)
+ * page, we leave the items alone and slide the linp array over. If
+ * the high key is to be truncated, offset 1 is deleted, and we insert
+ * the truncated high key at offset 1.
+ *
+ * 'last' pointer indicates the last offset added to the page.
+ *
+ * 'truncextra' is the size of the posting list in itup, if any. This
+ * information is stashed for the next call here, when we may benefit
+ * from considering the impact of truncating away the posting list on
+ * the page before deciding to finish the page off. Posting lists are
+ * often relatively large, so it is worth going to the trouble of
+ * accounting for the saving from truncating away the posting list of
+ * the tuple that becomes the high key (that may be the only way to
+ * get close to target free space on the page). Note that this is
+ * only used for the soft fillfactor-wise limit, not the critical hard
+ * limit.
+ *----------
+ */
+static void
+_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
+ Size truncextra)
+{
+ Page npage;
+ BlockNumber nblkno;
+ OffsetNumber last_off;
+ Size last_truncextra;
+ Size pgspc;
+ Size itupsz;
+ bool isleaf;
+
+ /*
+ * This is a handy place to check for cancel interrupts during the btree
+ * load phase of index creation.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ npage = state->btps_page;
+ nblkno = state->btps_blkno;
+ last_off = state->btps_lastoff;
+ last_truncextra = state->btps_lastextra;
+ state->btps_lastextra = truncextra;
+
+ pgspc = PageGetFreeSpace(npage);
+ itupsz = IndexTupleSize(itup);
+ itupsz = MAXALIGN(itupsz);
+ /* Leaf case has slightly different rules due to suffix truncation */
+ isleaf = (state->btps_level == 0);
+
+ /*
+ * Check whether the new item can fit on a btree page on current level at
+ * all.
+ *
+ * Every newly built index will treat heap TID as part of the keyspace,
+ * which imposes the requirement that new high keys must occasionally have
+ * a heap TID appended within _bt_truncate(). That may leave a new pivot
+ * tuple one or two MAXALIGN() quantums larger than the original
+ * firstright tuple it's derived from. v4 deals with the problem by
+ * decreasing the limit on the size of tuples inserted on the leaf level
+ * by the same small amount. Enforce the new v4+ limit on the leaf level,
+ * and the old limit on internal levels, since pivot tuples may need to
+ * make use of the reserved space. This should never fail on internal
+ * pages.
+ */
+ if (unlikely(itupsz > BTMaxItemSize(npage)))
+ _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
+ itup);
+
+ /*
+ * Check to see if current page will fit new item, with space left over to
+ * append a heap TID during suffix truncation when page is a leaf page.
+ *
+ * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
+ * high key with heap TID when finishing off a leaf page, since we rely on
+ * _bt_check_third_page() rejecting oversized non-pivot tuples. On
+ * internal pages we can always fit 3 pivot tuples with larger internal
+ * page tuple limit (includes page high key).
+ *
+ * Most of the time, a page is only "full" in the sense that the soft
+ * fillfactor-wise limit has been exceeded. However, we must always leave
+ * at least two items plus a high key on each page before starting a new
+ * page. Disregard fillfactor and insert on "full" current page if we
+ * don't have the minimum number of items yet. (Note that we deliberately
+ * assume that suffix truncation neither enlarges nor shrinks new high key
+ * when applying soft limit, except when last tuple has a posting list.)
+ */
+ Assert(last_truncextra == 0 || isleaf);
+ if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
+ (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
+ {
+ /*
+ * Finish off the page and write it out.
+ */
+ Page opage = npage;
+ BlockNumber oblkno = nblkno;
+ ItemId ii;
+ ItemId hii;
+ IndexTuple oitup;
+
+ /* Create new page of same level */
+ npage = _bt_blnewpage(state->btps_level);
+
+ /* and assign it a page position */
+ nblkno = wstate->btws_pages_alloced++;
+
+ /*
+ * We copy the last item on the page into the new page, and then
+ * rearrange the old page so that the 'last item' becomes its high key
+ * rather than a true data item. There had better be at least two
+ * items on the page already, else the page would be empty of useful
+ * data.
+ */
+ Assert(last_off > P_FIRSTKEY);
+ ii = PageGetItemId(opage, last_off);
+ oitup = (IndexTuple) PageGetItem(opage, ii);
+ _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
+ !isleaf);
+
+ /*
+ * Move 'last' into the high key position on opage. _bt_blnewpage()
+ * allocated empty space for a line pointer when opage was first
+ * created, so this is a matter of rearranging already-allocated space
+ * on page, and initializing high key line pointer. (Actually, leaf
+ * pages must also swap oitup with a truncated version of oitup, which
+ * is sometimes larger than oitup, though never by more than the space
+ * needed to append a heap TID.)
+ */
+ hii = PageGetItemId(opage, P_HIKEY);
+ *hii = *ii;
+ ItemIdSetUnused(ii); /* redundant */
+ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
+
+ if (isleaf)
+ {
+ IndexTuple lastleft;
+ IndexTuple truncated;
+
+ /*
+ * Truncate away any unneeded attributes from high key on leaf
+ * level. This is only done at the leaf level because downlinks
+ * in internal pages are either negative infinity items, or get
+ * their contents from copying from one level down. See also:
+ * _bt_split().
+ *
+ * We don't try to bias our choice of split point to make it more
+ * likely that _bt_truncate() can truncate away more attributes,
+ * whereas the split point used within _bt_split() is chosen much
+ * more delicately. Even still, the lastleft and firstright
+ * tuples passed to _bt_truncate() here are at least not fully
+ * equal to each other when deduplication is used, unless there is
+ * a large group of duplicates (also, unique index builds usually
+ * have few or no spool2 duplicates). When the split point is
+ * between two unequal tuples, _bt_truncate() will avoid including
+ * a heap TID in the new high key, which is the most important
+ * benefit of suffix truncation.
+ *
+ * Overwrite the old item with new truncated high key directly.
+ * oitup is already located at the physical beginning of tuple
+ * space, so this should directly reuse the existing tuple space.
+ */
+ ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
+ lastleft = (IndexTuple) PageGetItem(opage, ii);
+
+ Assert(IndexTupleSize(oitup) > last_truncextra);
+ truncated = _bt_truncate(wstate->index, lastleft, oitup,
+ wstate->inskey);
+ if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
+ IndexTupleSize(truncated)))
+ elog(ERROR, "failed to add high key to the index page");
+ pfree(truncated);
+
+ /* oitup should continue to point to the page's high key */
+ hii = PageGetItemId(opage, P_HIKEY);
+ oitup = (IndexTuple) PageGetItem(opage, hii);
+ }
+
+ /*
+ * Link the old page into its parent, using its low key. If we don't
+ * have a parent, we have to create one; this adds a new btree level.
+ */
+ if (state->btps_next == NULL)
+ state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
+
+ Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
+ IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+ BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
+ P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
+ Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
+ !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
+ BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
+ _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
+ pfree(state->btps_lowkey);
+
+ /*
+ * Save a copy of the high key from the old page. It is also the low
+ * key for the new page.
+ */
+ state->btps_lowkey = CopyIndexTuple(oitup);
+
+ /*
+ * Set the sibling links for both pages.
+ */
+ {
+ BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+ oopaque->btpo_next = nblkno;
+ nopaque->btpo_prev = oblkno;
+ nopaque->btpo_next = P_NONE; /* redundant */
+ }
+
+ /*
+ * Write out the old page. We never need to touch it again, so we can
+ * free the opage workspace too.
+ */
+ _bt_blwritepage(wstate, opage, oblkno);
+
+ /*
+ * Reset last_off to point to new page
+ */
+ last_off = P_FIRSTKEY;
+ }
+
+ /*
+ * By here, either original page is still the current page, or a new page
+ * was created that became the current page. Either way, the current page
+ * definitely has space for new item.
+ *
+ * If the new item is the first for its page, it must also be the first
+ * item on its entire level. On later same-level pages, a low key for a
+ * page will be copied from the prior page in the code above. Generate a
+ * minus infinity low key here instead.
+ */
+ if (last_off == P_HIKEY)
+ {
+ Assert(state->btps_lowkey == NULL);
+ state->btps_lowkey = palloc0(sizeof(IndexTupleData));
+ state->btps_lowkey->t_info = sizeof(IndexTupleData);
+ BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
+ }
+
+ /*
+ * Add the new item into the current page.
+ */
+ last_off = OffsetNumberNext(last_off);
+ _bt_sortaddtup(npage, itupsz, itup, last_off,
+ !isleaf && last_off == P_FIRSTKEY);
+
+ state->btps_page = npage;
+ state->btps_blkno = nblkno;
+ state->btps_lastoff = last_off;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the index. Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
+ * using _bt_buildadd().
+ */
+static void
+_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
+ BTDedupState dstate)
+{
+ Assert(dstate->nitems > 0);
+
+ if (dstate->nitems == 1)
+ _bt_buildadd(wstate, state, dstate->base, 0);
+ else
+ {
+ IndexTuple postingtuple;
+ Size truncextra;
+
+ /* form a tuple with a posting list */
+ postingtuple = _bt_form_posting(dstate->base,
+ dstate->htids,
+ dstate->nhtids);
+ /* Calculate posting list overhead */
+ truncextra = IndexTupleSize(postingtuple) -
+ BTreeTupleGetPostingOffset(postingtuple);
+
+ _bt_buildadd(wstate, state, postingtuple, truncextra);
+ pfree(postingtuple);
+ }
+
+ dstate->nmaxitems = 0;
+ dstate->nhtids = 0;
+ dstate->nitems = 0;
+ dstate->phystupsize = 0;
+}
+
+/*
+ * Finish writing out the completed btree.
+ */
+static void
+_bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
+{
+ BTPageState *s;
+ BlockNumber rootblkno = P_NONE;
+ uint32 rootlevel = 0;
+ Page metapage;
+
+ /*
+ * Each iteration of this loop completes one more level of the tree.
+ */
+ for (s = state; s != NULL; s = s->btps_next)
+ {
+ BlockNumber blkno;
+ BTPageOpaque opaque;
+
+ blkno = s->btps_blkno;
+ opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
+
+ /*
+ * We have to link the last page on this level to somewhere.
+ *
+ * If we're at the top, it's the root, so attach it to the metapage.
+ * Otherwise, add an entry for it to its parent using its low key.
+ * This may cause the last page of the parent level to split, but
+ * that's not a problem -- we haven't gotten to it yet.
+ */
+ if (s->btps_next == NULL)
+ {
+ opaque->btpo_flags |= BTP_ROOT;
+ rootblkno = blkno;
+ rootlevel = s->btps_level;
+ }
+ else
+ {
+ Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
+ IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+ BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
+ P_LEFTMOST(opaque));
+ Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
+ !P_LEFTMOST(opaque));
+ BTreeTupleSetDownLink(s->btps_lowkey, blkno);
+ _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
+ pfree(s->btps_lowkey);
+ s->btps_lowkey = NULL;
+ }
+
+ /*
+ * This is the rightmost page, so the ItemId array needs to be slid
+ * back one slot. Then we can dump out the page.
+ */
+ _bt_slideleft(s->btps_page);
+ _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
+ s->btps_page = NULL; /* writepage freed the workspace */
+ }
+
+ /*
+ * As the last step in the process, construct the metapage and make it
+ * point to the new root (unless we had no data at all, in which case it's
+ * set to point to "P_NONE"). This changes the index to the "valid" state
+ * by filling in a valid magic number in the metapage.
+ */
+ metapage = (Page) palloc(BLCKSZ);
+ _bt_initmetapage(metapage, rootblkno, rootlevel,
+ wstate->inskey->allequalimage);
+ _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
+}
+
+/*
+ * Read tuples in correct sort order from tuplesort, and load them into
+ * btree leaves.
+ */
+static void
+_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
+{
+ BTPageState *state = NULL;
+ bool merge = (btspool2 != NULL);
+ IndexTuple itup,
+ itup2 = NULL;
+ bool load1;
+ TupleDesc tupdes = RelationGetDescr(wstate->index);
+ int i,
+ keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
+ SortSupport sortKeys;
+ int64 tuples_done = 0;
+ bool deduplicate;
+
+ deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
+ BTGetDeduplicateItems(wstate->index);
+
+ if (merge)
+ {
+ /*
+ * Another BTSpool for dead tuples exists. Now we have to merge
+ * btspool and btspool2.
+ */
+
+ /* the preparation of merge */
+ itup = tuplesort_getindextuple(btspool->sortstate, true);
+ itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
+
+ /* Prepare SortSupport data for each column */
+ sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
+
+ for (i = 0; i < keysz; i++)
+ {
+ SortSupport sortKey = sortKeys + i;
+ ScanKey scanKey = wstate->inskey->scankeys + i;
+ int16 strategy;
+
+ sortKey->ssup_cxt = CurrentMemoryContext;
+ sortKey->ssup_collation = scanKey->sk_collation;
+ sortKey->ssup_nulls_first =
+ (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
+ sortKey->ssup_attno = scanKey->sk_attno;
+ /* Abbreviation is not supported here */
+ sortKey->abbreviate = false;
+
+ AssertState(sortKey->ssup_attno != 0);
+
+ strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
+ BTGreaterStrategyNumber : BTLessStrategyNumber;
+
+ PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
+ }
+
+ for (;;)
+ {
+ load1 = true; /* load BTSpool next ? */
+ if (itup2 == NULL)
+ {
+ if (itup == NULL)
+ break;
+ }
+ else if (itup != NULL)
+ {
+ int32 compare = 0;
+
+ for (i = 1; i <= keysz; i++)
+ {
+ SortSupport entry;
+ Datum attrDatum1,
+ attrDatum2;
+ bool isNull1,
+ isNull2;
+
+ entry = sortKeys + i - 1;
+ attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
+ attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
+
+ compare = ApplySortComparator(attrDatum1, isNull1,
+ attrDatum2, isNull2,
+ entry);
+ if (compare > 0)
+ {
+ load1 = false;
+ break;
+ }
+ else if (compare < 0)
+ break;
+ }
+
+ /*
+ * If key values are equal, we sort on ItemPointer. This is
+ * required for btree indexes, since heap TID is treated as an
+ * implicit last key attribute in order to ensure that all
+ * keys in the index are physically unique.
+ */
+ if (compare == 0)
+ {
+ compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
+ Assert(compare != 0);
+ if (compare > 0)
+ load1 = false;
+ }
+ }
+ else
+ load1 = false;
+
+ /* When we see first tuple, create first index page */
+ if (state == NULL)
+ state = _bt_pagestate(wstate, 0);
+
+ if (load1)
+ {
+ _bt_buildadd(wstate, state, itup, 0);
+ itup = tuplesort_getindextuple(btspool->sortstate, true);
+ }
+ else
+ {
+ _bt_buildadd(wstate, state, itup2, 0);
+ itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
+ }
+
+ /* Report progress */
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+ ++tuples_done);
+ }
+ pfree(sortKeys);
+ }
+ else if (deduplicate)
+ {
+ /* merge is unnecessary, deduplicate into posting lists */
+ BTDedupState dstate;
+
+ dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ dstate->deduplicate = true; /* unused */
+ dstate->nmaxitems = 0; /* unused */
+ dstate->maxpostingsize = 0; /* set later */
+ /* Metadata about base tuple of current pending posting list */
+ dstate->base = NULL;
+ dstate->baseoff = InvalidOffsetNumber; /* unused */
+ dstate->basetupsize = 0;
+ /* Metadata about current pending posting list TIDs */
+ dstate->htids = NULL;
+ dstate->nhtids = 0;
+ dstate->nitems = 0;
+ dstate->phystupsize = 0; /* unused */
+ dstate->nintervals = 0; /* unused */
+
+ while ((itup = tuplesort_getindextuple(btspool->sortstate,
+ true)) != NULL)
+ {
+ /* When we see first tuple, create first index page */
+ if (state == NULL)
+ {
+ state = _bt_pagestate(wstate, 0);
+
+ /*
+ * Limit size of posting list tuples to 1/10 space we want to
+ * leave behind on the page, plus space for final item's line
+ * pointer. This is equal to the space that we'd like to
+ * leave behind on each leaf page when fillfactor is 90,
+ * allowing us to get close to fillfactor% space utilization
+ * when there happen to be a great many duplicates. (This
+ * makes higher leaf fillfactor settings ineffective when
+ * building indexes that have many duplicates, but packing
+ * leaf pages full with few very large tuples doesn't seem
+ * like a useful goal.)
+ */
+ dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
+ sizeof(ItemIdData);
+ Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+ dstate->maxpostingsize <= INDEX_SIZE_MASK);
+ dstate->htids = palloc(dstate->maxpostingsize);
+
+ /* start new pending posting list with itup copy */
+ _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+ InvalidOffsetNumber);
+ }
+ else if (_bt_keep_natts_fast(wstate->index, dstate->base,
+ itup) > keysz &&
+ _bt_dedup_save_htid(dstate, itup))
+ {
+ /*
+ * Tuple is equal to base tuple of pending posting list. Heap
+ * TID from itup has been saved in state.
+ */
+ }
+ else
+ {
+ /*
+ * Tuple is not equal to pending posting list tuple, or
+ * _bt_dedup_save_htid() opted to not merge current item into
+ * pending posting list.
+ */
+ _bt_sort_dedup_finish_pending(wstate, state, dstate);
+ pfree(dstate->base);
+
+ /* start new pending posting list with itup copy */
+ _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+ InvalidOffsetNumber);
+ }
+
+ /* Report progress */
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+ ++tuples_done);
+ }
+
+ if (state)
+ {
+ /*
+ * Handle the last item (there must be a last item when the
+ * tuplesort returned one or more tuples)
+ */
+ _bt_sort_dedup_finish_pending(wstate, state, dstate);
+ pfree(dstate->base);
+ pfree(dstate->htids);
+ }
+
+ pfree(dstate);
+ }
+ else
+ {
+ /* merging and deduplication are both unnecessary */
+ while ((itup = tuplesort_getindextuple(btspool->sortstate,
+ true)) != NULL)
+ {
+ /* When we see first tuple, create first index page */
+ if (state == NULL)
+ state = _bt_pagestate(wstate, 0);
+
+ _bt_buildadd(wstate, state, itup, 0);
+
+ /* Report progress */
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+ ++tuples_done);
+ }
+ }
+
+ /* Close down final pages and write the metapage */
+ _bt_uppershutdown(wstate, state);
+
+ /*
+ * When we WAL-logged index pages, we must nonetheless fsync index files.
+ * Since we're building outside shared buffers, a CHECKPOINT occurring
+ * during the build has no way to flush the previously written data to
+ * disk (indeed it won't know the index even exists). A crash later on
+ * would replay WAL from the checkpoint, therefore it wouldn't replay our
+ * earlier WAL entries. If we do not fsync those pages here, they might
+ * still not be on disk when the crash occurs.
+ */
+ if (wstate->btws_use_wal)
+ {
+ RelationOpenSmgr(wstate->index);
+ smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
+ }
+}
+
+/*
+ * Create parallel context, and launch workers for leader.
+ *
+ * buildstate argument should be initialized (with the exception of the
+ * tuplesort state in spools, which may later be created based on shared
+ * state initially set up here).
+ *
+ * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
+ *
+ * request is the target number of parallel worker processes to launch.
+ *
+ * Sets buildstate's BTLeader, which caller must use to shut down parallel
+ * mode by passing it to _bt_end_parallel() at the very end of its index
+ * build. If not even a single worker process can be launched, this is
+ * never set, and caller should proceed with a serial index build.
+ */
+static void
+_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
+{
+ ParallelContext *pcxt;
+ int scantuplesortstates;
+ Snapshot snapshot;
+ Size estbtshared;
+ Size estsort;
+ BTShared *btshared;
+ Sharedsort *sharedsort;
+ Sharedsort *sharedsort2;
+ BTSpool *btspool = buildstate->spool;
+ BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
+ WalUsage *walusage;
+ BufferUsage *bufferusage;
+ bool leaderparticipates = true;
+ int querylen;
+
+#ifdef DISABLE_LEADER_PARTICIPATION
+ leaderparticipates = false;
+#endif
+
+ /*
+ * Enter parallel mode, and create context for parallel build of btree
+ * index
+ */
+ EnterParallelMode();
+ Assert(request > 0);
+ pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
+ request);
+
+ scantuplesortstates = leaderparticipates ? request + 1 : request;
+
+ /*
+ * Prepare for scan of the base relation. In a normal index build, we use
+ * SnapshotAny because we must retrieve all tuples and do our own time
+ * qual checks (because we have to index RECENTLY_DEAD tuples). In a
+ * concurrent build, we take a regular MVCC snapshot and index whatever's
+ * live according to that.
+ */
+ if (!isconcurrent)
+ snapshot = SnapshotAny;
+ else
+ snapshot = RegisterSnapshot(GetTransactionSnapshot());
+
+ /*
+ * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
+ * PARALLEL_KEY_TUPLESORT tuplesort workspace
+ */
+ estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
+ shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
+ estsort = tuplesort_estimate_shared(scantuplesortstates);
+ shm_toc_estimate_chunk(&pcxt->estimator, estsort);
+
+ /*
+ * Unique case requires a second spool, and so we may have to account for
+ * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
+ */
+ if (!btspool->isunique)
+ shm_toc_estimate_keys(&pcxt->estimator, 2);
+ else
+ {
+ shm_toc_estimate_chunk(&pcxt->estimator, estsort);
+ shm_toc_estimate_keys(&pcxt->estimator, 3);
+ }
+
+ /*
+ * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
+ * and PARALLEL_KEY_BUFFER_USAGE.
+ *
+ * If there are no extensions loaded that care, we could skip this. We
+ * have no way of knowing whether anyone's looking at pgWalUsage or
+ * pgBufferUsage, so do it unconditionally.
+ */
+ shm_toc_estimate_chunk(&pcxt->estimator,
+ mul_size(sizeof(WalUsage), pcxt->nworkers));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+ shm_toc_estimate_chunk(&pcxt->estimator,
+ mul_size(sizeof(BufferUsage), pcxt->nworkers));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
+ if (debug_query_string)
+ {
+ querylen = strlen(debug_query_string);
+ shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+ }
+ else
+ querylen = 0; /* keep compiler quiet */
+
+ /* Everyone's had a chance to ask for space, so now create the DSM */
+ InitializeParallelDSM(pcxt);
+
+ /* If no DSM segment was available, back out (do serial build) */
+ if (pcxt->seg == NULL)
+ {
+ if (IsMVCCSnapshot(snapshot))
+ UnregisterSnapshot(snapshot);
+ DestroyParallelContext(pcxt);
+ ExitParallelMode();
+ return;
+ }
+
+ /* Store shared build state, for which we reserved space */
+ btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
+ /* Initialize immutable state */
+ btshared->heaprelid = RelationGetRelid(btspool->heap);
+ btshared->indexrelid = RelationGetRelid(btspool->index);
+ btshared->isunique = btspool->isunique;
+ btshared->isconcurrent = isconcurrent;
+ btshared->scantuplesortstates = scantuplesortstates;
+ ConditionVariableInit(&btshared->workersdonecv);
+ SpinLockInit(&btshared->mutex);
+ /* Initialize mutable state */
+ btshared->nparticipantsdone = 0;
+ btshared->reltuples = 0.0;
+ btshared->havedead = false;
+ btshared->indtuples = 0.0;
+ btshared->brokenhotchain = false;
+ table_parallelscan_initialize(btspool->heap,
+ ParallelTableScanFromBTShared(btshared),
+ snapshot);
+
+ /*
+ * Store shared tuplesort-private state, for which we reserved space.
+ * Then, initialize opaque state using tuplesort routine.
+ */
+ sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
+ tuplesort_initialize_shared(sharedsort, scantuplesortstates,
+ pcxt->seg);
+
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
+
+ /* Unique case requires a second spool, and associated shared state */
+ if (!btspool->isunique)
+ sharedsort2 = NULL;
+ else
+ {
+ /*
+ * Store additional shared tuplesort-private state, for which we
+ * reserved space. Then, initialize opaque state using tuplesort
+ * routine.
+ */
+ sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
+ tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
+ pcxt->seg);
+
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
+ }
+
+ /* Store query string for workers */
+ if (debug_query_string)
+ {
+ char *sharedquery;
+
+ sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
+ memcpy(sharedquery, debug_query_string, querylen + 1);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
+ }
+
+ /*
+ * Allocate space for each worker's WalUsage and BufferUsage; no need to
+ * initialize.
+ */
+ walusage = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(WalUsage), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
+ bufferusage = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(BufferUsage), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+
+ /* Launch workers, saving status for leader/caller */
+ LaunchParallelWorkers(pcxt);
+ btleader->pcxt = pcxt;
+ btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
+ if (leaderparticipates)
+ btleader->nparticipanttuplesorts++;
+ btleader->btshared = btshared;
+ btleader->sharedsort = sharedsort;
+ btleader->sharedsort2 = sharedsort2;
+ btleader->snapshot = snapshot;
+ btleader->walusage = walusage;
+ btleader->bufferusage = bufferusage;
+
+ /* If no workers were successfully launched, back out (do serial build) */
+ if (pcxt->nworkers_launched == 0)
+ {
+ _bt_end_parallel(btleader);
+ return;
+ }
+
+ /* Save leader state now that it's clear build will be parallel */
+ buildstate->btleader = btleader;
+
+ /* Join heap scan ourselves */
+ if (leaderparticipates)
+ _bt_leader_participate_as_worker(buildstate);
+
+ /*
+ * Caller needs to wait for all launched workers when we return. Make
+ * sure that the failure-to-start case will not hang forever.
+ */
+ WaitForParallelWorkersToAttach(pcxt);
+}
+
+/*
+ * Shut down workers, destroy parallel context, and end parallel mode.
+ */
+static void
+_bt_end_parallel(BTLeader *btleader)
+{
+ int i;
+
+ /* Shutdown worker processes */
+ WaitForParallelWorkersToFinish(btleader->pcxt);
+
+ /*
+ * Next, accumulate WAL usage. (This must wait for the workers to finish,
+ * or we might get incomplete data.)
+ */
+ for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
+ InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
+
+ /* Free last reference to MVCC snapshot, if one was used */
+ if (IsMVCCSnapshot(btleader->snapshot))
+ UnregisterSnapshot(btleader->snapshot);
+ DestroyParallelContext(btleader->pcxt);
+ ExitParallelMode();
+}
+
+/*
+ * Returns size of shared memory required to store state for a parallel
+ * btree index build based on the snapshot its parallel scan will use.
+ */
+static Size
+_bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
+{
+ /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
+ return add_size(BUFFERALIGN(sizeof(BTShared)),
+ table_parallelscan_estimate(heap, snapshot));
+}
+
+/*
+ * Within leader, wait for end of heap scan.
+ *
+ * When called, parallel heap scan started by _bt_begin_parallel() will
+ * already be underway within worker processes (when leader participates
+ * as a worker, we should end up here just as workers are finishing).
+ *
+ * Fills in fields needed for ambuild statistics, and lets caller set
+ * field indicating that some worker encountered a broken HOT chain.
+ *
+ * Returns the total number of heap tuples scanned.
+ */
+static double
+_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
+{
+ BTShared *btshared = buildstate->btleader->btshared;
+ int nparticipanttuplesorts;
+ double reltuples;
+
+ nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
+ for (;;)
+ {
+ SpinLockAcquire(&btshared->mutex);
+ if (btshared->nparticipantsdone == nparticipanttuplesorts)
+ {
+ buildstate->havedead = btshared->havedead;
+ buildstate->indtuples = btshared->indtuples;
+ *brokenhotchain = btshared->brokenhotchain;
+ reltuples = btshared->reltuples;
+ SpinLockRelease(&btshared->mutex);
+ break;
+ }
+ SpinLockRelease(&btshared->mutex);
+
+ ConditionVariableSleep(&btshared->workersdonecv,
+ WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
+ }
+
+ ConditionVariableCancelSleep();
+
+ return reltuples;
+}
+
+/*
+ * Within leader, participate as a parallel worker.
+ */
+static void
+_bt_leader_participate_as_worker(BTBuildState *buildstate)
+{
+ BTLeader *btleader = buildstate->btleader;
+ BTSpool *leaderworker;
+ BTSpool *leaderworker2;
+ int sortmem;
+
+ /* Allocate memory and initialize private spool */
+ leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
+ leaderworker->heap = buildstate->spool->heap;
+ leaderworker->index = buildstate->spool->index;
+ leaderworker->isunique = buildstate->spool->isunique;
+
+ /* Initialize second spool, if required */
+ if (!btleader->btshared->isunique)
+ leaderworker2 = NULL;
+ else
+ {
+ /* Allocate memory for worker's own private secondary spool */
+ leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
+
+ /* Initialize worker's own secondary spool */
+ leaderworker2->heap = leaderworker->heap;
+ leaderworker2->index = leaderworker->index;
+ leaderworker2->isunique = false;
+ }
+
+ /*
+ * Might as well use reliable figure when doling out maintenance_work_mem
+ * (when requested number of workers were not launched, this will be
+ * somewhat higher than it is for other workers).
+ */
+ sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
+
+ /* Perform work common to all participants */
+ _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
+ btleader->sharedsort, btleader->sharedsort2,
+ sortmem, true);
+
+#ifdef BTREE_BUILD_STATS
+ if (log_btree_build_stats)
+ {
+ ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
+ ResetUsage();
+ }
+#endif /* BTREE_BUILD_STATS */
+}
+
+/*
+ * Perform work within a launched parallel process.
+ */
+void
+_bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
+{
+ char *sharedquery;
+ BTSpool *btspool;
+ BTSpool *btspool2;
+ BTShared *btshared;
+ Sharedsort *sharedsort;
+ Sharedsort *sharedsort2;
+ Relation heapRel;
+ Relation indexRel;
+ LOCKMODE heapLockmode;
+ LOCKMODE indexLockmode;
+ WalUsage *walusage;
+ BufferUsage *bufferusage;
+ int sortmem;
+
+#ifdef BTREE_BUILD_STATS
+ if (log_btree_build_stats)
+ ResetUsage();
+#endif /* BTREE_BUILD_STATS */
+
+ /*
+ * The only possible status flag that can be set to the parallel worker is
+ * PROC_IN_SAFE_IC.
+ */
+ Assert((MyProc->statusFlags == 0) ||
+ (MyProc->statusFlags == PROC_IN_SAFE_IC));
+
+ /* Set debug_query_string for individual workers first */
+ sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
+ debug_query_string = sharedquery;
+
+ /* Report the query string from leader */
+ pgstat_report_activity(STATE_RUNNING, debug_query_string);
+
+ /* Look up nbtree shared state */
+ btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
+
+ /* Open relations using lock modes known to be obtained by index.c */
+ if (!btshared->isconcurrent)
+ {
+ heapLockmode = ShareLock;
+ indexLockmode = AccessExclusiveLock;
+ }
+ else
+ {
+ heapLockmode = ShareUpdateExclusiveLock;
+ indexLockmode = RowExclusiveLock;
+ }
+
+ /* Open relations within worker */
+ heapRel = table_open(btshared->heaprelid, heapLockmode);
+ indexRel = index_open(btshared->indexrelid, indexLockmode);
+
+ /* Initialize worker's own spool */
+ btspool = (BTSpool *) palloc0(sizeof(BTSpool));
+ btspool->heap = heapRel;
+ btspool->index = indexRel;
+ btspool->isunique = btshared->isunique;
+
+ /* Look up shared state private to tuplesort.c */
+ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
+ tuplesort_attach_shared(sharedsort, seg);
+ if (!btshared->isunique)
+ {
+ btspool2 = NULL;
+ sharedsort2 = NULL;
+ }
+ else
+ {
+ /* Allocate memory for worker's own private secondary spool */
+ btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
+
+ /* Initialize worker's own secondary spool */
+ btspool2->heap = btspool->heap;
+ btspool2->index = btspool->index;
+ btspool2->isunique = false;
+ /* Look up shared state private to tuplesort.c */
+ sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
+ tuplesort_attach_shared(sharedsort2, seg);
+ }
+
+ /* Prepare to track buffer usage during parallel execution */
+ InstrStartParallelQuery();
+
+ /* Perform sorting of spool, and possibly a spool2 */
+ sortmem = maintenance_work_mem / btshared->scantuplesortstates;
+ _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
+ sharedsort2, sortmem, false);
+
+ /* Report WAL/buffer usage during parallel execution */
+ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
+ walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
+ InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+ &walusage[ParallelWorkerNumber]);
+
+#ifdef BTREE_BUILD_STATS
+ if (log_btree_build_stats)
+ {
+ ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
+ ResetUsage();
+ }
+#endif /* BTREE_BUILD_STATS */
+
+ index_close(indexRel, indexLockmode);
+ table_close(heapRel, heapLockmode);
+}
+
+/*
+ * Perform a worker's portion of a parallel sort.
+ *
+ * This generates a tuplesort for passed btspool, and a second tuplesort
+ * state if a second btspool is need (i.e. for unique index builds). All
+ * other spool fields should already be set when this is called.
+ *
+ * sortmem is the amount of working memory to use within each worker,
+ * expressed in KBs.
+ *
+ * When this returns, workers are done, and need only release resources.
+ */
+static void
+_bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
+ BTShared *btshared, Sharedsort *sharedsort,
+ Sharedsort *sharedsort2, int sortmem, bool progress)
+{
+ SortCoordinate coordinate;
+ BTBuildState buildstate;
+ TableScanDesc scan;
+ double reltuples;
+ IndexInfo *indexInfo;
+
+ /* Initialize local tuplesort coordination state */
+ coordinate = palloc0(sizeof(SortCoordinateData));
+ coordinate->isWorker = true;
+ coordinate->nParticipants = -1;
+ coordinate->sharedsort = sharedsort;
+
+ /* Begin "partial" tuplesort */
+ btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
+ btspool->index,
+ btspool->isunique,
+ sortmem, coordinate,
+ false);
+
+ /*
+ * Just as with serial case, there may be a second spool. If so, a
+ * second, dedicated spool2 partial tuplesort is required.
+ */
+ if (btspool2)
+ {
+ SortCoordinate coordinate2;
+
+ /*
+ * We expect that the second one (for dead tuples) won't get very
+ * full, so we give it only work_mem (unless sortmem is less for
+ * worker). Worker processes are generally permitted to allocate
+ * work_mem independently.
+ */
+ coordinate2 = palloc0(sizeof(SortCoordinateData));
+ coordinate2->isWorker = true;
+ coordinate2->nParticipants = -1;
+ coordinate2->sharedsort = sharedsort2;
+ btspool2->sortstate =
+ tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
+ Min(sortmem, work_mem), coordinate2,
+ false);
+ }
+
+ /* Fill in buildstate for _bt_build_callback() */
+ buildstate.isunique = btshared->isunique;
+ buildstate.havedead = false;
+ buildstate.heap = btspool->heap;
+ buildstate.spool = btspool;
+ buildstate.spool2 = btspool2;
+ buildstate.indtuples = 0;
+ buildstate.btleader = NULL;
+
+ /* Join parallel scan */
+ indexInfo = BuildIndexInfo(btspool->index);
+ indexInfo->ii_Concurrent = btshared->isconcurrent;
+ scan = table_beginscan_parallel(btspool->heap,
+ ParallelTableScanFromBTShared(btshared));
+ reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
+ true, progress, _bt_build_callback,
+ (void *) &buildstate, scan);
+
+ /* Execute this worker's part of the sort */
+ if (progress)
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+ PROGRESS_BTREE_PHASE_PERFORMSORT_1);
+ tuplesort_performsort(btspool->sortstate);
+ if (btspool2)
+ {
+ if (progress)
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
+ PROGRESS_BTREE_PHASE_PERFORMSORT_2);
+ tuplesort_performsort(btspool2->sortstate);
+ }
+
+ /*
+ * Done. Record ambuild statistics, and whether we encountered a broken
+ * HOT chain.
+ */
+ SpinLockAcquire(&btshared->mutex);
+ btshared->nparticipantsdone++;
+ btshared->reltuples += reltuples;
+ if (buildstate.havedead)
+ btshared->havedead = true;
+ btshared->indtuples += buildstate.indtuples;
+ if (indexInfo->ii_BrokenHotChain)
+ btshared->brokenhotchain = true;
+ SpinLockRelease(&btshared->mutex);
+
+ /* Notify leader */
+ ConditionVariableSignal(&btshared->workersdonecv);
+
+ /* We can end tuplesorts immediately */
+ tuplesort_end(btspool->sortstate);
+ if (btspool2)
+ tuplesort_end(btspool2->sortstate);
+}
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
new file mode 100644
index 0000000..3485e93
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -0,0 +1,1190 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsplitloc.c
+ * Choose split point code for Postgres btree implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtsplitloc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "storage/lmgr.h"
+
+typedef enum
+{
+ /* strategy for searching through materialized list of split points */
+ SPLIT_DEFAULT, /* give some weight to truncation */
+ SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */
+ SPLIT_SINGLE_VALUE /* leave left page almost full */
+} FindSplitStrat;
+
+typedef struct
+{
+ /* details of free space left by split */
+ int16 curdelta; /* current leftfree/rightfree delta */
+ int16 leftfree; /* space left on left page post-split */
+ int16 rightfree; /* space left on right page post-split */
+
+ /* split point identifying fields (returned by _bt_findsplitloc) */
+ OffsetNumber firstrightoff; /* first origpage item on rightpage */
+ bool newitemonleft; /* new item goes on left, or right? */
+
+} SplitPoint;
+
+typedef struct
+{
+ /* context data for _bt_recsplitloc */
+ Relation rel; /* index relation */
+ Page origpage; /* page undergoing split */
+ IndexTuple newitem; /* new item (cause of page split) */
+ Size newitemsz; /* size of newitem (includes line pointer) */
+ bool is_leaf; /* T if splitting a leaf page */
+ bool is_rightmost; /* T if splitting rightmost page on level */
+ OffsetNumber newitemoff; /* where the new item is to be inserted */
+ int leftspace; /* space available for items on left page */
+ int rightspace; /* space available for items on right page */
+ int olddataitemstotal; /* space taken by old items */
+ Size minfirstrightsz; /* smallest firstright size */
+
+ /* candidate split point data */
+ int maxsplits; /* maximum number of splits */
+ int nsplits; /* current number of splits */
+ SplitPoint *splits; /* all candidate split points for page */
+ int interval; /* current range of acceptable split points */
+} FindSplitData;
+
+static void _bt_recsplitloc(FindSplitData *state,
+ OffsetNumber firstrightoff, bool newitemonleft,
+ int olddataitemstoleft,
+ Size firstrightofforigpagetuplesz);
+static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+ bool usemult);
+static int _bt_splitcmp(const void *arg1, const void *arg2);
+static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+ int leaffillfactor, bool *usemult);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
+static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+ bool *newitemonleft, FindSplitStrat strategy);
+static int _bt_defaultinterval(FindSplitData *state);
+static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+ SplitPoint *rightpage, FindSplitStrat *strategy);
+static void _bt_interval_edges(FindSplitData *state,
+ SplitPoint **leftinterval, SplitPoint **rightinterval);
+static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
+static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
+ SplitPoint *split);
+static inline IndexTuple _bt_split_firstright(FindSplitData *state,
+ SplitPoint *split);
+
+
+/*
+ * _bt_findsplitloc() -- find an appropriate place to split a page.
+ *
+ * The main goal here is to equalize the free space that will be on each
+ * split page, *after accounting for the inserted tuple*. (If we fail to
+ * account for it, we might find ourselves with too little room on the page
+ * that it needs to go into!)
+ *
+ * If the page is the rightmost page on its level, we instead try to arrange
+ * to leave the left split page fillfactor% full. In this way, when we are
+ * inserting successively increasing keys (consider sequences, timestamps,
+ * etc) we will end up with a tree whose pages are about fillfactor% full,
+ * instead of the 50% full result that we'd get without this special case.
+ * This is the same as nbtsort.c produces for a newly-created tree. Note
+ * that leaf and nonleaf pages use different fillfactors. Note also that
+ * there are a number of further special cases where fillfactor is not
+ * applied in the standard way.
+ *
+ * We are passed the intended insert position of the new tuple, expressed as
+ * the offsetnumber of the tuple it must go in front of (this could be
+ * maxoff+1 if the tuple is to go at the end). The new tuple itself is also
+ * passed, since it's needed to give some weight to how effective suffix
+ * truncation will be. The implementation picks the split point that
+ * maximizes the effectiveness of suffix truncation from a small list of
+ * alternative candidate split points that leave each side of the split with
+ * about the same share of free space. Suffix truncation is secondary to
+ * equalizing free space, except in cases with large numbers of duplicates.
+ * Note that it is always assumed that caller goes on to perform truncation,
+ * even with pg_upgrade'd indexes where that isn't actually the case
+ * (!heapkeyspace indexes). See nbtree/README for more information about
+ * suffix truncation.
+ *
+ * We return the index of the first existing tuple that should go on the
+ * righthand page (which is called firstrightoff), plus a boolean
+ * indicating whether the new tuple goes on the left or right page. You
+ * can think of the returned state as a point _between_ two adjacent data
+ * items (laftleft and firstright data items) on an imaginary version of
+ * origpage that already includes newitem. The bool is necessary to
+ * disambiguate the case where firstrightoff == newitemoff (i.e. it is
+ * sometimes needed to determine if the firstright tuple for the split is
+ * newitem rather than the tuple from origpage at offset firstrightoff).
+ */
+OffsetNumber
+_bt_findsplitloc(Relation rel,
+ Page origpage,
+ OffsetNumber newitemoff,
+ Size newitemsz,
+ IndexTuple newitem,
+ bool *newitemonleft)
+{
+ BTPageOpaque opaque;
+ int leftspace,
+ rightspace,
+ olddataitemstotal,
+ olddataitemstoleft,
+ perfectpenalty,
+ leaffillfactor;
+ FindSplitData state;
+ FindSplitStrat strategy;
+ ItemId itemid;
+ OffsetNumber offnum,
+ maxoff,
+ firstrightoff;
+ double fillfactormult;
+ bool usemult;
+ SplitPoint leftpage,
+ rightpage;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ maxoff = PageGetMaxOffsetNumber(origpage);
+
+ /* Total free space available on a btree page, after fixed overhead */
+ leftspace = rightspace =
+ PageGetPageSize(origpage) - SizeOfPageHeaderData -
+ MAXALIGN(sizeof(BTPageOpaqueData));
+
+ /* The right page will have the same high key as the old page */
+ if (!P_RIGHTMOST(opaque))
+ {
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
+ sizeof(ItemIdData));
+ }
+
+ /* Count up total space in data items before actually scanning 'em */
+ olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
+ leaffillfactor = BTGetFillFactor(rel);
+
+ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+ newitemsz += sizeof(ItemIdData);
+ state.rel = rel;
+ state.origpage = origpage;
+ state.newitem = newitem;
+ state.newitemsz = newitemsz;
+ state.is_leaf = P_ISLEAF(opaque);
+ state.is_rightmost = P_RIGHTMOST(opaque);
+ state.leftspace = leftspace;
+ state.rightspace = rightspace;
+ state.olddataitemstotal = olddataitemstotal;
+ state.minfirstrightsz = SIZE_MAX;
+ state.newitemoff = newitemoff;
+
+ /* newitem cannot be a posting list item */
+ Assert(!BTreeTupleIsPosting(newitem));
+
+ /*
+ * nsplits should never exceed maxoff because there will be at most as
+ * many candidate split points as there are points _between_ tuples, once
+ * you imagine that the new item is already on the original page (the
+ * final number of splits may be slightly lower because not all points
+ * between tuples will be legal).
+ */
+ state.maxsplits = maxoff;
+ state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
+ state.nsplits = 0;
+
+ /*
+ * Scan through the data items and calculate space usage for a split at
+ * each possible position
+ */
+ olddataitemstoleft = 0;
+
+ for (offnum = P_FIRSTDATAKEY(opaque);
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ Size itemsz;
+
+ itemid = PageGetItemId(origpage, offnum);
+ itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+
+ /*
+ * When item offset number is not newitemoff, neither side of the
+ * split can be newitem. Record a split after the previous data item
+ * from original page, but before the current data item from original
+ * page. (_bt_recsplitloc() will reject the split when there are no
+ * previous items, which we rely on.)
+ */
+ if (offnum < newitemoff)
+ _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+ else if (offnum > newitemoff)
+ _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+ else
+ {
+ /*
+ * Record a split after all "offnum < newitemoff" original page
+ * data items, but before newitem
+ */
+ _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+
+ /*
+ * Record a split after newitem, but before data item from
+ * original page at offset newitemoff/current offset
+ */
+ _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+ }
+
+ olddataitemstoleft += itemsz;
+ }
+
+ /*
+ * Record a split after all original page data items, but before newitem.
+ * (Though only when it's possible that newitem will end up alone on new
+ * right page.)
+ */
+ Assert(olddataitemstoleft == olddataitemstotal);
+ if (newitemoff > maxoff)
+ _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
+
+ /*
+ * I believe it is not possible to fail to find a feasible split, but just
+ * in case ...
+ */
+ if (state.nsplits == 0)
+ elog(ERROR, "could not find a feasible split point for index \"%s\"",
+ RelationGetRelationName(rel));
+
+ /*
+ * Start search for a split point among list of legal split points. Give
+ * primary consideration to equalizing available free space in each half
+ * of the split initially (start with default strategy), while applying
+ * rightmost and split-after-new-item optimizations where appropriate.
+ * Either of the two other fallback strategies may be required for cases
+ * with a large number of duplicates around the original/space-optimal
+ * split point.
+ *
+ * Default strategy gives some weight to suffix truncation in deciding a
+ * split point on leaf pages. It attempts to select a split point where a
+ * distinguishing attribute appears earlier in the new high key for the
+ * left side of the split, in order to maximize the number of trailing
+ * attributes that can be truncated away. Only candidate split points
+ * that imply an acceptable balance of free space on each side are
+ * considered. See _bt_defaultinterval().
+ */
+ if (!state.is_leaf)
+ {
+ /* fillfactormult only used on rightmost page */
+ usemult = state.is_rightmost;
+ fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
+ }
+ else if (state.is_rightmost)
+ {
+ /* Rightmost leaf page -- fillfactormult always used */
+ usemult = true;
+ fillfactormult = leaffillfactor / 100.0;
+ }
+ else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
+ {
+ /*
+ * New item inserted at rightmost point among a localized grouping on
+ * a leaf page -- apply "split after new item" optimization, either by
+ * applying leaf fillfactor multiplier, or by choosing the exact split
+ * point that leaves newitem as lastleft. (usemult is set for us.)
+ */
+ if (usemult)
+ {
+ /* fillfactormult should be set based on leaf fillfactor */
+ fillfactormult = leaffillfactor / 100.0;
+ }
+ else
+ {
+ /* find precise split point after newitemoff */
+ for (int i = 0; i < state.nsplits; i++)
+ {
+ SplitPoint *split = state.splits + i;
+
+ if (split->newitemonleft &&
+ newitemoff == split->firstrightoff)
+ {
+ pfree(state.splits);
+ *newitemonleft = true;
+ return newitemoff;
+ }
+ }
+
+ /*
+ * Cannot legally split after newitemoff; proceed with split
+ * without using fillfactor multiplier. This is defensive, and
+ * should never be needed in practice.
+ */
+ fillfactormult = 0.50;
+ }
+ }
+ else
+ {
+ /* Other leaf page. 50:50 page split. */
+ usemult = false;
+ /* fillfactormult not used, but be tidy */
+ fillfactormult = 0.50;
+ }
+
+ /*
+ * Save leftmost and rightmost splits for page before original ordinal
+ * sort order is lost by delta/fillfactormult sort
+ */
+ leftpage = state.splits[0];
+ rightpage = state.splits[state.nsplits - 1];
+
+ /* Give split points a fillfactormult-wise delta, and sort on deltas */
+ _bt_deltasortsplits(&state, fillfactormult, usemult);
+
+ /* Determine split interval for default strategy */
+ state.interval = _bt_defaultinterval(&state);
+
+ /*
+ * Determine if default strategy/split interval will produce a
+ * sufficiently distinguishing split, or if we should change strategies.
+ * Alternative strategies change the range of split points that are
+ * considered acceptable (split interval), and possibly change
+ * fillfactormult, in order to deal with pages with a large number of
+ * duplicates gracefully.
+ *
+ * Pass low and high splits for the entire page (actually, they're for an
+ * imaginary version of the page that includes newitem). These are used
+ * when the initial split interval encloses split points that are full of
+ * duplicates, and we need to consider if it's even possible to avoid
+ * appending a heap TID.
+ */
+ perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
+
+ if (strategy == SPLIT_DEFAULT)
+ {
+ /*
+ * Default strategy worked out (always works out with internal page).
+ * Original split interval still stands.
+ */
+ }
+
+ /*
+ * Many duplicates strategy is used when a heap TID would otherwise be
+ * appended, but the page isn't completely full of logical duplicates.
+ *
+ * The split interval is widened to include all legal candidate split
+ * points. There might be a few as two distinct values in the whole-page
+ * split interval, though it's also possible that most of the values on
+ * the page are unique. The final split point will either be to the
+ * immediate left or to the immediate right of the group of duplicate
+ * tuples that enclose the first/delta-optimal split point (perfect
+ * penalty was set so that the lowest delta split point that avoids
+ * appending a heap TID will be chosen). Maximizing the number of
+ * attributes that can be truncated away is not a goal of the many
+ * duplicates strategy.
+ *
+ * Single value strategy is used when it is impossible to avoid appending
+ * a heap TID. It arranges to leave the left page very full. This
+ * maximizes space utilization in cases where tuples with the same
+ * attribute values span many pages. Newly inserted duplicates will tend
+ * to have higher heap TID values, so we'll end up splitting to the right
+ * consistently. (Single value strategy is harmless though not
+ * particularly useful with !heapkeyspace indexes.)
+ */
+ else if (strategy == SPLIT_MANY_DUPLICATES)
+ {
+ Assert(state.is_leaf);
+ /* Shouldn't try to truncate away extra user attributes */
+ Assert(perfectpenalty ==
+ IndexRelationGetNumberOfKeyAttributes(state.rel));
+ /* No need to resort splits -- no change in fillfactormult/deltas */
+ state.interval = state.nsplits;
+ }
+ else if (strategy == SPLIT_SINGLE_VALUE)
+ {
+ Assert(state.is_leaf);
+ /* Split near the end of the page */
+ usemult = true;
+ fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
+ /* Resort split points with new delta */
+ _bt_deltasortsplits(&state, fillfactormult, usemult);
+ /* Appending a heap TID is unavoidable, so interval of 1 is fine */
+ state.interval = 1;
+ }
+
+ /*
+ * Search among acceptable split points (using final split interval) for
+ * the entry that has the lowest penalty, and is therefore expected to
+ * maximize fan-out. Sets *newitemonleft for us.
+ */
+ firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
+ strategy);
+ pfree(state.splits);
+
+ return firstrightoff;
+}
+
+/*
+ * Subroutine to record a particular point between two tuples (possibly the
+ * new item) on page (ie, combination of firstrightoff and newitemonleft
+ * settings) in *state for later analysis. This is also a convenient point to
+ * check if the split is legal (if it isn't, it won't be recorded).
+ *
+ * firstrightoff is the offset of the first item on the original page that
+ * goes to the right page, and firstrightofforigpagetuplesz is the size of
+ * that tuple. firstrightoff can be > max offset, which means that all the
+ * old items go to the left page and only the new item goes to the right page.
+ * We don't actually use firstrightofforigpagetuplesz in that case (actually,
+ * we don't use it for _any_ split where the firstright tuple happens to be
+ * newitem).
+ *
+ * olddataitemstoleft is the total size of all old items to the left of the
+ * split point that is recorded here when legal. Should not include
+ * newitemsz, since that is handled here.
+ */
+static void
+_bt_recsplitloc(FindSplitData *state,
+ OffsetNumber firstrightoff,
+ bool newitemonleft,
+ int olddataitemstoleft,
+ Size firstrightofforigpagetuplesz)
+{
+ int16 leftfree,
+ rightfree;
+ Size firstrightsz;
+ Size postingsz = 0;
+ bool newitemisfirstright;
+
+ /* Is the new item going to be split point's firstright tuple? */
+ newitemisfirstright = (firstrightoff == state->newitemoff &&
+ !newitemonleft);
+
+ if (newitemisfirstright)
+ firstrightsz = state->newitemsz;
+ else
+ {
+ firstrightsz = firstrightofforigpagetuplesz;
+
+ /*
+ * Calculate suffix truncation space saving when firstright tuple is a
+ * posting list tuple, though only when the tuple is over 64 bytes
+ * including line pointer overhead (arbitrary). This avoids accessing
+ * the tuple in cases where its posting list must be very small (if
+ * tuple has one at all).
+ *
+ * Note: We don't do this in the case where firstright tuple is
+ * newitem, since newitem cannot have a posting list.
+ */
+ if (state->is_leaf && firstrightsz > 64)
+ {
+ ItemId itemid;
+ IndexTuple newhighkey;
+
+ itemid = PageGetItemId(state->origpage, firstrightoff);
+ newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
+
+ if (BTreeTupleIsPosting(newhighkey))
+ postingsz = IndexTupleSize(newhighkey) -
+ BTreeTupleGetPostingOffset(newhighkey);
+ }
+ }
+
+ /* Account for all the old tuples */
+ leftfree = state->leftspace - olddataitemstoleft;
+ rightfree = state->rightspace -
+ (state->olddataitemstotal - olddataitemstoleft);
+
+ /*
+ * The first item on the right page becomes the high key of the left page;
+ * therefore it counts against left space as well as right space (we
+ * cannot assume that suffix truncation will make it any smaller). When
+ * index has included attributes, then those attributes of left page high
+ * key will be truncated leaving that page with slightly more free space.
+ * However, that shouldn't affect our ability to find valid split
+ * location, since we err in the direction of being pessimistic about free
+ * space on the left half. Besides, even when suffix truncation of
+ * non-TID attributes occurs, the new high key often won't even be a
+ * single MAXALIGN() quantum smaller than the firstright tuple it's based
+ * on.
+ *
+ * If we are on the leaf level, assume that suffix truncation cannot avoid
+ * adding a heap TID to the left half's new high key when splitting at the
+ * leaf level. In practice the new high key will often be smaller and
+ * will rarely be larger, but conservatively assume the worst case. We do
+ * go to the trouble of subtracting away posting list overhead, though
+ * only when it looks like it will make an appreciable difference.
+ * (Posting lists are the only case where truncation will typically make
+ * the final high key far smaller than firstright, so being a bit more
+ * precise there noticeably improves the balance of free space.)
+ */
+ if (state->is_leaf)
+ leftfree -= (int16) (firstrightsz +
+ MAXALIGN(sizeof(ItemPointerData)) -
+ postingsz);
+ else
+ leftfree -= (int16) firstrightsz;
+
+ /* account for the new item */
+ if (newitemonleft)
+ leftfree -= (int16) state->newitemsz;
+ else
+ rightfree -= (int16) state->newitemsz;
+
+ /*
+ * If we are not on the leaf level, we will be able to discard the key
+ * data from the first item that winds up on the right page.
+ */
+ if (!state->is_leaf)
+ rightfree += (int16) firstrightsz -
+ (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
+
+ /* Record split if legal */
+ if (leftfree >= 0 && rightfree >= 0)
+ {
+ Assert(state->nsplits < state->maxsplits);
+
+ /* Determine smallest firstright tuple size among legal splits */
+ state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
+
+ state->splits[state->nsplits].curdelta = 0;
+ state->splits[state->nsplits].leftfree = leftfree;
+ state->splits[state->nsplits].rightfree = rightfree;
+ state->splits[state->nsplits].firstrightoff = firstrightoff;
+ state->splits[state->nsplits].newitemonleft = newitemonleft;
+ state->nsplits++;
+ }
+}
+
+/*
+ * Subroutine to assign space deltas to materialized array of candidate split
+ * points based on current fillfactor, and to sort array using that fillfactor
+ */
+static void
+_bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+ bool usemult)
+{
+ for (int i = 0; i < state->nsplits; i++)
+ {
+ SplitPoint *split = state->splits + i;
+ int16 delta;
+
+ if (usemult)
+ delta = fillfactormult * split->leftfree -
+ (1.0 - fillfactormult) * split->rightfree;
+ else
+ delta = split->leftfree - split->rightfree;
+
+ if (delta < 0)
+ delta = -delta;
+
+ /* Save delta */
+ split->curdelta = delta;
+ }
+
+ qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
+}
+
+/*
+ * qsort-style comparator used by _bt_deltasortsplits()
+ */
+static int
+_bt_splitcmp(const void *arg1, const void *arg2)
+{
+ SplitPoint *split1 = (SplitPoint *) arg1;
+ SplitPoint *split2 = (SplitPoint *) arg2;
+
+ if (split1->curdelta > split2->curdelta)
+ return 1;
+ if (split1->curdelta < split2->curdelta)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Subroutine to determine whether or not a non-rightmost leaf page should be
+ * split immediately after the would-be original page offset for the
+ * new/incoming tuple (or should have leaf fillfactor applied when new item is
+ * to the right on original page). This is appropriate when there is a
+ * pattern of localized monotonically increasing insertions into a composite
+ * index, where leading attribute values form local groupings, and we
+ * anticipate further insertions of the same/current grouping (new item's
+ * grouping) in the near future. This can be thought of as a variation on
+ * applying leaf fillfactor during rightmost leaf page splits, since cases
+ * that benefit will converge on packing leaf pages leaffillfactor% full over
+ * time.
+ *
+ * We may leave extra free space remaining on the rightmost page of a "most
+ * significant column" grouping of tuples if that grouping never ends up
+ * having future insertions that use the free space. That effect is
+ * self-limiting; a future grouping that becomes the "nearest on the right"
+ * grouping of the affected grouping usually puts the extra free space to good
+ * use.
+ *
+ * Caller uses optimization when routine returns true, though the exact action
+ * taken by caller varies. Caller uses original leaf page fillfactor in
+ * standard way rather than using the new item offset directly when *usemult
+ * was also set to true here. Otherwise, caller applies optimization by
+ * locating the legal split point that makes the new tuple the lastleft tuple
+ * for the split.
+ */
+static bool
+_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+ int leaffillfactor, bool *usemult)
+{
+ int16 nkeyatts;
+ ItemId itemid;
+ IndexTuple tup;
+ int keepnatts;
+
+ Assert(state->is_leaf && !state->is_rightmost);
+
+ nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+ /* Single key indexes not considered here */
+ if (nkeyatts == 1)
+ return false;
+
+ /* Ascending insertion pattern never inferred when new item is first */
+ if (state->newitemoff == P_FIRSTKEY)
+ return false;
+
+ /*
+ * Only apply optimization on pages with equisized tuples, since ordinal
+ * keys are likely to be fixed-width. Testing if the new tuple is
+ * variable width directly might also work, but that fails to apply the
+ * optimization to indexes with a numeric_ops attribute.
+ *
+ * Conclude that page has equisized tuples when the new item is the same
+ * width as the smallest item observed during pass over page, and other
+ * non-pivot tuples must be the same width as well. (Note that the
+ * possibly-truncated existing high key isn't counted in
+ * olddataitemstotal, and must be subtracted from maxoff.)
+ */
+ if (state->newitemsz != state->minfirstrightsz)
+ return false;
+ if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
+ return false;
+
+ /*
+ * Avoid applying optimization when tuples are wider than a tuple
+ * consisting of two non-NULL int8/int64 attributes (or four non-NULL
+ * int4/int32 attributes)
+ */
+ if (state->newitemsz >
+ MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
+ sizeof(ItemIdData))
+ return false;
+
+ /*
+ * At least the first attribute's value must be equal to the corresponding
+ * value in previous tuple to apply optimization. New item cannot be a
+ * duplicate, either.
+ *
+ * Handle case where new item is to the right of all items on the existing
+ * page. This is suggestive of monotonically increasing insertions in
+ * itself, so the "heap TID adjacency" test is not applied here.
+ */
+ if (state->newitemoff > maxoff)
+ {
+ itemid = PageGetItemId(state->origpage, maxoff);
+ tup = (IndexTuple) PageGetItem(state->origpage, itemid);
+ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+ if (keepnatts > 1 && keepnatts <= nkeyatts)
+ {
+ *usemult = true;
+ return true;
+ }
+
+ return false;
+ }
+
+ /*
+ * "Low cardinality leading column, high cardinality suffix column"
+ * indexes with a random insertion pattern (e.g., an index with a boolean
+ * column, such as an index on '(book_is_in_print, book_isbn)') present us
+ * with a risk of consistently misapplying the optimization. We're
+ * willing to accept very occasional misapplication of the optimization,
+ * provided the cases where we get it wrong are rare and self-limiting.
+ *
+ * Heap TID adjacency strongly suggests that the item just to the left was
+ * inserted very recently, which limits overapplication of the
+ * optimization. Besides, all inappropriate cases triggered here will
+ * still split in the middle of the page on average.
+ */
+ itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
+ tup = (IndexTuple) PageGetItem(state->origpage, itemid);
+ /* Do cheaper test first */
+ if (BTreeTupleIsPosting(tup) ||
+ !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+ return false;
+ /* Check same conditions as rightmost item case, too */
+ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+ if (keepnatts > 1 && keepnatts <= nkeyatts)
+ {
+ double interp = (double) state->newitemoff / ((double) maxoff + 1);
+ double leaffillfactormult = (double) leaffillfactor / 100.0;
+
+ /*
+ * Don't allow caller to split after a new item when it will result in
+ * a split point to the right of the point that a leaf fillfactor
+ * split would use -- have caller apply leaf fillfactor instead
+ */
+ *usemult = interp > leaffillfactormult;
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably during the current
+ * transaction.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+ BlockNumber lowblk,
+ highblk;
+
+ lowblk = ItemPointerGetBlockNumber(lowhtid);
+ highblk = ItemPointerGetBlockNumber(highhtid);
+
+ /* Make optimistic assumption of adjacency when heap blocks match */
+ if (lowblk == highblk)
+ return true;
+
+ /* When heap block one up, second offset should be FirstOffsetNumber */
+ if (lowblk + 1 == highblk &&
+ ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
+ return true;
+
+ return false;
+}
+
+/*
+ * Subroutine to find the "best" split point among candidate split points.
+ * The best split point is the split point with the lowest penalty among split
+ * points that fall within current/final split interval. Penalty is an
+ * abstract score, with a definition that varies depending on whether we're
+ * splitting a leaf page or an internal page. See _bt_split_penalty() for
+ * details.
+ *
+ * "perfectpenalty" is assumed to be the lowest possible penalty among
+ * candidate split points. This allows us to return early without wasting
+ * cycles on calculating the first differing attribute for all candidate
+ * splits when that clearly cannot improve our choice (or when we only want a
+ * minimally distinguishing split point, and don't want to make the split any
+ * more unbalanced than is necessary).
+ *
+ * We return the index of the first existing tuple that should go on the right
+ * page, plus a boolean indicating if new item is on left of split point.
+ */
+static OffsetNumber
+_bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+ bool *newitemonleft, FindSplitStrat strategy)
+{
+ int bestpenalty,
+ lowsplit;
+ int highsplit = Min(state->interval, state->nsplits);
+ SplitPoint *final;
+
+ bestpenalty = INT_MAX;
+ lowsplit = 0;
+ for (int i = lowsplit; i < highsplit; i++)
+ {
+ int penalty;
+
+ penalty = _bt_split_penalty(state, state->splits + i);
+
+ if (penalty < bestpenalty)
+ {
+ bestpenalty = penalty;
+ lowsplit = i;
+ }
+
+ if (penalty <= perfectpenalty)
+ break;
+ }
+
+ final = &state->splits[lowsplit];
+
+ /*
+ * There is a risk that the "many duplicates" strategy will repeatedly do
+ * the wrong thing when there are monotonically decreasing insertions to
+ * the right of a large group of duplicates. Repeated splits could leave
+ * a succession of right half pages with free space that can never be
+ * used. This must be avoided.
+ *
+ * Consider the example of the leftmost page in a single integer attribute
+ * NULLS FIRST index which is almost filled with NULLs. Monotonically
+ * decreasing integer insertions might cause the same leftmost page to
+ * split repeatedly at the same point. Each split derives its new high
+ * key from the lowest current value to the immediate right of the large
+ * group of NULLs, which will always be higher than all future integer
+ * insertions, directing all future integer insertions to the same
+ * leftmost page.
+ */
+ if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
+ !final->newitemonleft && final->firstrightoff >= state->newitemoff &&
+ final->firstrightoff < state->newitemoff + 9)
+ {
+ /*
+ * Avoid the problem by performing a 50:50 split when the new item is
+ * just to the right of the would-be "many duplicates" split point.
+ * (Note that the test used for an insert that is "just to the right"
+ * of the split point is conservative.)
+ */
+ final = &state->splits[0];
+ }
+
+ *newitemonleft = final->newitemonleft;
+ return final->firstrightoff;
+}
+
+#define LEAF_SPLIT_DISTANCE 0.050
+#define INTERNAL_SPLIT_DISTANCE 0.075
+
+/*
+ * Return a split interval to use for the default strategy. This is a limit
+ * on the number of candidate split points to give further consideration to.
+ * Only a fraction of all candidate splits points (those located at the start
+ * of the now-sorted splits array) fall within the split interval. Split
+ * interval is applied within _bt_bestsplitloc().
+ *
+ * Split interval represents an acceptable range of split points -- those that
+ * have leftfree and rightfree values that are acceptably balanced. The final
+ * split point chosen is the split point with the lowest "penalty" among split
+ * points in this split interval (unless we change our entire strategy, in
+ * which case the interval also changes -- see _bt_strategy()).
+ *
+ * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits,
+ * and sigma b for internal ("branch") splits. It's hard to provide a
+ * theoretical justification for the size of the split interval, though it's
+ * clear that a small split interval can make tuples on level L+1 much smaller
+ * on average, without noticeably affecting space utilization on level L.
+ * (Note that the way that we calculate split interval might need to change if
+ * suffix truncation is taught to truncate tuples "within" the last
+ * attribute/datum for data types like text, which is more or less how it is
+ * assumed to work in the paper.)
+ */
+static int
+_bt_defaultinterval(FindSplitData *state)
+{
+ SplitPoint *spaceoptimal;
+ int16 tolerance,
+ lowleftfree,
+ lowrightfree,
+ highleftfree,
+ highrightfree;
+
+ /*
+ * Determine leftfree and rightfree values that are higher and lower than
+ * we're willing to tolerate. Note that the final split interval will be
+ * about 10% of nsplits in the common case where all non-pivot tuples
+ * (data items) from a leaf page are uniformly sized. We're a bit more
+ * aggressive when splitting internal pages.
+ */
+ if (state->is_leaf)
+ tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
+ else
+ tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
+
+ /* First candidate split point is the most evenly balanced */
+ spaceoptimal = state->splits;
+ lowleftfree = spaceoptimal->leftfree - tolerance;
+ lowrightfree = spaceoptimal->rightfree - tolerance;
+ highleftfree = spaceoptimal->leftfree + tolerance;
+ highrightfree = spaceoptimal->rightfree + tolerance;
+
+ /*
+ * Iterate through split points, starting from the split immediately after
+ * 'spaceoptimal'. Find the first split point that divides free space so
+ * unevenly that including it in the split interval would be unacceptable.
+ */
+ for (int i = 1; i < state->nsplits; i++)
+ {
+ SplitPoint *split = state->splits + i;
+
+ /* Cannot use curdelta here, since its value is often weighted */
+ if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
+ split->leftfree > highleftfree || split->rightfree > highrightfree)
+ return i;
+ }
+
+ return state->nsplits;
+}
+
+/*
+ * Subroutine to decide whether split should use default strategy/initial
+ * split interval, or whether it should finish splitting the page using
+ * alternative strategies (this is only possible with leaf pages).
+ *
+ * Caller uses alternative strategy (or sticks with default strategy) based
+ * on how *strategy is set here. Return value is "perfect penalty", which is
+ * passed to _bt_bestsplitloc() as a final constraint on how far caller is
+ * willing to go to avoid appending a heap TID when using the many duplicates
+ * strategy (it also saves _bt_bestsplitloc() useless cycles).
+ */
+static int
+_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+ SplitPoint *rightpage, FindSplitStrat *strategy)
+{
+ IndexTuple leftmost,
+ rightmost;
+ SplitPoint *leftinterval,
+ *rightinterval;
+ int perfectpenalty;
+ int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+ /* Assume that alternative strategy won't be used for now */
+ *strategy = SPLIT_DEFAULT;
+
+ /*
+ * Use smallest observed firstright item size for entire page (actually,
+ * entire imaginary version of page that includes newitem) as perfect
+ * penalty on internal pages. This can save cycles in the common case
+ * where most or all splits (not just splits within interval) have
+ * firstright tuples that are the same size.
+ */
+ if (!state->is_leaf)
+ return state->minfirstrightsz;
+
+ /*
+ * Use leftmost and rightmost tuples from leftmost and rightmost splits in
+ * current split interval
+ */
+ _bt_interval_edges(state, &leftinterval, &rightinterval);
+ leftmost = _bt_split_lastleft(state, leftinterval);
+ rightmost = _bt_split_firstright(state, rightinterval);
+
+ /*
+ * If initial split interval can produce a split point that will at least
+ * avoid appending a heap TID in new high key, we're done. Finish split
+ * with default strategy and initial split interval.
+ */
+ perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+ if (perfectpenalty <= indnkeyatts)
+ return perfectpenalty;
+
+ /*
+ * Work out how caller should finish split when even their "perfect"
+ * penalty for initial/default split interval indicates that the interval
+ * does not contain even a single split that avoids appending a heap TID.
+ *
+ * Use the leftmost split's lastleft tuple and the rightmost split's
+ * firstright tuple to assess every possible split.
+ */
+ leftmost = _bt_split_lastleft(state, leftpage);
+ rightmost = _bt_split_firstright(state, rightpage);
+
+ /*
+ * If page (including new item) has many duplicates but is not entirely
+ * full of duplicates, a many duplicates strategy split will be performed.
+ * If page is entirely full of duplicates, a single value strategy split
+ * will be performed.
+ */
+ perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+ if (perfectpenalty <= indnkeyatts)
+ {
+ *strategy = SPLIT_MANY_DUPLICATES;
+
+ /*
+ * Many duplicates strategy should split at either side the group of
+ * duplicates that enclose the delta-optimal split point. Return
+ * indnkeyatts rather than the true perfect penalty to make that
+ * happen. (If perfectpenalty was returned here then low cardinality
+ * composite indexes could have continual unbalanced splits.)
+ *
+ * Note that caller won't go through with a many duplicates split in
+ * rare cases where it looks like there are ever-decreasing insertions
+ * to the immediate right of the split point. This must happen just
+ * before a final decision is made, within _bt_bestsplitloc().
+ */
+ return indnkeyatts;
+ }
+
+ /*
+ * Single value strategy is only appropriate with ever-increasing heap
+ * TIDs; otherwise, original default strategy split should proceed to
+ * avoid pathological performance. Use page high key to infer if this is
+ * the rightmost page among pages that store the same duplicate value.
+ * This should not prevent insertions of heap TIDs that are slightly out
+ * of order from using single value strategy, since that's expected with
+ * concurrent inserters of the same duplicate value.
+ */
+ else if (state->is_rightmost)
+ *strategy = SPLIT_SINGLE_VALUE;
+ else
+ {
+ ItemId itemid;
+ IndexTuple hikey;
+
+ itemid = PageGetItemId(state->origpage, P_HIKEY);
+ hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
+ perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
+ state->newitem);
+ if (perfectpenalty <= indnkeyatts)
+ *strategy = SPLIT_SINGLE_VALUE;
+ else
+ {
+ /*
+ * Have caller finish split using default strategy, since page
+ * does not appear to be the rightmost page for duplicates of the
+ * value the page is filled with
+ */
+ }
+ }
+
+ return perfectpenalty;
+}
+
+/*
+ * Subroutine to locate leftmost and rightmost splits for current/default
+ * split interval. Note that it will be the same split iff there is only one
+ * split in interval.
+ */
+static void
+_bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
+ SplitPoint **rightinterval)
+{
+ int highsplit = Min(state->interval, state->nsplits);
+ SplitPoint *deltaoptimal;
+
+ deltaoptimal = state->splits;
+ *leftinterval = NULL;
+ *rightinterval = NULL;
+
+ /*
+ * Delta is an absolute distance to optimal split point, so both the
+ * leftmost and rightmost split point will usually be at the end of the
+ * array
+ */
+ for (int i = highsplit - 1; i >= 0; i--)
+ {
+ SplitPoint *distant = state->splits + i;
+
+ if (distant->firstrightoff < deltaoptimal->firstrightoff)
+ {
+ if (*leftinterval == NULL)
+ *leftinterval = distant;
+ }
+ else if (distant->firstrightoff > deltaoptimal->firstrightoff)
+ {
+ if (*rightinterval == NULL)
+ *rightinterval = distant;
+ }
+ else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
+ {
+ /*
+ * "incoming tuple will become firstright" (distant) is to the
+ * left of "incoming tuple will become lastleft" (delta-optimal)
+ */
+ Assert(distant->firstrightoff == state->newitemoff);
+ if (*leftinterval == NULL)
+ *leftinterval = distant;
+ }
+ else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
+ {
+ /*
+ * "incoming tuple will become lastleft" (distant) is to the right
+ * of "incoming tuple will become firstright" (delta-optimal)
+ */
+ Assert(distant->firstrightoff == state->newitemoff);
+ if (*rightinterval == NULL)
+ *rightinterval = distant;
+ }
+ else
+ {
+ /* There was only one or two splits in initial split interval */
+ Assert(distant == deltaoptimal);
+ if (*leftinterval == NULL)
+ *leftinterval = distant;
+ if (*rightinterval == NULL)
+ *rightinterval = distant;
+ }
+
+ if (*leftinterval && *rightinterval)
+ return;
+ }
+
+ Assert(false);
+}
+
+/*
+ * Subroutine to find penalty for caller's candidate split point.
+ *
+ * On leaf pages, penalty is the attribute number that distinguishes each side
+ * of a split. It's the last attribute that needs to be included in new high
+ * key for left page. It can be greater than the number of key attributes in
+ * cases where a heap TID will need to be appended during truncation.
+ *
+ * On internal pages, penalty is simply the size of the firstright tuple for
+ * the split (including line pointer overhead). This tuple will become the
+ * new high key for the left page.
+ */
+static inline int
+_bt_split_penalty(FindSplitData *state, SplitPoint *split)
+{
+ IndexTuple lastleft;
+ IndexTuple firstright;
+
+ if (!state->is_leaf)
+ {
+ ItemId itemid;
+
+ if (!split->newitemonleft &&
+ split->firstrightoff == state->newitemoff)
+ return state->newitemsz;
+
+ itemid = PageGetItemId(state->origpage, split->firstrightoff);
+
+ return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+ }
+
+ lastleft = _bt_split_lastleft(state, split);
+ firstright = _bt_split_firstright(state, split);
+
+ return _bt_keep_natts_fast(state->rel, lastleft, firstright);
+}
+
+/*
+ * Subroutine to get a lastleft IndexTuple for a split point
+ */
+static inline IndexTuple
+_bt_split_lastleft(FindSplitData *state, SplitPoint *split)
+{
+ ItemId itemid;
+
+ if (split->newitemonleft && split->firstrightoff == state->newitemoff)
+ return state->newitem;
+
+ itemid = PageGetItemId(state->origpage,
+ OffsetNumberPrev(split->firstrightoff));
+ return (IndexTuple) PageGetItem(state->origpage, itemid);
+}
+
+/*
+ * Subroutine to get a firstright IndexTuple for a split point
+ */
+static inline IndexTuple
+_bt_split_firstright(FindSplitData *state, SplitPoint *split)
+{
+ ItemId itemid;
+
+ if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
+ return state->newitem;
+
+ itemid = PageGetItemId(state->origpage, split->firstrightoff);
+ return (IndexTuple) PageGetItem(state->origpage, itemid);
+}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
new file mode 100644
index 0000000..d524310
--- /dev/null
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -0,0 +1,2751 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtutils.c
+ * Utility code for Postgres btree implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <time.h>
+
+#include "access/nbtree.h"
+#include "access/reloptions.h"
+#include "access/relscan.h"
+#include "catalog/catalog.h"
+#include "commands/progress.h"
+#include "lib/qunique.h"
+#include "miscadmin.h"
+#include "utils/array.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+typedef struct BTSortArrayContext
+{
+ FmgrInfo flinfo;
+ Oid collation;
+ bool reverse;
+} BTSortArrayContext;
+
+static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
+ StrategyNumber strat,
+ Datum *elems, int nelems);
+static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
+ bool reverse,
+ Datum *elems, int nelems);
+static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
+static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
+ ScanKey leftarg, ScanKey rightarg,
+ bool *result);
+static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
+static void _bt_mark_scankey_required(ScanKey skey);
+static bool _bt_check_rowcompare(ScanKey skey,
+ IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+ ScanDirection dir, bool *continuescan);
+static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
+ IndexTuple firstright, BTScanInsert itup_key);
+
+
+/*
+ * _bt_mkscankey
+ * Build an insertion scan key that contains comparison data from itup
+ * as well as comparator routines appropriate to the key datatypes.
+ *
+ * When itup is a non-pivot tuple, the returned insertion scan key is
+ * suitable for finding a place for it to go on the leaf level. Pivot
+ * tuples can be used to re-find leaf page with matching high key, but
+ * then caller needs to set scan key's pivotsearch field to true. This
+ * allows caller to search for a leaf page with a matching high key,
+ * which is usually to the left of the first leaf page a non-pivot match
+ * might appear on.
+ *
+ * The result is intended for use with _bt_compare() and _bt_truncate().
+ * Callers that don't need to fill out the insertion scankey arguments
+ * (e.g. they use an ad-hoc comparison routine, or only need a scankey
+ * for _bt_truncate()) can pass a NULL index tuple. The scankey will
+ * be initialized as if an "all truncated" pivot tuple was passed
+ * instead.
+ *
+ * Note that we may occasionally have to share lock the metapage to
+ * determine whether or not the keys in the index are expected to be
+ * unique (i.e. if this is a "heapkeyspace" index). We assume a
+ * heapkeyspace index when caller passes a NULL tuple, allowing index
+ * build callers to avoid accessing the non-existent metapage. We
+ * also assume that the index is _not_ allequalimage when a NULL tuple
+ * is passed; CREATE INDEX callers call _bt_allequalimage() to set the
+ * field themselves.
+ */
+BTScanInsert
+_bt_mkscankey(Relation rel, IndexTuple itup)
+{
+ BTScanInsert key;
+ ScanKey skey;
+ TupleDesc itupdesc;
+ int indnkeyatts;
+ int16 *indoption;
+ int tupnatts;
+ int i;
+
+ itupdesc = RelationGetDescr(rel);
+ indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ indoption = rel->rd_indoption;
+ tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
+
+ Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
+
+ /*
+ * We'll execute search using scan key constructed on key columns.
+ * Truncated attributes and non-key attributes are omitted from the final
+ * scan key.
+ */
+ key = palloc(offsetof(BTScanInsertData, scankeys) +
+ sizeof(ScanKeyData) * indnkeyatts);
+ if (itup)
+ _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
+ else
+ {
+ /* Utility statement callers can set these fields themselves */
+ key->heapkeyspace = true;
+ key->allequalimage = false;
+ }
+ key->anynullkeys = false; /* initial assumption */
+ key->nextkey = false;
+ key->pivotsearch = false;
+ key->keysz = Min(indnkeyatts, tupnatts);
+ key->scantid = key->heapkeyspace && itup ?
+ BTreeTupleGetHeapTID(itup) : NULL;
+ skey = key->scankeys;
+ for (i = 0; i < indnkeyatts; i++)
+ {
+ FmgrInfo *procinfo;
+ Datum arg;
+ bool null;
+ int flags;
+
+ /*
+ * We can use the cached (default) support procs since no cross-type
+ * comparison can be needed.
+ */
+ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
+
+ /*
+ * Key arguments built from truncated attributes (or when caller
+ * provides no tuple) are defensively represented as NULL values. They
+ * should never be used.
+ */
+ if (i < tupnatts)
+ arg = index_getattr(itup, i + 1, itupdesc, &null);
+ else
+ {
+ arg = (Datum) 0;
+ null = true;
+ }
+ flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
+ ScanKeyEntryInitializeWithInfo(&skey[i],
+ flags,
+ (AttrNumber) (i + 1),
+ InvalidStrategy,
+ InvalidOid,
+ rel->rd_indcollation[i],
+ procinfo,
+ arg);
+ /* Record if any key attribute is NULL (or truncated) */
+ if (null)
+ key->anynullkeys = true;
+ }
+
+ return key;
+}
+
+/*
+ * free a retracement stack made by _bt_search.
+ */
+void
+_bt_freestack(BTStack stack)
+{
+ BTStack ostack;
+
+ while (stack != NULL)
+ {
+ ostack = stack;
+ stack = stack->bts_parent;
+ pfree(ostack);
+ }
+}
+
+
+/*
+ * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
+ *
+ * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
+ * set up BTArrayKeyInfo info for each one that is an equality-type key.
+ * Prepare modified scan keys in so->arrayKeyData, which will hold the current
+ * array elements during each primitive indexscan operation. For inequality
+ * array keys, it's sufficient to find the extreme element value and replace
+ * the whole array with that scalar value.
+ *
+ * Note: the reason we need so->arrayKeyData, rather than just scribbling
+ * on scan->keyData, is that callers are permitted to call btrescan without
+ * supplying a new set of scankey data.
+ */
+void
+_bt_preprocess_array_keys(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int numberOfKeys = scan->numberOfKeys;
+ int16 *indoption = scan->indexRelation->rd_indoption;
+ int numArrayKeys;
+ ScanKey cur;
+ int i;
+ MemoryContext oldContext;
+
+ /* Quick check to see if there are any array keys */
+ numArrayKeys = 0;
+ for (i = 0; i < numberOfKeys; i++)
+ {
+ cur = &scan->keyData[i];
+ if (cur->sk_flags & SK_SEARCHARRAY)
+ {
+ numArrayKeys++;
+ Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
+ /* If any arrays are null as a whole, we can quit right now. */
+ if (cur->sk_flags & SK_ISNULL)
+ {
+ so->numArrayKeys = -1;
+ so->arrayKeyData = NULL;
+ return;
+ }
+ }
+ }
+
+ /* Quit if nothing to do. */
+ if (numArrayKeys == 0)
+ {
+ so->numArrayKeys = 0;
+ so->arrayKeyData = NULL;
+ return;
+ }
+
+ /*
+ * Make a scan-lifespan context to hold array-associated data, or reset it
+ * if we already have one from a previous rescan cycle.
+ */
+ if (so->arrayContext == NULL)
+ so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
+ "BTree array context",
+ ALLOCSET_SMALL_SIZES);
+ else
+ MemoryContextReset(so->arrayContext);
+
+ oldContext = MemoryContextSwitchTo(so->arrayContext);
+
+ /* Create modifiable copy of scan->keyData in the workspace context */
+ so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
+ memcpy(so->arrayKeyData,
+ scan->keyData,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+
+ /* Allocate space for per-array data in the workspace context */
+ so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
+
+ /* Now process each array key */
+ numArrayKeys = 0;
+ for (i = 0; i < numberOfKeys; i++)
+ {
+ ArrayType *arrayval;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ int num_elems;
+ Datum *elem_values;
+ bool *elem_nulls;
+ int num_nonnulls;
+ int j;
+
+ cur = &so->arrayKeyData[i];
+ if (!(cur->sk_flags & SK_SEARCHARRAY))
+ continue;
+
+ /*
+ * First, deconstruct the array into elements. Anything allocated
+ * here (including a possibly detoasted array value) is in the
+ * workspace context.
+ */
+ arrayval = DatumGetArrayTypeP(cur->sk_argument);
+ /* We could cache this data, but not clear it's worth it */
+ get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+ &elmlen, &elmbyval, &elmalign);
+ deconstruct_array(arrayval,
+ ARR_ELEMTYPE(arrayval),
+ elmlen, elmbyval, elmalign,
+ &elem_values, &elem_nulls, &num_elems);
+
+ /*
+ * Compress out any null elements. We can ignore them since we assume
+ * all btree operators are strict.
+ */
+ num_nonnulls = 0;
+ for (j = 0; j < num_elems; j++)
+ {
+ if (!elem_nulls[j])
+ elem_values[num_nonnulls++] = elem_values[j];
+ }
+
+ /* We could pfree(elem_nulls) now, but not worth the cycles */
+
+ /* If there's no non-nulls, the scan qual is unsatisfiable */
+ if (num_nonnulls == 0)
+ {
+ numArrayKeys = -1;
+ break;
+ }
+
+ /*
+ * If the comparison operator is not equality, then the array qual
+ * degenerates to a simple comparison against the smallest or largest
+ * non-null array element, as appropriate.
+ */
+ switch (cur->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ cur->sk_argument =
+ _bt_find_extreme_element(scan, cur,
+ BTGreaterStrategyNumber,
+ elem_values, num_nonnulls);
+ continue;
+ case BTEqualStrategyNumber:
+ /* proceed with rest of loop */
+ break;
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ cur->sk_argument =
+ _bt_find_extreme_element(scan, cur,
+ BTLessStrategyNumber,
+ elem_values, num_nonnulls);
+ continue;
+ default:
+ elog(ERROR, "unrecognized StrategyNumber: %d",
+ (int) cur->sk_strategy);
+ break;
+ }
+
+ /*
+ * Sort the non-null elements and eliminate any duplicates. We must
+ * sort in the same ordering used by the index column, so that the
+ * successive primitive indexscans produce data in index order.
+ */
+ num_elems = _bt_sort_array_elements(scan, cur,
+ (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
+ elem_values, num_nonnulls);
+
+ /*
+ * And set up the BTArrayKeyInfo data.
+ */
+ so->arrayKeys[numArrayKeys].scan_key = i;
+ so->arrayKeys[numArrayKeys].num_elems = num_elems;
+ so->arrayKeys[numArrayKeys].elem_values = elem_values;
+ numArrayKeys++;
+ }
+
+ so->numArrayKeys = numArrayKeys;
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * _bt_find_extreme_element() -- get least or greatest array element
+ *
+ * scan and skey identify the index column, whose opfamily determines the
+ * comparison semantics. strat should be BTLessStrategyNumber to get the
+ * least element, or BTGreaterStrategyNumber to get the greatest.
+ */
+static Datum
+_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
+ StrategyNumber strat,
+ Datum *elems, int nelems)
+{
+ Relation rel = scan->indexRelation;
+ Oid elemtype,
+ cmp_op;
+ RegProcedure cmp_proc;
+ FmgrInfo flinfo;
+ Datum result;
+ int i;
+
+ /*
+ * Determine the nominal datatype of the array elements. We have to
+ * support the convention that sk_subtype == InvalidOid means the opclass
+ * input type; this is a hack to simplify life for ScanKeyInit().
+ */
+ elemtype = skey->sk_subtype;
+ if (elemtype == InvalidOid)
+ elemtype = rel->rd_opcintype[skey->sk_attno - 1];
+
+ /*
+ * Look up the appropriate comparison operator in the opfamily.
+ *
+ * Note: it's possible that this would fail, if the opfamily is
+ * incomplete, but it seems quite unlikely that an opfamily would omit
+ * non-cross-type comparison operators for any datatype that it supports
+ * at all.
+ */
+ cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
+ elemtype,
+ elemtype,
+ strat);
+ if (!OidIsValid(cmp_op))
+ elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+ strat, elemtype, elemtype,
+ rel->rd_opfamily[skey->sk_attno - 1]);
+ cmp_proc = get_opcode(cmp_op);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing oprcode for operator %u", cmp_op);
+
+ fmgr_info(cmp_proc, &flinfo);
+
+ Assert(nelems > 0);
+ result = elems[0];
+ for (i = 1; i < nelems; i++)
+ {
+ if (DatumGetBool(FunctionCall2Coll(&flinfo,
+ skey->sk_collation,
+ elems[i],
+ result)))
+ result = elems[i];
+ }
+
+ return result;
+}
+
+/*
+ * _bt_sort_array_elements() -- sort and de-dup array elements
+ *
+ * The array elements are sorted in-place, and the new number of elements
+ * after duplicate removal is returned.
+ *
+ * scan and skey identify the index column, whose opfamily determines the
+ * comparison semantics. If reverse is true, we sort in descending order.
+ */
+static int
+_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
+ bool reverse,
+ Datum *elems, int nelems)
+{
+ Relation rel = scan->indexRelation;
+ Oid elemtype;
+ RegProcedure cmp_proc;
+ BTSortArrayContext cxt;
+
+ if (nelems <= 1)
+ return nelems; /* no work to do */
+
+ /*
+ * Determine the nominal datatype of the array elements. We have to
+ * support the convention that sk_subtype == InvalidOid means the opclass
+ * input type; this is a hack to simplify life for ScanKeyInit().
+ */
+ elemtype = skey->sk_subtype;
+ if (elemtype == InvalidOid)
+ elemtype = rel->rd_opcintype[skey->sk_attno - 1];
+
+ /*
+ * Look up the appropriate comparison function in the opfamily.
+ *
+ * Note: it's possible that this would fail, if the opfamily is
+ * incomplete, but it seems quite unlikely that an opfamily would omit
+ * non-cross-type support functions for any datatype that it supports at
+ * all.
+ */
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+ elemtype,
+ elemtype,
+ BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+ BTORDER_PROC, elemtype, elemtype,
+ rel->rd_opfamily[skey->sk_attno - 1]);
+
+ /* Sort the array elements */
+ fmgr_info(cmp_proc, &cxt.flinfo);
+ cxt.collation = skey->sk_collation;
+ cxt.reverse = reverse;
+ qsort_arg((void *) elems, nelems, sizeof(Datum),
+ _bt_compare_array_elements, (void *) &cxt);
+
+ /* Now scan the sorted elements and remove duplicates */
+ return qunique_arg(elems, nelems, sizeof(Datum),
+ _bt_compare_array_elements, &cxt);
+}
+
+/*
+ * qsort_arg comparator for sorting array elements
+ */
+static int
+_bt_compare_array_elements(const void *a, const void *b, void *arg)
+{
+ Datum da = *((const Datum *) a);
+ Datum db = *((const Datum *) b);
+ BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
+ int32 compare;
+
+ compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
+ cxt->collation,
+ da, db));
+ if (cxt->reverse)
+ INVERT_COMPARE_RESULT(compare);
+ return compare;
+}
+
+/*
+ * _bt_start_array_keys() -- Initialize array keys at start of a scan
+ *
+ * Set up the cur_elem counters and fill in the first sk_argument value for
+ * each array scankey. We can't do this until we know the scan direction.
+ */
+void
+_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int i;
+
+ for (i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+ ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
+
+ Assert(curArrayKey->num_elems > 0);
+ if (ScanDirectionIsBackward(dir))
+ curArrayKey->cur_elem = curArrayKey->num_elems - 1;
+ else
+ curArrayKey->cur_elem = 0;
+ skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
+ }
+}
+
+/*
+ * _bt_advance_array_keys() -- Advance to next set of array elements
+ *
+ * Returns true if there is another set of values to consider, false if not.
+ * On true result, the scankeys are initialized with the next set of values.
+ */
+bool
+_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ bool found = false;
+ int i;
+
+ /*
+ * We must advance the last array key most quickly, since it will
+ * correspond to the lowest-order index column among the available
+ * qualifications. This is necessary to ensure correct ordering of output
+ * when there are multiple array keys.
+ */
+ for (i = so->numArrayKeys - 1; i >= 0; i--)
+ {
+ BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+ ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
+ int cur_elem = curArrayKey->cur_elem;
+ int num_elems = curArrayKey->num_elems;
+
+ if (ScanDirectionIsBackward(dir))
+ {
+ if (--cur_elem < 0)
+ {
+ cur_elem = num_elems - 1;
+ found = false; /* need to advance next array key */
+ }
+ else
+ found = true;
+ }
+ else
+ {
+ if (++cur_elem >= num_elems)
+ {
+ cur_elem = 0;
+ found = false; /* need to advance next array key */
+ }
+ else
+ found = true;
+ }
+
+ curArrayKey->cur_elem = cur_elem;
+ skey->sk_argument = curArrayKey->elem_values[cur_elem];
+ if (found)
+ break;
+ }
+
+ /* advance parallel scan */
+ if (scan->parallel_scan != NULL)
+ _bt_parallel_advance_array_keys(scan);
+
+ return found;
+}
+
+/*
+ * _bt_mark_array_keys() -- Handle array keys during btmarkpos
+ *
+ * Save the current state of the array keys as the "mark" position.
+ */
+void
+_bt_mark_array_keys(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int i;
+
+ for (i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+
+ curArrayKey->mark_elem = curArrayKey->cur_elem;
+ }
+}
+
+/*
+ * _bt_restore_array_keys() -- Handle array keys during btrestrpos
+ *
+ * Restore the array keys to where they were when the mark was set.
+ */
+void
+_bt_restore_array_keys(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ bool changed = false;
+ int i;
+
+ /* Restore each array key to its position when the mark was set */
+ for (i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+ ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
+ int mark_elem = curArrayKey->mark_elem;
+
+ if (curArrayKey->cur_elem != mark_elem)
+ {
+ curArrayKey->cur_elem = mark_elem;
+ skey->sk_argument = curArrayKey->elem_values[mark_elem];
+ changed = true;
+ }
+ }
+
+ /*
+ * If we changed any keys, we must redo _bt_preprocess_keys. That might
+ * sound like overkill, but in cases with multiple keys per index column
+ * it seems necessary to do the full set of pushups.
+ */
+ if (changed)
+ {
+ _bt_preprocess_keys(scan);
+ /* The mark should have been set on a consistent set of keys... */
+ Assert(so->qual_ok);
+ }
+}
+
+
+/*
+ * _bt_preprocess_keys() -- Preprocess scan keys
+ *
+ * The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
+ * are copied to so->keyData[] with possible transformation.
+ * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
+ * the number of output keys (possibly less, never greater).
+ *
+ * The output keys are marked with additional sk_flags bits beyond the
+ * system-standard bits supplied by the caller. The DESC and NULLS_FIRST
+ * indoption bits for the relevant index attribute are copied into the flags.
+ * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
+ * so that the index sorts in the desired direction.
+ *
+ * One key purpose of this routine is to discover which scan keys must be
+ * satisfied to continue the scan. It also attempts to eliminate redundant
+ * keys and detect contradictory keys. (If the index opfamily provides
+ * incomplete sets of cross-type operators, we may fail to detect redundant
+ * or contradictory keys, but we can survive that.)
+ *
+ * The output keys must be sorted by index attribute. Presently we expect
+ * (but verify) that the input keys are already so sorted --- this is done
+ * by match_clauses_to_index() in indxpath.c. Some reordering of the keys
+ * within each attribute may be done as a byproduct of the processing here,
+ * but no other code depends on that.
+ *
+ * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
+ * if they must be satisfied in order to continue the scan forward or backward
+ * respectively. _bt_checkkeys uses these flags. For example, if the quals
+ * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
+ * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
+ * But once we reach tuples like (1,4,z) we can stop scanning because no
+ * later tuples could match. This is reflected by marking the x and y keys,
+ * but not the z key, with SK_BT_REQFWD. In general, the keys for leading
+ * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
+ * For the first attribute without an "=" key, any "<" and "<=" keys are
+ * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
+ * This can be seen to be correct by considering the above example. Note
+ * in particular that if there are no keys for a given attribute, the keys for
+ * subsequent attributes can never be required; for instance "WHERE y = 4"
+ * requires a full-index scan.
+ *
+ * If possible, redundant keys are eliminated: we keep only the tightest
+ * >/>= bound and the tightest </<= bound, and if there's an = key then
+ * that's the only one returned. (So, we return either a single = key,
+ * or one or two boundary-condition keys for each attr.) However, if we
+ * cannot compare two keys for lack of a suitable cross-type operator,
+ * we cannot eliminate either. If there are two such keys of the same
+ * operator strategy, the second one is just pushed into the output array
+ * without further processing here. We may also emit both >/>= or both
+ * </<= keys if we can't compare them. The logic about required keys still
+ * works if we don't eliminate redundant keys.
+ *
+ * Note that one reason we need direction-sensitive required-key flags is
+ * precisely that we may not be able to eliminate redundant keys. Suppose
+ * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
+ * which key is more restrictive for lack of a suitable cross-type operator.
+ * _bt_first will arbitrarily pick one of the keys to do the initial
+ * positioning with. If it picks x > 4, then the x > 10 condition will fail
+ * until we reach index entries > 10; but we can't stop the scan just because
+ * x > 10 is failing. On the other hand, if we are scanning backwards, then
+ * failure of either key is indeed enough to stop the scan. (In general, when
+ * inequality keys are present, the initial-positioning code only promises to
+ * position before the first possible match, not exactly at the first match,
+ * for a forward scan; or after the last match for a backward scan.)
+ *
+ * As a byproduct of this work, we can detect contradictory quals such
+ * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false,
+ * indicating the scan need not be run at all since no tuples can match.
+ * (In this case we do not bother completing the output key array!)
+ * Again, missing cross-type operators might cause us to fail to prove the
+ * quals contradictory when they really are, but the scan will work correctly.
+ *
+ * Row comparison keys are currently also treated without any smarts:
+ * we just transfer them into the preprocessed array without any
+ * editorialization. We can treat them the same as an ordinary inequality
+ * comparison on the row's first index column, for the purposes of the logic
+ * about required keys.
+ *
+ * Note: the reason we have to copy the preprocessed scan keys into private
+ * storage is that we are modifying the array based on comparisons of the
+ * key argument values, which could change on a rescan or after moving to
+ * new elements of array keys. Therefore we can't overwrite the source data.
+ */
+void
+_bt_preprocess_keys(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int numberOfKeys = scan->numberOfKeys;
+ int16 *indoption = scan->indexRelation->rd_indoption;
+ int new_numberOfKeys;
+ int numberOfEqualCols;
+ ScanKey inkeys;
+ ScanKey outkeys;
+ ScanKey cur;
+ ScanKey xform[BTMaxStrategyNumber];
+ bool test_result;
+ int i,
+ j;
+ AttrNumber attno;
+
+ /* initialize result variables */
+ so->qual_ok = true;
+ so->numberOfKeys = 0;
+
+ if (numberOfKeys < 1)
+ return; /* done if qual-less scan */
+
+ /*
+ * Read so->arrayKeyData if array keys are present, else scan->keyData
+ */
+ if (so->arrayKeyData != NULL)
+ inkeys = so->arrayKeyData;
+ else
+ inkeys = scan->keyData;
+
+ outkeys = so->keyData;
+ cur = &inkeys[0];
+ /* we check that input keys are correctly ordered */
+ if (cur->sk_attno < 1)
+ elog(ERROR, "btree index keys must be ordered by attribute");
+
+ /* We can short-circuit most of the work if there's just one key */
+ if (numberOfKeys == 1)
+ {
+ /* Apply indoption to scankey (might change sk_strategy!) */
+ if (!_bt_fix_scankey_strategy(cur, indoption))
+ so->qual_ok = false;
+ memcpy(outkeys, cur, sizeof(ScanKeyData));
+ so->numberOfKeys = 1;
+ /* We can mark the qual as required if it's for first index col */
+ if (cur->sk_attno == 1)
+ _bt_mark_scankey_required(outkeys);
+ return;
+ }
+
+ /*
+ * Otherwise, do the full set of pushups.
+ */
+ new_numberOfKeys = 0;
+ numberOfEqualCols = 0;
+
+ /*
+ * Initialize for processing of keys for attr 1.
+ *
+ * xform[i] points to the currently best scan key of strategy type i+1; it
+ * is NULL if we haven't yet found such a key for this attr.
+ */
+ attno = 1;
+ memset(xform, 0, sizeof(xform));
+
+ /*
+ * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
+ * handle after-last-key processing. Actual exit from the loop is at the
+ * "break" statement below.
+ */
+ for (i = 0;; cur++, i++)
+ {
+ if (i < numberOfKeys)
+ {
+ /* Apply indoption to scankey (might change sk_strategy!) */
+ if (!_bt_fix_scankey_strategy(cur, indoption))
+ {
+ /* NULL can't be matched, so give up */
+ so->qual_ok = false;
+ return;
+ }
+ }
+
+ /*
+ * If we are at the end of the keys for a particular attr, finish up
+ * processing and emit the cleaned-up keys.
+ */
+ if (i == numberOfKeys || cur->sk_attno != attno)
+ {
+ int priorNumberOfEqualCols = numberOfEqualCols;
+
+ /* check input keys are correctly ordered */
+ if (i < numberOfKeys && cur->sk_attno < attno)
+ elog(ERROR, "btree index keys must be ordered by attribute");
+
+ /*
+ * If = has been specified, all other keys can be eliminated as
+ * redundant. If we have a case like key = 1 AND key > 2, we can
+ * set qual_ok to false and abandon further processing.
+ *
+ * We also have to deal with the case of "key IS NULL", which is
+ * unsatisfiable in combination with any other index condition. By
+ * the time we get here, that's been classified as an equality
+ * check, and we've rejected any combination of it with a regular
+ * equality condition; but not with other types of conditions.
+ */
+ if (xform[BTEqualStrategyNumber - 1])
+ {
+ ScanKey eq = xform[BTEqualStrategyNumber - 1];
+
+ for (j = BTMaxStrategyNumber; --j >= 0;)
+ {
+ ScanKey chk = xform[j];
+
+ if (!chk || j == (BTEqualStrategyNumber - 1))
+ continue;
+
+ if (eq->sk_flags & SK_SEARCHNULL)
+ {
+ /* IS NULL is contradictory to anything else */
+ so->qual_ok = false;
+ return;
+ }
+
+ if (_bt_compare_scankey_args(scan, chk, eq, chk,
+ &test_result))
+ {
+ if (!test_result)
+ {
+ /* keys proven mutually contradictory */
+ so->qual_ok = false;
+ return;
+ }
+ /* else discard the redundant non-equality key */
+ xform[j] = NULL;
+ }
+ /* else, cannot determine redundancy, keep both keys */
+ }
+ /* track number of attrs for which we have "=" keys */
+ numberOfEqualCols++;
+ }
+
+ /* try to keep only one of <, <= */
+ if (xform[BTLessStrategyNumber - 1]
+ && xform[BTLessEqualStrategyNumber - 1])
+ {
+ ScanKey lt = xform[BTLessStrategyNumber - 1];
+ ScanKey le = xform[BTLessEqualStrategyNumber - 1];
+
+ if (_bt_compare_scankey_args(scan, le, lt, le,
+ &test_result))
+ {
+ if (test_result)
+ xform[BTLessEqualStrategyNumber - 1] = NULL;
+ else
+ xform[BTLessStrategyNumber - 1] = NULL;
+ }
+ }
+
+ /* try to keep only one of >, >= */
+ if (xform[BTGreaterStrategyNumber - 1]
+ && xform[BTGreaterEqualStrategyNumber - 1])
+ {
+ ScanKey gt = xform[BTGreaterStrategyNumber - 1];
+ ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1];
+
+ if (_bt_compare_scankey_args(scan, ge, gt, ge,
+ &test_result))
+ {
+ if (test_result)
+ xform[BTGreaterEqualStrategyNumber - 1] = NULL;
+ else
+ xform[BTGreaterStrategyNumber - 1] = NULL;
+ }
+ }
+
+ /*
+ * Emit the cleaned-up keys into the outkeys[] array, and then
+ * mark them if they are required. They are required (possibly
+ * only in one direction) if all attrs before this one had "=".
+ */
+ for (j = BTMaxStrategyNumber; --j >= 0;)
+ {
+ if (xform[j])
+ {
+ ScanKey outkey = &outkeys[new_numberOfKeys++];
+
+ memcpy(outkey, xform[j], sizeof(ScanKeyData));
+ if (priorNumberOfEqualCols == attno - 1)
+ _bt_mark_scankey_required(outkey);
+ }
+ }
+
+ /*
+ * Exit loop here if done.
+ */
+ if (i == numberOfKeys)
+ break;
+
+ /* Re-initialize for new attno */
+ attno = cur->sk_attno;
+ memset(xform, 0, sizeof(xform));
+ }
+
+ /* check strategy this key's operator corresponds to */
+ j = cur->sk_strategy - 1;
+
+ /* if row comparison, push it directly to the output array */
+ if (cur->sk_flags & SK_ROW_HEADER)
+ {
+ ScanKey outkey = &outkeys[new_numberOfKeys++];
+
+ memcpy(outkey, cur, sizeof(ScanKeyData));
+ if (numberOfEqualCols == attno - 1)
+ _bt_mark_scankey_required(outkey);
+
+ /*
+ * We don't support RowCompare using equality; such a qual would
+ * mess up the numberOfEqualCols tracking.
+ */
+ Assert(j != (BTEqualStrategyNumber - 1));
+ continue;
+ }
+
+ /* have we seen one of these before? */
+ if (xform[j] == NULL)
+ {
+ /* nope, so remember this scankey */
+ xform[j] = cur;
+ }
+ else
+ {
+ /* yup, keep only the more restrictive key */
+ if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
+ &test_result))
+ {
+ if (test_result)
+ xform[j] = cur;
+ else if (j == (BTEqualStrategyNumber - 1))
+ {
+ /* key == a && key == b, but a != b */
+ so->qual_ok = false;
+ return;
+ }
+ /* else old key is more restrictive, keep it */
+ }
+ else
+ {
+ /*
+ * We can't determine which key is more restrictive. Keep the
+ * previous one in xform[j] and push this one directly to the
+ * output array.
+ */
+ ScanKey outkey = &outkeys[new_numberOfKeys++];
+
+ memcpy(outkey, cur, sizeof(ScanKeyData));
+ if (numberOfEqualCols == attno - 1)
+ _bt_mark_scankey_required(outkey);
+ }
+ }
+ }
+
+ so->numberOfKeys = new_numberOfKeys;
+}
+
+/*
+ * Compare two scankey values using a specified operator.
+ *
+ * The test we want to perform is logically "leftarg op rightarg", where
+ * leftarg and rightarg are the sk_argument values in those ScanKeys, and
+ * the comparison operator is the one in the op ScanKey. However, in
+ * cross-data-type situations we may need to look up the correct operator in
+ * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
+ * and amoplefttype/amoprighttype equal to the two argument datatypes.
+ *
+ * If the opfamily doesn't supply a complete set of cross-type operators we
+ * may not be able to make the comparison. If we can make the comparison
+ * we store the operator result in *result and return true. We return false
+ * if the comparison could not be made.
+ *
+ * Note: op always points at the same ScanKey as either leftarg or rightarg.
+ * Since we don't scribble on the scankeys, this aliasing should cause no
+ * trouble.
+ *
+ * Note: this routine needs to be insensitive to any DESC option applied
+ * to the index column. For example, "x < 4" is a tighter constraint than
+ * "x < 5" regardless of which way the index is sorted.
+ */
+static bool
+_bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
+ ScanKey leftarg, ScanKey rightarg,
+ bool *result)
+{
+ Relation rel = scan->indexRelation;
+ Oid lefttype,
+ righttype,
+ optype,
+ opcintype,
+ cmp_op;
+ StrategyNumber strat;
+
+ /*
+ * First, deal with cases where one or both args are NULL. This should
+ * only happen when the scankeys represent IS NULL/NOT NULL conditions.
+ */
+ if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
+ {
+ bool leftnull,
+ rightnull;
+
+ if (leftarg->sk_flags & SK_ISNULL)
+ {
+ Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
+ leftnull = true;
+ }
+ else
+ leftnull = false;
+ if (rightarg->sk_flags & SK_ISNULL)
+ {
+ Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
+ rightnull = true;
+ }
+ else
+ rightnull = false;
+
+ /*
+ * We treat NULL as either greater than or less than all other values.
+ * Since true > false, the tests below work correctly for NULLS LAST
+ * logic. If the index is NULLS FIRST, we need to flip the strategy.
+ */
+ strat = op->sk_strategy;
+ if (op->sk_flags & SK_BT_NULLS_FIRST)
+ strat = BTCommuteStrategyNumber(strat);
+
+ switch (strat)
+ {
+ case BTLessStrategyNumber:
+ *result = (leftnull < rightnull);
+ break;
+ case BTLessEqualStrategyNumber:
+ *result = (leftnull <= rightnull);
+ break;
+ case BTEqualStrategyNumber:
+ *result = (leftnull == rightnull);
+ break;
+ case BTGreaterEqualStrategyNumber:
+ *result = (leftnull >= rightnull);
+ break;
+ case BTGreaterStrategyNumber:
+ *result = (leftnull > rightnull);
+ break;
+ default:
+ elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
+ *result = false; /* keep compiler quiet */
+ break;
+ }
+ return true;
+ }
+
+ /*
+ * The opfamily we need to worry about is identified by the index column.
+ */
+ Assert(leftarg->sk_attno == rightarg->sk_attno);
+
+ opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
+
+ /*
+ * Determine the actual datatypes of the ScanKey arguments. We have to
+ * support the convention that sk_subtype == InvalidOid means the opclass
+ * input type; this is a hack to simplify life for ScanKeyInit().
+ */
+ lefttype = leftarg->sk_subtype;
+ if (lefttype == InvalidOid)
+ lefttype = opcintype;
+ righttype = rightarg->sk_subtype;
+ if (righttype == InvalidOid)
+ righttype = opcintype;
+ optype = op->sk_subtype;
+ if (optype == InvalidOid)
+ optype = opcintype;
+
+ /*
+ * If leftarg and rightarg match the types expected for the "op" scankey,
+ * we can use its already-looked-up comparison function.
+ */
+ if (lefttype == opcintype && righttype == optype)
+ {
+ *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
+ op->sk_collation,
+ leftarg->sk_argument,
+ rightarg->sk_argument));
+ return true;
+ }
+
+ /*
+ * Otherwise, we need to go to the syscache to find the appropriate
+ * operator. (This cannot result in infinite recursion, since no
+ * indexscan initiated by syscache lookup will use cross-data-type
+ * operators.)
+ *
+ * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
+ * un-flip it to get the correct opfamily member.
+ */
+ strat = op->sk_strategy;
+ if (op->sk_flags & SK_BT_DESC)
+ strat = BTCommuteStrategyNumber(strat);
+
+ cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
+ lefttype,
+ righttype,
+ strat);
+ if (OidIsValid(cmp_op))
+ {
+ RegProcedure cmp_proc = get_opcode(cmp_op);
+
+ if (RegProcedureIsValid(cmp_proc))
+ {
+ *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
+ op->sk_collation,
+ leftarg->sk_argument,
+ rightarg->sk_argument));
+ return true;
+ }
+ }
+
+ /* Can't make the comparison */
+ *result = false; /* suppress compiler warnings */
+ return false;
+}
+
+/*
+ * Adjust a scankey's strategy and flags setting as needed for indoptions.
+ *
+ * We copy the appropriate indoption value into the scankey sk_flags
+ * (shifting to avoid clobbering system-defined flag bits). Also, if
+ * the DESC option is set, commute (flip) the operator strategy number.
+ *
+ * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
+ * the strategy field correctly for them.
+ *
+ * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
+ * NULL comparison value. Since all btree operators are assumed strict,
+ * a NULL means that the qual cannot be satisfied. We return true if the
+ * comparison value isn't NULL, or false if the scan should be abandoned.
+ *
+ * This function is applied to the *input* scankey structure; therefore
+ * on a rescan we will be looking at already-processed scankeys. Hence
+ * we have to be careful not to re-commute the strategy if we already did it.
+ * It's a bit ugly to modify the caller's copy of the scankey but in practice
+ * there shouldn't be any problem, since the index's indoptions are certainly
+ * not going to change while the scankey survives.
+ */
+static bool
+_bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
+{
+ int addflags;
+
+ addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
+
+ /*
+ * We treat all btree operators as strict (even if they're not so marked
+ * in pg_proc). This means that it is impossible for an operator condition
+ * with a NULL comparison constant to succeed, and we can reject it right
+ * away.
+ *
+ * However, we now also support "x IS NULL" clauses as search conditions,
+ * so in that case keep going. The planner has not filled in any
+ * particular strategy in this case, so set it to BTEqualStrategyNumber
+ * --- we can treat IS NULL as an equality operator for purposes of search
+ * strategy.
+ *
+ * Likewise, "x IS NOT NULL" is supported. We treat that as either "less
+ * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
+ * FIRST index.
+ *
+ * Note: someday we might have to fill in sk_collation from the index
+ * column's collation. At the moment this is a non-issue because we'll
+ * never actually call the comparison operator on a NULL.
+ */
+ if (skey->sk_flags & SK_ISNULL)
+ {
+ /* SK_ISNULL shouldn't be set in a row header scankey */
+ Assert(!(skey->sk_flags & SK_ROW_HEADER));
+
+ /* Set indoption flags in scankey (might be done already) */
+ skey->sk_flags |= addflags;
+
+ /* Set correct strategy for IS NULL or NOT NULL search */
+ if (skey->sk_flags & SK_SEARCHNULL)
+ {
+ skey->sk_strategy = BTEqualStrategyNumber;
+ skey->sk_subtype = InvalidOid;
+ skey->sk_collation = InvalidOid;
+ }
+ else if (skey->sk_flags & SK_SEARCHNOTNULL)
+ {
+ if (skey->sk_flags & SK_BT_NULLS_FIRST)
+ skey->sk_strategy = BTGreaterStrategyNumber;
+ else
+ skey->sk_strategy = BTLessStrategyNumber;
+ skey->sk_subtype = InvalidOid;
+ skey->sk_collation = InvalidOid;
+ }
+ else
+ {
+ /* regular qual, so it cannot be satisfied */
+ return false;
+ }
+
+ /* Needn't do the rest */
+ return true;
+ }
+
+ /* Adjust strategy for DESC, if we didn't already */
+ if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
+ skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
+ skey->sk_flags |= addflags;
+
+ /* If it's a row header, fix row member flags and strategies similarly */
+ if (skey->sk_flags & SK_ROW_HEADER)
+ {
+ ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+
+ for (;;)
+ {
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
+ if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
+ subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
+ subkey->sk_flags |= addflags;
+ if (subkey->sk_flags & SK_ROW_END)
+ break;
+ subkey++;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Mark a scankey as "required to continue the scan".
+ *
+ * Depending on the operator type, the key may be required for both scan
+ * directions or just one. Also, if the key is a row comparison header,
+ * we have to mark its first subsidiary ScanKey as required. (Subsequent
+ * subsidiary ScanKeys are normally for lower-order columns, and thus
+ * cannot be required, since they're after the first non-equality scankey.)
+ *
+ * Note: when we set required-key flag bits in a subsidiary scankey, we are
+ * scribbling on a data structure belonging to the index AM's caller, not on
+ * our private copy. This should be OK because the marking will not change
+ * from scan to scan within a query, and so we'd just re-mark the same way
+ * anyway on a rescan. Something to keep an eye on though.
+ */
+static void
+_bt_mark_scankey_required(ScanKey skey)
+{
+ int addflags;
+
+ switch (skey->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ addflags = SK_BT_REQFWD;
+ break;
+ case BTEqualStrategyNumber:
+ addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
+ break;
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ addflags = SK_BT_REQBKWD;
+ break;
+ default:
+ elog(ERROR, "unrecognized StrategyNumber: %d",
+ (int) skey->sk_strategy);
+ addflags = 0; /* keep compiler quiet */
+ break;
+ }
+
+ skey->sk_flags |= addflags;
+
+ if (skey->sk_flags & SK_ROW_HEADER)
+ {
+ ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+
+ /* First subkey should be same column/operator as the header */
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ Assert(subkey->sk_attno == skey->sk_attno);
+ Assert(subkey->sk_strategy == skey->sk_strategy);
+ subkey->sk_flags |= addflags;
+ }
+}
+
+/*
+ * Test whether an indextuple satisfies all the scankey conditions.
+ *
+ * Return true if so, false if not. If the tuple fails to pass the qual,
+ * we also determine whether there's any need to continue the scan beyond
+ * this tuple, and set *continuescan accordingly. See comments for
+ * _bt_preprocess_keys(), above, about how this is done.
+ *
+ * Forward scan callers can pass a high key tuple in the hopes of having
+ * us set *continuescan to false, and avoiding an unnecessary visit to
+ * the page to the right.
+ *
+ * scan: index scan descriptor (containing a search-type scankey)
+ * tuple: index tuple to test
+ * tupnatts: number of attributes in tupnatts (high key may be truncated)
+ * dir: direction we are scanning in
+ * continuescan: output parameter (will be set correctly in all cases)
+ */
+bool
+_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
+ ScanDirection dir, bool *continuescan)
+{
+ TupleDesc tupdesc;
+ BTScanOpaque so;
+ int keysz;
+ int ikey;
+ ScanKey key;
+
+ Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
+
+ *continuescan = true; /* default assumption */
+
+ tupdesc = RelationGetDescr(scan->indexRelation);
+ so = (BTScanOpaque) scan->opaque;
+ keysz = so->numberOfKeys;
+
+ for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
+ {
+ Datum datum;
+ bool isNull;
+ Datum test;
+
+ if (key->sk_attno > tupnatts)
+ {
+ /*
+ * This attribute is truncated (must be high key). The value for
+ * this attribute in the first non-pivot tuple on the page to the
+ * right could be any possible value. Assume that truncated
+ * attribute passes the qual.
+ */
+ Assert(ScanDirectionIsForward(dir));
+ Assert(BTreeTupleIsPivot(tuple));
+ continue;
+ }
+
+ /* row-comparison keys need special processing */
+ if (key->sk_flags & SK_ROW_HEADER)
+ {
+ if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
+ continuescan))
+ continue;
+ return false;
+ }
+
+ datum = index_getattr(tuple,
+ key->sk_attno,
+ tupdesc,
+ &isNull);
+
+ if (key->sk_flags & SK_ISNULL)
+ {
+ /* Handle IS NULL/NOT NULL tests */
+ if (key->sk_flags & SK_SEARCHNULL)
+ {
+ if (isNull)
+ continue; /* tuple satisfies this qual */
+ }
+ else
+ {
+ Assert(key->sk_flags & SK_SEARCHNOTNULL);
+ if (!isNull)
+ continue; /* tuple satisfies this qual */
+ }
+
+ /*
+ * Tuple fails this qual. If it's a required qual for the current
+ * scan direction, then we can conclude no further tuples will
+ * pass, either.
+ */
+ if ((key->sk_flags & SK_BT_REQFWD) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ else if ((key->sk_flags & SK_BT_REQBKWD) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+
+ /*
+ * In any case, this indextuple doesn't match the qual.
+ */
+ return false;
+ }
+
+ if (isNull)
+ {
+ if (key->sk_flags & SK_BT_NULLS_FIRST)
+ {
+ /*
+ * Since NULLs are sorted before non-NULLs, we know we have
+ * reached the lower limit of the range of values for this
+ * index attr. On a backward scan, we can stop if this qual
+ * is one of the "must match" subset. We can stop regardless
+ * of whether the qual is > or <, so long as it's required,
+ * because it's not possible for any future tuples to pass. On
+ * a forward scan, however, we must keep going, because we may
+ * have initially positioned to the start of the index.
+ */
+ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+ }
+ else
+ {
+ /*
+ * Since NULLs are sorted after non-NULLs, we know we have
+ * reached the upper limit of the range of values for this
+ * index attr. On a forward scan, we can stop if this qual is
+ * one of the "must match" subset. We can stop regardless of
+ * whether the qual is > or <, so long as it's required,
+ * because it's not possible for any future tuples to pass. On
+ * a backward scan, however, we must keep going, because we
+ * may have initially positioned to the end of the index.
+ */
+ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ }
+
+ /*
+ * In any case, this indextuple doesn't match the qual.
+ */
+ return false;
+ }
+
+ test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+ datum, key->sk_argument);
+
+ if (!DatumGetBool(test))
+ {
+ /*
+ * Tuple fails this qual. If it's a required qual for the current
+ * scan direction, then we can conclude no further tuples will
+ * pass, either.
+ *
+ * Note: because we stop the scan as soon as any required equality
+ * qual fails, it is critical that equality quals be used for the
+ * initial positioning in _bt_first() when they are available. See
+ * comments in _bt_first().
+ */
+ if ((key->sk_flags & SK_BT_REQFWD) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ else if ((key->sk_flags & SK_BT_REQBKWD) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+
+ /*
+ * In any case, this indextuple doesn't match the qual.
+ */
+ return false;
+ }
+ }
+
+ /* If we get here, the tuple passes all index quals. */
+ return true;
+}
+
+/*
+ * Test whether an indextuple satisfies a row-comparison scan condition.
+ *
+ * Return true if so, false if not. If not, also clear *continuescan if
+ * it's not possible for any future tuples in the current scan direction
+ * to pass the qual.
+ *
+ * This is a subroutine for _bt_checkkeys, which see for more info.
+ */
+static bool
+_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
+ TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
+{
+ ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+ int32 cmpresult = 0;
+ bool result;
+
+ /* First subkey should be same as the header says */
+ Assert(subkey->sk_attno == skey->sk_attno);
+
+ /* Loop over columns of the row condition */
+ for (;;)
+ {
+ Datum datum;
+ bool isNull;
+
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+
+ if (subkey->sk_attno > tupnatts)
+ {
+ /*
+ * This attribute is truncated (must be high key). The value for
+ * this attribute in the first non-pivot tuple on the page to the
+ * right could be any possible value. Assume that truncated
+ * attribute passes the qual.
+ */
+ Assert(ScanDirectionIsForward(dir));
+ Assert(BTreeTupleIsPivot(tuple));
+ cmpresult = 0;
+ if (subkey->sk_flags & SK_ROW_END)
+ break;
+ subkey++;
+ continue;
+ }
+
+ datum = index_getattr(tuple,
+ subkey->sk_attno,
+ tupdesc,
+ &isNull);
+
+ if (isNull)
+ {
+ if (subkey->sk_flags & SK_BT_NULLS_FIRST)
+ {
+ /*
+ * Since NULLs are sorted before non-NULLs, we know we have
+ * reached the lower limit of the range of values for this
+ * index attr. On a backward scan, we can stop if this qual
+ * is one of the "must match" subset. We can stop regardless
+ * of whether the qual is > or <, so long as it's required,
+ * because it's not possible for any future tuples to pass. On
+ * a forward scan, however, we must keep going, because we may
+ * have initially positioned to the start of the index.
+ */
+ if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+ }
+ else
+ {
+ /*
+ * Since NULLs are sorted after non-NULLs, we know we have
+ * reached the upper limit of the range of values for this
+ * index attr. On a forward scan, we can stop if this qual is
+ * one of the "must match" subset. We can stop regardless of
+ * whether the qual is > or <, so long as it's required,
+ * because it's not possible for any future tuples to pass. On
+ * a backward scan, however, we must keep going, because we
+ * may have initially positioned to the end of the index.
+ */
+ if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ }
+
+ /*
+ * In any case, this indextuple doesn't match the qual.
+ */
+ return false;
+ }
+
+ if (subkey->sk_flags & SK_ISNULL)
+ {
+ /*
+ * Unlike the simple-scankey case, this isn't a disallowed case.
+ * But it can never match. If all the earlier row comparison
+ * columns are required for the scan direction, we can stop the
+ * scan, because there can't be another tuple that will succeed.
+ */
+ if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
+ subkey--;
+ if ((subkey->sk_flags & SK_BT_REQFWD) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+ return false;
+ }
+
+ /* Perform the test --- three-way comparison not bool operator */
+ cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
+ subkey->sk_collation,
+ datum,
+ subkey->sk_argument));
+
+ if (subkey->sk_flags & SK_BT_DESC)
+ INVERT_COMPARE_RESULT(cmpresult);
+
+ /* Done comparing if unequal, else advance to next column */
+ if (cmpresult != 0)
+ break;
+
+ if (subkey->sk_flags & SK_ROW_END)
+ break;
+ subkey++;
+ }
+
+ /*
+ * At this point cmpresult indicates the overall result of the row
+ * comparison, and subkey points to the deciding column (or the last
+ * column if the result is "=").
+ */
+ switch (subkey->sk_strategy)
+ {
+ /* EQ and NE cases aren't allowed here */
+ case BTLessStrategyNumber:
+ result = (cmpresult < 0);
+ break;
+ case BTLessEqualStrategyNumber:
+ result = (cmpresult <= 0);
+ break;
+ case BTGreaterEqualStrategyNumber:
+ result = (cmpresult >= 0);
+ break;
+ case BTGreaterStrategyNumber:
+ result = (cmpresult > 0);
+ break;
+ default:
+ elog(ERROR, "unrecognized RowCompareType: %d",
+ (int) subkey->sk_strategy);
+ result = 0; /* keep compiler quiet */
+ break;
+ }
+
+ if (!result)
+ {
+ /*
+ * Tuple fails this qual. If it's a required qual for the current
+ * scan direction, then we can conclude no further tuples will pass,
+ * either. Note we have to look at the deciding column, not
+ * necessarily the first or last column of the row condition.
+ */
+ if ((subkey->sk_flags & SK_BT_REQFWD) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+ }
+
+ return result;
+}
+
+/*
+ * _bt_killitems - set LP_DEAD state for items an indexscan caller has
+ * told us were killed
+ *
+ * scan->opaque, referenced locally through so, contains information about the
+ * current page and killed tuples thereon (generally, this should only be
+ * called if so->numKilled > 0).
+ *
+ * The caller does not have a lock on the page and may or may not have the
+ * page pinned in a buffer. Note that read-lock is sufficient for setting
+ * LP_DEAD status (which is only a hint).
+ *
+ * We match items by heap TID before assuming they are the right ones to
+ * delete. We cope with cases where items have moved right due to insertions.
+ * If an item has moved off the current page due to a split, we'll fail to
+ * find it and do nothing (this is not an error case --- we assume the item
+ * will eventually get marked in a future indexscan).
+ *
+ * Note that if we hold a pin on the target page continuously from initially
+ * reading the items until applying this function, VACUUM cannot have deleted
+ * any items from the page, and so there is no need to search left from the
+ * recorded offset. (This observation also guarantees that the item is still
+ * the right one to delete, which might otherwise be questionable since heap
+ * TIDs can get recycled.) This holds true even if the page has been modified
+ * by inserts and page splits, so there is no need to consult the LSN.
+ *
+ * If the pin was released after reading the page, then we re-read it. If it
+ * has been modified since we read it (as determined by the LSN), we dare not
+ * flag any entries because it is possible that the old entry was vacuumed
+ * away and the TID was re-used by a completely different heap tuple.
+ */
+void
+_bt_killitems(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber minoff;
+ OffsetNumber maxoff;
+ int i;
+ int numKilled = so->numKilled;
+ bool killedsomething = false;
+ bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
+
+ Assert(BTScanPosIsValid(so->currPos));
+
+ /*
+ * Always reset the scan state, so we don't look for same items on other
+ * pages.
+ */
+ so->numKilled = 0;
+
+ if (BTScanPosIsPinned(so->currPos))
+ {
+ /*
+ * We have held the pin on this page since we read the index tuples,
+ * so all we need to do is lock it. The pin will have prevented
+ * re-use of any TID on the page, so there is no need to check the
+ * LSN.
+ */
+ droppedpin = false;
+ _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
+
+ page = BufferGetPage(so->currPos.buf);
+ }
+ else
+ {
+ Buffer buf;
+
+ droppedpin = true;
+ /* Attempt to re-read the buffer, getting pin and lock. */
+ buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
+
+ page = BufferGetPage(buf);
+ if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
+ so->currPos.buf = buf;
+ else
+ {
+ /* Modified while not pinned means hinting is not safe. */
+ _bt_relbuf(scan->indexRelation, buf);
+ return;
+ }
+ }
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ for (i = 0; i < numKilled; i++)
+ {
+ int itemIndex = so->killedItems[i];
+ BTScanPosItem *kitem = &so->currPos.items[itemIndex];
+ OffsetNumber offnum = kitem->indexOffset;
+
+ Assert(itemIndex >= so->currPos.firstItem &&
+ itemIndex <= so->currPos.lastItem);
+ if (offnum < minoff)
+ continue; /* pure paranoia */
+ while (offnum <= maxoff)
+ {
+ ItemId iid = PageGetItemId(page, offnum);
+ IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
+ bool killtuple = false;
+
+ if (BTreeTupleIsPosting(ituple))
+ {
+ int pi = i + 1;
+ int nposting = BTreeTupleGetNPosting(ituple);
+ int j;
+
+ /*
+ * We rely on the convention that heap TIDs in the scanpos
+ * items array are stored in ascending heap TID order for a
+ * group of TIDs that originally came from a posting list
+ * tuple. This convention even applies during backwards
+ * scans, where returning the TIDs in descending order might
+ * seem more natural. This is about effectiveness, not
+ * correctness.
+ *
+ * Note that the page may have been modified in almost any way
+ * since we first read it (in the !droppedpin case), so it's
+ * possible that this posting list tuple wasn't a posting list
+ * tuple when we first encountered its heap TIDs.
+ */
+ for (j = 0; j < nposting; j++)
+ {
+ ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+ if (!ItemPointerEquals(item, &kitem->heapTid))
+ break; /* out of posting list loop */
+
+ /*
+ * kitem must have matching offnum when heap TIDs match,
+ * though only in the common case where the page can't
+ * have been concurrently modified
+ */
+ Assert(kitem->indexOffset == offnum || !droppedpin);
+
+ /*
+ * Read-ahead to later kitems here.
+ *
+ * We rely on the assumption that not advancing kitem here
+ * will prevent us from considering the posting list tuple
+ * fully dead by not matching its next heap TID in next
+ * loop iteration.
+ *
+ * If, on the other hand, this is the final heap TID in
+ * the posting list tuple, then tuple gets killed
+ * regardless (i.e. we handle the case where the last
+ * kitem is also the last heap TID in the last index tuple
+ * correctly -- posting tuple still gets killed).
+ */
+ if (pi < numKilled)
+ kitem = &so->currPos.items[so->killedItems[pi++]];
+ }
+
+ /*
+ * Don't bother advancing the outermost loop's int iterator to
+ * avoid processing killed items that relate to the same
+ * offnum/posting list tuple. This micro-optimization hardly
+ * seems worth it. (Further iterations of the outermost loop
+ * will fail to match on this same posting list's first heap
+ * TID instead, so we'll advance to the next offnum/index
+ * tuple pretty quickly.)
+ */
+ if (j == nposting)
+ killtuple = true;
+ }
+ else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+ killtuple = true;
+
+ /*
+ * Mark index item as dead, if it isn't already. Since this
+ * happens while holding a buffer lock possibly in shared mode,
+ * it's possible that multiple processes attempt to do this
+ * simultaneously, leading to multiple full-page images being sent
+ * to WAL (if wal_log_hints or data checksums are enabled), which
+ * is undesirable.
+ */
+ if (killtuple && !ItemIdIsDead(iid))
+ {
+ /* found the item/all posting list items */
+ ItemIdMarkDead(iid);
+ killedsomething = true;
+ break; /* out of inner search loop */
+ }
+ offnum = OffsetNumberNext(offnum);
+ }
+ }
+
+ /*
+ * Since this can be redone later if needed, mark as dirty hint.
+ *
+ * Whenever we mark anything LP_DEAD, we also set the page's
+ * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
+ * only rely on the page-level flag in !heapkeyspace indexes.)
+ */
+ if (killedsomething)
+ {
+ opaque->btpo_flags |= BTP_HAS_GARBAGE;
+ MarkBufferDirtyHint(so->currPos.buf, true);
+ }
+
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+}
+
+
+/*
+ * The following routines manage a shared-memory area in which we track
+ * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
+ * operations. There is a single counter which increments each time we
+ * start a vacuum to assign it a cycle ID. Since multiple vacuums could
+ * be active concurrently, we have to track the cycle ID for each active
+ * vacuum; this requires at most MaxBackends entries (usually far fewer).
+ * We assume at most one vacuum can be active for a given index.
+ *
+ * Access to the shared memory area is controlled by BtreeVacuumLock.
+ * In principle we could use a separate lmgr locktag for each index,
+ * but a single LWLock is much cheaper, and given the short time that
+ * the lock is ever held, the concurrency hit should be minimal.
+ */
+
+typedef struct BTOneVacInfo
+{
+ LockRelId relid; /* global identifier of an index */
+ BTCycleId cycleid; /* cycle ID for its active VACUUM */
+} BTOneVacInfo;
+
+typedef struct BTVacInfo
+{
+ BTCycleId cycle_ctr; /* cycle ID most recently assigned */
+ int num_vacuums; /* number of currently active VACUUMs */
+ int max_vacuums; /* allocated length of vacuums[] array */
+ BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
+} BTVacInfo;
+
+static BTVacInfo *btvacinfo;
+
+
+/*
+ * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
+ * or zero if there is no active VACUUM
+ *
+ * Note: for correct interlocking, the caller must already hold pin and
+ * exclusive lock on each buffer it will store the cycle ID into. This
+ * ensures that even if a VACUUM starts immediately afterwards, it cannot
+ * process those pages until the page split is complete.
+ */
+BTCycleId
+_bt_vacuum_cycleid(Relation rel)
+{
+ BTCycleId result = 0;
+ int i;
+
+ /* Share lock is enough since this is a read-only operation */
+ LWLockAcquire(BtreeVacuumLock, LW_SHARED);
+
+ for (i = 0; i < btvacinfo->num_vacuums; i++)
+ {
+ BTOneVacInfo *vac = &btvacinfo->vacuums[i];
+
+ if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
+ vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
+ {
+ result = vac->cycleid;
+ break;
+ }
+ }
+
+ LWLockRelease(BtreeVacuumLock);
+ return result;
+}
+
+/*
+ * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
+ *
+ * Note: the caller must guarantee that it will eventually call
+ * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
+ * that this happens even in elog(FATAL) scenarios, the appropriate coding
+ * is not just a PG_TRY, but
+ * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
+ */
+BTCycleId
+_bt_start_vacuum(Relation rel)
+{
+ BTCycleId result;
+ int i;
+ BTOneVacInfo *vac;
+
+ LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
+
+ /*
+ * Assign the next cycle ID, being careful to avoid zero as well as the
+ * reserved high values.
+ */
+ result = ++(btvacinfo->cycle_ctr);
+ if (result == 0 || result > MAX_BT_CYCLE_ID)
+ result = btvacinfo->cycle_ctr = 1;
+
+ /* Let's just make sure there's no entry already for this index */
+ for (i = 0; i < btvacinfo->num_vacuums; i++)
+ {
+ vac = &btvacinfo->vacuums[i];
+ if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
+ vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
+ {
+ /*
+ * Unlike most places in the backend, we have to explicitly
+ * release our LWLock before throwing an error. This is because
+ * we expect _bt_end_vacuum() to be called before transaction
+ * abort cleanup can run to release LWLocks.
+ */
+ LWLockRelease(BtreeVacuumLock);
+ elog(ERROR, "multiple active vacuums for index \"%s\"",
+ RelationGetRelationName(rel));
+ }
+ }
+
+ /* OK, add an entry */
+ if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
+ {
+ LWLockRelease(BtreeVacuumLock);
+ elog(ERROR, "out of btvacinfo slots");
+ }
+ vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
+ vac->relid = rel->rd_lockInfo.lockRelId;
+ vac->cycleid = result;
+ btvacinfo->num_vacuums++;
+
+ LWLockRelease(BtreeVacuumLock);
+ return result;
+}
+
+/*
+ * _bt_end_vacuum --- mark a btree VACUUM operation as done
+ *
+ * Note: this is deliberately coded not to complain if no entry is found;
+ * this allows the caller to put PG_TRY around the start_vacuum operation.
+ */
+void
+_bt_end_vacuum(Relation rel)
+{
+ int i;
+
+ LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
+
+ /* Find the array entry */
+ for (i = 0; i < btvacinfo->num_vacuums; i++)
+ {
+ BTOneVacInfo *vac = &btvacinfo->vacuums[i];
+
+ if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
+ vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
+ {
+ /* Remove it by shifting down the last entry */
+ *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
+ btvacinfo->num_vacuums--;
+ break;
+ }
+ }
+
+ LWLockRelease(BtreeVacuumLock);
+}
+
+/*
+ * _bt_end_vacuum wrapped as an on_shmem_exit callback function
+ */
+void
+_bt_end_vacuum_callback(int code, Datum arg)
+{
+ _bt_end_vacuum((Relation) DatumGetPointer(arg));
+}
+
+/*
+ * BTreeShmemSize --- report amount of shared memory space needed
+ */
+Size
+BTreeShmemSize(void)
+{
+ Size size;
+
+ size = offsetof(BTVacInfo, vacuums);
+ size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
+ return size;
+}
+
+/*
+ * BTreeShmemInit --- initialize this module's shared memory
+ */
+void
+BTreeShmemInit(void)
+{
+ bool found;
+
+ btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
+ BTreeShmemSize(),
+ &found);
+
+ if (!IsUnderPostmaster)
+ {
+ /* Initialize shared memory area */
+ Assert(!found);
+
+ /*
+ * It doesn't really matter what the cycle counter starts at, but
+ * having it always start the same doesn't seem good. Seed with
+ * low-order bits of time() instead.
+ */
+ btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
+
+ btvacinfo->num_vacuums = 0;
+ btvacinfo->max_vacuums = MaxBackends;
+ }
+ else
+ Assert(found);
+}
+
+bytea *
+btoptions(Datum reloptions, bool validate)
+{
+ static const relopt_parse_elt tab[] = {
+ {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
+ {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
+ offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
+ {"deduplicate_items", RELOPT_TYPE_BOOL,
+ offsetof(BTOptions, deduplicate_items)}
+
+ };
+
+ return (bytea *) build_reloptions(reloptions, validate,
+ RELOPT_KIND_BTREE,
+ sizeof(BTOptions),
+ tab, lengthof(tab));
+
+}
+
+/*
+ * btproperty() -- Check boolean properties of indexes.
+ *
+ * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
+ * to call btcanreturn.
+ */
+bool
+btproperty(Oid index_oid, int attno,
+ IndexAMProperty prop, const char *propname,
+ bool *res, bool *isnull)
+{
+ switch (prop)
+ {
+ case AMPROP_RETURNABLE:
+ /* answer only for columns, not AM or whole index */
+ if (attno == 0)
+ return false;
+ /* otherwise, btree can always return data */
+ *res = true;
+ return true;
+
+ default:
+ return false; /* punt to generic code */
+ }
+}
+
+/*
+ * btbuildphasename() -- Return name of index build phase.
+ */
+char *
+btbuildphasename(int64 phasenum)
+{
+ switch (phasenum)
+ {
+ case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
+ return "initializing";
+ case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
+ return "scanning table";
+ case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
+ return "sorting live tuples";
+ case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
+ return "sorting dead tuples";
+ case PROGRESS_BTREE_PHASE_LEAF_LOAD:
+ return "loading tuples in tree";
+ default:
+ return NULL;
+ }
+}
+
+/*
+ * _bt_truncate() -- create tuple without unneeded suffix attributes.
+ *
+ * Returns truncated pivot index tuple allocated in caller's memory context,
+ * with key attributes copied from caller's firstright argument. If rel is
+ * an INCLUDE index, non-key attributes will definitely be truncated away,
+ * since they're not part of the key space. More aggressive suffix
+ * truncation can take place when it's clear that the returned tuple does not
+ * need one or more suffix key attributes. We only need to keep firstright
+ * attributes up to and including the first non-lastleft-equal attribute.
+ * Caller's insertion scankey is used to compare the tuples; the scankey's
+ * argument values are not considered here.
+ *
+ * Note that returned tuple's t_tid offset will hold the number of attributes
+ * present, so the original item pointer offset is not represented. Caller
+ * should only change truncated tuple's downlink. Note also that truncated
+ * key attributes are treated as containing "minus infinity" values by
+ * _bt_compare().
+ *
+ * In the worst case (when a heap TID must be appended to distinguish lastleft
+ * from firstright), the size of the returned tuple is the size of firstright
+ * plus the size of an additional MAXALIGN()'d item pointer. This guarantee
+ * is important, since callers need to stay under the 1/3 of a page
+ * restriction on tuple size. If this routine is ever taught to truncate
+ * within an attribute/datum, it will need to avoid returning an enlarged
+ * tuple to caller when truncation + TOAST compression ends up enlarging the
+ * final datum.
+ */
+IndexTuple
+_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
+ BTScanInsert itup_key)
+{
+ TupleDesc itupdesc = RelationGetDescr(rel);
+ int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ int keepnatts;
+ IndexTuple pivot;
+ IndexTuple tidpivot;
+ ItemPointer pivotheaptid;
+ Size newsize;
+
+ /*
+ * We should only ever truncate non-pivot tuples from leaf pages. It's
+ * never okay to truncate when splitting an internal page.
+ */
+ Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
+
+ /* Determine how many attributes must be kept in truncated tuple */
+ keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
+
+#ifdef DEBUG_NO_TRUNCATE
+ /* Force truncation to be ineffective for testing purposes */
+ keepnatts = nkeyatts + 1;
+#endif
+
+ pivot = index_truncate_tuple(itupdesc, firstright,
+ Min(keepnatts, nkeyatts));
+
+ if (BTreeTupleIsPosting(pivot))
+ {
+ /*
+ * index_truncate_tuple() just returns a straight copy of firstright
+ * when it has no attributes to truncate. When that happens, we may
+ * need to truncate away a posting list here instead.
+ */
+ Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
+ Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
+ pivot->t_info &= ~INDEX_SIZE_MASK;
+ pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
+ }
+
+ /*
+ * If there is a distinguishing key attribute within pivot tuple, we're
+ * done
+ */
+ if (keepnatts <= nkeyatts)
+ {
+ BTreeTupleSetNAtts(pivot, keepnatts, false);
+ return pivot;
+ }
+
+ /*
+ * We have to store a heap TID in the new pivot tuple, since no non-TID
+ * key attribute value in firstright distinguishes the right side of the
+ * split from the left side. nbtree conceptualizes this case as an
+ * inability to truncate away any key attributes, since heap TID is
+ * treated as just another key attribute (despite lacking a pg_attribute
+ * entry).
+ *
+ * Use enlarged space that holds a copy of pivot. We need the extra space
+ * to store a heap TID at the end (using the special pivot tuple
+ * representation). Note that the original pivot already has firstright's
+ * possible posting list/non-key attribute values removed at this point.
+ */
+ newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
+ tidpivot = palloc0(newsize);
+ memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
+ /* Cannot leak memory here */
+ pfree(pivot);
+
+ /*
+ * Store all of firstright's key attribute values plus a tiebreaker heap
+ * TID value in enlarged pivot tuple
+ */
+ tidpivot->t_info &= ~INDEX_SIZE_MASK;
+ tidpivot->t_info |= newsize;
+ BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
+ pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
+
+ /*
+ * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
+ * consider suffix truncation. It seems like a good idea to follow that
+ * example in cases where no truncation takes place -- use lastleft's heap
+ * TID. (This is also the closest value to negative infinity that's
+ * legally usable.)
+ */
+ ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
+
+ /*
+ * We're done. Assert() that heap TID invariants hold before returning.
+ *
+ * Lehman and Yao require that the downlink to the right page, which is to
+ * be inserted into the parent page in the second phase of a page split be
+ * a strict lower bound on items on the right page, and a non-strict upper
+ * bound for items on the left page. Assert that heap TIDs follow these
+ * invariants, since a heap TID value is apparently needed as a
+ * tiebreaker.
+ */
+#ifndef DEBUG_NO_TRUNCATE
+ Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
+ BTreeTupleGetHeapTID(firstright)) < 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(lastleft)) >= 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(firstright)) < 0);
+#else
+
+ /*
+ * Those invariants aren't guaranteed to hold for lastleft + firstright
+ * heap TID attribute values when they're considered here only because
+ * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
+ * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
+ * TID value that always works as a strict lower bound for items to the
+ * right. In particular, it must avoid using firstright's leading key
+ * attribute values along with lastleft's heap TID value when lastleft's
+ * TID happens to be greater than firstright's TID.
+ */
+ ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
+
+ /*
+ * Pivot heap TID should never be fully equal to firstright. Note that
+ * the pivot heap TID will still end up equal to lastleft's heap TID when
+ * that's the only usable value.
+ */
+ ItemPointerSetOffsetNumber(pivotheaptid,
+ OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(firstright)) < 0);
+#endif
+
+ return tidpivot;
+}
+
+/*
+ * _bt_keep_natts - how many key attributes to keep when truncating.
+ *
+ * Caller provides two tuples that enclose a split point. Caller's insertion
+ * scankey is used to compare the tuples; the scankey's argument values are
+ * not considered here.
+ *
+ * This can return a number of attributes that is one greater than the
+ * number of key attributes for the index relation. This indicates that the
+ * caller must use a heap TID as a unique-ifier in new pivot tuple.
+ */
+static int
+_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
+ BTScanInsert itup_key)
+{
+ int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ TupleDesc itupdesc = RelationGetDescr(rel);
+ int keepnatts;
+ ScanKey scankey;
+
+ /*
+ * _bt_compare() treats truncated key attributes as having the value minus
+ * infinity, which would break searches within !heapkeyspace indexes. We
+ * must still truncate away non-key attribute values, though.
+ */
+ if (!itup_key->heapkeyspace)
+ return nkeyatts;
+
+ scankey = itup_key->scankeys;
+ keepnatts = 1;
+ for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
+ {
+ Datum datum1,
+ datum2;
+ bool isNull1,
+ isNull2;
+
+ datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
+ datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
+
+ if (isNull1 != isNull2)
+ break;
+
+ if (!isNull1 &&
+ DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+ scankey->sk_collation,
+ datum1,
+ datum2)) != 0)
+ break;
+
+ keepnatts++;
+ }
+
+ /*
+ * Assert that _bt_keep_natts_fast() agrees with us in passing. This is
+ * expected in an allequalimage index.
+ */
+ Assert(!itup_key->allequalimage ||
+ keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
+
+ return keepnatts;
+}
+
+/*
+ * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
+ *
+ * This is exported so that a candidate split point can have its effect on
+ * suffix truncation inexpensively evaluated ahead of time when finding a
+ * split location. A naive bitwise approach to datum comparisons is used to
+ * save cycles.
+ *
+ * The approach taken here usually provides the same answer as _bt_keep_natts
+ * will (for the same pair of tuples from a heapkeyspace index), since the
+ * majority of btree opclasses can never indicate that two datums are equal
+ * unless they're bitwise equal after detoasting. When an index only has
+ * "equal image" columns, routine is guaranteed to give the same result as
+ * _bt_keep_natts would.
+ *
+ * Callers can rely on the fact that attributes considered equal here are
+ * definitely also equal according to _bt_keep_natts, even when the index uses
+ * an opclass or collation that is not "allequalimage"/deduplication-safe.
+ * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
+ * negatives generally only have the effect of making leaf page splits use a
+ * more balanced split point.
+ */
+int
+_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
+{
+ TupleDesc itupdesc = RelationGetDescr(rel);
+ int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
+ int keepnatts;
+
+ keepnatts = 1;
+ for (int attnum = 1; attnum <= keysz; attnum++)
+ {
+ Datum datum1,
+ datum2;
+ bool isNull1,
+ isNull2;
+ Form_pg_attribute att;
+
+ datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
+ datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
+ att = TupleDescAttr(itupdesc, attnum - 1);
+
+ if (isNull1 != isNull2)
+ break;
+
+ if (!isNull1 &&
+ !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
+ break;
+
+ keepnatts++;
+ }
+
+ return keepnatts;
+}
+
+/*
+ * _bt_check_natts() -- Verify tuple has expected number of attributes.
+ *
+ * Returns value indicating if the expected number of attributes were found
+ * for a particular offset on page. This can be used as a general purpose
+ * sanity check.
+ *
+ * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
+ * preferred to calling here. That's usually more convenient, and is always
+ * more explicit. Call here instead when offnum's tuple may be a negative
+ * infinity tuple that uses the pre-v11 on-disk representation, or when a low
+ * context check is appropriate. This routine is as strict as possible about
+ * what is expected on each version of btree.
+ */
+bool
+_bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
+{
+ int16 natts = IndexRelationGetNumberOfAttributes(rel);
+ int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ IndexTuple itup;
+ int tupnatts;
+
+ /*
+ * We cannot reliably test a deleted or half-dead page, since they have
+ * dummy high keys
+ */
+ if (P_IGNORE(opaque))
+ return true;
+
+ Assert(offnum >= FirstOffsetNumber &&
+ offnum <= PageGetMaxOffsetNumber(page));
+
+ /*
+ * Mask allocated for number of keys in index tuple must be able to fit
+ * maximum possible number of index attributes
+ */
+ StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
+ "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ tupnatts = BTreeTupleGetNAtts(itup, rel);
+
+ /* !heapkeyspace indexes do not support deduplication */
+ if (!heapkeyspace && BTreeTupleIsPosting(itup))
+ return false;
+
+ /* Posting list tuples should never have "pivot heap TID" bit set */
+ if (BTreeTupleIsPosting(itup) &&
+ (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+ BT_PIVOT_HEAP_TID_ATTR) != 0)
+ return false;
+
+ /* INCLUDE indexes do not support deduplication */
+ if (natts != nkeyatts && BTreeTupleIsPosting(itup))
+ return false;
+
+ if (P_ISLEAF(opaque))
+ {
+ if (offnum >= P_FIRSTDATAKEY(opaque))
+ {
+ /*
+ * Non-pivot tuple should never be explicitly marked as a pivot
+ * tuple
+ */
+ if (BTreeTupleIsPivot(itup))
+ return false;
+
+ /*
+ * Leaf tuples that are not the page high key (non-pivot tuples)
+ * should never be truncated. (Note that tupnatts must have been
+ * inferred, even with a posting list tuple, because only pivot
+ * tuples store tupnatts directly.)
+ */
+ return tupnatts == natts;
+ }
+ else
+ {
+ /*
+ * Rightmost page doesn't contain a page high key, so tuple was
+ * checked above as ordinary leaf tuple
+ */
+ Assert(!P_RIGHTMOST(opaque));
+
+ /*
+ * !heapkeyspace high key tuple contains only key attributes. Note
+ * that tupnatts will only have been explicitly represented in
+ * !heapkeyspace indexes that happen to have non-key attributes.
+ */
+ if (!heapkeyspace)
+ return tupnatts == nkeyatts;
+
+ /* Use generic heapkeyspace pivot tuple handling */
+ }
+ }
+ else /* !P_ISLEAF(opaque) */
+ {
+ if (offnum == P_FIRSTDATAKEY(opaque))
+ {
+ /*
+ * The first tuple on any internal page (possibly the first after
+ * its high key) is its negative infinity tuple. Negative
+ * infinity tuples are always truncated to zero attributes. They
+ * are a particular kind of pivot tuple.
+ */
+ if (heapkeyspace)
+ return tupnatts == 0;
+
+ /*
+ * The number of attributes won't be explicitly represented if the
+ * negative infinity tuple was generated during a page split that
+ * occurred with a version of Postgres before v11. There must be
+ * a problem when there is an explicit representation that is
+ * non-zero, or when there is no explicit representation and the
+ * tuple is evidently not a pre-pg_upgrade tuple.
+ *
+ * Prior to v11, downlinks always had P_HIKEY as their offset.
+ * Accept that as an alternative indication of a valid
+ * !heapkeyspace negative infinity tuple.
+ */
+ return tupnatts == 0 ||
+ ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
+ }
+ else
+ {
+ /*
+ * !heapkeyspace downlink tuple with separator key contains only
+ * key attributes. Note that tupnatts will only have been
+ * explicitly represented in !heapkeyspace indexes that happen to
+ * have non-key attributes.
+ */
+ if (!heapkeyspace)
+ return tupnatts == nkeyatts;
+
+ /* Use generic heapkeyspace pivot tuple handling */
+ }
+
+ }
+
+ /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
+ Assert(heapkeyspace);
+
+ /*
+ * Explicit representation of the number of attributes is mandatory with
+ * heapkeyspace index pivot tuples, regardless of whether or not there are
+ * non-key attributes.
+ */
+ if (!BTreeTupleIsPivot(itup))
+ return false;
+
+ /* Pivot tuple should not use posting list representation (redundant) */
+ if (BTreeTupleIsPosting(itup))
+ return false;
+
+ /*
+ * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
+ * when any other key attribute is truncated
+ */
+ if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
+ return false;
+
+ /*
+ * Pivot tuple must have at least one untruncated key attribute (minus
+ * infinity pivot tuples are the only exception). Pivot tuples can never
+ * represent that there is a value present for a key attribute that
+ * exceeds pg_index.indnkeyatts for the index.
+ */
+ return tupnatts > 0 && tupnatts <= nkeyatts;
+}
+
+/*
+ *
+ * _bt_check_third_page() -- check whether tuple fits on a btree page at all.
+ *
+ * We actually need to be able to fit three items on every page, so restrict
+ * any one item to 1/3 the per-page available space. Note that itemsz should
+ * not include the ItemId overhead.
+ *
+ * It might be useful to apply TOAST methods rather than throw an error here.
+ * Using out of line storage would break assumptions made by suffix truncation
+ * and by contrib/amcheck, though.
+ */
+void
+_bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
+ Page page, IndexTuple newtup)
+{
+ Size itemsz;
+ BTPageOpaque opaque;
+
+ itemsz = MAXALIGN(IndexTupleSize(newtup));
+
+ /* Double check item size against limit */
+ if (itemsz <= BTMaxItemSize(page))
+ return;
+
+ /*
+ * Tuple is probably too large to fit on page, but it's possible that the
+ * index uses version 2 or version 3, or that page is an internal page, in
+ * which case a slightly higher limit applies.
+ */
+ if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
+ return;
+
+ /*
+ * Internal page insertions cannot fail here, because that would mean that
+ * an earlier leaf level insertion that should have failed didn't
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_ISLEAF(opaque))
+ elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
+ itemsz, RelationGetRelationName(rel));
+
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
+ itemsz,
+ needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
+ needheaptidspace ? BTMaxItemSize(page) :
+ BTMaxItemSizeNoHeapTid(page),
+ RelationGetRelationName(rel)),
+ errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
+ ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
+ ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
+ RelationGetRelationName(heap)),
+ errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
+ "Consider a function index of an MD5 hash of the value, "
+ "or use full text indexing."),
+ errtableconstraint(heap, RelationGetRelationName(rel))));
+}
+
+/*
+ * Are all attributes in rel "equality is image equality" attributes?
+ *
+ * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any
+ * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
+ * return false; otherwise we return true.
+ *
+ * Returned boolean value is stored in index metapage during index builds.
+ * Deduplication can only be used when we return true.
+ */
+bool
+_bt_allequalimage(Relation rel, bool debugmessage)
+{
+ bool allequalimage = true;
+
+ /* INCLUDE indexes don't support deduplication */
+ if (IndexRelationGetNumberOfAttributes(rel) !=
+ IndexRelationGetNumberOfKeyAttributes(rel))
+ return false;
+
+ /*
+ * There is no special reason why deduplication cannot work with system
+ * relations (i.e. with system catalog indexes and TOAST indexes). We
+ * deem deduplication unsafe for these indexes all the same, since the
+ * alternative is to force users to always use deduplication, without
+ * being able to opt out. (ALTER INDEX is not supported with system
+ * indexes, so users would have no way to set the deduplicate_items
+ * storage parameter to 'off'.)
+ */
+ if (IsSystemRelation(rel))
+ return false;
+
+ for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
+ {
+ Oid opfamily = rel->rd_opfamily[i];
+ Oid opcintype = rel->rd_opcintype[i];
+ Oid collation = rel->rd_indcollation[i];
+ Oid equalimageproc;
+
+ equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
+ BTEQUALIMAGE_PROC);
+
+ /*
+ * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
+ * be unsafe. Otherwise, actually call proc and see what it says.
+ */
+ if (!OidIsValid(equalimageproc) ||
+ !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
+ ObjectIdGetDatum(opcintype))))
+ {
+ allequalimage = false;
+ break;
+ }
+ }
+
+ /*
+ * Don't elog() until here to avoid reporting on a system relation index
+ * or an INCLUDE index
+ */
+ if (debugmessage)
+ {
+ if (allequalimage)
+ elog(DEBUG1, "index \"%s\" can safely use deduplication",
+ RelationGetRelationName(rel));
+ else
+ elog(DEBUG1, "index \"%s\" cannot use deduplication",
+ RelationGetRelationName(rel));
+ }
+
+ return allequalimage;
+}
diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c
new file mode 100644
index 0000000..7acb64e
--- /dev/null
+++ b/src/backend/access/nbtree/nbtvalidate.c
@@ -0,0 +1,380 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtvalidate.c
+ * Opclass validator for btree.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtvalidate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/htup_details.h"
+#include "access/nbtree.h"
+#include "access/xact.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Validator for a btree opclass.
+ *
+ * Some of the checks done here cover the whole opfamily, and therefore are
+ * redundant when checking each opclass in a family. But they don't run long
+ * enough to be much of a problem, so we accept the duplication rather than
+ * complicate the amvalidate API.
+ */
+bool
+btvalidate(Oid opclassoid)
+{
+ bool result = true;
+ HeapTuple classtup;
+ Form_pg_opclass classform;
+ Oid opfamilyoid;
+ Oid opcintype;
+ char *opclassname;
+ HeapTuple familytup;
+ Form_pg_opfamily familyform;
+ char *opfamilyname;
+ CatCList *proclist,
+ *oprlist;
+ List *grouplist;
+ OpFamilyOpFuncGroup *opclassgroup;
+ List *familytypes;
+ int usefulgroups;
+ int i;
+ ListCell *lc;
+
+ /* Fetch opclass information */
+ classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
+ if (!HeapTupleIsValid(classtup))
+ elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
+ classform = (Form_pg_opclass) GETSTRUCT(classtup);
+
+ opfamilyoid = classform->opcfamily;
+ opcintype = classform->opcintype;
+ opclassname = NameStr(classform->opcname);
+
+ /* Fetch opfamily information */
+ familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
+ if (!HeapTupleIsValid(familytup))
+ elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
+ familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
+
+ opfamilyname = NameStr(familyform->opfname);
+
+ /* Fetch all operators and support functions of the opfamily */
+ oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
+ proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
+
+ /* Check individual support functions */
+ for (i = 0; i < proclist->n_members; i++)
+ {
+ HeapTuple proctup = &proclist->members[i]->tuple;
+ Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
+ bool ok;
+
+ /* Check procedure numbers and function signatures */
+ switch (procform->amprocnum)
+ {
+ case BTORDER_PROC:
+ ok = check_amproc_signature(procform->amproc, INT4OID, true,
+ 2, 2, procform->amproclefttype,
+ procform->amprocrighttype);
+ break;
+ case BTSORTSUPPORT_PROC:
+ ok = check_amproc_signature(procform->amproc, VOIDOID, true,
+ 1, 1, INTERNALOID);
+ break;
+ case BTINRANGE_PROC:
+ ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+ 5, 5,
+ procform->amproclefttype,
+ procform->amproclefttype,
+ procform->amprocrighttype,
+ BOOLOID, BOOLOID);
+ break;
+ case BTEQUALIMAGE_PROC:
+ ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+ 1, 1, OIDOID);
+ break;
+ case BTOPTIONS_PROC:
+ ok = check_amoptsproc_signature(procform->amproc);
+ break;
+ default:
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
+ opfamilyname, "btree",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ continue; /* don't want additional message */
+ }
+
+ if (!ok)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
+ opfamilyname, "btree",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ }
+ }
+
+ /* Check individual operators */
+ for (i = 0; i < oprlist->n_members; i++)
+ {
+ HeapTuple oprtup = &oprlist->members[i]->tuple;
+ Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
+
+ /* Check that only allowed strategy numbers exist */
+ if (oprform->amopstrategy < 1 ||
+ oprform->amopstrategy > BTMaxStrategyNumber)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
+ opfamilyname, "btree",
+ format_operator(oprform->amopopr),
+ oprform->amopstrategy)));
+ result = false;
+ }
+
+ /* btree doesn't support ORDER BY operators */
+ if (oprform->amoppurpose != AMOP_SEARCH ||
+ OidIsValid(oprform->amopsortfamily))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
+ opfamilyname, "btree",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+
+ /* Check operator signature --- same for all btree strategies */
+ if (!check_amop_signature(oprform->amopopr, BOOLOID,
+ oprform->amoplefttype,
+ oprform->amoprighttype))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
+ opfamilyname, "btree",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ }
+
+ /* Now check for inconsistent groups of operators/functions */
+ grouplist = identify_opfamily_groups(oprlist, proclist);
+ usefulgroups = 0;
+ opclassgroup = NULL;
+ familytypes = NIL;
+ foreach(lc, grouplist)
+ {
+ OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
+
+ /*
+ * It is possible for an in_range support function to have a RHS type
+ * that is otherwise irrelevant to the opfamily --- for instance, SQL
+ * requires the datetime_ops opclass to have range support with an
+ * interval offset. So, if this group appears to contain only an
+ * in_range function, ignore it: it doesn't represent a pair of
+ * supported types.
+ */
+ if (thisgroup->operatorset == 0 &&
+ thisgroup->functionset == (1 << BTINRANGE_PROC))
+ continue;
+
+ /* Else count it as a relevant group */
+ usefulgroups++;
+
+ /* Remember the group exactly matching the test opclass */
+ if (thisgroup->lefttype == opcintype &&
+ thisgroup->righttype == opcintype)
+ opclassgroup = thisgroup;
+
+ /*
+ * Identify all distinct data types handled in this opfamily. This
+ * implementation is O(N^2), but there aren't likely to be enough
+ * types in the family for it to matter.
+ */
+ familytypes = list_append_unique_oid(familytypes, thisgroup->lefttype);
+ familytypes = list_append_unique_oid(familytypes, thisgroup->righttype);
+
+ /*
+ * Complain if there seems to be an incomplete set of either operators
+ * or support functions for this datatype pair. The sortsupport,
+ * in_range, and equalimage functions are considered optional.
+ */
+ if (thisgroup->operatorset !=
+ ((1 << BTLessStrategyNumber) |
+ (1 << BTLessEqualStrategyNumber) |
+ (1 << BTEqualStrategyNumber) |
+ (1 << BTGreaterEqualStrategyNumber) |
+ (1 << BTGreaterStrategyNumber)))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
+ opfamilyname, "btree",
+ format_type_be(thisgroup->lefttype),
+ format_type_be(thisgroup->righttype))));
+ result = false;
+ }
+ if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s",
+ opfamilyname, "btree",
+ format_type_be(thisgroup->lefttype),
+ format_type_be(thisgroup->righttype))));
+ result = false;
+ }
+ }
+
+ /* Check that the originally-named opclass is supported */
+ /* (if group is there, we already checked it adequately above) */
+ if (!opclassgroup)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator class \"%s\" of access method %s is missing operator(s)",
+ opclassname, "btree")));
+ result = false;
+ }
+
+ /*
+ * Complain if the opfamily doesn't have entries for all possible
+ * combinations of its supported datatypes. While missing cross-type
+ * operators are not fatal, they do limit the planner's ability to derive
+ * additional qual clauses from equivalence classes, so it seems
+ * reasonable to insist that all built-in btree opfamilies be complete.
+ */
+ if (usefulgroups != (list_length(familytypes) * list_length(familytypes)))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)",
+ opfamilyname, "btree")));
+ result = false;
+ }
+
+ ReleaseCatCacheList(proclist);
+ ReleaseCatCacheList(oprlist);
+ ReleaseSysCache(familytup);
+ ReleaseSysCache(classtup);
+
+ return result;
+}
+
+/*
+ * Prechecking function for adding operators/functions to a btree opfamily.
+ */
+void
+btadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions)
+{
+ Oid opcintype;
+ ListCell *lc;
+
+ /*
+ * Btree operators and comparison support functions are always "loose"
+ * members of the opfamily if they are cross-type. If they are not
+ * cross-type, we prefer to tie them to the appropriate opclass ... but if
+ * the user hasn't created one, we can't do that, and must fall back to
+ * using the opfamily dependency. (We mustn't force creation of an
+ * opclass in such a case, as leaving an incomplete opclass laying about
+ * would be bad. Throwing an error is another undesirable alternative.)
+ *
+ * This behavior results in a bit of a dump/reload hazard, in that the
+ * order of restoring objects could affect what dependencies we end up
+ * with. pg_dump's existing behavior will preserve the dependency choices
+ * in most cases, but not if a cross-type operator has been bound tightly
+ * into an opclass. That's a mistake anyway, so silently "fixing" it
+ * isn't awful.
+ *
+ * Optional support functions are always "loose" family members.
+ *
+ * To avoid repeated lookups, we remember the most recently used opclass's
+ * input type.
+ */
+ if (OidIsValid(opclassoid))
+ {
+ /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
+ CommandCounterIncrement();
+ opcintype = get_opclass_input_type(opclassoid);
+ }
+ else
+ opcintype = InvalidOid;
+
+ /*
+ * We handle operators and support functions almost identically, so rather
+ * than duplicate this code block, just join the lists.
+ */
+ foreach(lc, list_concat_copy(operators, functions))
+ {
+ OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+ if (op->is_func && op->number != BTORDER_PROC)
+ {
+ /* Optional support proc, so always a soft family dependency */
+ op->ref_is_hard = false;
+ op->ref_is_family = true;
+ op->refobjid = opfamilyoid;
+ }
+ else if (op->lefttype != op->righttype)
+ {
+ /* Cross-type, so always a soft family dependency */
+ op->ref_is_hard = false;
+ op->ref_is_family = true;
+ op->refobjid = opfamilyoid;
+ }
+ else
+ {
+ /* Not cross-type; is there a suitable opclass? */
+ if (op->lefttype != opcintype)
+ {
+ /* Avoid repeating this expensive lookup, even if it fails */
+ opcintype = op->lefttype;
+ opclassoid = opclass_for_family_datatype(BTREE_AM_OID,
+ opfamilyoid,
+ opcintype);
+ }
+ if (OidIsValid(opclassoid))
+ {
+ /* Hard dependency on opclass */
+ op->ref_is_hard = true;
+ op->ref_is_family = false;
+ op->refobjid = opclassoid;
+ }
+ else
+ {
+ /* We're stuck, so make a soft dependency on the opfamily */
+ op->ref_is_hard = false;
+ op->ref_is_family = true;
+ op->refobjid = opfamilyoid;
+ }
+ }
+ }
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
new file mode 100644
index 0000000..786c08c
--- /dev/null
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -0,0 +1,1126 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtxlog.c
+ * WAL replay logic for btrees.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtxlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bufmask.h"
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
+#include "utils/memutils.h"
+
+static MemoryContext opCtx; /* working memory for operations */
+
+/*
+ * _bt_restore_page -- re-enter all the index tuples on a page
+ *
+ * The page is freshly init'd, and *from (length len) is a copy of what
+ * had been its upper part (pd_upper to pd_special). We assume that the
+ * tuples had been added to the page in item-number order, and therefore
+ * the one with highest item number appears first (lowest on the page).
+ */
+static void
+_bt_restore_page(Page page, char *from, int len)
+{
+ IndexTupleData itupdata;
+ Size itemsz;
+ char *end = from + len;
+ Item items[MaxIndexTuplesPerPage];
+ uint16 itemsizes[MaxIndexTuplesPerPage];
+ int i;
+ int nitems;
+
+ /*
+ * To get the items back in the original order, we add them to the page in
+ * reverse. To figure out where one tuple ends and another begins, we
+ * have to scan them in forward order first.
+ */
+ i = 0;
+ while (from < end)
+ {
+ /*
+ * As we step through the items, 'from' won't always be properly
+ * aligned, so we need to use memcpy(). Further, we use Item (which
+ * is just a char*) here for our items array for the same reason;
+ * wouldn't want the compiler or anyone thinking that an item is
+ * aligned when it isn't.
+ */
+ memcpy(&itupdata, from, sizeof(IndexTupleData));
+ itemsz = IndexTupleSize(&itupdata);
+ itemsz = MAXALIGN(itemsz);
+
+ items[i] = (Item) from;
+ itemsizes[i] = itemsz;
+ i++;
+
+ from += itemsz;
+ }
+ nitems = i;
+
+ for (i = nitems - 1; i >= 0; i--)
+ {
+ if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "_bt_restore_page: cannot add item to page");
+ }
+}
+
+static void
+_bt_restore_meta(XLogReaderState *record, uint8 block_id)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ Buffer metabuf;
+ Page metapg;
+ BTMetaPageData *md;
+ BTPageOpaque pageop;
+ xl_btree_metadata *xlrec;
+ char *ptr;
+ Size len;
+
+ metabuf = XLogInitBufferForRedo(record, block_id);
+ ptr = XLogRecGetBlockData(record, block_id, &len);
+
+ Assert(len == sizeof(xl_btree_metadata));
+ Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
+ xlrec = (xl_btree_metadata *) ptr;
+ metapg = BufferGetPage(metabuf);
+
+ _bt_pageinit(metapg, BufferGetPageSize(metabuf));
+
+ md = BTPageGetMeta(metapg);
+ md->btm_magic = BTREE_MAGIC;
+ md->btm_version = xlrec->version;
+ md->btm_root = xlrec->root;
+ md->btm_level = xlrec->level;
+ md->btm_fastroot = xlrec->fastroot;
+ md->btm_fastlevel = xlrec->fastlevel;
+ /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
+ Assert(md->btm_version >= BTREE_NOVAC_VERSION);
+ md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages;
+ md->btm_last_cleanup_num_heap_tuples = -1.0;
+ md->btm_allequalimage = xlrec->allequalimage;
+
+ pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ pageop->btpo_flags = BTP_META;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page.
+ */
+ ((PageHeader) metapg)->pd_lower =
+ ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
+
+ PageSetLSN(metapg, lsn);
+ MarkBufferDirty(metabuf);
+ UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
+ *
+ * This is a common subroutine of the redo functions of all the WAL record
+ * types that can insert a downlink: insert, split, and newroot.
+ */
+static void
+_bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ Buffer buf;
+
+ if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO)
+ {
+ Page page = (Page) BufferGetPage(buf);
+ BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ Assert(P_INCOMPLETE_SPLIT(pageop));
+ pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+ }
+ if (BufferIsValid(buf))
+ UnlockReleaseBuffer(buf);
+}
+
+static void
+btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
+ XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ /*
+ * Insertion to an internal page finishes an incomplete split at the child
+ * level. Clear the incomplete-split flag in the child. Note: during
+ * normal operation, the child and parent pages are locked at the same
+ * time (the locks are coupled), so that clearing the flag and inserting
+ * the downlink appear atomic to other backends. We don't bother with
+ * that during replay, because readers don't care about the
+ * incomplete-split flag and there cannot be updates happening.
+ */
+ if (!isleaf)
+ _bt_clear_incomplete_split(record, 1);
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ Size datalen;
+ char *datapos = XLogRecGetBlockData(record, 0, &datalen);
+
+ page = BufferGetPage(buffer);
+
+ if (!posting)
+ {
+ /* Simple retail insertion */
+ if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add new item");
+ }
+ else
+ {
+ ItemId itemid;
+ IndexTuple oposting,
+ newitem,
+ nposting;
+ uint16 postingoff;
+
+ /*
+ * A posting list split occurred during leaf page insertion. WAL
+ * record data will start with an offset number representing the
+ * point in an existing posting list that a split occurs at.
+ *
+ * Use _bt_swap_posting() to repeat posting list split steps from
+ * primary. Note that newitem from WAL record is 'orignewitem',
+ * not the final version of newitem that is actually inserted on
+ * page.
+ */
+ postingoff = *((uint16 *) datapos);
+ datapos += sizeof(uint16);
+ datalen -= sizeof(uint16);
+
+ itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum));
+ oposting = (IndexTuple) PageGetItem(page, itemid);
+
+ /* Use mutable, aligned newitem copy in _bt_swap_posting() */
+ Assert(isleaf && postingoff > 0);
+ newitem = CopyIndexTuple((IndexTuple) datapos);
+ nposting = _bt_swap_posting(newitem, oposting, postingoff);
+
+ /* Replace existing posting list with post-split version */
+ memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+ /* Insert "final" new item (not orignewitem from WAL stream) */
+ Assert(IndexTupleSize(newitem) == datalen);
+ if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum,
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add posting split new item");
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * Note: in normal operation, we'd update the metapage while still holding
+ * lock on the page we inserted into. But during replay it's not
+ * necessary to hold that lock, since no other index updates can be
+ * happening concurrently, and readers will cope fine with following an
+ * obsolete link from the metapage.
+ */
+ if (ismeta)
+ _bt_restore_meta(record, 2);
+}
+
+static void
+btree_xlog_split(bool newitemonleft, XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
+ bool isleaf = (xlrec->level == 0);
+ Buffer buf;
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque ropaque;
+ char *datapos;
+ Size datalen;
+ BlockNumber origpagenumber;
+ BlockNumber rightpagenumber;
+ BlockNumber spagenumber;
+
+ XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber);
+ XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber);
+ if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &spagenumber))
+ spagenumber = P_NONE;
+
+ /*
+ * Clear the incomplete split flag on the appropriate child page one level
+ * down when origpage/buf is an internal page (there must have been
+ * cascading page splits during original execution in the event of an
+ * internal page split). This is like the corresponding btree_xlog_insert
+ * call for internal pages. We're not clearing the incomplete split flag
+ * for the current page split here (you can think of this as part of the
+ * insert of newitem that the page split action needs to perform in
+ * passing).
+ *
+ * Like in btree_xlog_insert, this can be done before locking other pages.
+ * We never need to couple cross-level locks in REDO routines.
+ */
+ if (!isleaf)
+ _bt_clear_incomplete_split(record, 3);
+
+ /* Reconstruct right (new) sibling page from scratch */
+ rbuf = XLogInitBufferForRedo(record, 1);
+ datapos = XLogRecGetBlockData(record, 1, &datalen);
+ rpage = (Page) BufferGetPage(rbuf);
+
+ _bt_pageinit(rpage, BufferGetPageSize(rbuf));
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ ropaque->btpo_prev = origpagenumber;
+ ropaque->btpo_next = spagenumber;
+ ropaque->btpo_level = xlrec->level;
+ ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
+ ropaque->btpo_cycleid = 0;
+
+ _bt_restore_page(rpage, datapos, datalen);
+
+ PageSetLSN(rpage, lsn);
+ MarkBufferDirty(rbuf);
+
+ /* Now reconstruct original page (left half of split) */
+ if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+ {
+ /*
+ * To retain the same physical order of the tuples that they had, we
+ * initialize a temporary empty page for the left page and add all the
+ * items to that in item number order. This mirrors how _bt_split()
+ * works. Retaining the same physical order makes WAL consistency
+ * checking possible. See also _bt_restore_page(), which does the
+ * same for the right page.
+ */
+ Page origpage = (Page) BufferGetPage(buf);
+ BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ OffsetNumber off;
+ IndexTuple newitem = NULL,
+ left_hikey = NULL,
+ nposting = NULL;
+ Size newitemsz = 0,
+ left_hikeysz = 0;
+ Page leftpage;
+ OffsetNumber leftoff,
+ replacepostingoff = InvalidOffsetNumber;
+
+ datapos = XLogRecGetBlockData(record, 0, &datalen);
+
+ if (newitemonleft || xlrec->postingoff != 0)
+ {
+ newitem = (IndexTuple) datapos;
+ newitemsz = MAXALIGN(IndexTupleSize(newitem));
+ datapos += newitemsz;
+ datalen -= newitemsz;
+
+ if (xlrec->postingoff != 0)
+ {
+ ItemId itemid;
+ IndexTuple oposting;
+
+ /* Posting list must be at offset number before new item's */
+ replacepostingoff = OffsetNumberPrev(xlrec->newitemoff);
+
+ /* Use mutable, aligned newitem copy in _bt_swap_posting() */
+ newitem = CopyIndexTuple(newitem);
+ itemid = PageGetItemId(origpage, replacepostingoff);
+ oposting = (IndexTuple) PageGetItem(origpage, itemid);
+ nposting = _bt_swap_posting(newitem, oposting,
+ xlrec->postingoff);
+ }
+ }
+
+ /*
+ * Extract left hikey and its size. We assume that 16-bit alignment
+ * is enough to apply IndexTupleSize (since it's fetching from a
+ * uint16 field).
+ */
+ left_hikey = (IndexTuple) datapos;
+ left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
+ datapos += left_hikeysz;
+ datalen -= left_hikeysz;
+
+ Assert(datalen == 0);
+
+ leftpage = PageGetTempPageCopySpecial(origpage);
+
+ /* Add high key tuple from WAL record to temp page */
+ leftoff = P_HIKEY;
+ if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add high key to left page after split");
+ leftoff = OffsetNumberNext(leftoff);
+
+ for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++)
+ {
+ ItemId itemid;
+ Size itemsz;
+ IndexTuple item;
+
+ /* Add replacement posting list when required */
+ if (off == replacepostingoff)
+ {
+ Assert(newitemonleft ||
+ xlrec->firstrightoff == xlrec->newitemoff);
+ if (PageAddItem(leftpage, (Item) nposting,
+ MAXALIGN(IndexTupleSize(nposting)), leftoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add new posting list item to left page after split");
+ leftoff = OffsetNumberNext(leftoff);
+ continue; /* don't insert oposting */
+ }
+
+ /* add the new item if it was inserted on left page */
+ else if (newitemonleft && off == xlrec->newitemoff)
+ {
+ if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add new item to left page after split");
+ leftoff = OffsetNumberNext(leftoff);
+ }
+
+ itemid = PageGetItemId(origpage, off);
+ itemsz = ItemIdGetLength(itemid);
+ item = (IndexTuple) PageGetItem(origpage, itemid);
+ if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add old item to left page after split");
+ leftoff = OffsetNumberNext(leftoff);
+ }
+
+ /* cope with possibility that newitem goes at the end */
+ if (newitemonleft && off == xlrec->newitemoff)
+ {
+ if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add new item to left page after split");
+ leftoff = OffsetNumberNext(leftoff);
+ }
+
+ PageRestoreTempPage(leftpage, origpage);
+
+ /* Fix opaque fields */
+ oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
+ if (isleaf)
+ oopaque->btpo_flags |= BTP_LEAF;
+ oopaque->btpo_next = rightpagenumber;
+ oopaque->btpo_cycleid = 0;
+
+ PageSetLSN(origpage, lsn);
+ MarkBufferDirty(buf);
+ }
+
+ /* Fix left-link of the page to the right of the new right sibling */
+ if (spagenumber != P_NONE)
+ {
+ Buffer sbuf;
+
+ if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO)
+ {
+ Page spage = (Page) BufferGetPage(sbuf);
+ BTPageOpaque spageop = (BTPageOpaque) PageGetSpecialPointer(spage);
+
+ spageop->btpo_prev = rightpagenumber;
+
+ PageSetLSN(spage, lsn);
+ MarkBufferDirty(sbuf);
+ }
+ if (BufferIsValid(sbuf))
+ UnlockReleaseBuffer(sbuf);
+ }
+
+ /*
+ * Finally, release the remaining buffers. sbuf, rbuf, and buf must be
+ * released together, so that readers cannot observe inconsistencies.
+ */
+ UnlockReleaseBuffer(rbuf);
+ if (BufferIsValid(buf))
+ UnlockReleaseBuffer(buf);
+}
+
+static void
+btree_xlog_dedup(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record);
+ Buffer buf;
+
+ if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+ {
+ char *ptr = XLogRecGetBlockData(record, 0, NULL);
+ Page page = (Page) BufferGetPage(buf);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ BTDedupState state;
+ BTDedupInterval *intervals;
+ Page newpage;
+
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true; /* unused */
+ state->nmaxitems = 0; /* unused */
+ /* Conservatively use larger maxpostingsize than primary */
+ state->maxpostingsize = BTMaxItemSize(page);
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+ state->nintervals = 0;
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ newpage = PageGetTempPageCopySpecial(page);
+
+ if (!P_RIGHTMOST(opaque))
+ {
+ ItemId itemid = PageGetItemId(page, P_HIKEY);
+ Size itemsz = ItemIdGetLength(itemid);
+ IndexTuple item = (IndexTuple) PageGetItem(page, itemid);
+
+ if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add highkey");
+ }
+
+ intervals = (BTDedupInterval *) ptr;
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (offnum == minoff)
+ _bt_dedup_start_pending(state, itup, offnum);
+ else if (state->nintervals < xlrec->nintervals &&
+ state->baseoff == intervals[state->nintervals].baseoff &&
+ state->nitems < intervals[state->nintervals].nitems)
+ {
+ if (!_bt_dedup_save_htid(state, itup))
+ elog(ERROR, "deduplication failed to add heap tid to pending posting list");
+ }
+ else
+ {
+ _bt_dedup_finish_pending(newpage, state);
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+
+ _bt_dedup_finish_pending(newpage, state);
+ Assert(state->nintervals == xlrec->nintervals);
+ Assert(memcmp(state->intervals, intervals,
+ state->nintervals * sizeof(BTDedupInterval)) == 0);
+
+ if (P_HAS_GARBAGE(opaque))
+ {
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+ nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+ }
+
+ PageRestoreTempPage(newpage, page);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+ }
+
+ if (BufferIsValid(buf))
+ UnlockReleaseBuffer(buf);
+}
+
+static void
+btree_xlog_updates(Page page, OffsetNumber *updatedoffsets,
+ xl_btree_update *updates, int nupdated)
+{
+ BTVacuumPosting vacposting;
+ IndexTuple origtuple;
+ ItemId itemid;
+ Size itemsz;
+
+ for (int i = 0; i < nupdated; i++)
+ {
+ itemid = PageGetItemId(page, updatedoffsets[i]);
+ origtuple = (IndexTuple) PageGetItem(page, itemid);
+
+ vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+ updates->ndeletedtids * sizeof(uint16));
+ vacposting->updatedoffset = updatedoffsets[i];
+ vacposting->itup = origtuple;
+ vacposting->ndeletedtids = updates->ndeletedtids;
+ memcpy(vacposting->deletetids,
+ (char *) updates + SizeOfBtreeUpdate,
+ updates->ndeletedtids * sizeof(uint16));
+
+ _bt_update_posting(vacposting);
+
+ /* Overwrite updated version of tuple */
+ itemsz = MAXALIGN(IndexTupleSize(vacposting->itup));
+ if (!PageIndexTupleOverwrite(page, updatedoffsets[i],
+ (Item) vacposting->itup, itemsz))
+ elog(PANIC, "failed to update partially dead item");
+
+ pfree(vacposting->itup);
+ pfree(vacposting);
+
+ /* advance to next xl_btree_update from array */
+ updates = (xl_btree_update *)
+ ((char *) updates + SizeOfBtreeUpdate +
+ updates->ndeletedtids * sizeof(uint16));
+ }
+}
+
+static void
+btree_xlog_vacuum(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+ BTPageOpaque opaque;
+
+ /*
+ * We need to take a cleanup lock here, just like btvacuumpage(). However,
+ * it isn't necessary to exhaustively get a cleanup lock on every block in
+ * the index during recovery (just getting a cleanup lock on pages with
+ * items to kill suffices). See nbtree/README for details.
+ */
+ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer)
+ == BLK_NEEDS_REDO)
+ {
+ char *ptr = XLogRecGetBlockData(record, 0, NULL);
+
+ page = (Page) BufferGetPage(buffer);
+
+ if (xlrec->nupdated > 0)
+ {
+ OffsetNumber *updatedoffsets;
+ xl_btree_update *updates;
+
+ updatedoffsets = (OffsetNumber *)
+ (ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+ updates = (xl_btree_update *) ((char *) updatedoffsets +
+ xlrec->nupdated *
+ sizeof(OffsetNumber));
+
+ btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated);
+ }
+
+ if (xlrec->ndeleted > 0)
+ PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+
+ /*
+ * Mark the page as not containing any LP_DEAD items --- see comments
+ * in _bt_delitems_vacuum().
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+btree_xlog_delete(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+ BTPageOpaque opaque;
+
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page
+ */
+ if (InHotStandby)
+ {
+ RelFileNode rnode;
+
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+ ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
+ }
+
+ /*
+ * We don't need to take a cleanup lock to apply these changes. See
+ * nbtree/README for details.
+ */
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ char *ptr = XLogRecGetBlockData(record, 0, NULL);
+
+ page = (Page) BufferGetPage(buffer);
+
+ if (xlrec->nupdated > 0)
+ {
+ OffsetNumber *updatedoffsets;
+ xl_btree_update *updates;
+
+ updatedoffsets = (OffsetNumber *)
+ (ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+ updates = (xl_btree_update *) ((char *) updatedoffsets +
+ xlrec->nupdated *
+ sizeof(OffsetNumber));
+
+ btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated);
+ }
+
+ if (xlrec->ndeleted > 0)
+ PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+
+ /* Mark the page as not containing any LP_DEAD items */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+ BTPageOpaque pageop;
+ IndexTupleData trunctuple;
+
+ /*
+ * In normal operation, we would lock all the pages this WAL record
+ * touches before changing any of them. In WAL replay, it should be okay
+ * to lock just one page at a time, since no concurrent index updates can
+ * be happening, and readers should not care whether they arrive at the
+ * target page or not (since it's surely empty).
+ */
+
+ /* to-be-deleted subtree's parent page */
+ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+ {
+ OffsetNumber poffset;
+ ItemId itemid;
+ IndexTuple itup;
+ OffsetNumber nextoffset;
+ BlockNumber rightsib;
+
+ page = (Page) BufferGetPage(buffer);
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ poffset = xlrec->poffset;
+
+ nextoffset = OffsetNumberNext(poffset);
+ itemid = PageGetItemId(page, nextoffset);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ rightsib = BTreeTupleGetDownLink(itup);
+
+ itemid = PageGetItemId(page, poffset);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ BTreeTupleSetDownLink(itup, rightsib);
+ nextoffset = OffsetNumberNext(poffset);
+ PageIndexTupleDelete(page, nextoffset);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+
+ /*
+ * Don't need to couple cross-level locks in REDO routines, so release
+ * lock on internal page immediately
+ */
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* Rewrite the leaf page as a halfdead page */
+ buffer = XLogInitBufferForRedo(record, 0);
+ page = (Page) BufferGetPage(buffer);
+
+ _bt_pageinit(page, BufferGetPageSize(buffer));
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ pageop->btpo_prev = xlrec->leftblk;
+ pageop->btpo_next = xlrec->rightblk;
+ pageop->btpo_level = 0;
+ pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
+ pageop->btpo_cycleid = 0;
+
+ /*
+ * Construct a dummy high key item that points to top parent page (value
+ * is InvalidBlockNumber when the top parent page is the leaf page itself)
+ */
+ MemSet(&trunctuple, 0, sizeof(IndexTupleData));
+ trunctuple.t_info = sizeof(IndexTupleData);
+ BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
+
+ if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "could not add dummy high key to half-dead page");
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+}
+
+
+static void
+btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
+ BlockNumber leftsib;
+ BlockNumber rightsib;
+ uint32 level;
+ bool isleaf;
+ FullTransactionId safexid;
+ Buffer leftbuf;
+ Buffer target;
+ Buffer rightbuf;
+ Page page;
+ BTPageOpaque pageop;
+
+ leftsib = xlrec->leftsib;
+ rightsib = xlrec->rightsib;
+ level = xlrec->level;
+ isleaf = (level == 0);
+ safexid = xlrec->safexid;
+
+ /* No leaftopparent for level 0 (leaf page) or level 1 target */
+ Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1);
+
+ /*
+ * In normal operation, we would lock all the pages this WAL record
+ * touches before changing any of them. In WAL replay, we at least lock
+ * the pages in the same standard left-to-right order (leftsib, target,
+ * rightsib), and don't release the sibling locks until the target is
+ * marked deleted.
+ */
+
+ /* Fix right-link of left sibling, if any */
+ if (leftsib != P_NONE)
+ {
+ if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(leftbuf);
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ pageop->btpo_next = rightsib;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(leftbuf);
+ }
+ }
+ else
+ leftbuf = InvalidBuffer;
+
+ /* Rewrite target page as empty deleted page */
+ target = XLogInitBufferForRedo(record, 0);
+ page = (Page) BufferGetPage(target);
+
+ _bt_pageinit(page, BufferGetPageSize(target));
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ pageop->btpo_prev = leftsib;
+ pageop->btpo_next = rightsib;
+ pageop->btpo_level = level;
+ BTPageSetDeleted(page, safexid);
+ if (isleaf)
+ pageop->btpo_flags |= BTP_LEAF;
+ pageop->btpo_cycleid = 0;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(target);
+
+ /* Fix left-link of right sibling */
+ if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(rightbuf);
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ pageop->btpo_prev = leftsib;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(rightbuf);
+ }
+
+ /* Release siblings */
+ if (BufferIsValid(leftbuf))
+ UnlockReleaseBuffer(leftbuf);
+ if (BufferIsValid(rightbuf))
+ UnlockReleaseBuffer(rightbuf);
+
+ /* Release target */
+ UnlockReleaseBuffer(target);
+
+ /*
+ * If we deleted a parent of the targeted leaf page, instead of the leaf
+ * itself, update the leaf to point to the next remaining child in the
+ * to-be-deleted subtree
+ */
+ if (XLogRecHasBlockRef(record, 3))
+ {
+ /*
+ * There is no real data on the page, so we just re-create it from
+ * scratch using the information from the WAL record.
+ *
+ * Note that we don't end up here when the target page is also the
+ * leafbuf page. There is no need to add a dummy hikey item with a
+ * top parent link when deleting leafbuf because it's the last page
+ * we'll delete in the subtree undergoing deletion.
+ */
+ Buffer leafbuf;
+ IndexTupleData trunctuple;
+
+ Assert(!isleaf);
+
+ leafbuf = XLogInitBufferForRedo(record, 3);
+ page = (Page) BufferGetPage(leafbuf);
+
+ _bt_pageinit(page, BufferGetPageSize(leafbuf));
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
+ pageop->btpo_prev = xlrec->leafleftsib;
+ pageop->btpo_next = xlrec->leafrightsib;
+ pageop->btpo_level = 0;
+ pageop->btpo_cycleid = 0;
+
+ /* Add a dummy hikey item */
+ MemSet(&trunctuple, 0, sizeof(IndexTupleData));
+ trunctuple.t_info = sizeof(IndexTupleData);
+ BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent);
+
+ if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "could not add dummy high key to half-dead page");
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(leafbuf);
+ UnlockReleaseBuffer(leafbuf);
+ }
+
+ /* Update metapage if needed */
+ if (info == XLOG_BTREE_UNLINK_PAGE_META)
+ _bt_restore_meta(record, 4);
+}
+
+static void
+btree_xlog_newroot(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+ BTPageOpaque pageop;
+ char *ptr;
+ Size len;
+
+ buffer = XLogInitBufferForRedo(record, 0);
+ page = (Page) BufferGetPage(buffer);
+
+ _bt_pageinit(page, BufferGetPageSize(buffer));
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ pageop->btpo_flags = BTP_ROOT;
+ pageop->btpo_prev = pageop->btpo_next = P_NONE;
+ pageop->btpo_level = xlrec->level;
+ if (xlrec->level == 0)
+ pageop->btpo_flags |= BTP_LEAF;
+ pageop->btpo_cycleid = 0;
+
+ if (xlrec->level > 0)
+ {
+ ptr = XLogRecGetBlockData(record, 0, &len);
+ _bt_restore_page(page, ptr, len);
+
+ /* Clear the incomplete-split flag in left child */
+ _bt_clear_incomplete_split(record, 1);
+ }
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+
+ _bt_restore_meta(record, 2);
+}
+
+/*
+ * In general VACUUM must defer recycling as a way of avoiding certain race
+ * conditions. Deleted pages contain a safexid value that is used by VACUUM
+ * to determine whether or not it's safe to place a page that was deleted by
+ * VACUUM earlier into the FSM now. See nbtree/README.
+ *
+ * As far as any backend operating during original execution is concerned, the
+ * FSM is a cache of recycle-safe pages; the mere presence of the page in the
+ * FSM indicates that the page must already be safe to recycle (actually,
+ * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just
+ * because it would be unwise to completely trust the FSM, given its current
+ * limitations).
+ *
+ * This isn't sufficient to prevent similar concurrent recycling race
+ * conditions during Hot Standby, though. For that we need to log a
+ * xl_btree_reuse_page record at the point that a page is actually recycled
+ * and reused for an entirely unrelated page inside _bt_split(). These
+ * records include the same safexid value from the original deleted page,
+ * stored in the record's latestRemovedFullXid field.
+ *
+ * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used
+ * to determine if it's safe to recycle a page. This mirrors our own test:
+ * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
+ * Consequently, one XID value achieves the same exclusion effect on primary
+ * and standby.
+ */
+static void
+btree_xlog_reuse_page(XLogReaderState *record)
+{
+ xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
+
+ if (InHotStandby)
+ ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+ xlrec->node);
+}
+
+void
+btree_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ MemoryContext oldCtx;
+
+ oldCtx = MemoryContextSwitchTo(opCtx);
+ switch (info)
+ {
+ case XLOG_BTREE_INSERT_LEAF:
+ btree_xlog_insert(true, false, false, record);
+ break;
+ case XLOG_BTREE_INSERT_UPPER:
+ btree_xlog_insert(false, false, false, record);
+ break;
+ case XLOG_BTREE_INSERT_META:
+ btree_xlog_insert(false, true, false, record);
+ break;
+ case XLOG_BTREE_SPLIT_L:
+ btree_xlog_split(true, record);
+ break;
+ case XLOG_BTREE_SPLIT_R:
+ btree_xlog_split(false, record);
+ break;
+ case XLOG_BTREE_INSERT_POST:
+ btree_xlog_insert(true, false, true, record);
+ break;
+ case XLOG_BTREE_DEDUP:
+ btree_xlog_dedup(record);
+ break;
+ case XLOG_BTREE_VACUUM:
+ btree_xlog_vacuum(record);
+ break;
+ case XLOG_BTREE_DELETE:
+ btree_xlog_delete(record);
+ break;
+ case XLOG_BTREE_MARK_PAGE_HALFDEAD:
+ btree_xlog_mark_page_halfdead(info, record);
+ break;
+ case XLOG_BTREE_UNLINK_PAGE:
+ case XLOG_BTREE_UNLINK_PAGE_META:
+ btree_xlog_unlink_page(info, record);
+ break;
+ case XLOG_BTREE_NEWROOT:
+ btree_xlog_newroot(record);
+ break;
+ case XLOG_BTREE_REUSE_PAGE:
+ btree_xlog_reuse_page(record);
+ break;
+ case XLOG_BTREE_META_CLEANUP:
+ _bt_restore_meta(record, 0);
+ break;
+ default:
+ elog(PANIC, "btree_redo: unknown op code %u", info);
+ }
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(opCtx);
+}
+
+void
+btree_xlog_startup(void)
+{
+ opCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Btree recovery temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+btree_xlog_cleanup(void)
+{
+ MemoryContextDelete(opCtx);
+ opCtx = NULL;
+}
+
+/*
+ * Mask a btree page before performing consistency checks on it.
+ */
+void
+btree_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ BTPageOpaque maskopaq;
+
+ mask_page_lsn_and_checksum(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_ISLEAF(maskopaq))
+ {
+ /*
+ * In btree leaf pages, it is possible to modify the LP_FLAGS without
+ * emitting any WAL record. Hence, mask the line pointer flags. See
+ * _bt_killitems(), _bt_check_unique() for details.
+ */
+ mask_lp_flags(page);
+ }
+
+ /*
+ * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
+ * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique()
+ * for details.
+ */
+ maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ /*
+ * During replay of a btree page split, we don't set the BTP_SPLIT_END
+ * flag of the right sibling and initialize the cycle_id to 0 for the same
+ * page. See btree_xlog_split() for details.
+ */
+ maskopaq->btpo_flags &= ~BTP_SPLIT_END;
+ maskopaq->btpo_cycleid = 0;
+}