diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtsearch.c')
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 2501 |
1 files changed, 2501 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c new file mode 100644 index 0000000..fdf0e56 --- /dev/null +++ b/src/backend/access/nbtree/nbtsearch.c @@ -0,0 +1,2501 @@ +/*------------------------------------------------------------------------- + * + * nbtsearch.c + * Search code for postgres btrees. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsearch.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/relscan.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/predicate.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" + + +static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); +static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, + OffsetNumber offnum); +static void _bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup); +static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset); +static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); +static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, + ScanDirection dir); +static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); +static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); +static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); + + +/* + * _bt_drop_lock_and_maybe_pin() + * + * Unlock the buffer; and if it is safe to release the pin, do that, too. It + * is safe if the scan is using an MVCC snapshot and the index is WAL-logged. + * This will prevent vacuum from stalling in a blocked state trying to read a + * page when a cursor is sitting on it -- at least in many important cases. + * + * Set the buffer to invalid if the pin is released, since the buffer may be + * re-used. If we need to go back to this block (for example, to apply + * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it + * will remain in shared memory for as long as it takes to scan the index + * buffer page. + */ +static void +_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +{ + _bt_unlockbuf(scan->indexRelation, sp->buf); + + if (IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation) && + !scan->xs_want_itup) + { + ReleaseBuffer(sp->buf); + sp->buf = InvalidBuffer; + } +} + +/* + * _bt_search() -- Search the tree for a particular scankey, + * or more precisely for the first leaf page it could be on. + * + * The passed scankey is an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * + * Return value is a stack of parent-page pointers (i.e. there is no entry for + * the leaf level/page). *bufP is set to the address of the leaf-page buffer, + * which is locked and pinned. No locks are held on the parent pages, + * however! + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + * + * The returned buffer is locked according to access parameter. Additionally, + * access = BT_WRITE will allow an empty root page to be created and returned. + * When access = BT_READ, an empty index will result in *bufP being set to + * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered + * during the search will be finished. + */ +BTStack +_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, + Snapshot snapshot) +{ + BTStack stack_in = NULL; + int page_access = BT_READ; + + /* Get the root page to start with */ + *bufP = _bt_getroot(rel, access); + + /* If index is empty and access = BT_READ, no root page is created. */ + if (!BufferIsValid(*bufP)) + return (BTStack) NULL; + + /* Loop iterates once per level descended in the tree */ + for (;;) + { + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + ItemId itemid; + IndexTuple itup; + BlockNumber child; + BTStack new_stack; + + /* + * Race -- the page we just grabbed may have split since we read its + * downlink in its parent page (or the metapage). If it has, we may + * need to move right to its new sibling. Do that. + * + * In write-mode, allow _bt_moveright to finish any incomplete splits + * along the way. Strictly speaking, we'd only need to finish an + * incomplete split on the leaf page we're about to insert to, not on + * any of the upper levels (internal pages with incomplete splits are + * also taken care of in _bt_getstackbuf). But this is a good + * opportunity to finish splits of internal pages too. + */ + *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in, + page_access, snapshot); + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISLEAF(opaque)) + break; + + /* + * Find the appropriate pivot tuple on this page. Its downlink points + * to the child page that we're about to descend to. + */ + offnum = _bt_binsrch(rel, key, *bufP); + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); + child = BTreeTupleGetDownLink(itup); + + /* + * We need to save the location of the pivot tuple we chose in a new + * stack entry for this page/level. If caller ends up splitting a + * page one level down, it usually ends up inserting a new pivot + * tuple/downlink immediately after the location recorded here. + */ + new_stack = (BTStack) palloc(sizeof(BTStackData)); + new_stack->bts_blkno = BufferGetBlockNumber(*bufP); + new_stack->bts_offset = offnum; + new_stack->bts_parent = stack_in; + + /* + * Page level 1 is lowest non-leaf page level prior to leaves. So, if + * we're on the level 1 and asked to lock leaf page in write mode, + * then lock next page in write mode, because it must be a leaf. + */ + if (opaque->btpo_level == 1 && access == BT_WRITE) + page_access = BT_WRITE; + + /* drop the read lock on the page, then acquire one on its child */ + *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access); + + /* okay, all set to move down a level */ + stack_in = new_stack; + } + + /* + * If we're asked to lock leaf in write mode, but didn't manage to, then + * relock. This should only happen when the root page is a leaf page (and + * the only page in the index other than the metapage). + */ + if (access == BT_WRITE && page_access == BT_READ) + { + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, *bufP); + _bt_lockbuf(rel, *bufP, BT_WRITE); + + /* + * Race -- the leaf page may have split after we dropped the read lock + * but before we acquired a write lock. If it has, we may need to + * move right to its new sibling. Do that. + */ + *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE, + snapshot); + } + + return stack_in; +} + +/* + * _bt_moveright() -- move right in the btree if necessary. + * + * When we follow a pointer to reach a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. + * + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry is + * strictly less than the scankey, or <= the scankey in the + * key.nextkey=true case, then we followed the wrong link and we need + * to move right. + * + * The passed insertion-type scankey can omit the rightmost column(s) of the + * index. (see nbtree/README) + * + * When key.nextkey is false (the usual case), we are looking for the first + * item >= key. When key.nextkey is true, we are looking for the first item + * strictly greater than key. + * + * If forupdate is true, we will attempt to finish any incomplete splits + * that we encounter. This is required when locking a target page for an + * insertion, because we don't allow inserting on a page before the split + * is completed. 'stack' is only used if forupdate is true. + * + * On entry, we have the buffer pinned and a lock of the type specified by + * 'access'. If we move right, we release the buffer and lock and acquire + * the same on the right sibling. Return value is the buffer we stop at. + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + */ +Buffer +_bt_moveright(Relation rel, + BTScanInsert key, + Buffer buf, + bool forupdate, + BTStack stack, + int access, + Snapshot snapshot) +{ + Page page; + BTPageOpaque opaque; + int32 cmpval; + + /* + * When nextkey = false (normal case): if the scan key that brought us to + * this page is > the high key stored on the page, then the page has split + * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could + * have some duplicates to the right as well as the left, but that's + * something that's only ever dealt with on the leaf level, after + * _bt_search has found an initial leaf page.) + * + * When nextkey = true: move right if the scan key is >= page's high key. + * (Note that key.scantid cannot be set in this case.) + * + * The page could even have split more than once, so scan as far as + * needed. + * + * We also have to move right if we followed a link that brought us to a + * dead page. + */ + cmpval = key->nextkey ? 0 : 1; + + for (;;) + { + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_RIGHTMOST(opaque)) + break; + + /* + * Finish any incomplete splits we encounter along the way. + */ + if (forupdate && P_INCOMPLETE_SPLIT(opaque)) + { + BlockNumber blkno = BufferGetBlockNumber(buf); + + /* upgrade our lock if necessary */ + if (access == BT_READ) + { + _bt_unlockbuf(rel, buf); + _bt_lockbuf(rel, buf, BT_WRITE); + } + + if (P_INCOMPLETE_SPLIT(opaque)) + _bt_finish_split(rel, buf, stack); + else + _bt_relbuf(rel, buf); + + /* re-acquire the lock in the right mode, and re-check */ + buf = _bt_getbuf(rel, blkno, access); + continue; + } + + if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval) + { + /* step right one page */ + buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); + continue; + } + else + break; + } + + if (P_IGNORE(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + + return buf; +} + +/* + * _bt_binsrch() -- Do a binary search for a key on a particular page. + * + * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first + * key >= given scankey, or > scankey if nextkey is true. (NOTE: in + * particular, this means it is possible to return a value 1 greater than the + * number of keys on the page, if the scankey is > all keys on the page.) + * + * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber + * of the last key < given scankey, or last key <= given scankey if nextkey + * is true. (Since _bt_compare treats the first data key of such a page as + * minus infinity, there will be at least one key < scankey, so the result + * always points at one of the keys on the page.) This key indicates the + * right place to descend to be sure we find all leaf keys >= given scankey + * (or leaf keys > given scankey when nextkey is true). + * + * This procedure is not responsible for walking right, it just examines + * the given page. _bt_binsrch() has no lock or refcount side effects + * on the buffer. + */ +static OffsetNumber +_bt_binsrch(Relation rel, + BTScanInsert key, + Buffer buf) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high; + int32 result, + cmpval; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Requesting nextkey semantics while using scantid seems nonsensical */ + Assert(!key->nextkey || key->scantid == NULL); + /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */ + Assert(!P_ISLEAF(opaque) || key->scantid == NULL); + + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + + /* + * If there are no keys on the page, return the first available slot. Note + * this covers two cases: the page is really empty (no keys), or it + * contains only a high key. The latter case is possible after vacuuming. + * This can never happen on an internal page, however, since they are + * never empty (an internal page must have children). + */ + if (unlikely(high < low)) + return low; + + /* + * Binary search to find the first key on the page >= scan key, or first + * key > scankey when nextkey is true. + * + * For nextkey=false (cmpval=1), the loop invariant is: all slots before + * 'low' are < scan key, all slots at or after 'high' are >= scan key. + * + * For nextkey=true (cmpval=0), the loop invariant is: all slots before + * 'low' are <= scan key, all slots at or after 'high' are > scan key. + * + * We can fall out when high == low. + */ + high++; /* establish the loop invariant for high */ + + cmpval = key->nextkey ? 0 : 1; /* select comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid); + + if (result >= cmpval) + low = mid + 1; + else + high = mid; + } + + /* + * At this point we have high == low, but be careful: they could point + * past the last slot on the page. + * + * On a leaf page, we always return the first key >= scan key (resp. > + * scan key), which could be the last slot + 1. + */ + if (P_ISLEAF(opaque)) + return low; + + /* + * On a non-leaf page, return the last key < scan key (resp. <= scan key). + * There must be one if _bt_compare() is playing by the rules. + */ + Assert(low > P_FIRSTDATAKEY(opaque)); + + return OffsetNumberPrev(low); +} + +/* + * + * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search. + * + * Like _bt_binsrch(), but with support for caching the binary search + * bounds. Only used during insertion, and only on the leaf page that it + * looks like caller will insert tuple on. Exclusive-locked and pinned + * leaf page is contained within insertstate. + * + * Caches the bounds fields in insertstate so that a subsequent call can + * reuse the low and strict high bounds of original binary search. Callers + * that use these fields directly must be prepared for the case where low + * and/or stricthigh are not on the same page (one or both exceed maxoff + * for the page). The case where there are no items on the page (high < + * low) makes bounds invalid. + * + * Caller is responsible for invalidating bounds when it modifies the page + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by a posting + * list split). + */ +OffsetNumber +_bt_binsrch_insert(Relation rel, BTInsertState insertstate) +{ + BTScanInsert key = insertstate->itup_key; + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high, + stricthigh; + int32 result, + cmpval; + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque)); + Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); + + if (!insertstate->bounds_valid) + { + /* Start new binary search */ + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + } + else + { + /* Restore result of previous binary search against same page */ + low = insertstate->low; + high = insertstate->stricthigh; + } + + /* If there are no keys on the page, return the first available slot */ + if (unlikely(high < low)) + { + /* Caller can't reuse bounds */ + insertstate->low = InvalidOffsetNumber; + insertstate->stricthigh = InvalidOffsetNumber; + insertstate->bounds_valid = false; + return low; + } + + /* + * Binary search to find the first key on the page >= scan key. (nextkey + * is always false when inserting). + * + * The loop invariant is: all slots before 'low' are < scan key, all slots + * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is + * maintained to save additional search effort for caller. + * + * We can fall out when high == low. + */ + if (!insertstate->bounds_valid) + high++; /* establish the loop invariant for high */ + stricthigh = high; /* high initially strictly higher */ + + cmpval = 1; /* !nextkey comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid); + + if (result >= cmpval) + low = mid + 1; + else + { + high = mid; + if (result != 0) + stricthigh = high; + } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + { + /* + * postingoff should never be set more than once per leaf page + * binary search. That would mean that there are duplicate table + * TIDs in the index, which is never okay. Check for that here. + */ + if (insertstate->postingoff != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(key->scantid), + ItemPointerGetOffsetNumber(key->scantid), + low, stricthigh, + BufferGetBlockNumber(insertstate->buf), + RelationGetRelationName(rel)))); + + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); + } + } + + /* + * On a leaf page, a binary search always returns the first key >= scan + * key (at least in !nextkey case), which could be the last slot + 1. This + * is also the lower bound of cached search. + * + * stricthigh may also be the last slot + 1, which prevents caller from + * using bounds directly, but is still useful to us if we're called a + * second time with cached bounds (cached low will be < stricthigh when + * that happens). + */ + insertstate->low = low; + insertstate->stricthigh = stricthigh; + insertstate->bounds_valid = true; + + return low; +} + +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Helper routine for _bt_binsrch_insert(). + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + Assert(key->heapkeyspace && key->allequalimage); + + /* + * In the event that posting list tuple has LP_DEAD bit set, indicate this + * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A + * second call to _bt_binsrch_insert() can take place when its caller has + * removed the dead item. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res > 0) + low = mid + 1; + else if (res < 0) + high = mid; + else + return mid; + } + + /* Exact match not found */ + return low; +} + +/*---------- + * _bt_compare() -- Compare insertion-type scankey to tuple on a page. + * + * page/offnum: location of btree item to be compared to. + * + * This routine returns: + * <0 if scankey < tuple at offnum; + * 0 if scankey == tuple at offnum; + * >0 if scankey > tuple at offnum. + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). + * + * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be + * "minus infinity": this routine will always claim it is less than the + * scankey. The actual key value stored is explicitly truncated to 0 + * attributes (explicitly minus infinity) with version 3+ indexes, but + * that isn't relied upon. This allows us to implement the Lehman and + * Yao convention that the first down-link pointer is before the first + * key. See backend/access/nbtree/README for details. + *---------- + */ +int32 +_bt_compare(Relation rel, + BTScanInsert key, + Page page, + OffsetNumber offnum) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + IndexTuple itup; + ItemPointer heapTid; + ScanKey scankey; + int ncmpkey; + int ntupatts; + int32 result; + + Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); + Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(key->heapkeyspace || key->scantid == NULL); + + /* + * Force result ">" if target item is first data item on an internal page + * --- see NOTE above. + */ + if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) + return 1; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + ntupatts = BTreeTupleGetNAtts(itup, rel); + + /* + * The scan key is set up with the attribute number associated with each + * term in the key. It is important that, if the index is multi-key, the + * scan contain the first k key attributes, and that they be in order. If + * you think about how multi-key ordering works, you'll understand why + * this is. + * + * We don't test for violation of this condition here, however. The + * initial setup for the index scan had better have gotten it right (see + * _bt_first). + */ + + ncmpkey = Min(ntupatts, key->keysz); + Assert(key->heapkeyspace || ncmpkey == key->keysz); + Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); + scankey = key->scankeys; + for (int i = 1; i <= ncmpkey; i++) + { + Datum datum; + bool isNull; + + datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); + + if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ + { + if (isNull) + result = 0; /* NULL "=" NULL */ + else if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull) /* key is NOT_NULL and item is NULL */ + { + if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * The sk_func needs to be passed the index value as left arg and + * the sk_argument as right arg (they might be of different + * types). Since it is convenient for callers to think of + * _bt_compare as comparing the scankey to the index item, we have + * to flip the sign of the comparison result. (Unless it's a DESC + * column, in which case we *don't* flip the sign.) + */ + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); + + if (!(scankey->sk_flags & SK_BT_DESC)) + INVERT_COMPARE_RESULT(result); + } + + /* if the keys are unequal, return the difference */ + if (result != 0) + return result; + + scankey++; + } + + /* + * All non-truncated attributes (other than heap TID) were found to be + * equal. Treat truncated attributes as minus infinity when scankey has a + * key attribute value that would otherwise be compared directly. + * + * Note: it doesn't matter if ntupatts includes non-key attributes; + * scankey won't, so explicitly excluding non-key attributes isn't + * necessary. + */ + if (key->keysz > ntupatts) + return 1; + + /* + * Use the heap TID attribute and scantid to try to break the tie. The + * rules are the same as any other key attribute -- only the + * representation differs. + */ + heapTid = BTreeTupleGetHeapTID(itup); + if (key->scantid == NULL) + { + /* + * Most searches have a scankey that is considered greater than a + * truncated pivot tuple if and when the scankey has equal values for + * attributes up to and including the least significant untruncated + * attribute in tuple. + * + * For example, if an index has the minimum two attributes (single + * user key attribute, plus heap TID attribute), and a page's high key + * is ('foo', -inf), and scankey is ('foo', <omitted>), the search + * will not descend to the page to the left. The search will descend + * right instead. The truncated attribute in pivot tuple means that + * all non-pivot tuples on the page to the left are strictly < 'foo', + * so it isn't necessary to descend left. In other words, search + * doesn't have to descend left because it isn't interested in a match + * that has a heap TID value of -inf. + * + * However, some searches (pivotsearch searches) actually require that + * we descend left when this happens. -inf is treated as a possible + * match for omitted scankey attribute(s). This is needed by page + * deletion, which must re-find leaf pages that are targets for + * deletion using their high keys. + * + * Note: the heap TID part of the test ensures that scankey is being + * compared to a pivot tuple with one or more truncated key + * attributes. + * + * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the + * left here, since they have no heap TID attribute (and cannot have + * any -inf key values in any case, since truncation can only remove + * non-key attributes). !heapkeyspace searches must always be + * prepared to deal with matches on both sides of the pivot once the + * leaf level is reached. + */ + if (key->heapkeyspace && !key->pivotsearch && + key->keysz == ntupatts && heapTid == NULL) + return 1; + + /* All provided scankey arguments found to be equal */ + return 0; + } + + /* + * Treat truncated heap TID as minus infinity, since scankey has a key + * attribute value (scantid) that would otherwise be compared directly + */ + Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel)); + if (heapTid == NULL) + return 1; + + /* + * Scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * with scantid. + */ + Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); + result = ItemPointerCompare(key->scantid, heapTid); + if (result <= 0 || !BTreeTupleIsPosting(itup)) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; +} + +/* + * _bt_first() -- Find the first item in a scan. + * + * We need to be clever about the direction of scan, the search + * conditions, and the tree ordering. We find the first item (or, + * if backwards scan, the last item) in the tree that satisfies the + * qualifications in the scan key. On success exit, the page containing + * the current index tuple is pinned but not locked, and data about + * the matching tuple(s) on the page has been loaded into so->currPos. + * scan->xs_ctup.t_self is set to the heap TID of the current tuple, + * and if requested, scan->xs_itup points to a copy of the index tuple. + * + * If there are no matching items in the index, we return false, with no + * pins or locks held. + * + * Note that scan->keyData[], and the so->keyData[] scankey built from it, + * are both search-type scankeys (see nbtree/README for more about this). + * Within this routine, we build a temporary insertion-type scankey to use + * in locating the scan start position. + */ +bool +_bt_first(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + BTStack stack; + OffsetNumber offnum; + StrategyNumber strat; + bool nextkey; + bool goback; + BTScanInsertData inskey; + ScanKey startKeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + int keysCount = 0; + int i; + bool status; + StrategyNumber strat_total; + BTScanPosItem *currItem; + BlockNumber blkno; + + Assert(!BTScanPosIsValid(so->currPos)); + + pgstat_count_index_scan(rel); + + /* + * Examine the scan keys and eliminate any redundant keys; also mark the + * keys that must be matched to continue the scan. + */ + _bt_preprocess_keys(scan); + + /* + * Quit now if _bt_preprocess_keys() discovered that the scan keys can + * never be satisfied (eg, x == 1 AND x > 2). + */ + if (!so->qual_ok) + { + /* Notify any other workers that we're done with this scan key. */ + _bt_parallel_done(scan); + return false; + } + + /* + * For parallel scans, get the starting page from shared state. If the + * scan has not started, proceed to find out first leaf page in the usual + * way while keeping other participating processes waiting. If the scan + * has already begun, use the page number from the shared structure. + */ + if (scan->parallel_scan != NULL) + { + status = _bt_parallel_seize(scan, &blkno); + if (!status) + return false; + else if (blkno == P_NONE) + { + _bt_parallel_done(scan); + return false; + } + else if (blkno != InvalidBlockNumber) + { + if (!_bt_parallel_readpage(scan, blkno, dir)) + return false; + goto readcomplete; + } + } + + /*---------- + * Examine the scan keys to discover where we need to start the scan. + * + * We want to identify the keys that can be used as starting boundaries; + * these are =, >, or >= keys for a forward scan or =, <, <= keys for + * a backwards scan. We can use keys for multiple attributes so long as + * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept + * a > or < boundary or find an attribute with no boundary (which can be + * thought of as the same as "> -infinity"), we can't use keys for any + * attributes to its right, because it would break our simplistic notion + * of what initial positioning strategy to use. + * + * When the scan keys include cross-type operators, _bt_preprocess_keys + * may not be able to eliminate redundant keys; in such cases we will + * arbitrarily pick a usable one for each attribute. This is correct + * but possibly not optimal behavior. (For example, with keys like + * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when + * x=5 would be more efficient.) Since the situation only arises given + * a poorly-worded query plus an incomplete opfamily, live with it. + * + * When both equality and inequality keys appear for a single attribute + * (again, only possible when cross-type operators appear), we *must* + * select one of the equality keys for the starting point, because + * _bt_checkkeys() will stop the scan as soon as an equality qual fails. + * For example, if we have keys like "x >= 4 AND x = 10" and we elect to + * start at x=4, we will fail and stop before reaching x=10. If multiple + * equality quals survive preprocessing, however, it doesn't matter which + * one we use --- by definition, they are either redundant or + * contradictory. + * + * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. + * If the index stores nulls at the end of the index we'll be starting + * from, and we have no boundary key for the column (which means the key + * we deduced NOT NULL from is an inequality key that constrains the other + * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to + * use as a boundary key. If we didn't do this, we might find ourselves + * traversing a lot of null entries at the start of the scan. + * + * In this loop, row-comparison keys are treated the same as keys on their + * first (leftmost) columns. We'll add on lower-order columns of the row + * comparison below, if possible. + * + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. + *---------- + */ + strat_total = BTEqualStrategyNumber; + if (so->numberOfKeys > 0) + { + AttrNumber curattr; + ScanKey chosen; + ScanKey impliesNN; + ScanKey cur; + + /* + * chosen is the so-far-chosen key for the current attribute, if any. + * We don't cast the decision in stone until we reach keys for the + * next attribute. + */ + curattr = 1; + chosen = NULL; + /* Also remember any scankey that implies a NOT NULL constraint */ + impliesNN = NULL; + + /* + * Loop iterates from 0 to numberOfKeys inclusive; we use the last + * pass to handle after-last-key processing. Actual exit from the + * loop is at one of the "break" statements below. + */ + for (cur = so->keyData, i = 0;; cur++, i++) + { + if (i >= so->numberOfKeys || cur->sk_attno != curattr) + { + /* + * Done looking at keys for curattr. If we didn't find a + * usable boundary key, see if we can deduce a NOT NULL key. + */ + if (chosen == NULL && impliesNN != NULL && + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + ScanDirectionIsForward(dir) : + ScanDirectionIsBackward(dir))) + { + /* Yes, so build the key in notnullkeys[keysCount] */ + chosen = ¬nullkeys[keysCount]; + ScanKeyEntryInitialize(chosen, + (SK_SEARCHNOTNULL | SK_ISNULL | + (impliesNN->sk_flags & + (SK_BT_DESC | SK_BT_NULLS_FIRST))), + curattr, + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + BTGreaterStrategyNumber : + BTLessStrategyNumber), + InvalidOid, + InvalidOid, + InvalidOid, + (Datum) 0); + } + + /* + * If we still didn't find a usable boundary key, quit; else + * save the boundary key pointer in startKeys. + */ + if (chosen == NULL) + break; + startKeys[keysCount++] = chosen; + + /* + * Adjust strat_total, and quit if we have stored a > or < + * key. + */ + strat = chosen->sk_strategy; + if (strat != BTEqualStrategyNumber) + { + strat_total = strat; + if (strat == BTGreaterStrategyNumber || + strat == BTLessStrategyNumber) + break; + } + + /* + * Done if that was the last attribute, or if next key is not + * in sequence (implying no boundary key is available for the + * next attribute). + */ + if (i >= so->numberOfKeys || + cur->sk_attno != curattr + 1) + break; + + /* + * Reset for next attr. + */ + curattr = cur->sk_attno; + chosen = NULL; + impliesNN = NULL; + } + + /* + * Can we use this key as a starting boundary for this attr? + * + * If not, does it imply a NOT NULL constraint? (Because + * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, + * *any* inequality key works for that; we need not test.) + */ + switch (cur->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsBackward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + case BTEqualStrategyNumber: + /* override any non-equality choice */ + chosen = cur; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsForward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + } + } + } + + /* + * If we found no usable boundary keys, we have to start from one end of + * the tree. Walk down that edge to the first or last key, and scan from + * there. + */ + if (keysCount == 0) + { + bool match; + + match = _bt_endpoint(scan, dir); + + if (!match) + { + /* No match, so mark (parallel) scan finished */ + _bt_parallel_done(scan); + } + + return match; + } + + /* + * We want to start the scan somewhere within the index. Set up an + * insertion scankey we can use to search for the boundary point we + * identified above. The insertion scankey is built using the keys + * identified by startKeys[]. (Remaining insertion scankey fields are + * initialized after initial-positioning strategy is finalized.) + */ + Assert(keysCount <= INDEX_MAX_KEYS); + for (i = 0; i < keysCount; i++) + { + ScanKey cur = startKeys[i]; + + Assert(cur->sk_attno == i + 1); + + if (cur->sk_flags & SK_ROW_HEADER) + { + /* + * Row comparison header: look to the first row member instead. + * + * The member scankeys are already in insertion format (ie, they + * have sk_func = 3-way-comparison function), but we have to watch + * out for nulls, which _bt_preprocess_keys didn't check. A null + * in the first row member makes the condition unmatchable, just + * like qual_ok = false. + */ + ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_flags & SK_ISNULL) + { + _bt_parallel_done(scan); + return false; + } + memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); + + /* + * If the row comparison is the last positioning key we accepted, + * try to add additional keys from the lower-order row members. + * (If we accepted independent conditions on additional index + * columns, we use those instead --- doesn't seem worth trying to + * determine which is more restrictive.) Note that this is OK + * even if the row comparison is of ">" or "<" type, because the + * condition applied to all but the last row member is effectively + * ">=" or "<=", and so the extra keys don't break the positioning + * scheme. But, by the same token, if we aren't able to use all + * the row members, then the part of the row comparison that we + * did use has to be treated as just a ">=" or "<=" condition, and + * so we'd better adjust strat_total accordingly. + */ + if (i == keysCount - 1) + { + bool used_all_subkeys = false; + + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) + { + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != keysCount + 1) + break; /* out-of-sequence, can't use it */ + if (subkey->sk_strategy != cur->sk_strategy) + break; /* wrong direction, can't use it */ + if (subkey->sk_flags & SK_ISNULL) + break; /* can't use null keys */ + Assert(keysCount < INDEX_MAX_KEYS); + memcpy(inskey.scankeys + keysCount, subkey, + sizeof(ScanKeyData)); + keysCount++; + if (subkey->sk_flags & SK_ROW_END) + { + used_all_subkeys = true; + break; + } + } + if (!used_all_subkeys) + { + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } + } + break; /* done with outer loop */ + } + } + else + { + /* + * Ordinary comparison key. Transform the search-style scan key + * to an insertion scan key by replacing the sk_func with the + * appropriate btree comparison function. + * + * If scankey operator is not a cross-type comparison, we can use + * the cached comparison function; otherwise gotta look it up in + * the catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means + * the opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + if (cur->sk_subtype == rel->rd_opcintype[i] || + cur->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + procinfo, + cur->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + cur->sk_subtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, + cur->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + cmp_proc, + cur->sk_argument); + } + } + } + + /*---------- + * Examine the selected initial-positioning strategy to determine exactly + * where we need to start the scan, and set flag variables to control the + * code below. + * + * If nextkey = false, _bt_search and _bt_binsrch will locate the first + * item >= scan key. If nextkey = true, they will locate the first + * item > scan key. + * + * If goback = true, we will then step back one item, while if + * goback = false, we will start the scan on the located item. + *---------- + */ + switch (strat_total) + { + case BTLessStrategyNumber: + + /* + * Find first item >= scankey, then back up one to arrive at last + * item < scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + nextkey = false; + goback = true; + break; + + case BTLessEqualStrategyNumber: + + /* + * Find first item > scankey, then back up one to arrive at last + * item <= scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + nextkey = true; + goback = true; + break; + + case BTEqualStrategyNumber: + + /* + * If a backward scan was specified, need to start with last equal + * item not first one. + */ + if (ScanDirectionIsBackward(dir)) + { + /* + * This is the same as the <= strategy. We will check at the + * end whether the found item is actually =. + */ + nextkey = true; + goback = true; + } + else + { + /* + * This is the same as the >= strategy. We will check at the + * end whether the found item is actually =. + */ + nextkey = false; + goback = false; + } + break; + + case BTGreaterEqualStrategyNumber: + + /* + * Find first item >= scankey. (This is only used for forward + * scans.) + */ + nextkey = false; + goback = false; + break; + + case BTGreaterStrategyNumber: + + /* + * Find first item > scankey. (This is only used for forward + * scans.) + */ + nextkey = true; + goback = false; + break; + + default: + /* can't get here, but keep compiler quiet */ + elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); + return false; + } + + /* Initialize remaining insertion scan key fields */ + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); + inskey.anynullkeys = false; /* unused */ + inskey.nextkey = nextkey; + inskey.pivotsearch = false; + inskey.scantid = NULL; + inskey.keysz = keysCount; + + /* + * Use the manufactured insertion scan key to descend the tree and + * position ourselves on the target leaf page. + */ + stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); + + /* don't need to keep the stack around... */ + _bt_freestack(stack); + + if (!BufferIsValid(buf)) + { + /* + * We only get here if the index is completely empty. Lock relation + * because nothing finer to lock exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + + /* + * mark parallel scan as done, so that all the workers can finish + * their scan + */ + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + + return false; + } + else + PredicateLockPage(rel, BufferGetBlockNumber(buf), + scan->xs_snapshot); + + _bt_initialize_more_data(so, dir); + + /* position to the precise item on the page */ + offnum = _bt_binsrch(rel, &inskey, buf); + + /* + * If nextkey = false, we are positioned at the first item >= scan key, or + * possibly at the end of a page on which all the existing items are less + * than the scan key and we know that everything on later pages is greater + * than or equal to scan key. + * + * If nextkey = true, we are positioned at the first item > scan key, or + * possibly at the end of a page on which all the existing items are less + * than or equal to the scan key and we know that everything on later + * pages is greater than scan key. + * + * The actually desired starting point is either this item or the prior + * one, or in the end-of-page case it's the first item on the next page or + * the last item on this page. Adjust the starting offset if needed. (If + * this results in an offset before the first item or after the last one, + * _bt_readpage will report no items found, and then we'll step to the + * next page as needed.) + */ + if (goback) + offnum = OffsetNumberPrev(offnum); + + /* remember which buffer we have pinned, if any */ + Assert(!BTScanPosIsValid(so->currPos)); + so->currPos.buf = buf; + + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, dir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + return false; + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + +readcomplete: + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_next() -- Get the next item in a scan. + * + * On entry, so->currPos describes the current page, which may be pinned + * but is not locked, and so->currPos.itemIndex identifies which item was + * previously returned. + * + * On successful exit, scan->xs_ctup.t_self is set to the TID of the + * next heap tuple, and if requested, scan->xs_itup points to a copy of + * the index tuple. so->currPos is updated as needed. + * + * On failure exit (no more tuples), we release pin and set + * so->currPos.buf to InvalidBuffer. + */ +bool +_bt_next(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPosItem *currItem; + + /* + * Advance to next tuple on current page; or if there's no more, try to + * step to the next page with data. + */ + if (ScanDirectionIsForward(dir)) + { + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + if (!_bt_steppage(scan, dir)) + return false; + } + } + else + { + if (--so->currPos.itemIndex < so->currPos.firstItem) + { + if (!_bt_steppage(scan, dir)) + return false; + } + } + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_readpage() -- Load data from current index page into so->currPos + * + * Caller must have pinned and read-locked so->currPos.buf; the buffer's state + * is not changed here. Also, currPos.moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of so->currPos are + * initialized from scratch here. + * + * We scan the current page starting at offnum and moving in the indicated + * direction. All items matching the scan keys are loaded into currPos.items. + * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports + * that there can be no more matching tuples in the current scan direction. + * + * In the case of a parallel scan, caller must have called _bt_parallel_seize + * prior to calling this function; this function will invoke + * _bt_parallel_release before returning. + * + * Returns true if any matching items found on the page, false if none. + */ +static bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int itemIndex; + bool continuescan; + int indnatts; + + /* + * We must have the buffer pinned and locked, but the usual macro can't be + * used here; this function is what makes it good for currPos. + */ + Assert(BufferIsValid(so->currPos.buf)); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* allow next page be processed by parallel worker */ + if (scan->parallel_scan) + { + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, opaque->btpo_next); + else + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + continuescan = true; /* default assumption */ + indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * We note the buffer's block number so that we can release the pin later. + * This allows us to re-read the buffer if it is needed again for hinting. + */ + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + + /* + * we must save the page's right-link while scanning it; this tells us + * where to step right to after we're done with these items. There is no + * corresponding need for the left-link, since splits always go right. + */ + so->currPos.nextPage = opaque->btpo_next; + + /* initialize tuple workspace to empty */ + so->currPos.nextTupleOffset = 0; + + /* + * Now that the current page has been made consistent, the macro should be + * good. + */ + Assert(BTScanPosIsPinned(so->currPos)); + + if (ScanDirectionIsForward(dir)) + { + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + + itup = (IndexTuple) PageGetItem(page, iid); + + if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID + */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!continuescan) + break; + + offnum = OffsetNumberNext(offnum); + } + + /* + * We don't need to visit page to the right when the high key + * indicates that no more matches will be found there. + * + * Checking the high key like this works out more often than you might + * think. Leaf page splits pick a split point between the two most + * dissimilar tuples (this is weighed against the need to evenly share + * free space). Leaf pages with high key attribute values that can + * only appear on non-pivot tuples on the right sibling page are + * common. + */ + if (continuescan && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + IndexTuple itup = (IndexTuple) PageGetItem(page, iid); + int truncatt; + + truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); + _bt_checkkeys(scan, itup, truncatt, dir, &continuescan); + } + + if (!continuescan) + so->currPos.moreRight = false; + + Assert(itemIndex <= MaxTIDsPerBTreePage); + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; + } + else + { + /* load items[] in descending order */ + itemIndex = MaxTIDsPerBTreePage; + + offnum = Min(offnum, maxoff); + + while (offnum >= minoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool tuple_alive; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual. Most of the + * time, it's a win to not bother examining the tuple's index + * keys, but just skip to the next tuple (previous, actually, + * since we're scanning backwards). However, if this is the first + * tuple on the page, we do check the index keys, to prevent + * uselessly advancing to the page to the left. This is similar + * to the high key optimization used by forward scans. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + Assert(offnum >= P_FIRSTDATAKEY(opaque)); + if (offnum > P_FIRSTDATAKEY(opaque)) + { + offnum = OffsetNumberPrev(offnum); + continue; + } + + tuple_alive = false; + } + else + tuple_alive = true; + + itup = (IndexTuple) PageGetItem(page, iid); + + passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, + &continuescan); + if (passes_quals && tuple_alive) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID. + * + * Note that we deliberately save/return items from + * posting lists in ascending heap TID order for backwards + * scans. This allows _bt_killitems() to make a + * consistent assumption about the order of items + * associated with the same posting list tuple. + */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } + } + if (!continuescan) + { + /* there can't be any more matches, so stop */ + so->currPos.moreLeft = false; + break; + } + + offnum = OffsetNumberPrev(offnum); + } + + Assert(itemIndex >= 0); + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxTIDsPerBTreePage - 1; + so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + } + + return (so->currPos.firstItem <= so->currPos.lastItem); +} + +/* Save an index item into so->currPos.items[itemIndex] */ +static void +_bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + + currItem->heapTid = itup->t_tid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + Size itupsz = IndexTupleSize(itup); + + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += MAXALIGN(itupsz); + } +} + +/* + * Setup state to save TIDs/items from a single posting list tuple. + * + * Saves an index item into so->currPos.items[itemIndex] for TID that is + * returned to scan first. Second or subsequent TIDs for posting list should + * be saved by calling _bt_savepostingitem(). + * + * Returns an offset into tuple storage space that main tuple is stored at if + * needed. + */ +static int +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(BTreeTupleIsPosting(itup)); + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + + return currItem->tupleOffset; + } + + return 0; +} + +/* + * Save an index item into so->currPos.items[itemIndex] for current posting + * tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. Caller passes its return value as tupleOffset. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every TID + * that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = tupleOffset; +} + +/* + * _bt_steppage() -- Step to next page containing valid data for scan + * + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked; + * if pinned, we'll drop the pin before moving to next page. The buffer is + * not locked on entry. + * + * For success on a scan using a non-MVCC snapshot we hold a pin, but not a + * read lock, on that page. If we do not hold the pin, we set so->currPos.buf + * to InvalidBuffer. We return true to indicate success. + */ +static bool +_bt_steppage(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BlockNumber blkno = InvalidBlockNumber; + bool status; + + Assert(BTScanPosIsValid(so->currPos)); + + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + + /* + * Before we modify currPos, make a copy of the page data if there was a + * mark position that needs it. + */ + if (so->markItemIndex >= 0) + { + /* bump pin on current buffer for assignment to mark buffer */ + if (BTScanPosIsPinned(so->currPos)) + IncrBufferRefCount(so->currPos.buf); + memcpy(&so->markPos, &so->currPos, + offsetof(BTScanPosData, items[1]) + + so->currPos.lastItem * sizeof(BTScanPosItem)); + if (so->markTuples) + memcpy(so->markTuples, so->currTuples, + so->currPos.nextTupleOffset); + so->markPos.itemIndex = so->markItemIndex; + so->markItemIndex = -1; + } + + if (ScanDirectionIsForward(dir)) + { + /* Walk right to the next page with data */ + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the next block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so use the previously-saved nextPage link. */ + blkno = so->currPos.nextPage; + } + + /* Remember we left a page with data */ + so->currPos.moreLeft = true; + + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + } + else + { + /* Remember we left a page with data */ + so->currPos.moreRight = true; + + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the current block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + BTScanPosUnpinIfPinned(so->currPos); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so just use our own notion of the current page */ + blkno = so->currPos.currPage; + } + } + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + return true; +} + +/* + * _bt_readnextpage() -- Read next page containing valid data for scan + * + * On success exit, so->currPos is updated to contain data from the next + * interesting page. Caller is responsible to release lock and pin on + * buffer on success. We return true to indicate success. + * + * If there are no more matching records in the given direction, we drop all + * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. + */ +static bool +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel; + Page page; + BTPageOpaque opaque; + bool status; + + rel = scan->indexRelation; + + if (ScanDirectionIsForward(dir)) + { + for (;;) + { + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ + if (blkno == P_NONE || !so->currPos.moreRight) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + /* step right one page */ + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* check for deleted page */ + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, blkno, scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, opaque->btpo_next); + } + + /* nope, keep going */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, so->currPos.buf); + } + } + } + else + { + /* + * Should only happen in parallel cases, when some other backend + * advanced the scan. + */ + if (so->currPos.currPage != blkno) + { + BTScanPosUnpinIfPinned(so->currPos); + so->currPos.currPage = blkno; + } + + /* + * Walk left to the next page with data. This is much more complex + * than the walk-right case because of the possibility that the page + * to our left splits while we are in flight to it, plus the + * possibility that the page we were on gets deleted after we leave + * it. See nbtree/README for details. + * + * It might be possible to rearrange this code to have less overhead + * in pinning and locking, but that would require capturing the left + * pointer when the page is initially read, and using it here, along + * with big changes to _bt_walk_left() and the code below. It is not + * clear whether this would be a win, since if the page immediately to + * the left splits after we read this page and before we step left, we + * would need to visit more pages than with the current code. + * + * Note that if we change the code so that we drop the pin for a scan + * which uses a non-MVCC snapshot, we will need to modify the code for + * walking left, to allow for the possibility that a referenced page + * has been deleted. As long as the buffer is pinned or the snapshot + * is MVCC the page cannot move past the half-dead state to fully + * deleted. + */ + if (BTScanPosIsPinned(so->currPos)) + _bt_lockbuf(rel, so->currPos.buf, BT_READ); + else + so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); + + for (;;) + { + /* Done if we know there are no matching keys to the left */ + if (!so->currPos.moreLeft) + { + _bt_relbuf(rel, so->currPos.buf); + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* Step to next physical page */ + so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, + scan->xs_snapshot); + + /* if we're physically at end of index, return failure */ + if (so->currPos.buf == InvalidBuffer) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* + * Okay, we managed to move left to a non-deleted page. Done if + * it's not half-dead and contains matching tuples. Else loop back + * and do it all again. + */ + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreLeft if we can stop */ + if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + /* + * For parallel scans, get the last page scanned as it is quite + * possible that by the time we try to seize the scan, some other + * worker has already advanced the scan to a different page. We + * must continue based on the latest page scanned by any worker. + */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + } + } + } + + return true; +} + +/* + * _bt_parallel_readpage() -- Read current page containing valid data for scan + * + * On success, release lock and maybe pin on buffer. We return true to + * indicate success. + */ +static bool +_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + _bt_initialize_more_data(so, dir); + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + return true; +} + +/* + * _bt_walk_left() -- step left one page, if possible + * + * The given buffer must be pinned and read-locked. This will be dropped + * before stepping left. On return, we have pin and read lock on the + * returned page, instead. + * + * Returns InvalidBuffer if there is no page to the left (no lock is held + * in that case). + * + * When working on a non-leaf level, it is possible for the returned page + * to be half-dead; the caller should check that condition and step left + * again if it's important. + */ +static Buffer +_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) +{ + Page page; + BTPageOpaque opaque; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + BlockNumber obknum; + BlockNumber lblkno; + BlockNumber blkno; + int tries; + + /* if we're at end of tree, release buf and return failure */ + if (P_LEFTMOST(opaque)) + { + _bt_relbuf(rel, buf); + break; + } + /* remember original page we are stepping left from */ + obknum = BufferGetBlockNumber(buf); + /* step left */ + blkno = lblkno = opaque->btpo_prev; + _bt_relbuf(rel, buf); + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this isn't the page we want, walk right till we find what we + * want --- but go no more than four hops (an arbitrary limit). If we + * don't find the correct page by then, the most likely bet is that + * the original page got deleted and isn't in the sibling chain at all + * anymore, not that its left sibling got split more than four times. + * + * Note that it is correct to test P_ISDELETED not P_IGNORE here, + * because half-dead pages are still in the sibling chain. Caller + * must reject half-dead pages if wanted. + */ + tries = 0; + for (;;) + { + if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum) + { + /* Found desired page, return it */ + return buf; + } + if (P_RIGHTMOST(opaque) || ++tries > 4) + break; + blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Return to the original page to see what's up */ + buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * It was deleted. Move right to first nondeleted page (there + * must be one); that is the page that has acquired the deleted + * one's keyspace, so stepping left from it will take us where we + * want to be. + */ + for (;;) + { + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_ISDELETED(opaque)) + break; + } + + /* + * Now return to top of loop, resetting obknum to point to this + * nondeleted page, and try again. + */ + } + else + { + /* + * It wasn't deleted; the explanation had better be that the page + * to the left got split or deleted. Without this check, we'd go + * into an infinite loop if there's anything wrong. + */ + if (opaque->btpo_prev == lblkno) + elog(ERROR, "could not find left sibling of block %u in index \"%s\"", + obknum, RelationGetRelationName(rel)); + /* Okay to try again with new lblkno value */ + } + } + + return InvalidBuffer; +} + +/* + * _bt_get_endpoint() -- Find the first or last page on a given tree level + * + * If the index is empty, we will return InvalidBuffer; any other failure + * condition causes ereport(). We will not return a dead page. + * + * The returned buffer is pinned and read-locked. + */ +Buffer +_bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + BlockNumber blkno; + IndexTuple itup; + + /* + * If we are looking for a leaf page, okay to descend from fast root; + * otherwise better descend from true root. (There is no point in being + * smarter about intermediate levels.) + */ + if (level == 0) + buf = _bt_getroot(rel, BT_READ); + else + buf = _bt_gettrueroot(rel); + + if (!BufferIsValid(buf)) + return InvalidBuffer; + + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + /* + * If we landed on a deleted page, step right to find a live page + * (there must be one). Also, if we want the rightmost page, step + * right if needed to get to it (this could happen if the page split + * since we obtained a pointer to it). + */ + while (P_IGNORE(opaque) || + (rightmost && !P_RIGHTMOST(opaque))) + { + blkno = opaque->btpo_next; + if (blkno == P_NONE) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Done? */ + if (opaque->btpo_level == level) + break; + if (opaque->btpo_level < level) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("btree level %u not found in index \"%s\"", + level, RelationGetRelationName(rel)))); + + /* Descend to leftmost or rightmost child page */ + if (rightmost) + offnum = PageGetMaxOffsetNumber(page); + else + offnum = P_FIRSTDATAKEY(opaque); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + blkno = BTreeTupleGetDownLink(itup); + + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + return buf; +} + +/* + * _bt_endpoint() -- Find the first or last page in the index, and scan + * from there to the first key satisfying all the quals. + * + * This is used by _bt_first() to set up a scan when we've determined + * that the scan must start at the beginning or end of the index (for + * a forward or backward scan respectively). Exit conditions are the + * same as for _bt_first(). + */ +static bool +_bt_endpoint(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber start; + BTScanPosItem *currItem; + + /* + * Scan down to the leftmost or rightmost leaf page. This is a simplified + * version of _bt_search(). We don't maintain a stack since we know we + * won't need it. + */ + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); + + if (!BufferIsValid(buf)) + { + /* + * Empty index. Lock the whole relation, as nothing finer to lock + * exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + BTScanPosInvalidate(so->currPos); + return false; + } + + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISLEAF(opaque)); + + if (ScanDirectionIsForward(dir)) + { + /* There could be dead pages to the left, so not this: */ + /* Assert(P_LEFTMOST(opaque)); */ + + start = P_FIRSTDATAKEY(opaque); + } + else if (ScanDirectionIsBackward(dir)) + { + Assert(P_RIGHTMOST(opaque)); + + start = PageGetMaxOffsetNumber(page); + } + else + { + elog(ERROR, "invalid scan direction: %d", (int) dir); + start = 0; /* keep compiler quiet */ + } + + /* remember which buffer we have pinned */ + so->currPos.buf = buf; + + _bt_initialize_more_data(so, dir); + + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, dir, start)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + return false; + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately + * for scan direction + */ +static inline void +_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) +{ + /* initialize moreLeft/moreRight appropriately for scan direction */ + if (ScanDirectionIsForward(dir)) + { + so->currPos.moreLeft = false; + so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } + so->numKilled = 0; /* just paranoia */ + so->markItemIndex = -1; /* ditto */ +} |