diff options
Diffstat (limited to 'contrib/amcheck/verify_nbtree.c')
-rw-r--r-- | contrib/amcheck/verify_nbtree.c | 3347 |
1 files changed, 3347 insertions, 0 deletions
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c new file mode 100644 index 0000000..dc7d4a5 --- /dev/null +++ b/contrib/amcheck/verify_nbtree.c @@ -0,0 +1,3347 @@ +/*------------------------------------------------------------------------- + * + * verify_nbtree.c + * Verifies the integrity of nbtree indexes based on invariants. + * + * For B-Tree indexes, verification includes checking that each page in the + * target index has items in logical order as reported by an insertion scankey + * (the insertion scankey sort-wise NULL semantics are needed for + * verification). + * + * When index-to-heap verification is requested, a Bloom filter is used to + * fingerprint all tuples in the target index, as the index is traversed to + * verify its structure. A heap scan later uses Bloom filter probes to verify + * that every visible heap tuple has a matching index tuple. + * + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/verify_nbtree.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "catalog/pg_opfamily_d.h" +#include "commands/tablecmds.h" +#include "common/pg_prng.h" +#include "lib/bloomfilter.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + + +PG_MODULE_MAGIC; + +/* + * A B-Tree cannot possibly have this many levels, since there must be one + * block per level, which is bound by the range of BlockNumber: + */ +#define InvalidBtreeLevel ((uint32) InvalidBlockNumber) +#define BTreeTupleGetNKeyAtts(itup, rel) \ + Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel)) + +/* + * State associated with verifying a B-Tree index + * + * target is the point of reference for a verification operation. + * + * Other B-Tree pages may be allocated, but those are always auxiliary (e.g., + * they are current target's child pages). Conceptually, problems are only + * ever found in the current target page (or for a particular heap tuple during + * heapallindexed verification). Each page found by verification's left/right, + * top/bottom scan becomes the target exactly once. + */ +typedef struct BtreeCheckState +{ + /* + * Unchanging state, established at start of verification: + */ + + /* B-Tree Index Relation and associated heap relation */ + Relation rel; + Relation heaprel; + /* rel is heapkeyspace index? */ + bool heapkeyspace; + /* ShareLock held on heap/index, rather than AccessShareLock? */ + bool readonly; + /* Also verifying heap has no unindexed tuples? */ + bool heapallindexed; + /* Also making sure non-pivot tuples can be found by new search? */ + bool rootdescend; + /* Per-page context */ + MemoryContext targetcontext; + /* Buffer access strategy */ + BufferAccessStrategy checkstrategy; + + /* + * Mutable state, for verification of particular page: + */ + + /* Current target page */ + Page target; + /* Target block number */ + BlockNumber targetblock; + /* Target page's LSN */ + XLogRecPtr targetlsn; + + /* + * Low key: high key of left sibling of target page. Used only for child + * verification. So, 'lowkey' is kept only when 'readonly' is set. + */ + IndexTuple lowkey; + + /* + * The rightlink and incomplete split flag of block one level down to the + * target page, which was visited last time via downlink from target page. + * We use it to check for missing downlinks. + */ + BlockNumber prevrightlink; + bool previncompletesplit; + + /* + * Mutable state, for optional heapallindexed verification: + */ + + /* Bloom filter fingerprints B-Tree index */ + bloom_filter *filter; + /* Debug counter */ + int64 heaptuplespresent; +} BtreeCheckState; + +/* + * Starting point for verifying an entire B-Tree index level + */ +typedef struct BtreeLevel +{ + /* Level number (0 is leaf page level). */ + uint32 level; + + /* Left most block on level. Scan of level begins here. */ + BlockNumber leftmost; + + /* Is this level reported as "true" root level by meta page? */ + bool istruerootlevel; +} BtreeLevel; + +PG_FUNCTION_INFO_V1(bt_index_check); +PG_FUNCTION_INFO_V1(bt_index_parent_check); + +static void bt_index_check_internal(Oid indrelid, bool parentcheck, + bool heapallindexed, bool rootdescend); +static inline void btree_index_checkable(Relation rel); +static inline bool btree_index_mainfork_expected(Relation rel); +static void bt_check_every_level(Relation rel, Relation heaprel, + bool heapkeyspace, bool readonly, bool heapallindexed, + bool rootdescend); +static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, + BtreeLevel level); +static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque); +static void bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent); +static void bt_target_page_check(BtreeCheckState *state); +static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); +static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, + OffsetNumber downlinkoffnum); +static void bt_child_highkey_check(BtreeCheckState *state, + OffsetNumber target_downlinkoffnum, + Page loaded_child, + uint32 target_level); +static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber blkno, Page page); +static void bt_tuple_present_callback(Relation index, ItemPointer tid, + Datum *values, bool *isnull, + bool tupleIsAlive, void *checkstate); +static IndexTuple bt_normalize_tuple(BtreeCheckState *state, + IndexTuple itup); +static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n); +static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); +static inline bool offset_is_negative_infinity(BTPageOpaque opaque, + OffsetNumber offset); +static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound); +static inline bool invariant_leq_offset(BtreeCheckState *state, + BTScanInsert key, + OffsetNumber upperbound); +static inline bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber lowerbound); +static inline bool invariant_l_nontarget_offset(BtreeCheckState *state, + BTScanInsert key, + BlockNumber nontargetblock, + Page nontarget, + OffsetNumber upperbound); +static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); +static inline BTScanInsert bt_mkscankey_pivotsearch(Relation rel, + IndexTuple itup); +static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, + Page page, OffsetNumber offset); +static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, + IndexTuple itup, bool nonpivot); +static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup); + +/* + * bt_index_check(index regclass, heapallindexed boolean) + * + * Verify integrity of B-Tree index. + * + * Acquires AccessShareLock on heap & index relations. Does not consider + * invariants that exist between parent/child pages. Optionally verifies + * that heap does not contain any unindexed or incorrectly indexed tuples. + */ +Datum +bt_index_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool heapallindexed = false; + + if (PG_NARGS() == 2) + heapallindexed = PG_GETARG_BOOL(1); + + bt_index_check_internal(indrelid, false, heapallindexed, false); + + PG_RETURN_VOID(); +} + +/* + * bt_index_parent_check(index regclass, heapallindexed boolean) + * + * Verify integrity of B-Tree index. + * + * Acquires ShareLock on heap & index relations. Verifies that downlinks in + * parent pages are valid lower bounds on child pages. Optionally verifies + * that heap does not contain any unindexed or incorrectly indexed tuples. + */ +Datum +bt_index_parent_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool heapallindexed = false; + bool rootdescend = false; + + if (PG_NARGS() >= 2) + heapallindexed = PG_GETARG_BOOL(1); + if (PG_NARGS() == 3) + rootdescend = PG_GETARG_BOOL(2); + + bt_index_check_internal(indrelid, true, heapallindexed, rootdescend); + + PG_RETURN_VOID(); +} + +/* + * Helper for bt_index_[parent_]check, coordinating the bulk of the work. + */ +static void +bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, + bool rootdescend) +{ + Oid heapid; + Relation indrel; + Relation heaprel; + LOCKMODE lockmode; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + + if (parentcheck) + lockmode = ShareLock; + else + lockmode = AccessShareLock; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indrelid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + * + * In hot standby mode this will raise an error when parentcheck is true. + */ + heapid = IndexGetRelation(indrelid, true); + if (OidIsValid(heapid)) + { + heaprel = table_open(heapid, lockmode); + + /* + * Switch to the table owner's userid, so that any index functions are + * run as that user. Also lock down security-restricted operations + * and arrange to make GUC variable changes local to this command. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(heaprel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + } + else + { + heaprel = NULL; + /* Set these just to suppress "uninitialized variable" warnings */ + save_userid = InvalidOid; + save_sec_context = -1; + save_nestlevel = -1; + } + + /* + * Open the target index relations separately (like relation_openrv(), but + * with heap relation locked first to prevent deadlocking). In hot + * standby mode this will raise an error when parentcheck is true. + * + * There is no need for the usual indcheckxmin usability horizon test + * here, even in the heapallindexed case, because index undergoing + * verification only needs to have entries for a new transaction snapshot. + * (If this is a parentcheck verification, there is no question about + * committed or recently dead heap tuples lacking index entries due to + * concurrent activity.) + */ + indrel = index_open(indrelid, lockmode); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. + */ + if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indrel)))); + + /* Relation suitable for checking as B-Tree? */ + btree_index_checkable(indrel); + + if (btree_index_mainfork_expected(indrel)) + { + bool heapkeyspace, + allequalimage; + + if (!smgrexists(RelationGetSmgr(indrel), MAIN_FORKNUM)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" lacks a main relation fork", + RelationGetRelationName(indrel)))); + + /* Extract metadata from metapage, and sanitize it in passing */ + _bt_metaversion(indrel, &heapkeyspace, &allequalimage); + if (allequalimage && !heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version", + RelationGetRelationName(indrel)))); + if (allequalimage && !_bt_allequalimage(indrel, false)) + { + bool has_interval_ops = false; + + for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(indrel); i++) + if (indrel->rd_opfamily[i] == INTERVAL_BTREE_FAM_OID) + has_interval_ops = true; + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe", + RelationGetRelationName(indrel)), + has_interval_ops + ? errhint("This is known of \"interval\" indexes last built on a version predating 2023-11.") + : 0)); + } + + /* Check index, possibly against table it is an index on */ + bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, + heapallindexed, rootdescend); + } + + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + /* + * Release locks early. That's ok here because nothing in the called + * routines will trigger shared cache invalidations to be sent, so we can + * relax the usual pattern of only releasing locks after commit. + */ + index_close(indrel, lockmode); + if (heaprel) + table_close(heaprel, lockmode); +} + +/* + * Basic checks about the suitability of a relation for checking as a B-Tree + * index. + * + * NB: Intentionally not checking permissions, the function is normally not + * callable by non-superusers. If granted, it's useful to be able to check a + * whole cluster. + */ +static inline void +btree_index_checkable(Relation rel) +{ + if (rel->rd_rel->relkind != RELKIND_INDEX || + rel->rd_rel->relam != BTREE_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only B-Tree indexes are supported as targets for verification"), + errdetail("Relation \"%s\" is not a B-Tree index.", + RelationGetRelationName(rel)))); + + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"), + errdetail("Index \"%s\" is associated with temporary relation.", + RelationGetRelationName(rel)))); + + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot check index \"%s\"", + RelationGetRelationName(rel)), + errdetail("Index is not valid."))); +} + +/* + * Check if B-Tree index relation should have a file for its main relation + * fork. Verification uses this to skip unlogged indexes when in hot standby + * mode, where there is simply nothing to verify. We behave as if the + * relation is empty. + * + * NB: Caller should call btree_index_checkable() before calling here. + */ +static inline bool +btree_index_mainfork_expected(Relation rel) +{ + if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED || + !RecoveryInProgress()) + return true; + + ereport(DEBUG1, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot verify unlogged index \"%s\" during recovery, skipping", + RelationGetRelationName(rel)))); + + return false; +} + +/* + * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in + * logical order, verifying invariants as it goes. Optionally, verification + * checks if the heap relation contains any tuples that are not represented in + * the index but should be. + * + * It is the caller's responsibility to acquire appropriate heavyweight lock on + * the index relation, and advise us if extra checks are safe when a ShareLock + * is held. (A lock of the same type must also have been acquired on the heap + * relation.) + * + * A ShareLock is generally assumed to prevent any kind of physical + * modification to the index structure, including modifications that VACUUM may + * make. This does not include setting of the LP_DEAD bit by concurrent index + * scans, although that is just metadata that is not able to directly affect + * any check performed here. Any concurrent process that might act on the + * LP_DEAD bit being set (recycle space) requires a heavyweight lock that + * cannot be held while we hold a ShareLock. (Besides, even if that could + * happen, the ad-hoc recycling when a page might otherwise split is performed + * per-page, and requires an exclusive buffer lock, which wouldn't cause us + * trouble. _bt_delitems_vacuum() may only delete leaf items, and so the extra + * parent/child check cannot be affected.) + */ +static void +bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, + bool readonly, bool heapallindexed, bool rootdescend) +{ + BtreeCheckState *state; + Page metapage; + BTMetaPageData *metad; + uint32 previouslevel; + BtreeLevel current; + Snapshot snapshot = SnapshotAny; + + if (!readonly) + elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"", + RelationGetRelationName(rel)); + else + elog(DEBUG1, "verifying consistency of tree structure for index \"%s\" with cross-level checks", + RelationGetRelationName(rel)); + + /* + * This assertion matches the one in index_getnext_tid(). See page + * recycling/"visible to everyone" notes in nbtree README. + */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * Initialize state for entire verification operation + */ + state = palloc0(sizeof(BtreeCheckState)); + state->rel = rel; + state->heaprel = heaprel; + state->heapkeyspace = heapkeyspace; + state->readonly = readonly; + state->heapallindexed = heapallindexed; + state->rootdescend = rootdescend; + + if (state->heapallindexed) + { + int64 total_pages; + int64 total_elems; + uint64 seed; + + /* + * Size Bloom filter based on estimated number of tuples in index, + * while conservatively assuming that each block must contain at least + * MaxTIDsPerBTreePage / 3 "plain" tuples -- see + * bt_posting_plain_tuple() for definition, and details of how posting + * list tuples are handled. + */ + total_pages = RelationGetNumberOfBlocks(rel); + total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3), + (int64) state->rel->rd_rel->reltuples); + /* Generate a random seed to avoid repetition */ + seed = pg_prng_uint64(&pg_global_prng_state); + /* Create Bloom filter to fingerprint index */ + state->filter = bloom_create(total_elems, maintenance_work_mem, seed); + state->heaptuplespresent = 0; + + /* + * Register our own snapshot in !readonly case, rather than asking + * table_index_build_scan() to do this for us later. This needs to + * happen before index fingerprinting begins, so we can later be + * certain that index fingerprinting should have reached all tuples + * returned by table_index_build_scan(). + */ + if (!state->readonly) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + + /* + * GetTransactionSnapshot() always acquires a new MVCC snapshot in + * READ COMMITTED mode. A new snapshot is guaranteed to have all + * the entries it requires in the index. + * + * We must defend against the possibility that an old xact + * snapshot was returned at higher isolation levels when that + * snapshot is not safe for index scans of the target index. This + * is possible when the snapshot sees tuples that are before the + * index's indcheckxmin horizon. Throwing an error here should be + * very rare. It doesn't seem worth using a secondary snapshot to + * avoid this. + */ + if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), + snapshot->xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("index \"%s\" cannot be verified using transaction snapshot", + RelationGetRelationName(rel)))); + } + } + + Assert(!state->rootdescend || state->readonly); + if (state->rootdescend && !state->heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search", + RelationGetRelationName(rel)), + errhint("Only B-Tree version 4 indexes support rootdescend verification."))); + + /* Create context for page */ + state->targetcontext = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + state->checkstrategy = GetAccessStrategy(BAS_BULKREAD); + + /* Get true root block from meta-page */ + metapage = palloc_btree_page(state, BTREE_METAPAGE); + metad = BTPageGetMeta(metapage); + + /* + * Certain deletion patterns can result in "skinny" B-Tree indexes, where + * the fast root and true root differ. + * + * Start from the true root, not the fast root, unlike conventional index + * scans. This approach is more thorough, and removes the risk of + * following a stale fast root from the meta page. + */ + if (metad->btm_fastroot != metad->btm_root) + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless fast root mismatch in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).", + metad->btm_fastroot, metad->btm_fastlevel, + metad->btm_root, metad->btm_level))); + + /* + * Starting at the root, verify every level. Move left to right, top to + * bottom. Note that there may be no pages other than the meta page (meta + * page can indicate that root is P_NONE when the index is totally empty). + */ + previouslevel = InvalidBtreeLevel; + current.level = metad->btm_level; + current.leftmost = metad->btm_root; + current.istruerootlevel = true; + while (current.leftmost != P_NONE) + { + /* + * Verify this level, and get left most page for next level down, if + * not at leaf level + */ + current = bt_check_level_from_leftmost(state, current); + + if (current.leftmost == InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has no valid pages on level below %u or first level", + RelationGetRelationName(rel), previouslevel))); + + previouslevel = current.level; + } + + /* + * * Check whether heap contains unindexed/malformed tuples * + */ + if (state->heapallindexed) + { + IndexInfo *indexinfo = BuildIndexInfo(state->rel); + TableScanDesc scan; + + /* + * Create our own scan for table_index_build_scan(), rather than + * getting it to do so for us. This is required so that we can + * actually use the MVCC snapshot registered earlier in !readonly + * case. + * + * Note that table_index_build_scan() calls heap_endscan() for us. + */ + scan = table_beginscan_strat(state->heaprel, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + true); /* syncscan OK? */ + + /* + * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY + * behaves in !readonly case. + * + * It's okay that we don't actually use the same lock strength for the + * heap relation as any other ii_Concurrent caller would in !readonly + * case. We have no reason to care about a concurrent VACUUM + * operation, since there isn't going to be a second scan of the heap + * that needs to be sure that there was no concurrent recycling of + * TIDs. + */ + indexinfo->ii_Concurrent = !state->readonly; + + /* + * Don't wait for uncommitted tuple xact commit/abort when index is a + * unique index on a catalog (or an index used by an exclusion + * constraint). This could otherwise happen in the readonly case. + */ + indexinfo->ii_Unique = false; + indexinfo->ii_ExclusionOps = NULL; + indexinfo->ii_ExclusionProcs = NULL; + indexinfo->ii_ExclusionStrats = NULL; + + elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"", + RelationGetRelationName(state->rel), + RelationGetRelationName(state->heaprel)); + + table_index_build_scan(state->heaprel, state->rel, indexinfo, true, false, + bt_tuple_present_callback, (void *) state, scan); + + ereport(DEBUG1, + (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set", + state->heaptuplespresent, RelationGetRelationName(heaprel), + 100.0 * bloom_prop_bits_set(state->filter)))); + + if (snapshot != SnapshotAny) + UnregisterSnapshot(snapshot); + + bloom_free(state->filter); + } + + /* Be tidy: */ + MemoryContextDelete(state->targetcontext); +} + +/* + * Given a left-most block at some level, move right, verifying each page + * individually (with more verification across pages for "readonly" + * callers). Caller should pass the true root page as the leftmost initially, + * working their way down by passing what is returned for the last call here + * until level 0 (leaf page level) was reached. + * + * Returns state for next call, if any. This includes left-most block number + * one level lower that should be passed on next level/call, which is set to + * P_NONE on last call here (when leaf level is verified). Level numbers + * follow the nbtree convention: higher levels have higher numbers, because new + * levels are added only due to a root page split. Note that prior to the + * first root page split, the root is also a leaf page, so there is always a + * level 0 (leaf level), and it's always the last level processed. + * + * Note on memory management: State's per-page context is reset here, between + * each call to bt_target_page_check(). + */ +static BtreeLevel +bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) +{ + /* State to establish early, concerning entire level */ + BTPageOpaque opaque; + MemoryContext oldcontext; + BtreeLevel nextleveldown; + + /* Variables for iterating across level using right links */ + BlockNumber leftcurrent = P_NONE; + BlockNumber current = level.leftmost; + + /* Initialize return state */ + nextleveldown.leftmost = InvalidBlockNumber; + nextleveldown.level = InvalidBtreeLevel; + nextleveldown.istruerootlevel = false; + + /* Use page-level context for duration of this call */ + oldcontext = MemoryContextSwitchTo(state->targetcontext); + + elog(DEBUG1, "verifying level %u%s", level.level, + level.istruerootlevel ? + " (true root level)" : level.level == 0 ? " (leaf level)" : ""); + + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + + do + { + /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */ + CHECK_FOR_INTERRUPTS(); + + /* Initialize state for this iteration */ + state->targetblock = current; + state->target = palloc_btree_page(state, state->targetblock); + state->targetlsn = PageGetLSN(state->target); + + opaque = BTPageGetOpaque(state->target); + + if (P_IGNORE(opaque)) + { + /* + * Since there cannot be a concurrent VACUUM operation in readonly + * mode, and since a page has no links within other pages + * (siblings and parent) once it is marked fully deleted, it + * should be impossible to land on a fully deleted page in + * readonly mode. See bt_child_check() for further details. + * + * The bt_child_check() P_ISDELETED() check is repeated here so + * that pages that are only reachable through sibling links get + * checked. + */ + if (state->readonly && P_ISDELETED(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("downlink or sibling link points to deleted block in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u left block=%u left link from block=%u.", + current, leftcurrent, opaque->btpo_prev))); + + if (P_RIGHTMOST(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u fell off the end of index \"%s\"", + current, RelationGetRelationName(state->rel)))); + else + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("block %u of index \"%s\" concurrently deleted", + current, RelationGetRelationName(state->rel)))); + goto nextpage; + } + else if (nextleveldown.leftmost == InvalidBlockNumber) + { + /* + * A concurrent page split could make the caller supplied leftmost + * block no longer contain the leftmost page, or no longer be the + * true root, but where that isn't possible due to heavyweight + * locking, check that the first valid page meets caller's + * expectations. + */ + if (state->readonly) + { + if (!bt_leftmost_ignoring_half_dead(state, current, opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u is not leftmost in index \"%s\"", + current, RelationGetRelationName(state->rel)))); + + if (level.istruerootlevel && !P_ISROOT(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u is not true root in index \"%s\"", + current, RelationGetRelationName(state->rel)))); + } + + /* + * Before beginning any non-trivial examination of level, prepare + * state for next bt_check_level_from_leftmost() invocation for + * the next level for the next level down (if any). + * + * There should be at least one non-ignorable page per level, + * unless this is the leaf level, which is assumed by caller to be + * final level. + */ + if (!P_ISLEAF(opaque)) + { + IndexTuple itup; + ItemId itemid; + + /* Internal page -- downlink gets leftmost on next level */ + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, + P_FIRSTDATAKEY(opaque)); + itup = (IndexTuple) PageGetItem(state->target, itemid); + nextleveldown.leftmost = BTreeTupleGetDownLink(itup); + nextleveldown.level = opaque->btpo_level - 1; + } + else + { + /* + * Leaf page -- final level caller must process. + * + * Note that this could also be the root page, if there has + * been no root page split yet. + */ + nextleveldown.leftmost = P_NONE; + nextleveldown.level = InvalidBtreeLevel; + } + + /* + * Finished setting up state for this call/level. Control will + * never end up back here in any future loop iteration for this + * level. + */ + } + + /* + * Sibling links should be in mutual agreement. There arises + * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling + * of the parent's low-key downlink is half-dead. (A half-dead page + * has no downlink from its parent.) Under heavyweight locking, the + * last bt_leftmost_ignoring_half_dead() validated this btpo_prev. + * Without heavyweight locking, validation of the P_NONE case remains + * unimplemented. + */ + if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE) + bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); + + /* Check level */ + if (level.level != opaque->btpo_level) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down", + RelationGetRelationName(state->rel)), + errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", + current, level.level, opaque->btpo_level))); + + /* Verify invariants for page */ + bt_target_page_check(state); + +nextpage: + + /* Try to detect circular links */ + if (current == leftcurrent || current == opaque->btpo_prev) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("circular link chain found in block %u of index \"%s\"", + current, RelationGetRelationName(state->rel)))); + + leftcurrent = current; + current = opaque->btpo_next; + + if (state->lowkey) + { + Assert(state->readonly); + pfree(state->lowkey); + state->lowkey = NULL; + } + + /* + * Copy current target high key as the low key of right sibling. + * Allocate memory in upper level context, so it would be cleared + * after reset of target context. + * + * We only need the low key in corner cases of checking child high + * keys. We use high key only when incomplete split on the child level + * falls to the boundary of pages on the target level. See + * bt_child_highkey_check() for details. So, typically we won't end + * up doing anything with low key, but it's simpler for general case + * high key verification to always have it available. + * + * The correctness of managing low key in the case of concurrent + * splits wasn't investigated yet. Thankfully we only need low key + * for readonly verification and concurrent splits won't happen. + */ + if (state->readonly && !P_RIGHTMOST(opaque)) + { + IndexTuple itup; + ItemId itemid; + + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, P_HIKEY); + itup = (IndexTuple) PageGetItem(state->target, itemid); + + state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup)); + memcpy(state->lowkey, itup, IndexTupleSize(itup)); + } + + /* Free page and associated memory for this iteration */ + MemoryContextReset(state->targetcontext); + } + while (current != P_NONE); + + if (state->lowkey) + { + Assert(state->readonly); + pfree(state->lowkey); + state->lowkey = NULL; + } + + /* Don't change context for caller */ + MemoryContextSwitchTo(oldcontext); + + return nextleveldown; +} + +/* + * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of + * half-dead, sibling-linked pages to the left. If a half-dead page appears + * under state->readonly, the database exited recovery between the first-stage + * and second-stage WAL records of a deletion. + */ +static bool +bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque) +{ + BlockNumber reached = start_opaque->btpo_prev, + reached_from = start; + bool all_half_dead = true; + + /* + * To handle the !readonly case, we'd need to accept BTP_DELETED pages and + * potentially observe nbtree/README "Page deletion and backwards scans". + */ + Assert(state->readonly); + + while (reached != P_NONE && all_half_dead) + { + Page page = palloc_btree_page(state, reached); + BTPageOpaque reached_opaque = BTPageGetOpaque(page); + + CHECK_FOR_INTERRUPTS(); + + /* + * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page() + * writes that side-links will continue to point to the siblings. + * Check btpo_next for that property. + */ + all_half_dead = P_ISHALFDEAD(reached_opaque) && + reached != start && + reached != reached_from && + reached_opaque->btpo_next == reached_from; + if (all_half_dead) + { + XLogRecPtr pagelsn = PageGetLSN(page); + + /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */ + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", + reached, reached_from, + LSN_FORMAT_ARGS(pagelsn)))); + + reached_from = reached; + reached = reached_opaque->btpo_prev; + } + + pfree(page); + } + + return all_half_dead; +} + +/* + * Raise an error when target page's left link does not point back to the + * previous target page, called leftcurrent here. The leftcurrent page's + * right link was followed to get to the current target page, and we expect + * mutual agreement among leftcurrent and the current target page. Make sure + * that this condition has definitely been violated in the !readonly case, + * where concurrent page splits are something that we need to deal with. + * + * Cross-page inconsistencies involving pages that don't agree about being + * siblings are known to be a particularly good indicator of corruption + * involving partial writes/lost updates. The bt_right_page_check_scankey + * check also provides a way of detecting cross-page inconsistencies for + * !readonly callers, but it can only detect sibling pages that have an + * out-of-order keyspace, which can't catch many of the problems that we + * expect to catch here. + * + * The classic example of the kind of inconsistency that we can only catch + * with this check (when in !readonly mode) involves three sibling pages that + * were affected by a faulty page split at some point in the past. The + * effects of the split are reflected in the original page and its new right + * sibling page, with a lack of any accompanying changes for the _original_ + * right sibling page. The original right sibling page's left link fails to + * point to the new right sibling page (its left link still points to the + * original page), even though the first phase of a page split is supposed to + * work as a single atomic action. This subtle inconsistency will probably + * only break backwards scans in practice. + * + * Note that this is the only place where amcheck will "couple" buffer locks + * (and only for !readonly callers). In general we prefer to avoid more + * thorough cross-page checks in !readonly mode, but it seems worth the + * complexity here. Also, the performance overhead of performing lock + * coupling here is negligible in practice. Control only reaches here with a + * non-corrupt index when there is a concurrent page split at the instant + * caller crossed over to target page from leftcurrent page. + */ +static void +bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent) +{ + /* passing metapage to BTPageGetOpaque() would give irrelevant findings */ + Assert(leftcurrent != P_NONE); + + if (!state->readonly) + { + Buffer lbuf; + Buffer newtargetbuf; + Page page; + BTPageOpaque opaque; + BlockNumber newtargetblock; + + /* Couple locks in the usual order for nbtree: Left to right */ + lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent, + RBM_NORMAL, state->checkstrategy); + LockBuffer(lbuf, BT_READ); + _bt_checkpage(state->rel, lbuf); + page = BufferGetPage(lbuf); + opaque = BTPageGetOpaque(page); + if (P_ISDELETED(opaque)) + { + /* + * Cannot reason about concurrently deleted page -- the left link + * in the page to the right is expected to point to some other + * page to the left (not leftcurrent page). + * + * Note that we deliberately don't give up with a half-dead page. + */ + UnlockReleaseBuffer(lbuf); + return; + } + + newtargetblock = opaque->btpo_next; + /* Avoid self-deadlock when newtargetblock == leftcurrent */ + if (newtargetblock != leftcurrent) + { + newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, + newtargetblock, RBM_NORMAL, + state->checkstrategy); + LockBuffer(newtargetbuf, BT_READ); + _bt_checkpage(state->rel, newtargetbuf); + page = BufferGetPage(newtargetbuf); + opaque = BTPageGetOpaque(page); + /* btpo_prev_from_target may have changed; update it */ + btpo_prev_from_target = opaque->btpo_prev; + } + else + { + /* + * leftcurrent right sibling points back to leftcurrent block. + * Index is corrupt. Easiest way to handle this is to pretend + * that we actually read from a distinct page that has an invalid + * block number in its btpo_prev. + */ + newtargetbuf = InvalidBuffer; + btpo_prev_from_target = InvalidBlockNumber; + } + + /* + * No need to check P_ISDELETED here, since new target block cannot be + * marked deleted as long as we hold a lock on lbuf + */ + if (BufferIsValid(newtargetbuf)) + UnlockReleaseBuffer(newtargetbuf); + UnlockReleaseBuffer(lbuf); + + if (btpo_prev_from_target == leftcurrent) + { + /* Report split in left sibling, not target (or new target) */ + ereport(DEBUG1, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("harmless concurrent page split detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.", + leftcurrent, newtargetblock, + state->targetblock))); + return; + } + + /* + * Index is corrupt. Make sure that we report correct target page. + * + * This could have changed in cases where there was a concurrent page + * split, as well as index corruption (at least in theory). Note that + * btpo_prev_from_target was already updated above. + */ + state->targetblock = newtargetblock; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("left link/right link pair in index \"%s\" not in agreement", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u left block=%u left link from block=%u.", + state->targetblock, leftcurrent, + btpo_prev_from_target))); +} + +/* + * Function performs the following checks on target page, or pages ancillary to + * target page: + * + * - That every "real" data item is less than or equal to the high key, which + * is an upper bound on the items on the page. Data items should be + * strictly less than the high key when the page is an internal page. + * + * - That within the page, every data item is strictly less than the item + * immediately to its right, if any (i.e., that the items are in order + * within the page, so that the binary searches performed by index scans are + * sane). + * + * - That the last data item stored on the page is strictly less than the + * first data item on the page to the right (when such a first item is + * available). + * + * - Various checks on the structure of tuples themselves. For example, check + * that non-pivot tuples have no truncated attributes. + * + * Furthermore, when state passed shows ShareLock held, function also checks: + * + * - That all child pages respect strict lower bound from parent's pivot + * tuple. + * + * - That downlink to block was encountered in parent where that's expected. + * + * - That high keys of child pages matches corresponding pivot keys in parent. + * + * This is also where heapallindexed callers use their Bloom filter to + * fingerprint IndexTuples for later table_index_build_scan() verification. + * + * Note: Memory allocated in this routine is expected to be released by caller + * resetting state->targetcontext. + */ +static void +bt_target_page_check(BtreeCheckState *state) +{ + OffsetNumber offset; + OffsetNumber max; + BTPageOpaque topaque; + + topaque = BTPageGetOpaque(state->target); + max = PageGetMaxOffsetNumber(state->target); + + elog(DEBUG2, "verifying %u items on %s block %u", max, + P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock); + + /* + * Check the number of attributes in high key. Note, rightmost page + * doesn't contain a high key, so nothing to check + */ + if (!P_RIGHTMOST(topaque)) + { + ItemId itemid; + IndexTuple itup; + + /* Verify line pointer before checking tuple */ + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, P_HIKEY); + if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, + P_HIKEY)) + { + itup = (IndexTuple) PageGetItem(state->target, itemid); + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("wrong number of high key index tuple attributes in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.", + state->targetblock, + BTreeTupleGetNAtts(itup, state->rel), + P_ISLEAF(topaque) ? "heap" : "index", + LSN_FORMAT_ARGS(state->targetlsn)))); + } + } + + /* + * Loop over page items, starting from first non-highkey item, not high + * key (if any). Most tests are not performed for the "negative infinity" + * real item (if any). + */ + for (offset = P_FIRSTDATAKEY(topaque); + offset <= max; + offset = OffsetNumberNext(offset)) + { + ItemId itemid; + IndexTuple itup; + size_t tupsize; + BTScanInsert skey; + bool lowersizelimit; + ItemPointer scantid; + + CHECK_FOR_INTERRUPTS(); + + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, offset); + itup = (IndexTuple) PageGetItem(state->target, itemid); + tupsize = IndexTupleSize(itup); + + /* + * lp_len should match the IndexTuple reported length exactly, since + * lp_len is completely redundant in indexes, and both sources of + * tuple length are MAXALIGN()'d. nbtree does not use lp_len all that + * frequently, and is surprisingly tolerant of corrupt lp_len fields. + */ + if (tupsize != ItemIdGetLength(itemid)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index tuple size does not equal lp_len in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.", + state->targetblock, offset, + tupsize, ItemIdGetLength(itemid), + LSN_FORMAT_ARGS(state->targetlsn)), + errhint("This could be a torn page problem."))); + + /* Check the number of index tuple attributes */ + if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, + offset)) + { + ItemPointer tid; + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("wrong number of index tuple attributes in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.", + itid, + BTreeTupleGetNAtts(itup, state->rel), + P_ISLEAF(topaque) ? "heap" : "index", + htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* + * Don't try to generate scankey using "negative infinity" item on + * internal pages. They are always truncated to zero attributes. + */ + if (offset_is_negative_infinity(topaque, offset)) + { + /* + * We don't call bt_child_check() for "negative infinity" items. + * But if we're performing downlink connectivity check, we do it + * for every item including "negative infinity" one. + */ + if (!P_ISLEAF(topaque) && state->readonly) + { + bt_child_highkey_check(state, + offset, + NULL, + topaque->btpo_level); + } + continue; + } + + /* + * Readonly callers may optionally verify that non-pivot tuples can + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * TID, since the posting list itself is validated by other checks. + */ + if (state->rootdescend && P_ISLEAF(topaque) && + !bt_rootdescend(state, itup)) + { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("could not find tuple using search from root page in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + itid, htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* + * If tuple is a posting list tuple, make sure posting list TIDs are + * in order + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid = psprintf("(%u,%u)", state->targetblock, offset); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("posting list contains misplaced TID in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.", + itid, i, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + ItemPointerCopy(current, &last); + } + } + + /* Build insertion scankey for current page offset */ + skey = bt_mkscankey_pivotsearch(state->rel, itup); + + /* + * Make sure tuple size does not exceed the relevant BTREE_VERSION + * specific limit. + * + * BTREE_VERSION 4 (which introduced heapkeyspace rules) requisitioned + * a small amount of space from BTMaxItemSize() in order to ensure + * that suffix truncation always has enough space to add an explicit + * heap TID back to a tuple -- we pessimistically assume that every + * newly inserted tuple will eventually need to have a heap TID + * appended during a future leaf page split, when the tuple becomes + * the basis of the new high key (pivot tuple) for the leaf page. + * + * Since the reclaimed space is reserved for that purpose, we must not + * enforce the slightly lower limit when the extra space has been used + * as intended. In other words, there is only a cross-version + * difference in the limit on tuple size within leaf pages. + * + * Still, we're particular about the details within BTREE_VERSION 4 + * internal pages. Pivot tuples may only use the extra space for its + * designated purpose. Enforce the lower limit for pivot tuples when + * an explicit heap TID isn't actually present. (In all other cases + * suffix truncation is guaranteed to generate a pivot tuple that's no + * larger than the firstright tuple provided to it by its caller.) + */ + lowersizelimit = skey->heapkeyspace && + (P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL); + if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) : + BTMaxItemSizeNoHeapTid(state->target))) + { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index row size %zu exceeds maximum for index \"%s\"", + tupsize, RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* Fingerprint leaf page tuples (those that point to the heap) */ + if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid)) + { + IndexTuple norm; + + if (BTreeTupleIsPosting(itup)) + { + /* Fingerprint all elements as distinct "plain" tuples */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + IndexTuple logtuple; + + logtuple = bt_posting_plain_tuple(itup, i); + norm = bt_normalize_tuple(state, logtuple); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != logtuple) + pfree(norm); + pfree(logtuple); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } + } + + /* + * * High key check * + * + * If there is a high key (if this is not the rightmost page on its + * entire level), check that high key actually is upper bound on all + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. + * + * We prefer to check all items against high key rather than checking + * just the last and trusting that the operator class obeys the + * transitive law (which implies that all previous items also + * respected the high key invariant if they pass the item order + * check). + * + * Ideally, we'd compare every item in the index against every other + * item in the index, and not trust opclass obedience of the + * transitive law to bridge the gap between children and their + * grandparents (as well as great-grandparents, and so on). We don't + * go to those lengths because that would be prohibitively expensive, + * and probably not markedly more effective in practice. + * + * On the leaf level, we check that the key is <= the highkey. + * However, on non-leaf levels we check that the key is < the highkey, + * because the high key is "just another separator" rather than a copy + * of some existing key item; we expect it to be unique among all keys + * on the same level. (Suffix truncation will sometimes produce a + * leaf highkey that is an untruncated copy of the lastleft item, but + * never any other item, which necessitates weakening the leaf level + * check to <=.) + * + * Full explanation for why a highkey is never truly a copy of another + * item from the same level on internal levels: + * + * While the new left page's high key is copied from the first offset + * on the right page during an internal page split, that's not the + * full story. In effect, internal pages are split in the middle of + * the firstright tuple, not between the would-be lastleft and + * firstright tuples: the firstright key ends up on the left side as + * left's new highkey, and the firstright downlink ends up on the + * right side as right's new "negative infinity" item. The negative + * infinity tuple is truncated to zero attributes, so we're only left + * with the downlink. In other words, the copying is just an + * implementation detail of splitting in the middle of a (pivot) + * tuple. (See also: "Notes About Data Representation" in the nbtree + * README.) + */ + scantid = skey->scantid; + if (state->heapkeyspace && BTreeTupleIsPosting(itup)) + skey->scantid = BTreeTupleGetMaxHeapTID(itup); + + if (!P_RIGHTMOST(topaque) && + !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : + invariant_l_offset(state, skey, P_HIKEY))) + { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("high key invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + /* Reset, in case scantid was set to (itup) posting tuple's max TID */ + skey->scantid = scantid; + + /* + * * Item order check * + * + * Check that items are stored on page in logical order, by checking + * current item is strictly less than next item (if any). + */ + if (OffsetNumberNext(offset) <= max && + !invariant_l_offset(state, skey, OffsetNumberNext(offset))) + { + ItemPointer tid; + char *itid, + *htid, + *nitid, + *nhtid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + nitid = psprintf("(%u,%u)", state->targetblock, + OffsetNumberNext(offset)); + + /* Reuse itup to get pointed-to heap location of second item */ + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, + OffsetNumberNext(offset)); + itup = (IndexTuple) PageGetItem(state->target, itemid); + tid = BTreeTupleGetPointsToTID(itup); + nhtid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("item order invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Lower index tid=%s (points to %s tid=%s) " + "higher index tid=%s (points to %s tid=%s) " + "page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + nitid, + P_ISLEAF(topaque) ? "heap" : "index", + nhtid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* + * * Last item check * + * + * Check last item against next/right page's first data item's when + * last item on page is reached. This additional check will detect + * transposed pages iff the supposed right sibling page happens to + * belong before target in the key space. (Otherwise, a subsequent + * heap verification will probably detect the problem.) + * + * This check is similar to the item order check that will have + * already been performed for every other "real" item on target page + * when last item is checked. The difference is that the next item + * (the item that is compared to target's last item) needs to come + * from the next/sibling page. There may not be such an item + * available from sibling for various reasons, though (e.g., target is + * the rightmost page on level). + */ + else if (offset == max) + { + BTScanInsert rightkey; + + /* Get item in next/right page */ + rightkey = bt_right_page_check_scankey(state); + + if (rightkey && + !invariant_g_offset(state, rightkey, max)) + { + /* + * As explained at length in bt_right_page_check_scankey(), + * there is a known !readonly race that could account for + * apparent violation of invariant, which we must check for + * before actually proceeding with raising error. Our canary + * condition is that target page was deleted. + */ + if (!state->readonly) + { + /* Get fresh copy of target page */ + state->target = palloc_btree_page(state, state->targetblock); + /* Note that we deliberately do not update target LSN */ + topaque = BTPageGetOpaque(state->target); + + /* + * All !readonly checks now performed; just return + */ + if (P_IGNORE(topaque)) + return; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("cross page item order invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.", + state->targetblock, offset, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + } + + /* + * * Downlink check * + * + * Additional check of child items iff this is an internal page and + * caller holds a ShareLock. This happens for every downlink (item) + * in target excluding the negative-infinity downlink (again, this is + * because it has no useful value to compare). + */ + if (!P_ISLEAF(topaque) && state->readonly) + bt_child_check(state, skey, offset); + } + + /* + * Special case bt_child_highkey_check() call + * + * We don't pass a real downlink, but we've to finish the level + * processing. If condition is satisfied, we've already processed all the + * downlinks from the target level. But there still might be pages to the + * right of the child page pointer to by our rightmost downlink. And they + * might have missing downlinks. This final call checks for them. + */ + if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) + { + bt_child_highkey_check(state, InvalidOffsetNumber, + NULL, topaque->btpo_level); + } +} + +/* + * Return a scankey for an item on page to right of current target (or the + * first non-ignorable page), sufficient to check ordering invariant on last + * item in current target page. Returned scankey relies on local memory + * allocated for the child page, which caller cannot pfree(). Caller's memory + * context should be reset between calls here. + * + * This is the first data item, and so all adjacent items are checked against + * their immediate sibling item (which may be on a sibling page, or even a + * "cousin" page at parent boundaries where target's rightlink points to page + * with different parent page). If no such valid item is available, return + * NULL instead. + * + * Note that !readonly callers must reverify that target page has not + * been concurrently deleted. + */ +static BTScanInsert +bt_right_page_check_scankey(BtreeCheckState *state) +{ + BTPageOpaque opaque; + ItemId rightitem; + IndexTuple firstitup; + BlockNumber targetnext; + Page rightpage; + OffsetNumber nline; + + /* Determine target's next block number */ + opaque = BTPageGetOpaque(state->target); + + /* If target is already rightmost, no right sibling; nothing to do here */ + if (P_RIGHTMOST(opaque)) + return NULL; + + /* + * General notes on concurrent page splits and page deletion: + * + * Routines like _bt_search() don't require *any* page split interlock + * when descending the tree, including something very light like a buffer + * pin. That's why it's okay that we don't either. This avoidance of any + * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao + * algorithm, in fact. + * + * That leaves deletion. A deleted page won't actually be recycled by + * VACUUM early enough for us to fail to at least follow its right link + * (or left link, or downlink) and find its sibling, because recycling + * does not occur until no possible index scan could land on the page. + * Index scans can follow links with nothing more than their snapshot as + * an interlock and be sure of at least that much. (See page + * recycling/"visible to everyone" notes in nbtree README.) + * + * Furthermore, it's okay if we follow a rightlink and find a half-dead or + * dead (ignorable) page one or more times. There will either be a + * further right link to follow that leads to a live page before too long + * (before passing by parent's rightmost child), or we will find the end + * of the entire level instead (possible when parent page is itself the + * rightmost on its level). + */ + targetnext = opaque->btpo_next; + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + rightpage = palloc_btree_page(state, targetnext); + opaque = BTPageGetOpaque(rightpage); + + if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque)) + break; + + /* + * We landed on a deleted or half-dead sibling page. Step right until + * we locate a live sibling page. + */ + ereport(DEBUG2, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("level %u sibling page in block %u of index \"%s\" was found deleted or half dead", + opaque->btpo_level, targetnext, RelationGetRelationName(state->rel)), + errdetail_internal("Deleted page found when building scankey from right sibling."))); + + targetnext = opaque->btpo_next; + + /* Be slightly more pro-active in freeing this memory, just in case */ + pfree(rightpage); + } + + /* + * No ShareLock held case -- why it's safe to proceed. + * + * Problem: + * + * We must avoid false positive reports of corruption when caller treats + * item returned here as an upper bound on target's last item. In + * general, false positives are disallowed. Avoiding them here when + * caller is !readonly is subtle. + * + * A concurrent page deletion by VACUUM of the target page can result in + * the insertion of items on to this right sibling page that would + * previously have been inserted on our target page. There might have + * been insertions that followed the target's downlink after it was made + * to point to right sibling instead of target by page deletion's first + * phase. The inserters insert items that would belong on target page. + * This race is very tight, but it's possible. This is our only problem. + * + * Non-problems: + * + * We are not hindered by a concurrent page split of the target; we'll + * never land on the second half of the page anyway. A concurrent split + * of the right page will also not matter, because the first data item + * remains the same within the left half, which we'll reliably land on. If + * we had to skip over ignorable/deleted pages, it cannot matter because + * their key space has already been atomically merged with the first + * non-ignorable page we eventually find (doesn't matter whether the page + * we eventually find is a true sibling or a cousin of target, which we go + * into below). + * + * Solution: + * + * Caller knows that it should reverify that target is not ignorable + * (half-dead or deleted) when cross-page sibling item comparison appears + * to indicate corruption (invariant fails). This detects the single race + * condition that exists for caller. This is correct because the + * continued existence of target block as non-ignorable (not half-dead or + * deleted) implies that target page was not merged into from the right by + * deletion; the key space at or after target never moved left. Target's + * parent either has the same downlink to target as before, or a < + * downlink due to deletion at the left of target. Target either has the + * same highkey as before, or a highkey < before when there is a page + * split. (The rightmost concurrently-split-from-target-page page will + * still have the same highkey as target was originally found to have, + * which for our purposes is equivalent to target's highkey itself never + * changing, since we reliably skip over + * concurrently-split-from-target-page pages.) + * + * In simpler terms, we allow that the key space of the target may expand + * left (the key space can move left on the left side of target only), but + * the target key space cannot expand right and get ahead of us without + * our detecting it. The key space of the target cannot shrink, unless it + * shrinks to zero due to the deletion of the original page, our canary + * condition. (To be very precise, we're a bit stricter than that because + * it might just have been that the target page split and only the + * original target page was deleted. We can be more strict, just not more + * lax.) + * + * Top level tree walk caller moves on to next page (makes it the new + * target) following recovery from this race. (cf. The rationale for + * child/downlink verification needing a ShareLock within + * bt_child_check(), where page deletion is also the main source of + * trouble.) + * + * Note that it doesn't matter if right sibling page here is actually a + * cousin page, because in order for the key space to be readjusted in a + * way that causes us issues in next level up (guiding problematic + * concurrent insertions to the cousin from the grandparent rather than to + * the sibling from the parent), there'd have to be page deletion of + * target's parent page (affecting target's parent's downlink in target's + * grandparent page). Internal page deletion only occurs when there are + * no child pages (they were all fully deleted), and caller is checking + * that the target's parent has at least one non-deleted (so + * non-ignorable) child: the target page. (Note that the first phase of + * deletion atomically marks the page to be deleted half-dead/ignorable at + * the same time downlink in its parent is removed, so caller will + * definitely not fail to detect that this happened.) + * + * This trick is inspired by the method backward scans use for dealing + * with concurrent page splits; concurrent page deletion is a problem that + * similarly receives special consideration sometimes (it's possible that + * the backwards scan will re-read its "original" block after failing to + * find a right-link to it, having already moved in the opposite direction + * (right/"forwards") a few times to try to locate one). Just like us, + * that happens only to determine if there was a concurrent page deletion + * of a reference page, and just like us if there was a page deletion of + * that reference page it means we can move on from caring about the + * reference page. See the nbtree README for a full description of how + * that works. + */ + nline = PageGetMaxOffsetNumber(rightpage); + + /* + * Get first data item, if any + */ + if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque)) + { + /* Return first data item (if any) */ + rightitem = PageGetItemIdCareful(state, targetnext, rightpage, + P_FIRSTDATAKEY(opaque)); + } + else if (!P_ISLEAF(opaque) && + nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque))) + { + /* + * Return first item after the internal page's "negative infinity" + * item + */ + rightitem = PageGetItemIdCareful(state, targetnext, rightpage, + OffsetNumberNext(P_FIRSTDATAKEY(opaque))); + } + else + { + /* + * No first item. Page is probably empty leaf page, but it's also + * possible that it's an internal page with only a negative infinity + * item. + */ + ereport(DEBUG2, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("%s block %u of index \"%s\" has no first data item", + P_ISLEAF(opaque) ? "leaf" : "internal", targetnext, + RelationGetRelationName(state->rel)))); + return NULL; + } + + /* + * Return first real item scankey. Note that this relies on right page + * memory remaining allocated. + */ + firstitup = (IndexTuple) PageGetItem(rightpage, rightitem); + return bt_mkscankey_pivotsearch(state->rel, firstitup); +} + +/* + * Check if two tuples are binary identical except the block number. So, + * this function is capable to compare pivot keys on different levels. + */ +static bool +bt_pivot_tuple_identical(bool heapkeyspace, IndexTuple itup1, IndexTuple itup2) +{ + if (IndexTupleSize(itup1) != IndexTupleSize(itup2)) + return false; + + if (heapkeyspace) + { + /* + * Offset number will contain important information in heapkeyspace + * indexes: the number of attributes left in the pivot tuple following + * suffix truncation. Don't skip over it (compare it too). + */ + if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid, + IndexTupleSize(itup1) - + offsetof(ItemPointerData, ip_posid)) != 0) + return false; + } + else + { + /* + * Cannot rely on offset number field having consistent value across + * levels on pg_upgrade'd !heapkeyspace indexes. Compare contents of + * tuple starting from just after item pointer (i.e. after block + * number and offset number). + */ + if (memcmp(&itup1->t_info, &itup2->t_info, + IndexTupleSize(itup1) - + offsetof(IndexTupleData, t_info)) != 0) + return false; + } + + return true; +} + +/*--- + * Check high keys on the child level. Traverse rightlinks from previous + * downlink to the current one. Check that there are no intermediate pages + * with missing downlinks. + * + * If 'loaded_child' is given, it's assumed to be the page pointed to by the + * downlink referenced by 'downlinkoffnum' of the target page. + * + * Basically this function is called for each target downlink and checks two + * invariants: + * + * 1) You can reach the next child from previous one via rightlinks; + * 2) Each child high key have matching pivot key on target level. + * + * Consider the sample tree picture. + * + * 1 + * / \ + * 2 <-> 3 + * / \ / \ + * 4 <> 5 <> 6 <> 7 <> 8 + * + * This function will be called for blocks 4, 5, 6 and 8. Consider what is + * happening for each function call. + * + * - The function call for block 4 initializes data structure and matches high + * key of block 4 to downlink's pivot key of block 2. + * - The high key of block 5 is matched to the high key of block 2. + * - The block 6 has an incomplete split flag set, so its high key isn't + * matched to anything. + * - The function call for block 8 checks that block 8 can be found while + * following rightlinks from block 6. The high key of block 7 will be + * matched to downlink's pivot key in block 3. + * + * There is also final call of this function, which checks that there is no + * missing downlinks for children to the right of the child referenced by + * rightmost downlink in target level. + */ +static void +bt_child_highkey_check(BtreeCheckState *state, + OffsetNumber target_downlinkoffnum, + Page loaded_child, + uint32 target_level) +{ + BlockNumber blkno = state->prevrightlink; + Page page; + BTPageOpaque opaque; + bool rightsplit = state->previncompletesplit; + bool first = true; + ItemId itemid; + IndexTuple itup; + BlockNumber downlink; + + if (OffsetNumberIsValid(target_downlinkoffnum)) + { + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, target_downlinkoffnum); + itup = (IndexTuple) PageGetItem(state->target, itemid); + downlink = BTreeTupleGetDownLink(itup); + } + else + { + downlink = P_NONE; + } + + /* + * If no previous rightlink is memorized for current level just below + * target page's level, we are about to start from the leftmost page. We + * can't follow rightlinks from previous page, because there is no + * previous page. But we still can match high key. + * + * So we initialize variables for the loop above like there is previous + * page referencing current child. Also we imply previous page to not + * have incomplete split flag, that would make us require downlink for + * current child. That's correct, because leftmost page on the level + * should always have parent downlink. + */ + if (!BlockNumberIsValid(blkno)) + { + blkno = downlink; + rightsplit = false; + } + + /* Move to the right on the child level */ + while (true) + { + /* + * Did we traverse the whole tree level and this is check for pages to + * the right of rightmost downlink? + */ + if (blkno == P_NONE && downlink == P_NONE) + { + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + return; + } + + /* Did we traverse the whole tree level and don't find next downlink? */ + if (blkno == P_NONE) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"", + state->prevrightlink, downlink, + RelationGetRelationName(state->rel)))); + + /* Load page contents */ + if (blkno == downlink && loaded_child) + page = loaded_child; + else + page = palloc_btree_page(state, blkno); + + opaque = BTPageGetOpaque(page); + + /* The first page we visit at the level should be leftmost */ + if (first && !BlockNumberIsValid(state->prevrightlink) && + !bt_leftmost_ignoring_half_dead(state, blkno, opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + + /* Do level sanity check */ + if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) && + opaque->btpo_level != target_level - 1) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block found while following rightlinks from child of index \"%s\" has invalid level", + RelationGetRelationName(state->rel)), + errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, target_level - 1, opaque->btpo_level))); + + /* Try to detect circular links */ + if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("circular link chain found in block %u of index \"%s\"", + blkno, RelationGetRelationName(state->rel)))); + + if (blkno != downlink && !P_IGNORE(opaque)) + { + /* blkno probably has missing parent downlink */ + bt_downlink_missing_check(state, rightsplit, blkno, page); + } + + rightsplit = P_INCOMPLETE_SPLIT(opaque); + + /* + * If we visit page with high key, check that it is equal to the + * target key next to corresponding downlink. + */ + if (!rightsplit && !P_RIGHTMOST(opaque)) + { + BTPageOpaque topaque; + IndexTuple highkey; + OffsetNumber pivotkey_offset; + + /* Get high key */ + itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY); + highkey = (IndexTuple) PageGetItem(page, itemid); + + /* + * There might be two situations when we examine high key. If + * current child page is referenced by given target downlink, we + * should look to the next offset number for matching key from + * target page. + * + * Alternatively, we're following rightlinks somewhere in the + * middle between page referenced by previous target's downlink + * and the page referenced by current target's downlink. If + * current child page hasn't incomplete split flag set, then its + * high key should match to the target's key of current offset + * number. This happens when a previous call here (to + * bt_child_highkey_check()) found an incomplete split, and we + * reach a right sibling page without a downlink -- the right + * sibling page's high key still needs to be matched to a + * separator key on the parent/target level. + * + * Don't apply OffsetNumberNext() to target_downlinkoffnum when we + * already had to step right on the child level. Our traversal of + * the child level must try to move in perfect lockstep behind (to + * the left of) the target/parent level traversal. + */ + if (blkno == downlink) + pivotkey_offset = OffsetNumberNext(target_downlinkoffnum); + else + pivotkey_offset = target_downlinkoffnum; + + topaque = BTPageGetOpaque(state->target); + + if (!offset_is_negative_infinity(topaque, pivotkey_offset)) + { + /* + * If we're looking for the next pivot tuple in target page, + * but there is no more pivot tuples, then we should match to + * high key instead. + */ + if (pivotkey_offset > PageGetMaxOffsetNumber(state->target)) + { + if (P_RIGHTMOST(topaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + pivotkey_offset = P_HIKEY; + } + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, pivotkey_offset); + itup = (IndexTuple) PageGetItem(state->target, itemid); + } + else + { + /* + * We cannot try to match child's high key to a negative + * infinity key in target, since there is nothing to compare. + * However, it's still possible to match child's high key + * outside of target page. The reason why we're are is that + * bt_child_highkey_check() was previously called for the + * cousin page of 'loaded_child', which is incomplete split. + * So, now we traverse to the right of that cousin page and + * current child level page under consideration still belongs + * to the subtree of target's left sibling. Thus, we need to + * match child's high key to it's left uncle page high key. + * Thankfully we saved it, it's called a "low key" of target + * page. + */ + if (!state->lowkey) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't find left sibling high key in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + itup = state->lowkey; + } + + if (!bt_pivot_tuple_identical(state->heapkeyspace, highkey, itup)) + { + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("mismatch between parent key and child high key in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + } + + /* Exit if we already found next downlink */ + if (blkno == downlink) + { + state->prevrightlink = opaque->btpo_next; + state->previncompletesplit = rightsplit; + return; + } + + /* Traverse to the next page using rightlink */ + blkno = opaque->btpo_next; + + /* Free page contents if it's allocated by us */ + if (page != loaded_child) + pfree(page); + first = false; + } +} + +/* + * Checks one of target's downlink against its child page. + * + * Conceptually, the target page continues to be what is checked here. The + * target block is still blamed in the event of finding an invariant violation. + * The downlink insertion into the target is probably where any problem raised + * here arises, and there is no such thing as a parent link, so doing the + * verification this way around is much more practical. + * + * This function visits child page and it's sequentially called for each + * downlink of target page. Assuming this we also check downlink connectivity + * here in order to save child page visits. + */ +static void +bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, + OffsetNumber downlinkoffnum) +{ + ItemId itemid; + IndexTuple itup; + BlockNumber childblock; + OffsetNumber offset; + OffsetNumber maxoffset; + Page child; + BTPageOpaque copaque; + BTPageOpaque topaque; + + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, downlinkoffnum); + itup = (IndexTuple) PageGetItem(state->target, itemid); + childblock = BTreeTupleGetDownLink(itup); + + /* + * Caller must have ShareLock on target relation, because of + * considerations around page deletion by VACUUM. + * + * NB: In general, page deletion deletes the right sibling's downlink, not + * the downlink of the page being deleted; the deleted page's downlink is + * reused for its sibling. The key space is thereby consolidated between + * the deleted page and its right sibling. (We cannot delete a parent + * page's rightmost child unless it is the last child page, and we intend + * to also delete the parent itself.) + * + * If this verification happened without a ShareLock, the following race + * condition could cause false positives: + * + * In general, concurrent page deletion might occur, including deletion of + * the left sibling of the child page that is examined here. If such a + * page deletion were to occur, closely followed by an insertion into the + * newly expanded key space of the child, a window for the false positive + * opens up: the stale parent/target downlink originally followed to get + * to the child legitimately ceases to be a lower bound on all items in + * the page, since the key space was concurrently expanded "left". + * (Insertion followed the "new" downlink for the child, not our now-stale + * downlink, which was concurrently physically removed in target/parent as + * part of deletion's first phase.) + * + * While we use various techniques elsewhere to perform cross-page + * verification for !readonly callers, a similar trick seems difficult + * here. The tricks used by bt_recheck_sibling_links and by + * bt_right_page_check_scankey both involve verification of a same-level, + * cross-sibling invariant. Cross-level invariants are far more squishy, + * though. The nbtree REDO routines do not actually couple buffer locks + * across levels during page splits, so making any cross-level check work + * reliably in !readonly mode may be impossible. + */ + Assert(state->readonly); + + /* + * Verify child page has the downlink key from target page (its parent) as + * a lower bound; downlink must be strictly less than all keys on the + * page. + * + * Check all items, rather than checking just the first and trusting that + * the operator class obeys the transitive law. + */ + topaque = BTPageGetOpaque(state->target); + child = palloc_btree_page(state, childblock); + copaque = BTPageGetOpaque(child); + maxoffset = PageGetMaxOffsetNumber(child); + + /* + * Since we've already loaded the child block, combine this check with + * check for downlink connectivity. + */ + bt_child_highkey_check(state, downlinkoffnum, + child, topaque->btpo_level); + + /* + * Since there cannot be a concurrent VACUUM operation in readonly mode, + * and since a page has no links within other pages (siblings and parent) + * once it is marked fully deleted, it should be impossible to land on a + * fully deleted page. + * + * It does not quite make sense to enforce that the page cannot even be + * half-dead, despite the fact the downlink is modified at the same stage + * that the child leaf page is marked half-dead. That's incorrect because + * there may occasionally be multiple downlinks from a chain of pages + * undergoing deletion, where multiple successive calls are made to + * _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark + * the leaf page as fully dead. While _bt_mark_page_halfdead() usually + * removes the downlink to the leaf page that is marked half-dead, that's + * not guaranteed, so it's possible we'll land on a half-dead page with a + * downlink due to an interrupted multi-level page deletion. + * + * We go ahead with our checks if the child page is half-dead. It's safe + * to do so because we do not test the child's high key, so it does not + * matter that the original high key will have been replaced by a dummy + * truncated high key within _bt_mark_page_halfdead(). All other page + * items are left intact on a half-dead page, so there is still something + * to test. + */ + if (P_ISDELETED(copaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("downlink to deleted page found in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.", + state->targetblock, childblock, + LSN_FORMAT_ARGS(state->targetlsn)))); + + for (offset = P_FIRSTDATAKEY(copaque); + offset <= maxoffset; + offset = OffsetNumberNext(offset)) + { + /* + * Skip comparison of target page key against "negative infinity" + * item, if any. Checking it would indicate that it's not a strict + * lower bound, but that's only because of the hard-coding for + * negative infinity items within _bt_compare(). + * + * If nbtree didn't truncate negative infinity tuples during internal + * page splits then we'd expect child's negative infinity key to be + * equal to the scankey/downlink from target/parent (it would be a + * "low key" in this hypothetical scenario, and so it would still need + * to be treated as a special case here). + * + * Negative infinity items can be thought of as a strict lower bound + * that works transitively, with the last non-negative-infinity pivot + * followed during a descent from the root as its "true" strict lower + * bound. Only a small number of negative infinity items are truly + * negative infinity; those that are the first items of leftmost + * internal pages. In more general terms, a negative infinity item is + * only negative infinity with respect to the subtree that the page is + * at the root of. + * + * See also: bt_rootdescend(), which can even detect transitive + * inconsistencies on cousin leaf pages. + */ + if (offset_is_negative_infinity(copaque, offset)) + continue; + + if (!invariant_l_nontarget_offset(state, targetkey, childblock, child, + offset)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("down-link lower bound invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.", + state->targetblock, childblock, offset, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + pfree(child); +} + +/* + * Checks if page is missing a downlink that it should have. + * + * A page that lacks a downlink/parent may indicate corruption. However, we + * must account for the fact that a missing downlink can occasionally be + * encountered in a non-corrupt index. This can be due to an interrupted page + * split, or an interrupted multi-level page deletion (i.e. there was a hard + * crash or an error during a page split, or while VACUUM was deleting a + * multi-level chain of pages). + * + * Note that this can only be called in readonly mode, so there is no need to + * be concerned about concurrent page splits or page deletions. + */ +static void +bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber blkno, Page page) +{ + BTPageOpaque opaque = BTPageGetOpaque(page); + ItemId itemid; + IndexTuple itup; + Page child; + BTPageOpaque copaque; + uint32 level; + BlockNumber childblk; + XLogRecPtr pagelsn; + + Assert(state->readonly); + Assert(!P_IGNORE(opaque)); + + /* No next level up with downlinks to fingerprint from the true root */ + if (P_ISROOT(opaque)) + return; + + pagelsn = PageGetLSN(page); + + /* + * Incomplete (interrupted) page splits can account for the lack of a + * downlink. Some inserting transaction should eventually complete the + * page split in passing, when it notices that the left sibling page is + * P_INCOMPLETE_SPLIT(). + * + * In general, VACUUM is not prepared for there to be no downlink to a + * page that it deletes. This is the main reason why the lack of a + * downlink can be reported as corruption here. It's not obvious that an + * invalid missing downlink can result in wrong answers to queries, + * though, since index scans that land on the child may end up + * consistently moving right. The handling of concurrent page splits (and + * page deletions) within _bt_moveright() cannot distinguish + * inconsistencies that last for a moment from inconsistencies that are + * permanent and irrecoverable. + * + * VACUUM isn't even prepared to delete pages that have no downlink due to + * an incomplete page split, but it can detect and reason about that case + * by design, so it shouldn't be taken to indicate corruption. See + * _bt_pagedel() for full details. + */ + if (rightsplit) + { + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless interrupted page split detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", + blkno, opaque->btpo_level, + opaque->btpo_prev, + LSN_FORMAT_ARGS(pagelsn)))); + return; + } + + /* + * Page under check is probably the "top parent" of a multi-level page + * deletion. We'll need to descend the subtree to make sure that + * descendant pages are consistent with that, though. + * + * If the page (which must be non-ignorable) is a leaf page, then clearly + * it can't be the top parent. The lack of a downlink is probably a + * symptom of a broad problem that could just as easily cause + * inconsistencies anywhere else. + */ + if (P_ISLEAF(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("leaf index block lacks downlink in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u page lsn=%X/%X.", + blkno, + LSN_FORMAT_ARGS(pagelsn)))); + + /* Descend from the given page, which is an internal page */ + elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", + RelationGetRelationName(state->rel)); + + level = opaque->btpo_level; + itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); + itup = (IndexTuple) PageGetItem(page, itemid); + childblk = BTreeTupleGetDownLink(itup); + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + child = palloc_btree_page(state, childblk); + copaque = BTPageGetOpaque(child); + + if (P_ISLEAF(copaque)) + break; + + /* Do an extra sanity check in passing on internal pages */ + if (copaque->btpo_level != level - 1) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", + RelationGetRelationName(state->rel)), + errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, childblk, + level - 1, copaque->btpo_level))); + + level = copaque->btpo_level; + itemid = PageGetItemIdCareful(state, childblk, child, + P_FIRSTDATAKEY(copaque)); + itup = (IndexTuple) PageGetItem(child, itemid); + childblk = BTreeTupleGetDownLink(itup); + /* Be slightly more pro-active in freeing this memory, just in case */ + pfree(child); + } + + /* + * Since there cannot be a concurrent VACUUM operation in readonly mode, + * and since a page has no links within other pages (siblings and parent) + * once it is marked fully deleted, it should be impossible to land on a + * fully deleted page. See bt_child_check() for further details. + * + * The bt_child_check() P_ISDELETED() check is repeated here because + * bt_child_check() does not visit pages reachable through negative + * infinity items. Besides, bt_child_check() is unwilling to descend + * multiple levels. (The similar bt_child_check() P_ISDELETED() check + * within bt_check_level_from_leftmost() won't reach the page either, + * since the leaf's live siblings should have their sibling links updated + * to bypass the deletion target page when it is marked fully dead.) + * + * If this error is raised, it might be due to a previous multi-level page + * deletion that failed to realize that it wasn't yet safe to mark the + * leaf page as fully dead. A "dangling downlink" will still remain when + * this happens. The fact that the dangling downlink's page (the leaf's + * parent/ancestor page) lacked a downlink is incidental. + */ + if (P_ISDELETED(copaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("downlink to deleted leaf page found in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.", + blkno, childblk, + LSN_FORMAT_ARGS(pagelsn)))); + + /* + * Iff leaf page is half-dead, its high key top parent link should point + * to what VACUUM considered to be the top parent page at the instant it + * was interrupted. Provided the high key link actually points to the + * page under check, the missing downlink we detected is consistent with + * there having been an interrupted multi-level page deletion. This means + * that the subtree with the page under check at its root (a page deletion + * chain) is in a consistent state, enabling VACUUM to resume deleting the + * entire chain the next time it encounters the half-dead leaf page. + */ + if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque)) + { + itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY); + itup = (IndexTuple) PageGetItem(child, itemid); + if (BTreeTupleGetTopParent(itup) == blkno) + return; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("internal index block lacks downlink in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u level=%u page lsn=%X/%X.", + blkno, opaque->btpo_level, + LSN_FORMAT_ARGS(pagelsn)))); +} + +/* + * Per-tuple callback from table_index_build_scan, used to determine if index has + * all the entries that definitely should have been observed in leaf pages of + * the target index (that is, all IndexTuples that were fingerprinted by our + * Bloom filter). All heapallindexed checks occur here. + * + * The redundancy between an index and the table it indexes provides a good + * opportunity to detect corruption, especially corruption within the table. + * The high level principle behind the verification performed here is that any + * IndexTuple that should be in an index following a fresh CREATE INDEX (based + * on the same index definition) should also have been in the original, + * existing index, which should have used exactly the same representation + * + * Since the overall structure of the index has already been verified, the most + * likely explanation for error here is a corrupt heap page (could be logical + * or physical corruption). Index corruption may still be detected here, + * though. Only readonly callers will have verified that left links and right + * links are in agreement, and so it's possible that a leaf page transposition + * within index is actually the source of corruption detected here (for + * !readonly callers). The checks performed only for readonly callers might + * more accurately frame the problem as a cross-page invariant issue (this + * could even be due to recovery not replaying all WAL records). The !readonly + * ERROR message raised here includes a HINT about retrying with readonly + * verification, just in case it's a cross-page invariant issue, though that + * isn't particularly likely. + * + * table_index_build_scan() expects to be able to find the root tuple when a + * heap-only tuple (the live tuple at the end of some HOT chain) needs to be + * indexed, in order to replace the actual tuple's TID with the root tuple's + * TID (which is what we're actually passed back here). The index build heap + * scan code will raise an error when a tuple that claims to be the root of the + * heap-only tuple's HOT chain cannot be located. This catches cases where the + * original root item offset/root tuple for a HOT chain indicates (for whatever + * reason) that the entire HOT chain is dead, despite the fact that the latest + * heap-only tuple should be indexed. When this happens, sequential scans may + * always give correct answers, and all indexes may be considered structurally + * consistent (i.e. the nbtree structural checks would not detect corruption). + * It may be the case that only index scans give wrong answers, and yet heap or + * SLRU corruption is the real culprit. (While it's true that LP_DEAD bit + * setting will probably also leave the index in a corrupt state before too + * long, the problem is nonetheless that there is heap corruption.) + * + * Heap-only tuple handling within table_index_build_scan() works in a way that + * helps us to detect index tuples that contain the wrong values (values that + * don't match the latest tuple in the HOT chain). This can happen when there + * is no superseding index tuple due to a faulty assessment of HOT safety, + * perhaps during the original CREATE INDEX. Because the latest tuple's + * contents are used with the root TID, an error will be raised when a tuple + * with the same TID but non-matching attribute values is passed back to us. + * Faulty assessment of HOT-safety was behind at least two distinct CREATE + * INDEX CONCURRENTLY bugs that made it into stable releases, one of which was + * undetected for many years. In short, the same principle that allows a + * REINDEX to repair corruption when there was an (undetected) broken HOT chain + * also allows us to detect the corruption in many cases. + */ +static void +bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *checkstate) +{ + BtreeCheckState *state = (BtreeCheckState *) checkstate; + IndexTuple itup, + norm; + + Assert(state->heapallindexed); + + /* Generate a normalized index tuple for fingerprinting */ + itup = index_form_tuple(RelationGetDescr(index), values, isnull); + itup->t_tid = *tid; + norm = bt_normalize_tuple(state, itup); + + /* Probe Bloom filter -- tuple should be present */ + if (bloom_lacks_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(state->heaprel), + RelationGetRelationName(state->rel)), + !state->readonly + ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.") + : 0)); + + state->heaptuplespresent++; + pfree(itup); + /* Cannot leak memory here */ + if (norm != itup) + pfree(norm); +} + +/* + * Normalize an index tuple for fingerprinting. + * + * In general, index tuple formation is assumed to be deterministic by + * heapallindexed verification, and IndexTuples are assumed immutable. While + * the LP_DEAD bit is mutable in leaf pages, that's ItemId metadata, which is + * not fingerprinted. Normalization is required to compensate for corner + * cases where the determinism assumption doesn't quite work. + * + * There is currently one such case: index_form_tuple() does not try to hide + * the source TOAST state of input datums. The executor applies TOAST + * compression for heap tuples based on different criteria to the compression + * applied within btinsert()'s call to index_form_tuple(): it sometimes + * compresses more aggressively, resulting in compressed heap tuple datums but + * uncompressed corresponding index tuple datums. A subsequent heapallindexed + * verification will get a logically equivalent though bitwise unequal tuple + * from index_form_tuple(). False positive heapallindexed corruption reports + * could occur without normalizing away the inconsistency. + * + * Returned tuple is often caller's own original tuple. Otherwise, it is a + * new representation of caller's original index tuple, palloc()'d in caller's + * memory context. + * + * Note: This routine is not concerned with distinctions about the + * representation of tuples beyond those that might break heapallindexed + * verification. In particular, it won't try to normalize opclass-equal + * datums with potentially distinct representations (e.g., btree/numeric_ops + * index datums will not get their display scale normalized-away here). + * Caller does normalization for non-pivot tuples that have a posting list, + * since dummy CREATE INDEX callback code generates new tuples with the same + * normalized representation. + */ +static IndexTuple +bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) +{ + TupleDesc tupleDescriptor = RelationGetDescr(state->rel); + Datum normalized[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + bool toast_free[INDEX_MAX_KEYS]; + bool formnewtup = false; + IndexTuple reformed; + int i; + + /* Caller should only pass "logical" non-pivot tuples here */ + Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup)); + + /* Easy case: It's immediately clear that tuple has no varlena datums */ + if (!IndexTupleHasVarwidths(itup)) + return itup; + + for (i = 0; i < tupleDescriptor->natts; i++) + { + Form_pg_attribute att; + + att = TupleDescAttr(tupleDescriptor, i); + + /* Assume untoasted/already normalized datum initially */ + toast_free[i] = false; + normalized[i] = index_getattr(itup, att->attnum, + tupleDescriptor, + &isnull[i]); + if (att->attbyval || att->attlen != -1 || isnull[i]) + continue; + + /* + * Callers always pass a tuple that could safely be inserted into the + * index without further processing, so an external varlena header + * should never be encountered here + */ + if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i]))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(state->rel)))); + else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i]))) + { + formnewtup = true; + normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i])); + toast_free[i] = true; + } + } + + /* Easier case: Tuple has varlena datums, none of which are compressed */ + if (!formnewtup) + return itup; + + /* + * Hard case: Tuple had compressed varlena datums that necessitate + * creating normalized version of the tuple from uncompressed input datums + * (normalized input datums). This is rather naive, but shouldn't be + * necessary too often. + * + * Note that we rely on deterministic index_form_tuple() TOAST compression + * of normalized input. + */ + reformed = index_form_tuple(tupleDescriptor, normalized, isnull); + reformed->t_tid = itup->t_tid; + + /* Cannot leak memory here */ + for (i = 0; i < tupleDescriptor->natts; i++) + if (toast_free[i]) + pfree(DatumGetPointer(normalized[i])); + + return reformed; +} + +/* + * Produce palloc()'d "plain" tuple for nth posting list entry/TID. + * + * In general, deduplication is not supposed to change the logical contents of + * an index. Multiple index tuples are merged together into one equivalent + * posting list index tuple when convenient. + * + * heapallindexed verification must normalize-away this variation in + * representation by converting posting list tuples into two or more "plain" + * tuples. Each tuple must be fingerprinted separately -- there must be one + * tuple for each corresponding Bloom filter probe during the heap scan. + * + * Note: Caller still needs to call bt_normalize_tuple() with returned tuple. + */ +static inline IndexTuple +bt_posting_plain_tuple(IndexTuple itup, int n) +{ + Assert(BTreeTupleIsPosting(itup)); + + /* Returns non-posting-list tuple */ + return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1); +} + +/* + * Search for itup in index, starting from fast root page. itup must be a + * non-pivot tuple. This is only supported with heapkeyspace indexes, since + * we rely on having fully unique keys to find a match with only a single + * visit to a leaf page, barring an interrupted page split, where we may have + * to move right. (A concurrent page split is impossible because caller must + * be readonly caller.) + * + * This routine can detect very subtle transitive consistency issues across + * more than one level of the tree. Leaf pages all have a high key (even the + * rightmost page has a conceptual positive infinity high key), but not a low + * key. Their downlink in parent is a lower bound, which along with the high + * key is almost enough to detect every possible inconsistency. A downlink + * separator key value won't always be available from parent, though, because + * the first items of internal pages are negative infinity items, truncated + * down to zero attributes during internal page splits. While it's true that + * bt_child_check() and the high key check can detect most imaginable key + * space problems, there are remaining problems it won't detect with non-pivot + * tuples in cousin leaf pages. Starting a search from the root for every + * existing leaf tuple detects small inconsistencies in upper levels of the + * tree that cannot be detected any other way. (Besides all this, this is + * probably also useful as a direct test of the code used by index scans + * themselves.) + */ +static bool +bt_rootdescend(BtreeCheckState *state, IndexTuple itup) +{ + BTScanInsert key; + BTStack stack; + Buffer lbuf; + bool exists; + + key = _bt_mkscankey(state->rel, itup); + Assert(key->heapkeyspace && key->scantid != NULL); + + /* + * Search from root. + * + * Ideally, we would arrange to only move right within _bt_search() when + * an interrupted page split is detected (i.e. when the incomplete split + * bit is found to be set), but for now we accept the possibility that + * that could conceal an inconsistency. + */ + Assert(state->readonly && state->rootdescend); + exists = false; + stack = _bt_search(state->rel, NULL, key, &lbuf, BT_READ, NULL); + + if (BufferIsValid(lbuf)) + { + BTInsertStateData insertstate; + OffsetNumber offnum; + Page page; + + insertstate.itup = itup; + insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); + insertstate.itup_key = key; + insertstate.postingoff = 0; + insertstate.bounds_valid = false; + insertstate.buf = lbuf; + + /* Get matching tuple on leaf page */ + offnum = _bt_binsrch_insert(state->rel, &insertstate); + /* Compare first >= matching item on leaf page, if any */ + page = BufferGetPage(lbuf); + /* Should match on first heap TID when tuple has a posting list */ + if (offnum <= PageGetMaxOffsetNumber(page) && + insertstate.postingoff <= 0 && + _bt_compare(state->rel, key, page, offnum) == 0) + exists = true; + _bt_relbuf(state->rel, lbuf); + } + + _bt_freestack(stack); + pfree(key); + + return exists; +} + +/* + * Is particular offset within page (whose special state is passed by caller) + * the page negative-infinity item? + * + * As noted in comments above _bt_compare(), there is special handling of the + * first data item as a "negative infinity" item. The hard-coding within + * _bt_compare() makes comparing this item for the purposes of verification + * pointless at best, since the IndexTuple only contains a valid TID (a + * reference TID to child page). + */ +static inline bool +offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) +{ + /* + * For internal pages only, the first item after high key, if any, is + * negative infinity item. Internal pages always have a negative infinity + * item, whereas leaf pages never have one. This implies that negative + * infinity item is either first or second line item, or there is none + * within page. + * + * Negative infinity items are a special case among pivot tuples. They + * always have zero attributes, while all other pivot tuples always have + * nkeyatts attributes. + * + * Right-most pages don't have a high key, but could be said to + * conceptually have a "positive infinity" high key. Thus, there is a + * symmetry between down link items in parent pages, and high keys in + * children. Together, they represent the part of the key space that + * belongs to each page in the index. For example, all children of the + * root page will have negative infinity as a lower bound from root + * negative infinity downlink, and positive infinity as an upper bound + * (implicitly, from "imaginary" positive infinity high key in root). + */ + return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque); +} + +/* + * Does the invariant hold that the key is strictly less than a given upper + * bound offset item? + * + * Verifies line pointer on behalf of caller. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_l_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound) +{ + ItemId itemid; + int32 cmp; + + Assert(key->pivotsearch); + + /* Verify line pointer before checking tuple */ + itemid = PageGetItemIdCareful(state, state->targetblock, state->target, + upperbound); + /* pg_upgrade'd indexes may legally have equal sibling tuples */ + if (!key->heapkeyspace) + return invariant_leq_offset(state, key, upperbound); + + cmp = _bt_compare(state->rel, key, state->target, upperbound); + + /* + * _bt_compare() is capable of determining that a scankey with a + * filled-out attribute is greater than pivot tuples where the comparison + * is resolved at a truncated attribute (value of attribute in pivot is + * minus infinity). However, it is not capable of determining that a + * scankey is _less than_ a tuple on the basis of a comparison resolved at + * _scankey_ minus infinity attribute. Complete an extra step to simulate + * having minus infinity values for omitted scankey attribute(s). + */ + if (cmp == 0) + { + BTPageOpaque topaque; + IndexTuple ritup; + int uppnkeyatts; + ItemPointer rheaptid; + bool nonpivot; + + ritup = (IndexTuple) PageGetItem(state->target, itemid); + topaque = BTPageGetOpaque(state->target); + nonpivot = P_ISLEAF(topaque) && upperbound >= P_FIRSTDATAKEY(topaque); + + /* Get number of keys + heap TID for item to the right */ + uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel); + rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, nonpivot); + + /* Heap TID is tiebreaker key attribute */ + if (key->keysz == uppnkeyatts) + return key->scantid == NULL && rheaptid != NULL; + + return key->keysz < uppnkeyatts; + } + + return cmp < 0; +} + +/* + * Does the invariant hold that the key is less than or equal to a given upper + * bound offset item? + * + * Caller should have verified that upperbound's line pointer is consistent + * using PageGetItemIdCareful() call. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound) +{ + int32 cmp; + + Assert(key->pivotsearch); + + cmp = _bt_compare(state->rel, key, state->target, upperbound); + + return cmp <= 0; +} + +/* + * Does the invariant hold that the key is strictly greater than a given lower + * bound offset item? + * + * Caller should have verified that lowerbound's line pointer is consistent + * using PageGetItemIdCareful() call. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_g_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber lowerbound) +{ + int32 cmp; + + Assert(key->pivotsearch); + + cmp = _bt_compare(state->rel, key, state->target, lowerbound); + + /* pg_upgrade'd indexes may legally have equal sibling tuples */ + if (!key->heapkeyspace) + return cmp >= 0; + + /* + * No need to consider the possibility that scankey has attributes that we + * need to force to be interpreted as negative infinity. _bt_compare() is + * able to determine that scankey is greater than negative infinity. The + * distinction between "==" and "<" isn't interesting here, since + * corruption is indicated either way. + */ + return cmp > 0; +} + +/* + * Does the invariant hold that the key is strictly less than a given upper + * bound offset item, with the offset relating to a caller-supplied page that + * is not the current target page? + * + * Caller's non-target page is a child page of the target, checked as part of + * checking a property of the target page (i.e. the key comes from the + * target). Verifies line pointer on behalf of caller. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, + BlockNumber nontargetblock, Page nontarget, + OffsetNumber upperbound) +{ + ItemId itemid; + int32 cmp; + + Assert(key->pivotsearch); + + /* Verify line pointer before checking tuple */ + itemid = PageGetItemIdCareful(state, nontargetblock, nontarget, + upperbound); + cmp = _bt_compare(state->rel, key, nontarget, upperbound); + + /* pg_upgrade'd indexes may legally have equal sibling tuples */ + if (!key->heapkeyspace) + return cmp <= 0; + + /* See invariant_l_offset() for an explanation of this extra step */ + if (cmp == 0) + { + IndexTuple child; + int uppnkeyatts; + ItemPointer childheaptid; + BTPageOpaque copaque; + bool nonpivot; + + child = (IndexTuple) PageGetItem(nontarget, itemid); + copaque = BTPageGetOpaque(nontarget); + nonpivot = P_ISLEAF(copaque) && upperbound >= P_FIRSTDATAKEY(copaque); + + /* Get number of keys + heap TID for child/non-target item */ + uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel); + childheaptid = BTreeTupleGetHeapTIDCareful(state, child, nonpivot); + + /* Heap TID is tiebreaker key attribute */ + if (key->keysz == uppnkeyatts) + return key->scantid == NULL && childheaptid != NULL; + + return key->keysz < uppnkeyatts; + } + + return cmp < 0; +} + +/* + * Given a block number of a B-Tree page, return page in palloc()'d memory. + * While at it, perform some basic checks of the page. + * + * There is never an attempt to get a consistent view of multiple pages using + * multiple concurrent buffer locks; in general, we only acquire a single pin + * and buffer lock at a time, which is often all that the nbtree code requires. + * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only + * exception to this general rule.) + * + * Operating on a copy of the page is useful because it prevents control + * getting stuck in an uninterruptible state when an underlying operator class + * misbehaves. + */ +static Page +palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) +{ + Buffer buffer; + Page page; + BTPageOpaque opaque; + OffsetNumber maxoffset; + + page = palloc(BLCKSZ); + + /* + * We copy the page into local storage to avoid holding pin on the buffer + * longer than we must. + */ + buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL, + state->checkstrategy); + LockBuffer(buffer, BT_READ); + + /* + * Perform the same basic sanity checking that nbtree itself performs for + * every page: + */ + _bt_checkpage(state->rel, buffer); + + /* Only use copy of page in palloc()'d memory */ + memcpy(page, BufferGetPage(buffer), BLCKSZ); + UnlockReleaseBuffer(buffer); + + opaque = BTPageGetOpaque(page); + + if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid meta page found at block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + + /* Check page from block that ought to be meta page */ + if (blocknum == BTREE_METAPAGE) + { + BTMetaPageData *metad = BTPageGetMeta(page); + + if (!P_ISMETA(opaque) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" meta page is corrupt", + RelationGetRelationName(state->rel)))); + + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimum supported version %d", + RelationGetRelationName(state->rel), + metad->btm_version, BTREE_VERSION, + BTREE_MIN_VERSION))); + + /* Finished with metapage checks */ + return page; + } + + /* + * Deleted pages that still use the old 32-bit XID representation have no + * sane "level" field because they type pun the field, but all other pages + * (including pages deleted on Postgres 14+) have a valid value. + */ + if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) + { + /* Okay, no reason not to trust btpo_level field from page */ + + if (P_ISLEAF(opaque) && opaque->btpo_level != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"", + opaque->btpo_level, blocknum, + RelationGetRelationName(state->rel)))); + + if (!P_ISLEAF(opaque) && opaque->btpo_level == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"", + blocknum, + RelationGetRelationName(state->rel)))); + } + + /* + * Sanity checks for number of items on page. + * + * As noted at the beginning of _bt_binsrch(), an internal page must have + * children, since there must always be a negative infinity downlink + * (there may also be a highkey). In the case of non-rightmost leaf + * pages, there must be at least a highkey. The exceptions are deleted + * pages, which contain no items. + * + * This is correct when pages are half-dead, since internal pages are + * never half-dead, and leaf pages must have a high key when half-dead + * (the rightmost page can never be deleted). It's also correct with + * fully deleted pages: _bt_unlink_halfdead_page() doesn't change anything + * about the target page other than setting the page as fully dead, and + * setting its xact field. In particular, it doesn't change the sibling + * links in the deletion target itself, since they're required when index + * scans land on the deletion target, and then need to move right (or need + * to move left, in the case of backward index scans). + */ + maxoffset = PageGetMaxOffsetNumber(page); + if (maxoffset > MaxIndexTuplesPerPage) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)", + blocknum, RelationGetRelationName(state->rel), + MaxIndexTuplesPerPage))); + + if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && maxoffset < P_FIRSTDATAKEY(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item", + blocknum, RelationGetRelationName(state->rel)))); + + /* + * In general, internal pages are never marked half-dead, except on + * versions of Postgres prior to 9.4, where it can be valid transient + * state. This state is nonetheless treated as corruption by VACUUM on + * from version 9.4 on, so do the same here. See _bt_pagedel() for full + * details. + */ + if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("internal page block %u in index \"%s\" is half-dead", + blocknum, RelationGetRelationName(state->rel)), + errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); + + /* + * Check that internal pages have no garbage items, and that no page has + * an invalid combination of deletion-related page level flags + */ + if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("internal page block %u in index \"%s\" has garbage items", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("deleted page block %u in index \"%s\" is half-dead", + blocknum, RelationGetRelationName(state->rel)))); + + return page; +} + +/* + * _bt_mkscankey() wrapper that automatically prevents insertion scankey from + * being considered greater than the pivot tuple that its values originated + * from (or some other identical pivot tuple) in the common case where there + * are truncated/minus infinity attributes. Without this extra step, there + * are forms of corruption that amcheck could theoretically fail to report. + * + * For example, invariant_g_offset() might miss a cross-page invariant failure + * on an internal level if the scankey built from the first item on the + * target's right sibling page happened to be equal to (not greater than) the + * last item on target page. The !pivotsearch tiebreaker in _bt_compare() + * might otherwise cause amcheck to assume (rather than actually verify) that + * the scankey is greater. + */ +static inline BTScanInsert +bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) +{ + BTScanInsert skey; + + skey = _bt_mkscankey(rel, itup); + skey->pivotsearch = true; + + return skey; +} + +/* + * PageGetItemId() wrapper that validates returned line pointer. + * + * Buffer page/page item access macros generally trust that line pointers are + * not corrupt, which might cause problems for verification itself. For + * example, there is no bounds checking in PageGetItem(). Passing it a + * corrupt line pointer can cause it to return a tuple/pointer that is unsafe + * to dereference. + * + * Validating line pointers before tuples avoids undefined behavior and + * assertion failures with corrupt indexes, making the verification process + * more robust and predictable. + */ +static ItemId +PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, + OffsetNumber offset) +{ + ItemId itemid = PageGetItemId(page, offset); + + if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > + BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("line pointer points past end of tuple space in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + /* + * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree + * never uses either. Verify that line pointer has storage, too, since + * even LP_DEAD items should within nbtree. + */ + if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) || + ItemIdGetLength(itemid) == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid line pointer storage in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + return itemid; +} + +/* + * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in + * cases where that is mandatory (i.e. for non-pivot tuples) + */ +static inline ItemPointer +BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, + bool nonpivot) +{ + ItemPointer htid; + + /* + * Caller determines whether this is supposed to be a pivot or non-pivot + * tuple using page type and item offset number. Verify that tuple + * metadata agrees with this. + */ + Assert(state->heapkeyspace); + if (BTreeTupleIsPivot(itup) && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + if (!BTreeTupleIsPivot(itup) && !nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + htid = BTreeTupleGetHeapTID(itup); + if (!ItemPointerIsValid(htid) && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", + state->targetblock, + RelationGetRelationName(state->rel)))); + + return htid; +} + +/* + * Return the "pointed to" TID for itup, which is used to generate a + * descriptive error message. itup must be a "data item" tuple (it wouldn't + * make much sense to call here with a high key tuple, since there won't be a + * valid downlink/block number to display). + * + * Returns either a heap TID (which will be the first heap TID in posting list + * if itup is posting list tuple), or a TID that contains downlink block + * number, plus some encoded metadata (e.g., the number of attributes present + * in itup). + */ +static inline ItemPointer +BTreeTupleGetPointsToTID(IndexTuple itup) +{ + /* + * Rely on the assumption that !heapkeyspace internal page data items will + * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't + * recognize it as a pivot tuple, but everything still works out because + * the t_tid field is still returned + */ + if (!BTreeTupleIsPivot(itup)) + return BTreeTupleGetHeapTID(itup); + + /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */ + return &itup->t_tid; +} |