/*------------------------------------------------------------------------- * * nbtxlog.c * WAL replay logic for btrees. * * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/access/nbtree/nbtxlog.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/bufmask.h" #include "access/nbtree.h" #include "access/nbtxlog.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xlogutils.h" #include "miscadmin.h" #include "storage/procarray.h" #include "utils/memutils.h" static MemoryContext opCtx; /* working memory for operations */ /* * _bt_restore_page -- re-enter all the index tuples on a page * * The page is freshly init'd, and *from (length len) is a copy of what * had been its upper part (pd_upper to pd_special). We assume that the * tuples had been added to the page in item-number order, and therefore * the one with highest item number appears first (lowest on the page). */ static void _bt_restore_page(Page page, char *from, int len) { IndexTupleData itupdata; Size itemsz; char *end = from + len; Item items[MaxIndexTuplesPerPage]; uint16 itemsizes[MaxIndexTuplesPerPage]; int i; int nitems; /* * To get the items back in the original order, we add them to the page in * reverse. To figure out where one tuple ends and another begins, we * have to scan them in forward order first. */ i = 0; while (from < end) { /* * As we step through the items, 'from' won't always be properly * aligned, so we need to use memcpy(). Further, we use Item (which * is just a char*) here for our items array for the same reason; * wouldn't want the compiler or anyone thinking that an item is * aligned when it isn't. */ memcpy(&itupdata, from, sizeof(IndexTupleData)); itemsz = IndexTupleSize(&itupdata); itemsz = MAXALIGN(itemsz); items[i] = (Item) from; itemsizes[i] = itemsz; i++; from += itemsz; } nitems = i; for (i = nitems - 1; i >= 0; i--) { if (PageAddItem(page, items[i], itemsizes[i], nitems - i, false, false) == InvalidOffsetNumber) elog(PANIC, "_bt_restore_page: cannot add item to page"); } } static void _bt_restore_meta(XLogReaderState *record, uint8 block_id) { XLogRecPtr lsn = record->EndRecPtr; Buffer metabuf; Page metapg; BTMetaPageData *md; BTPageOpaque pageop; xl_btree_metadata *xlrec; char *ptr; Size len; metabuf = XLogInitBufferForRedo(record, block_id); ptr = XLogRecGetBlockData(record, block_id, &len); Assert(len == sizeof(xl_btree_metadata)); Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); xlrec = (xl_btree_metadata *) ptr; metapg = BufferGetPage(metabuf); _bt_pageinit(metapg, BufferGetPageSize(metabuf)); md = BTPageGetMeta(metapg); md->btm_magic = BTREE_MAGIC; md->btm_version = xlrec->version; md->btm_root = xlrec->root; md->btm_level = xlrec->level; md->btm_fastroot = xlrec->fastroot; md->btm_fastlevel = xlrec->fastlevel; /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */ Assert(md->btm_version >= BTREE_NOVAC_VERSION); md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages; md->btm_last_cleanup_num_heap_tuples = -1.0; md->btm_allequalimage = xlrec->allequalimage; pageop = BTPageGetOpaque(metapg); pageop->btpo_flags = BTP_META; /* * Set pd_lower just past the end of the metadata. This is essential, * because without doing so, metadata will be lost if xlog.c compresses * the page. */ ((PageHeader) metapg)->pd_lower = ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg; PageSetLSN(metapg, lsn); MarkBufferDirty(metabuf); UnlockReleaseBuffer(metabuf); } /* * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page * * This is a common subroutine of the redo functions of all the WAL record * types that can insert a downlink: insert, split, and newroot. */ static void _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) { XLogRecPtr lsn = record->EndRecPtr; Buffer buf; if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buf); BTPageOpaque pageop = BTPageGetOpaque(page); Assert(P_INCOMPLETE_SPLIT(pageop)); pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; PageSetLSN(page, lsn); MarkBufferDirty(buf); } if (BufferIsValid(buf)) UnlockReleaseBuffer(buf); } static void btree_xlog_insert(bool isleaf, bool ismeta, bool posting, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); Buffer buffer; Page page; /* * Insertion to an internal page finishes an incomplete split at the child * level. Clear the incomplete-split flag in the child. Note: during * normal operation, the child and parent pages are locked at the same * time (the locks are coupled), so that clearing the flag and inserting * the downlink appear atomic to other backends. We don't bother with * that during replay, because readers don't care about the * incomplete-split flag and there cannot be updates happening. */ if (!isleaf) _bt_clear_incomplete_split(record, 1); if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Size datalen; char *datapos = XLogRecGetBlockData(record, 0, &datalen); page = BufferGetPage(buffer); if (!posting) { /* Simple retail insertion */ if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add new item"); } else { ItemId itemid; IndexTuple oposting, newitem, nposting; uint16 postingoff; /* * A posting list split occurred during leaf page insertion. WAL * record data will start with an offset number representing the * point in an existing posting list that a split occurs at. * * Use _bt_swap_posting() to repeat posting list split steps from * primary. Note that newitem from WAL record is 'orignewitem', * not the final version of newitem that is actually inserted on * page. */ postingoff = *((uint16 *) datapos); datapos += sizeof(uint16); datalen -= sizeof(uint16); itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum)); oposting = (IndexTuple) PageGetItem(page, itemid); /* Use mutable, aligned newitem copy in _bt_swap_posting() */ Assert(isleaf && postingoff > 0); newitem = CopyIndexTuple((IndexTuple) datapos); nposting = _bt_swap_posting(newitem, oposting, postingoff); /* Replace existing posting list with post-split version */ memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); /* Insert "final" new item (not orignewitem from WAL stream) */ Assert(IndexTupleSize(newitem) == datalen); if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add posting split new item"); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* * Note: in normal operation, we'd update the metapage while still holding * lock on the page we inserted into. But during replay it's not * necessary to hold that lock, since no other index updates can be * happening concurrently, and readers will cope fine with following an * obsolete link from the metapage. */ if (ismeta) _bt_restore_meta(record, 2); } static void btree_xlog_split(bool newitemonleft, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); Buffer buf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; char *datapos; Size datalen; BlockNumber origpagenumber; BlockNumber rightpagenumber; BlockNumber spagenumber; XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber); XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber); if (!XLogRecGetBlockTagExtended(record, 2, NULL, NULL, &spagenumber, NULL)) spagenumber = P_NONE; /* * Clear the incomplete split flag on the appropriate child page one level * down when origpage/buf is an internal page (there must have been * cascading page splits during original execution in the event of an * internal page split). This is like the corresponding btree_xlog_insert * call for internal pages. We're not clearing the incomplete split flag * for the current page split here (you can think of this as part of the * insert of newitem that the page split action needs to perform in * passing). * * Like in btree_xlog_insert, this can be done before locking other pages. * We never need to couple cross-level locks in REDO routines. */ if (!isleaf) _bt_clear_incomplete_split(record, 3); /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogInitBufferForRedo(record, 1); datapos = XLogRecGetBlockData(record, 1, &datalen); rpage = (Page) BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = BTPageGetOpaque(rpage); ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = spagenumber; ropaque->btpo_level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); /* Now reconstruct original page (left half of split) */ if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we * initialize a temporary empty page for the left page and add all the * items to that in item number order. This mirrors how _bt_split() * works. Retaining the same physical order makes WAL consistency * checking possible. See also _bt_restore_page(), which does the * same for the right page. */ Page origpage = (Page) BufferGetPage(buf); BTPageOpaque oopaque = BTPageGetOpaque(origpage); OffsetNumber off; IndexTuple newitem = NULL, left_hikey = NULL, nposting = NULL; Size newitemsz = 0, left_hikeysz = 0; Page leftpage; OffsetNumber leftoff, replacepostingoff = InvalidOffsetNumber; datapos = XLogRecGetBlockData(record, 0, &datalen); if (newitemonleft || xlrec->postingoff != 0) { newitem = (IndexTuple) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; if (xlrec->postingoff != 0) { ItemId itemid; IndexTuple oposting; /* Posting list must be at offset number before new item's */ replacepostingoff = OffsetNumberPrev(xlrec->newitemoff); /* Use mutable, aligned newitem copy in _bt_swap_posting() */ newitem = CopyIndexTuple(newitem); itemid = PageGetItemId(origpage, replacepostingoff); oposting = (IndexTuple) PageGetItem(origpage, itemid); nposting = _bt_swap_posting(newitem, oposting, xlrec->postingoff); } } /* * Extract left hikey and its size. We assume that 16-bit alignment * is enough to apply IndexTupleSize (since it's fetching from a * uint16 field). */ left_hikey = (IndexTuple) datapos; left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); datapos += left_hikeysz; datalen -= left_hikeysz; Assert(datalen == 0); leftpage = PageGetTempPageCopySpecial(origpage); /* Add high key tuple from WAL record to temp page */ leftoff = P_HIKEY; if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++) { ItemId itemid; Size itemsz; IndexTuple item; /* Add replacement posting list when required */ if (off == replacepostingoff) { Assert(newitemonleft || xlrec->firstrightoff == xlrec->newitemoff); if (PageAddItem(leftpage, (Item) nposting, MAXALIGN(IndexTupleSize(nposting)), leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new posting list item to left page after split"); leftoff = OffsetNumberNext(leftoff); continue; /* don't insert oposting */ } /* add the new item if it was inserted on left page */ else if (newitemonleft && off == xlrec->newitemoff) { if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } itemid = PageGetItemId(origpage, off); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); } /* cope with possibility that newitem goes at the end */ if (newitemonleft && off == xlrec->newitemoff) { if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } PageRestoreTempPage(leftpage, origpage); /* Fix opaque fields */ oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) oopaque->btpo_flags |= BTP_LEAF; oopaque->btpo_next = rightpagenumber; oopaque->btpo_cycleid = 0; PageSetLSN(origpage, lsn); MarkBufferDirty(buf); } /* Fix left-link of the page to the right of the new right sibling */ if (spagenumber != P_NONE) { Buffer sbuf; if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO) { Page spage = (Page) BufferGetPage(sbuf); BTPageOpaque spageop = BTPageGetOpaque(spage); spageop->btpo_prev = rightpagenumber; PageSetLSN(spage, lsn); MarkBufferDirty(sbuf); } if (BufferIsValid(sbuf)) UnlockReleaseBuffer(sbuf); } /* * Finally, release the remaining buffers. sbuf, rbuf, and buf must be * released together, so that readers cannot observe inconsistencies. */ UnlockReleaseBuffer(rbuf); if (BufferIsValid(buf)) UnlockReleaseBuffer(buf); } static void btree_xlog_dedup(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record); Buffer buf; if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) { char *ptr = XLogRecGetBlockData(record, 0, NULL); Page page = (Page) BufferGetPage(buf); BTPageOpaque opaque = BTPageGetOpaque(page); OffsetNumber offnum, minoff, maxoff; BTDedupState state; BTDedupInterval *intervals; Page newpage; state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; /* unused */ state->nmaxitems = 0; /* unused */ /* Conservatively use larger maxpostingsize than primary */ state->maxpostingsize = BTMaxItemSize(page); state->base = NULL; state->baseoff = InvalidOffsetNumber; state->basetupsize = 0; state->htids = palloc(state->maxpostingsize); state->nhtids = 0; state->nitems = 0; state->phystupsize = 0; state->nintervals = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); newpage = PageGetTempPageCopySpecial(page); if (!P_RIGHTMOST(opaque)) { ItemId itemid = PageGetItemId(page, P_HIKEY); Size itemsz = ItemIdGetLength(itemid); IndexTuple item = (IndexTuple) PageGetItem(page, itemid); if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add highkey"); } intervals = (BTDedupInterval *) ptr; for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid = PageGetItemId(page, offnum); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); if (offnum == minoff) _bt_dedup_start_pending(state, itup, offnum); else if (state->nintervals < xlrec->nintervals && state->baseoff == intervals[state->nintervals].baseoff && state->nitems < intervals[state->nintervals].nitems) { if (!_bt_dedup_save_htid(state, itup)) elog(ERROR, "deduplication failed to add heap tid to pending posting list"); } else { _bt_dedup_finish_pending(newpage, state); _bt_dedup_start_pending(state, itup, offnum); } } _bt_dedup_finish_pending(newpage, state); Assert(state->nintervals == xlrec->nintervals); Assert(memcmp(state->intervals, intervals, state->nintervals * sizeof(BTDedupInterval)) == 0); if (P_HAS_GARBAGE(opaque)) { BTPageOpaque nopaque = BTPageGetOpaque(newpage); nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; } PageRestoreTempPage(newpage, page); PageSetLSN(page, lsn); MarkBufferDirty(buf); } if (BufferIsValid(buf)) UnlockReleaseBuffer(buf); } static void btree_xlog_updates(Page page, OffsetNumber *updatedoffsets, xl_btree_update *updates, int nupdated) { BTVacuumPosting vacposting; IndexTuple origtuple; ItemId itemid; Size itemsz; for (int i = 0; i < nupdated; i++) { itemid = PageGetItemId(page, updatedoffsets[i]); origtuple = (IndexTuple) PageGetItem(page, itemid); vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + updates->ndeletedtids * sizeof(uint16)); vacposting->updatedoffset = updatedoffsets[i]; vacposting->itup = origtuple; vacposting->ndeletedtids = updates->ndeletedtids; memcpy(vacposting->deletetids, (char *) updates + SizeOfBtreeUpdate, updates->ndeletedtids * sizeof(uint16)); _bt_update_posting(vacposting); /* Overwrite updated version of tuple */ itemsz = MAXALIGN(IndexTupleSize(vacposting->itup)); if (!PageIndexTupleOverwrite(page, updatedoffsets[i], (Item) vacposting->itup, itemsz)) elog(PANIC, "failed to update partially dead item"); pfree(vacposting->itup); pfree(vacposting); /* advance to next xl_btree_update from array */ updates = (xl_btree_update *) ((char *) updates + SizeOfBtreeUpdate + updates->ndeletedtids * sizeof(uint16)); } } static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * We need to take a cleanup lock here, just like btvacuumpage(). However, * it isn't necessary to exhaustively get a cleanup lock on every block in * the index during recovery (just getting a cleanup lock on pages with * items to kill suffices). See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr = XLogRecGetBlockData(record, 0, NULL); page = (Page) BufferGetPage(buffer); if (xlrec->nupdated > 0) { OffsetNumber *updatedoffsets; xl_btree_update *updates; updatedoffsets = (OffsetNumber *) (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); updates = (xl_btree_update *) ((char *) updatedoffsets + xlrec->nupdated * sizeof(OffsetNumber)); btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated); } if (xlrec->ndeleted > 0) PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = BTPageGetOpaque(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } static void btree_xlog_delete(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If we have any conflict processing to do, it must happen before we * update the page */ if (InHotStandby) { RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode); } /* * We don't need to take a cleanup lock to apply these changes. See * nbtree/README for details. */ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { char *ptr = XLogRecGetBlockData(record, 0, NULL); page = (Page) BufferGetPage(buffer); if (xlrec->nupdated > 0) { OffsetNumber *updatedoffsets; xl_btree_update *updates; updatedoffsets = (OffsetNumber *) (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); updates = (xl_btree_update *) ((char *) updatedoffsets + xlrec->nupdated * sizeof(OffsetNumber)); btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated); } if (xlrec->ndeleted > 0) PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* Mark the page as not containing any LP_DEAD items */ opaque = BTPageGetOpaque(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } static void btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque pageop; IndexTupleData trunctuple; /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay * to lock just one page at a time, since no concurrent index updates can * be happening, and readers should not care whether they arrive at the * target page or not (since it's surely empty). */ /* to-be-deleted subtree's parent page */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { OffsetNumber poffset; ItemId itemid; IndexTuple itup; OffsetNumber nextoffset; BlockNumber rightsib; page = (Page) BufferGetPage(buffer); pageop = BTPageGetOpaque(page); poffset = xlrec->poffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); rightsib = BTreeTupleGetDownLink(itup); itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); BTreeTupleSetDownLink(itup, rightsib); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } /* * Don't need to couple cross-level locks in REDO routines, so release * lock on internal page immediately */ if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Rewrite the leaf page as a halfdead page */ buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = BTPageGetOpaque(page); pageop->btpo_prev = xlrec->leftblk; pageop->btpo_next = xlrec->rightblk; pageop->btpo_level = 0; pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_cycleid = 0; /* * Construct a dummy high key item that points to top parent page (value * is InvalidBlockNumber when the top parent page is the leaf page itself) */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); BTreeTupleSetTopParent(&trunctuple, xlrec->topparent); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } static void btree_xlog_unlink_page(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; uint32 level; bool isleaf; FullTransactionId safexid; Buffer leftbuf; Buffer target; Buffer rightbuf; Page page; BTPageOpaque pageop; leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; level = xlrec->level; isleaf = (level == 0); safexid = xlrec->safexid; /* No leaftopparent for level 0 (leaf page) or level 1 target */ Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1); /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, we at least lock * the pages in the same standard left-to-right order (leftsib, target, * rightsib), and don't release the sibling locks until the target is * marked deleted. */ /* Fix right-link of left sibling, if any */ if (leftsib != P_NONE) { if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(leftbuf); pageop = BTPageGetOpaque(page); pageop->btpo_next = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(leftbuf); } } else leftbuf = InvalidBuffer; /* Rewrite target page as empty deleted page */ target = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(target); _bt_pageinit(page, BufferGetPageSize(target)); pageop = BTPageGetOpaque(page); pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; pageop->btpo_level = level; BTPageSetDeleted(page, safexid); if (isleaf) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; PageSetLSN(page, lsn); MarkBufferDirty(target); /* Fix left-link of right sibling */ if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(rightbuf); pageop = BTPageGetOpaque(page); pageop->btpo_prev = leftsib; PageSetLSN(page, lsn); MarkBufferDirty(rightbuf); } /* Release siblings */ if (BufferIsValid(leftbuf)) UnlockReleaseBuffer(leftbuf); if (BufferIsValid(rightbuf)) UnlockReleaseBuffer(rightbuf); /* Release target */ UnlockReleaseBuffer(target); /* * If we deleted a parent of the targeted leaf page, instead of the leaf * itself, update the leaf to point to the next remaining child in the * to-be-deleted subtree */ if (XLogRecHasBlockRef(record, 3)) { /* * There is no real data on the page, so we just re-create it from * scratch using the information from the WAL record. * * Note that we don't end up here when the target page is also the * leafbuf page. There is no need to add a dummy hikey item with a * top parent link when deleting leafbuf because it's the last page * we'll delete in the subtree undergoing deletion. */ Buffer leafbuf; IndexTupleData trunctuple; Assert(!isleaf); leafbuf = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(leafbuf); _bt_pageinit(page, BufferGetPageSize(leafbuf)); pageop = BTPageGetOpaque(page); pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_prev = xlrec->leafleftsib; pageop->btpo_next = xlrec->leafrightsib; pageop->btpo_level = 0; pageop->btpo_cycleid = 0; /* Add a dummy hikey item */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(leafbuf); UnlockReleaseBuffer(leafbuf); } /* Update metapage if needed */ if (info == XLOG_BTREE_UNLINK_PAGE_META) _bt_restore_meta(record, 4); } static void btree_xlog_newroot(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque pageop; char *ptr; Size len; buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = BTPageGetOpaque(page); pageop->btpo_flags = BTP_ROOT; pageop->btpo_prev = pageop->btpo_next = P_NONE; pageop->btpo_level = xlrec->level; if (xlrec->level == 0) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; if (xlrec->level > 0) { ptr = XLogRecGetBlockData(record, 0, &len); _bt_restore_page(page, ptr, len); /* Clear the incomplete-split flag in left child */ _bt_clear_incomplete_split(record, 1); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); _bt_restore_meta(record, 2); } /* * In general VACUUM must defer recycling as a way of avoiding certain race * conditions. Deleted pages contain a safexid value that is used by VACUUM * to determine whether or not it's safe to place a page that was deleted by * VACUUM earlier into the FSM now. See nbtree/README. * * As far as any backend operating during original execution is concerned, the * FSM is a cache of recycle-safe pages; the mere presence of the page in the * FSM indicates that the page must already be safe to recycle (actually, * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just * because it would be unwise to completely trust the FSM, given its current * limitations). * * This isn't sufficient to prevent similar concurrent recycling race * conditions during Hot Standby, though. For that we need to log a * xl_btree_reuse_page record at the point that a page is actually recycled * and reused for an entirely unrelated page inside _bt_split(). These * records include the same safexid value from the original deleted page, * stored in the record's latestRemovedFullXid field. * * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used * to determine if it's safe to recycle a page. This mirrors our own test: * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs(). * Consequently, one XID value achieves the same exclusion effect on primary * and standby. */ static void btree_xlog_reuse_page(XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); if (InHotStandby) ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, xlrec->node); } void btree_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; MemoryContext oldCtx; oldCtx = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_BTREE_INSERT_LEAF: btree_xlog_insert(true, false, false, record); break; case XLOG_BTREE_INSERT_UPPER: btree_xlog_insert(false, false, false, record); break; case XLOG_BTREE_INSERT_META: btree_xlog_insert(false, true, false, record); break; case XLOG_BTREE_SPLIT_L: btree_xlog_split(true, record); break; case XLOG_BTREE_SPLIT_R: btree_xlog_split(false, record); break; case XLOG_BTREE_INSERT_POST: btree_xlog_insert(true, false, true, record); break; case XLOG_BTREE_DEDUP: btree_xlog_dedup(record); break; case XLOG_BTREE_VACUUM: btree_xlog_vacuum(record); break; case XLOG_BTREE_DELETE: btree_xlog_delete(record); break; case XLOG_BTREE_MARK_PAGE_HALFDEAD: btree_xlog_mark_page_halfdead(info, record); break; case XLOG_BTREE_UNLINK_PAGE: case XLOG_BTREE_UNLINK_PAGE_META: btree_xlog_unlink_page(info, record); break; case XLOG_BTREE_NEWROOT: btree_xlog_newroot(record); break; case XLOG_BTREE_REUSE_PAGE: btree_xlog_reuse_page(record); break; case XLOG_BTREE_META_CLEANUP: _bt_restore_meta(record, 0); break; default: elog(PANIC, "btree_redo: unknown op code %u", info); } MemoryContextSwitchTo(oldCtx); MemoryContextReset(opCtx); } void btree_xlog_startup(void) { opCtx = AllocSetContextCreate(CurrentMemoryContext, "Btree recovery temporary context", ALLOCSET_DEFAULT_SIZES); } void btree_xlog_cleanup(void) { MemoryContextDelete(opCtx); opCtx = NULL; } /* * Mask a btree page before performing consistency checks on it. */ void btree_mask(char *pagedata, BlockNumber blkno) { Page page = (Page) pagedata; BTPageOpaque maskopaq; mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); mask_unused_space(page); maskopaq = BTPageGetOpaque(page); if (P_ISLEAF(maskopaq)) { /* * In btree leaf pages, it is possible to modify the LP_FLAGS without * emitting any WAL record. Hence, mask the line pointer flags. See * _bt_killitems(), _bt_check_unique() for details. */ mask_lp_flags(page); } /* * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique() * for details. */ maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; /* * During replay of a btree page split, we don't set the BTP_SPLIT_END * flag of the right sibling and initialize the cycle_id to 0 for the same * page. See btree_xlog_split() for details. */ maskopaq->btpo_flags &= ~BTP_SPLIT_END; maskopaq->btpo_cycleid = 0; }