1 files changed, 351 insertions, 0 deletions
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
new file mode 100644
index 0000000..0f77318
--- /dev/null
+++ b/src/include/access/nbtxlog.h
@@ -0,0 +1,351 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtxlog.h
+ *	  header file for postgres btree xlog routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/nbtxlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NBTXLOG_H
+#define NBTXLOG_H
+
+#include "access/transam.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/off.h"
+
+/*
+ * XLOG records for btree operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field
+ */
+#define XLOG_BTREE_INSERT_LEAF	0x00	/* add index tuple without split */
+#define XLOG_BTREE_INSERT_UPPER 0x10	/* same, on a non-leaf page */
+#define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
+#define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
+#define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
+#define XLOG_BTREE_INSERT_POST	0x50	/* add index tuple with posting split */
+#define XLOG_BTREE_DEDUP		0x60	/* deduplicate tuples for a page */
+#define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
+#define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
+#define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
+#define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
+#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0	/* mark a leaf as half-dead */
+#define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during
+										 * vacuum */
+#define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from
+										 * FSM */
+#define XLOG_BTREE_META_CLEANUP	0xE0	/* update cleanup-related data in the
+										 * metapage */
+
+/*
+ * All that we need to regenerate the meta-data page
+ */
+typedef struct xl_btree_metadata
+{
+	uint32		version;
+	BlockNumber root;
+	uint32		level;
+	BlockNumber fastroot;
+	uint32		fastlevel;
+	uint32		last_cleanup_num_delpages;
+	bool		allequalimage;
+} xl_btree_metadata;
+
+/*
+ * This is what we need to know about simple (without split) insert.
+ *
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
+ * INSERT_POST.  Note that INSERT_META and INSERT_UPPER implies it's not a
+ * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
+ * page.
+ *
+ * Backup Blk 0: original page
+ * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
+ * Backup Blk 2: xl_btree_metadata, if INSERT_META
+ *
+ * Note: The new tuple is actually the "original" new item in the posting
+ * list split insert case (i.e. the INSERT_POST case).  A split offset for
+ * the posting list is logged before the original new item.  Recovery needs
+ * both, since it must do an in-place update of the existing posting list
+ * that was split as an extra step.  Also, recovery generates a "final"
+ * newitem.  See _bt_swap_posting() for details on posting list splits.
+ */
+typedef struct xl_btree_insert
+{
+	OffsetNumber offnum;
+
+	/* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
+	/* NEW TUPLE ALWAYS FOLLOWS AT THE END */
+} xl_btree_insert;
+
+#define SizeOfBtreeInsert	(offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
+
+/*
+ * On insert with split, we save all the items going into the right sibling
+ * so that we can restore it completely from the log record.  This way takes
+ * less xlog space than the normal approach, because if we did it standardly,
+ * XLogInsert would almost always think the right page is new and store its
+ * whole page image.  The left page, however, is handled in the normal
+ * incremental-update fashion.
+ *
+ * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
+ * There are two variants to indicate whether the inserted tuple went into the
+ * left or right split page (and thus, whether the new item is stored or not).
+ * We always log the left page high key because suffix truncation can generate
+ * a new leaf high key using user-defined code.  This is also necessary on
+ * internal pages, since the firstright item that the left page's high key was
+ * based on will have been truncated to zero attributes in the right page (the
+ * separator key is unavailable from the right page).
+ *
+ * Backup Blk 0: original page / new left page
+ *
+ * The left page's data portion contains the new item, if it's the _L variant.
+ * _R variant split records generally do not have a newitem (_R variant leaf
+ * page split records that must deal with a posting list split will include an
+ * explicit newitem, though it is never used on the right page -- it is
+ * actually an orignewitem needed to update existing posting list).  The new
+ * high key of the left/original page appears last of all (and must always be
+ * present).
+ *
+ * Page split records that need the REDO routine to deal with a posting list
+ * split directly will have an explicit newitem, which is actually an
+ * orignewitem (the newitem as it was before the posting list split, not
+ * after).  A posting list split always has a newitem that comes immediately
+ * after the posting list being split (which would have overlapped with
+ * orignewitem prior to split).  Usually REDO must deal with posting list
+ * splits with an _L variant page split record, and usually both the new
+ * posting list and the final newitem go on the left page (the existing
+ * posting list will be inserted instead of the old, and the final newitem
+ * will be inserted next to that).  However, _R variant split records will
+ * include an orignewitem when the split point for the page happens to have a
+ * lastleft tuple that is also the posting list being split (leaving newitem
+ * as the page split's firstright tuple).  The existence of this corner case
+ * does not change the basic fact about newitem/orignewitem for the REDO
+ * routine: it is always state used for the left page alone.  (This is why the
+ * record's postingoff field isn't a reliable indicator of whether or not a
+ * posting list split occurred during the page split; a non-zero value merely
+ * indicates that the REDO routine must reconstruct a new posting list tuple
+ * that is needed for the left page.)
+ *
+ * This posting list split handling is equivalent to the xl_btree_insert REDO
+ * routine's INSERT_POST handling.  While the details are more complicated
+ * here, the concept and goals are exactly the same.  See _bt_swap_posting()
+ * for details on posting list splits.
+ *
+ * Backup Blk 1: new right page
+ *
+ * The right page's data portion contains the right page's tuples in the form
+ * used by _bt_restore_page.  This includes the new item, if it's the _R
+ * variant.  The right page's tuples also include the right page's high key
+ * with either variant (moved from the left/original page during the split),
+ * unless the split happened to be of the rightmost page on its level, where
+ * there is no high key for new right page.
+ *
+ * Backup Blk 2: next block (orig page's rightlink), if any
+ * Backup Blk 3: child's left sibling, if non-leaf split
+ */
+typedef struct xl_btree_split
+{
+	uint32		level;			/* tree level of page being split */
+	OffsetNumber firstrightoff; /* first origpage item on rightpage */
+	OffsetNumber newitemoff;	/* new item's offset */
+	uint16		postingoff;		/* offset inside orig posting tuple */
+} xl_btree_split;
+
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, postingoff) + sizeof(uint16))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents a deduplication pass for a leaf page.  An array
+ * of BTDedupInterval structs follows.
+ */
+typedef struct xl_btree_dedup
+{
+	uint16		nintervals;
+
+	/* DEDUPLICATION INTERVALS FOLLOW */
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup 	(offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
+
+/*
+ * This is what we need to know about page reuse within btree.  This record
+ * only exists to generate a conflict point for Hot Standby.
+ *
+ * Note that we must include a RelFileNode in the record because we don't
+ * actually register the buffer with the record.
+ */
+typedef struct xl_btree_reuse_page
+{
+	RelFileNode node;
+	BlockNumber block;
+	FullTransactionId latestRemovedFullXid;
+} xl_btree_reuse_page;
+
+#define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))
+
+/*
+ * xl_btree_vacuum and xl_btree_delete records describe deletion of index
+ * tuples on a leaf page.  The former variant is used by VACUUM, while the
+ * latter variant is used by the ad-hoc deletions that sometimes take place
+ * when btinsert() is called.
+ *
+ * The records are very similar.  The only difference is that xl_btree_delete
+ * has to include a latestRemovedXid field to generate recovery conflicts.
+ * (VACUUM operations can just rely on earlier conflicts generated during
+ * pruning of the table whose TIDs the to-be-deleted index tuples point to.
+ * There are also small differences between each REDO routine that we don't go
+ * into here.)
+ *
+ * xl_btree_vacuum and xl_btree_delete both represent deletion of any number
+ * of index tuples on a single leaf page using page offset numbers.  Both also
+ * support "updates" of index tuples, which is how deletes of a subset of TIDs
+ * contained in an existing posting list tuple are implemented.
+ *
+ * Updated posting list tuples are represented using xl_btree_update metadata.
+ * The REDO routines each use the xl_btree_update entries (plus each
+ * corresponding original index tuple from the target leaf page) to generate
+ * the final updated tuple.
+ *
+ * Updates are only used when there will be some remaining TIDs left by the
+ * REDO routine.  Otherwise the posting list tuple just gets deleted outright.
+ */
+typedef struct xl_btree_vacuum
+{
+	uint16		ndeleted;
+	uint16		nupdated;
+
+	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
+} xl_btree_vacuum;
+
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
+
+typedef struct xl_btree_delete
+{
+	TransactionId latestRemovedXid;
+	uint16		ndeleted;
+	uint16		nupdated;
+
+	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
+} xl_btree_delete;
+
+#define SizeOfBtreeDelete	(offsetof(xl_btree_delete, nupdated) + sizeof(uint16))
+
+/*
+ * The offsets that appear in xl_btree_update metadata are offsets into the
+ * original posting list from tuple, not page offset numbers.  These are
+ * 0-based.  The page offset number for the original posting list tuple comes
+ * from the main xl_btree_vacuum/xl_btree_delete record.
+ */
+typedef struct xl_btree_update
+{
+	uint16		ndeletedtids;
+
+	/* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
+} xl_btree_update;
+
+#define SizeOfBtreeUpdate	(offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
+
+/*
+ * This is what we need to know about marking an empty subtree for deletion.
+ * The target identifies the tuple removed from the parent page (note that we
+ * remove this tuple's downlink and the *following* tuple's key).  Note that
+ * the leaf page is empty, so we don't need to store its content --- it is
+ * just reinitialized during recovery using the rest of the fields.
+ *
+ * Backup Blk 0: leaf block
+ * Backup Blk 1: top parent
+ */
+typedef struct xl_btree_mark_page_halfdead
+{
+	OffsetNumber poffset;		/* deleted tuple id in parent page */
+
+	/* information needed to recreate the leaf page: */
+	BlockNumber leafblk;		/* leaf block ultimately being deleted */
+	BlockNumber leftblk;		/* leaf block's left sibling, if any */
+	BlockNumber rightblk;		/* leaf block's right sibling */
+	BlockNumber topparent;		/* topmost internal page in the subtree */
+} xl_btree_mark_page_halfdead;
+
+#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
+
+/*
+ * This is what we need to know about deletion of a btree page.  Note that we
+ * only leave behind a small amount of bookkeeping information in deleted
+ * pages (deleted pages must be kept around as tombstones for a while).  It is
+ * convenient for the REDO routine to regenerate its target page from scratch.
+ * This is why WAL record describes certain details that are actually directly
+ * available from the target page.
+ *
+ * Backup Blk 0: target block being deleted
+ * Backup Blk 1: target block's left sibling, if any
+ * Backup Blk 2: target block's right sibling
+ * Backup Blk 3: leaf block (if different from target)
+ * Backup Blk 4: metapage (if rightsib becomes new fast root)
+ */
+typedef struct xl_btree_unlink_page
+{
+	BlockNumber leftsib;		/* target block's left sibling, if any */
+	BlockNumber rightsib;		/* target block's right sibling */
+	uint32		level;			/* target block's level */
+	FullTransactionId safexid;	/* target block's BTPageSetDeleted() XID */
+
+	/*
+	 * Information needed to recreate a half-dead leaf page with correct
+	 * topparent link.  The fields are only used when deletion operation's
+	 * target page is an internal page.  REDO routine creates half-dead page
+	 * from scratch to keep things simple (this is the same convenient
+	 * approach used for the target page itself).
+	 */
+	BlockNumber leafleftsib;
+	BlockNumber leafrightsib;
+	BlockNumber leaftopparent;	/* next child down in the subtree */
+
+	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
+} xl_btree_unlink_page;
+
+#define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
+
+/*
+ * New root log record.  There are zero tuples if this is to establish an
+ * empty root, or two if it is the result of splitting an old root.
+ *
+ * Note that although this implies rewriting the metadata page, we don't need
+ * an xl_btree_metadata record --- the rootblk and level are sufficient.
+ *
+ * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
+ * Backup Blk 1: left child (if splitting an old root)
+ * Backup Blk 2: metapage
+ */
+typedef struct xl_btree_newroot
+{
+	BlockNumber rootblk;		/* location of new root (redundant with blk 0) */
+	uint32		level;			/* its tree level */
+} xl_btree_newroot;
+
+#define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, level) + sizeof(uint32))
+
+
+/*
+ * prototypes for functions in nbtxlog.c
+ */
+extern void btree_redo(XLogReaderState *record);
+extern void btree_desc(StringInfo buf, XLogReaderState *record);
+extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
+extern void btree_mask(char *pagedata, BlockNumber blkno);
+
+#endif							/* NBTXLOG_H */