summaryrefslogtreecommitdiffstats
path: root/src/include/access
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/access')
-rw-r--r--src/include/access/amapi.h290
-rw-r--r--src/include/access/amvalidate.h40
-rw-r--r--src/include/access/attmap.h52
-rw-r--r--src/include/access/attnum.h64
-rw-r--r--src/include/access/brin.h55
-rw-r--r--src/include/access/brin_internal.h115
-rw-r--r--src/include/access/brin_page.h96
-rw-r--r--src/include/access/brin_pageops.h38
-rw-r--r--src/include/access/brin_revmap.h41
-rw-r--r--src/include/access/brin_tuple.h110
-rw-r--r--src/include/access/brin_xlog.h151
-rw-r--r--src/include/access/bufmask.h32
-rw-r--r--src/include/access/clog.h63
-rw-r--r--src/include/access/commit_ts.h78
-rw-r--r--src/include/access/detoast.h82
-rw-r--r--src/include/access/genam.h231
-rw-r--r--src/include/access/generic_xlog.h45
-rw-r--r--src/include/access/gin.h78
-rw-r--r--src/include/access/gin_private.h500
-rw-r--r--src/include/access/ginblock.h346
-rw-r--r--src/include/access/ginxlog.h216
-rw-r--r--src/include/access/gist.h248
-rw-r--r--src/include/access/gist_private.h571
-rw-r--r--src/include/access/gistscan.h24
-rw-r--r--src/include/access/gistxlog.h114
-rw-r--r--src/include/access/hash.h483
-rw-r--r--src/include/access/hash_xlog.h267
-rw-r--r--src/include/access/heapam.h235
-rw-r--r--src/include/access/heapam_xlog.h419
-rw-r--r--src/include/access/heaptoast.h149
-rw-r--r--src/include/access/hio.h43
-rw-r--r--src/include/access/htup.h89
-rw-r--r--src/include/access/htup_details.h818
-rw-r--r--src/include/access/itup.h164
-rw-r--r--src/include/access/multixact.h164
-rw-r--r--src/include/access/nbtree.h1286
-rw-r--r--src/include/access/nbtxlog.h351
-rw-r--r--src/include/access/parallel.h82
-rw-r--r--src/include/access/printsimple.h23
-rw-r--r--src/include/access/printtup.h35
-rw-r--r--src/include/access/relation.h28
-rw-r--r--src/include/access/reloptions.h247
-rw-r--r--src/include/access/relscan.h191
-rw-r--r--src/include/access/rewriteheap.h57
-rw-r--r--src/include/access/rmgr.h35
-rw-r--r--src/include/access/rmgrlist.h49
-rw-r--r--src/include/access/sdir.h58
-rw-r--r--src/include/access/session.h44
-rw-r--r--src/include/access/skey.h151
-rw-r--r--src/include/access/slru.h174
-rw-r--r--src/include/access/spgist.h229
-rw-r--r--src/include/access/spgist_private.h548
-rw-r--r--src/include/access/spgxlog.h257
-rw-r--r--src/include/access/stratnum.h85
-rw-r--r--src/include/access/subtrans.h29
-rw-r--r--src/include/access/syncscan.h25
-rw-r--r--src/include/access/sysattr.h29
-rw-r--r--src/include/access/table.h28
-rw-r--r--src/include/access/tableam.h2075
-rw-r--r--src/include/access/timeline.h44
-rw-r--r--src/include/access/toast_compression.h73
-rw-r--r--src/include/access/toast_helper.h116
-rw-r--r--src/include/access/toast_internals.h63
-rw-r--r--src/include/access/transam.h370
-rw-r--r--src/include/access/tsmapi.h82
-rw-r--r--src/include/access/tupconvert.h51
-rw-r--r--src/include/access/tupdesc.h154
-rw-r--r--src/include/access/tupdesc_details.h28
-rw-r--r--src/include/access/tupmacs.h247
-rw-r--r--src/include/access/twophase.h63
-rw-r--r--src/include/access/twophase_rmgr.h40
-rw-r--r--src/include/access/valid.h69
-rw-r--r--src/include/access/visibilitymap.h42
-rw-r--r--src/include/access/visibilitymapdefs.h25
-rw-r--r--src/include/access/xact.h476
-rw-r--r--src/include/access/xlog.h406
-rw-r--r--src/include/access/xlog_internal.h336
-rw-r--r--src/include/access/xlogarchive.h35
-rw-r--r--src/include/access/xlogdefs.h116
-rw-r--r--src/include/access/xloginsert.h66
-rw-r--r--src/include/access/xlogreader.h340
-rw-r--r--src/include/access/xlogrecord.h229
-rw-r--r--src/include/access/xlogutils.h63
83 files changed, 16161 insertions, 0 deletions
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
new file mode 100644
index 0000000..d357ebb
--- /dev/null
+++ b/src/include/access/amapi.h
@@ -0,0 +1,290 @@
+/*-------------------------------------------------------------------------
+ *
+ * amapi.h
+ * API for Postgres index access methods.
+ *
+ * Copyright (c) 2015-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/amapi.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef AMAPI_H
+#define AMAPI_H
+
+#include "access/genam.h"
+
+/*
+ * We don't wish to include planner header files here, since most of an index
+ * AM's implementation isn't concerned with those data structures. To allow
+ * declaring amcostestimate_function here, use forward struct references.
+ */
+struct PlannerInfo;
+struct IndexPath;
+
+/* Likewise, this file shouldn't depend on execnodes.h. */
+struct IndexInfo;
+
+
+/*
+ * Properties for amproperty API. This list covers properties known to the
+ * core code, but an index AM can define its own properties, by matching the
+ * string property name.
+ */
+typedef enum IndexAMProperty
+{
+ AMPROP_UNKNOWN = 0, /* anything not known to core code */
+ AMPROP_ASC, /* column properties */
+ AMPROP_DESC,
+ AMPROP_NULLS_FIRST,
+ AMPROP_NULLS_LAST,
+ AMPROP_ORDERABLE,
+ AMPROP_DISTANCE_ORDERABLE,
+ AMPROP_RETURNABLE,
+ AMPROP_SEARCH_ARRAY,
+ AMPROP_SEARCH_NULLS,
+ AMPROP_CLUSTERABLE, /* index properties */
+ AMPROP_INDEX_SCAN,
+ AMPROP_BITMAP_SCAN,
+ AMPROP_BACKWARD_SCAN,
+ AMPROP_CAN_ORDER, /* AM properties */
+ AMPROP_CAN_UNIQUE,
+ AMPROP_CAN_MULTI_COL,
+ AMPROP_CAN_EXCLUDE,
+ AMPROP_CAN_INCLUDE
+} IndexAMProperty;
+
+/*
+ * We use lists of this struct type to keep track of both operators and
+ * support functions while building or adding to an opclass or opfamily.
+ * amadjustmembers functions receive lists of these structs, and are allowed
+ * to alter their "ref" fields.
+ *
+ * The "ref" fields define how the pg_amop or pg_amproc entry should depend
+ * on the associated objects (that is, which dependency type to use, and
+ * which opclass or opfamily it should depend on).
+ *
+ * If ref_is_hard is true, the entry will have a NORMAL dependency on the
+ * operator or support func, and an INTERNAL dependency on the opclass or
+ * opfamily. This forces the opclass or opfamily to be dropped if the
+ * operator or support func is dropped, and requires the CASCADE option
+ * to do so. Nor will ALTER OPERATOR FAMILY DROP be allowed. This is
+ * the right behavior for objects that are essential to an opclass.
+ *
+ * If ref_is_hard is false, the entry will have an AUTO dependency on the
+ * operator or support func, and also an AUTO dependency on the opclass or
+ * opfamily. This allows ALTER OPERATOR FAMILY DROP, and causes that to
+ * happen automatically if the operator or support func is dropped. This
+ * is the right behavior for inessential ("loose") objects.
+ */
+typedef struct OpFamilyMember
+{
+ bool is_func; /* is this an operator, or support func? */
+ Oid object; /* operator or support func's OID */
+ int number; /* strategy or support func number */
+ Oid lefttype; /* lefttype */
+ Oid righttype; /* righttype */
+ Oid sortfamily; /* ordering operator's sort opfamily, or 0 */
+ bool ref_is_hard; /* hard or soft dependency? */
+ bool ref_is_family; /* is dependency on opclass or opfamily? */
+ Oid refobjid; /* OID of opclass or opfamily */
+} OpFamilyMember;
+
+
+/*
+ * Callback function signatures --- see indexam.sgml for more info.
+ */
+
+/* build new index */
+typedef IndexBuildResult *(*ambuild_function) (Relation heapRelation,
+ Relation indexRelation,
+ struct IndexInfo *indexInfo);
+
+/* build empty index */
+typedef void (*ambuildempty_function) (Relation indexRelation);
+
+/* insert this tuple */
+typedef bool (*aminsert_function) (Relation indexRelation,
+ Datum *values,
+ bool *isnull,
+ ItemPointer heap_tid,
+ Relation heapRelation,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+
+/* bulk delete */
+typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+
+/* post-VACUUM cleanup */
+typedef IndexBulkDeleteResult *(*amvacuumcleanup_function) (IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats);
+
+/* can indexscan return IndexTuples? */
+typedef bool (*amcanreturn_function) (Relation indexRelation, int attno);
+
+/* estimate cost of an indexscan */
+typedef void (*amcostestimate_function) (struct PlannerInfo *root,
+ struct IndexPath *path,
+ double loop_count,
+ Cost *indexStartupCost,
+ Cost *indexTotalCost,
+ Selectivity *indexSelectivity,
+ double *indexCorrelation,
+ double *indexPages);
+
+/* parse index reloptions */
+typedef bytea *(*amoptions_function) (Datum reloptions,
+ bool validate);
+
+/* report AM, index, or index column property */
+typedef bool (*amproperty_function) (Oid index_oid, int attno,
+ IndexAMProperty prop, const char *propname,
+ bool *res, bool *isnull);
+
+/* name of phase as used in progress reporting */
+typedef char *(*ambuildphasename_function) (int64 phasenum);
+
+/* validate definition of an opclass for this AM */
+typedef bool (*amvalidate_function) (Oid opclassoid);
+
+/* validate operators and support functions to be added to an opclass/family */
+typedef void (*amadjustmembers_function) (Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions);
+
+/* prepare for index scan */
+typedef IndexScanDesc (*ambeginscan_function) (Relation indexRelation,
+ int nkeys,
+ int norderbys);
+
+/* (re)start index scan */
+typedef void (*amrescan_function) (IndexScanDesc scan,
+ ScanKey keys,
+ int nkeys,
+ ScanKey orderbys,
+ int norderbys);
+
+/* next valid tuple */
+typedef bool (*amgettuple_function) (IndexScanDesc scan,
+ ScanDirection direction);
+
+/* fetch all valid tuples */
+typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
+ TIDBitmap *tbm);
+
+/* end index scan */
+typedef void (*amendscan_function) (IndexScanDesc scan);
+
+/* mark current scan position */
+typedef void (*ammarkpos_function) (IndexScanDesc scan);
+
+/* restore marked scan position */
+typedef void (*amrestrpos_function) (IndexScanDesc scan);
+
+/*
+ * Callback function signatures - for parallel index scans.
+ */
+
+/* estimate size of parallel scan descriptor */
+typedef Size (*amestimateparallelscan_function) (void);
+
+/* prepare for parallel index scan */
+typedef void (*aminitparallelscan_function) (void *target);
+
+/* (re)start parallel index scan */
+typedef void (*amparallelrescan_function) (IndexScanDesc scan);
+
+/*
+ * API struct for an index AM. Note this must be stored in a single palloc'd
+ * chunk of memory.
+ */
+typedef struct IndexAmRoutine
+{
+ NodeTag type;
+
+ /*
+ * Total number of strategies (operators) by which we can traverse/search
+ * this AM. Zero if AM does not have a fixed set of strategy assignments.
+ */
+ uint16 amstrategies;
+ /* total number of support functions that this AM uses */
+ uint16 amsupport;
+ /* opclass options support function number or 0 */
+ uint16 amoptsprocnum;
+ /* does AM support ORDER BY indexed column's value? */
+ bool amcanorder;
+ /* does AM support ORDER BY result of an operator on indexed column? */
+ bool amcanorderbyop;
+ /* does AM support backward scanning? */
+ bool amcanbackward;
+ /* does AM support UNIQUE indexes? */
+ bool amcanunique;
+ /* does AM support multi-column indexes? */
+ bool amcanmulticol;
+ /* does AM require scans to have a constraint on the first index column? */
+ bool amoptionalkey;
+ /* does AM handle ScalarArrayOpExpr quals? */
+ bool amsearcharray;
+ /* does AM handle IS NULL/IS NOT NULL quals? */
+ bool amsearchnulls;
+ /* can index storage data type differ from column data type? */
+ bool amstorage;
+ /* can an index of this type be clustered on? */
+ bool amclusterable;
+ /* does AM handle predicate locks? */
+ bool ampredlocks;
+ /* does AM support parallel scan? */
+ bool amcanparallel;
+ /* does AM support columns included with clause INCLUDE? */
+ bool amcaninclude;
+ /* does AM use maintenance_work_mem? */
+ bool amusemaintenanceworkmem;
+ /* OR of parallel vacuum flags. See vacuum.h for flags. */
+ uint8 amparallelvacuumoptions;
+ /* type of data stored in index, or InvalidOid if variable */
+ Oid amkeytype;
+
+ /*
+ * If you add new properties to either the above or the below lists, then
+ * they should also (usually) be exposed via the property API (see
+ * IndexAMProperty at the top of the file, and utils/adt/amutils.c).
+ */
+
+ /* interface functions */
+ ambuild_function ambuild;
+ ambuildempty_function ambuildempty;
+ aminsert_function aminsert;
+ ambulkdelete_function ambulkdelete;
+ amvacuumcleanup_function amvacuumcleanup;
+ amcanreturn_function amcanreturn; /* can be NULL */
+ amcostestimate_function amcostestimate;
+ amoptions_function amoptions;
+ amproperty_function amproperty; /* can be NULL */
+ ambuildphasename_function ambuildphasename; /* can be NULL */
+ amvalidate_function amvalidate;
+ amadjustmembers_function amadjustmembers; /* can be NULL */
+ ambeginscan_function ambeginscan;
+ amrescan_function amrescan;
+ amgettuple_function amgettuple; /* can be NULL */
+ amgetbitmap_function amgetbitmap; /* can be NULL */
+ amendscan_function amendscan;
+ ammarkpos_function ammarkpos; /* can be NULL */
+ amrestrpos_function amrestrpos; /* can be NULL */
+
+ /* interface functions to support parallel index scans */
+ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
+ aminitparallelscan_function aminitparallelscan; /* can be NULL */
+ amparallelrescan_function amparallelrescan; /* can be NULL */
+} IndexAmRoutine;
+
+
+/* Functions in access/index/amapi.c */
+extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler);
+extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid amoid, bool noerror);
+
+#endif /* AMAPI_H */
diff --git a/src/include/access/amvalidate.h b/src/include/access/amvalidate.h
new file mode 100644
index 0000000..df02fba
--- /dev/null
+++ b/src/include/access/amvalidate.h
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * amvalidate.h
+ * Support routines for index access methods' amvalidate and
+ * amadjustmembers functions.
+ *
+ * Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/amvalidate.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef AMVALIDATE_H
+#define AMVALIDATE_H
+
+#include "utils/catcache.h"
+
+
+/* Struct returned (in a list) by identify_opfamily_groups() */
+typedef struct OpFamilyOpFuncGroup
+{
+ Oid lefttype; /* amoplefttype/amproclefttype */
+ Oid righttype; /* amoprighttype/amprocrighttype */
+ uint64 operatorset; /* bitmask of operators with these types */
+ uint64 functionset; /* bitmask of support funcs with these types */
+} OpFamilyOpFuncGroup;
+
+
+/* Functions in access/index/amvalidate.c */
+extern List *identify_opfamily_groups(CatCList *oprlist, CatCList *proclist);
+extern bool check_amproc_signature(Oid funcid, Oid restype, bool exact,
+ int minargs, int maxargs,...);
+extern bool check_amoptsproc_signature(Oid funcid);
+extern bool check_amop_signature(Oid opno, Oid restype,
+ Oid lefttype, Oid righttype);
+extern Oid opclass_for_family_datatype(Oid amoid, Oid opfamilyoid,
+ Oid datatypeoid);
+extern bool opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid);
+
+#endif /* AMVALIDATE_H */
diff --git a/src/include/access/attmap.h b/src/include/access/attmap.h
new file mode 100644
index 0000000..778fa27
--- /dev/null
+++ b/src/include/access/attmap.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * attmap.h
+ * Definitions for PostgreSQL attribute mappings
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/attmap.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ATTMAP_H
+#define ATTMAP_H
+
+#include "access/attnum.h"
+#include "access/tupdesc.h"
+
+/*
+ * Attribute mapping structure
+ *
+ * This maps attribute numbers between a pair of relations, designated
+ * 'input' and 'output' (most typically inheritance parent and child
+ * relations), whose common columns may have different attribute numbers.
+ * Such difference may arise due to the columns being ordered differently
+ * in the two relations or the two relations having dropped columns at
+ * different positions.
+ *
+ * 'maplen' is set to the number of attributes of the 'output' relation,
+ * taking into account any of its dropped attributes, with the corresponding
+ * elements of the 'attnums' array set to 0.
+ */
+typedef struct AttrMap
+{
+ AttrNumber *attnums;
+ int maplen;
+} AttrMap;
+
+extern AttrMap *make_attrmap(int maplen);
+extern void free_attrmap(AttrMap *map);
+
+/* Conversion routines to build mappings */
+extern AttrMap *build_attrmap_by_name(TupleDesc indesc,
+ TupleDesc outdesc);
+extern AttrMap *build_attrmap_by_name_if_req(TupleDesc indesc,
+ TupleDesc outdesc);
+extern AttrMap *build_attrmap_by_position(TupleDesc indesc,
+ TupleDesc outdesc,
+ const char *msg);
+
+#endif /* ATTMAP_H */
diff --git a/src/include/access/attnum.h b/src/include/access/attnum.h
new file mode 100644
index 0000000..0c43e26
--- /dev/null
+++ b/src/include/access/attnum.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * attnum.h
+ * POSTGRES attribute number definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/attnum.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ATTNUM_H
+#define ATTNUM_H
+
+
+/*
+ * user defined attribute numbers start at 1. -ay 2/95
+ */
+typedef int16 AttrNumber;
+
+#define InvalidAttrNumber 0
+#define MaxAttrNumber 32767
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+/*
+ * AttributeNumberIsValid
+ * True iff the attribute number is valid.
+ */
+#define AttributeNumberIsValid(attributeNumber) \
+ ((bool) ((attributeNumber) != InvalidAttrNumber))
+
+/*
+ * AttrNumberIsForUserDefinedAttr
+ * True iff the attribute number corresponds to an user defined attribute.
+ */
+#define AttrNumberIsForUserDefinedAttr(attributeNumber) \
+ ((bool) ((attributeNumber) > 0))
+
+/*
+ * AttrNumberGetAttrOffset
+ * Returns the attribute offset for an attribute number.
+ *
+ * Note:
+ * Assumes the attribute number is for a user defined attribute.
+ */
+#define AttrNumberGetAttrOffset(attNum) \
+( \
+ AssertMacro(AttrNumberIsForUserDefinedAttr(attNum)), \
+ ((attNum) - 1) \
+)
+
+/*
+ * AttrOffsetGetAttrNumber
+ * Returns the attribute number for an attribute offset.
+ */
+#define AttrOffsetGetAttrNumber(attributeOffset) \
+ ((AttrNumber) (1 + (attributeOffset)))
+
+#endif /* ATTNUM_H */
diff --git a/src/include/access/brin.h b/src/include/access/brin.h
new file mode 100644
index 0000000..4e2be13
--- /dev/null
+++ b/src/include/access/brin.h
@@ -0,0 +1,55 @@
+/*
+ * AM-callable functions for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/include/access/brin.h
+ */
+#ifndef BRIN_H
+#define BRIN_H
+
+#include "nodes/execnodes.h"
+#include "utils/relcache.h"
+
+
+/*
+ * Storage type for BRIN's reloptions
+ */
+typedef struct BrinOptions
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ BlockNumber pagesPerRange;
+ bool autosummarize;
+} BrinOptions;
+
+
+/*
+ * BrinStatsData represents stats data for planner use
+ */
+typedef struct BrinStatsData
+{
+ BlockNumber pagesPerRange;
+ BlockNumber revmapNumPages;
+} BrinStatsData;
+
+
+#define BRIN_DEFAULT_PAGES_PER_RANGE 128
+#define BrinGetPagesPerRange(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == BRIN_AM_OID), \
+ (relation)->rd_options ? \
+ ((BrinOptions *) (relation)->rd_options)->pagesPerRange : \
+ BRIN_DEFAULT_PAGES_PER_RANGE)
+#define BrinGetAutoSummarize(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == BRIN_AM_OID), \
+ (relation)->rd_options ? \
+ ((BrinOptions *) (relation)->rd_options)->autosummarize : \
+ false)
+
+
+extern void brinGetStats(Relation index, BrinStatsData *stats);
+
+#endif /* BRIN_H */
diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h
new file mode 100644
index 0000000..79440eb
--- /dev/null
+++ b/src/include/access/brin_internal.h
@@ -0,0 +1,115 @@
+/*
+ * brin_internal.h
+ * internal declarations for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/include/access/brin_internal.h
+ */
+#ifndef BRIN_INTERNAL_H
+#define BRIN_INTERNAL_H
+
+#include "access/amapi.h"
+#include "storage/bufpage.h"
+#include "utils/typcache.h"
+
+
+/*
+ * A BrinDesc is a struct designed to enable decoding a BRIN tuple from the
+ * on-disk format to an in-memory tuple and vice-versa.
+ */
+
+/* struct returned by "OpcInfo" amproc */
+typedef struct BrinOpcInfo
+{
+ /* Number of columns stored in an index column of this opclass */
+ uint16 oi_nstored;
+
+ /* Regular processing of NULLs in BrinValues? */
+ bool oi_regular_nulls;
+
+ /* Opaque pointer for the opclass' private use */
+ void *oi_opaque;
+
+ /* Type cache entries of the stored columns */
+ TypeCacheEntry *oi_typcache[FLEXIBLE_ARRAY_MEMBER];
+} BrinOpcInfo;
+
+/* the size of a BrinOpcInfo for the given number of columns */
+#define SizeofBrinOpcInfo(ncols) \
+ (offsetof(BrinOpcInfo, oi_typcache) + sizeof(TypeCacheEntry *) * ncols)
+
+typedef struct BrinDesc
+{
+ /* Containing memory context */
+ MemoryContext bd_context;
+
+ /* the index relation itself */
+ Relation bd_index;
+
+ /* tuple descriptor of the index relation */
+ TupleDesc bd_tupdesc;
+
+ /* cached copy for on-disk tuples; generated at first use */
+ TupleDesc bd_disktdesc;
+
+ /* total number of Datum entries that are stored on-disk for all columns */
+ int bd_totalstored;
+
+ /* per-column info; bd_tupdesc->natts entries long */
+ BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER];
+} BrinDesc;
+
+/*
+ * Globally-known function support numbers for BRIN indexes. Individual
+ * opclasses can define more function support numbers, which must fall into
+ * BRIN_FIRST_OPTIONAL_PROCNUM .. BRIN_LAST_OPTIONAL_PROCNUM.
+ */
+#define BRIN_PROCNUM_OPCINFO 1
+#define BRIN_PROCNUM_ADDVALUE 2
+#define BRIN_PROCNUM_CONSISTENT 3
+#define BRIN_PROCNUM_UNION 4
+#define BRIN_MANDATORY_NPROCS 4
+#define BRIN_PROCNUM_OPTIONS 5 /* optional */
+/* procedure numbers up to 10 are reserved for BRIN future expansion */
+#define BRIN_FIRST_OPTIONAL_PROCNUM 11
+#define BRIN_LAST_OPTIONAL_PROCNUM 15
+
+#undef BRIN_DEBUG
+
+#ifdef BRIN_DEBUG
+#define BRIN_elog(args) elog args
+#else
+#define BRIN_elog(args) ((void) 0)
+#endif
+
+/* brin.c */
+extern BrinDesc *brin_build_desc(Relation rel);
+extern void brin_free_desc(BrinDesc *bdesc);
+extern IndexBuildResult *brinbuild(Relation heap, Relation index,
+ struct IndexInfo *indexInfo);
+extern void brinbuildempty(Relation index);
+extern bool brininsert(Relation idxRel, Datum *values, bool *nulls,
+ ItemPointer heaptid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys);
+extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys);
+extern void brinendscan(IndexScanDesc scan);
+extern IndexBulkDeleteResult *brinbulkdelete(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+extern IndexBulkDeleteResult *brinvacuumcleanup(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats);
+extern bytea *brinoptions(Datum reloptions, bool validate);
+
+/* brin_validate.c */
+extern bool brinvalidate(Oid opclassoid);
+
+#endif /* BRIN_INTERNAL_H */
diff --git a/src/include/access/brin_page.h b/src/include/access/brin_page.h
new file mode 100644
index 0000000..75de538
--- /dev/null
+++ b/src/include/access/brin_page.h
@@ -0,0 +1,96 @@
+/*
+ * brin_page.h
+ * Prototypes and definitions for BRIN page layouts
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/include/access/brin_page.h
+ *
+ * NOTES
+ *
+ * These structs should really be private to specific BRIN files, but it's
+ * useful to have them here so that they can be used by pageinspect and similar
+ * tools.
+ */
+#ifndef BRIN_PAGE_H
+#define BRIN_PAGE_H
+
+#include "storage/block.h"
+#include "storage/itemptr.h"
+
+/*
+ * Special area of BRIN pages.
+ *
+ * We define it in this odd way so that it always occupies the last
+ * MAXALIGN-sized element of each page.
+ */
+typedef struct BrinSpecialSpace
+{
+ uint16 vector[MAXALIGN(1) / sizeof(uint16)];
+} BrinSpecialSpace;
+
+/*
+ * Make the page type be the last half-word in the page, for consumption by
+ * pg_filedump and similar utilities. We don't really care much about the
+ * position of the "flags" half-word, but it's simpler to apply a consistent
+ * rule to both.
+ *
+ * See comments above GinPageOpaqueData.
+ */
+#define BrinPageType(page) \
+ (((BrinSpecialSpace *) \
+ PageGetSpecialPointer(page))->vector[MAXALIGN(1) / sizeof(uint16) - 1])
+
+#define BrinPageFlags(page) \
+ (((BrinSpecialSpace *) \
+ PageGetSpecialPointer(page))->vector[MAXALIGN(1) / sizeof(uint16) - 2])
+
+/* special space on all BRIN pages stores a "type" identifier */
+#define BRIN_PAGETYPE_META 0xF091
+#define BRIN_PAGETYPE_REVMAP 0xF092
+#define BRIN_PAGETYPE_REGULAR 0xF093
+
+#define BRIN_IS_META_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_META)
+#define BRIN_IS_REVMAP_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REVMAP)
+#define BRIN_IS_REGULAR_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REGULAR)
+
+/* flags for BrinSpecialSpace */
+#define BRIN_EVACUATE_PAGE (1 << 0)
+
+
+/* Metapage definitions */
+typedef struct BrinMetaPageData
+{
+ uint32 brinMagic;
+ uint32 brinVersion;
+ BlockNumber pagesPerRange;
+ BlockNumber lastRevmapPage;
+} BrinMetaPageData;
+
+#define BRIN_CURRENT_VERSION 1
+#define BRIN_META_MAGIC 0xA8109CFA
+
+#define BRIN_METAPAGE_BLKNO 0
+
+/* Definitions for revmap pages */
+typedef struct RevmapContents
+{
+ /*
+ * This array will fill all available space on the page. It should be
+ * declared [FLEXIBLE_ARRAY_MEMBER], but for some reason you can't do that
+ * in an otherwise-empty struct.
+ */
+ ItemPointerData rm_tids[1];
+} RevmapContents;
+
+#define REVMAP_CONTENT_SIZE \
+ (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
+ offsetof(RevmapContents, rm_tids) - \
+ MAXALIGN(sizeof(BrinSpecialSpace)))
+/* max num of items in the array */
+#define REVMAP_PAGE_MAXITEMS \
+ (REVMAP_CONTENT_SIZE / sizeof(ItemPointerData))
+
+#endif /* BRIN_PAGE_H */
diff --git a/src/include/access/brin_pageops.h b/src/include/access/brin_pageops.h
new file mode 100644
index 0000000..c2e8a2a
--- /dev/null
+++ b/src/include/access/brin_pageops.h
@@ -0,0 +1,38 @@
+/*
+ * brin_pageops.h
+ * Prototypes for operating on BRIN pages.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/include/access/brin_pageops.h
+ */
+#ifndef BRIN_PAGEOPS_H
+#define BRIN_PAGEOPS_H
+
+#include "access/brin_revmap.h"
+
+extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, BlockNumber heapBlk,
+ Buffer oldbuf, OffsetNumber oldoff,
+ const BrinTuple *origtup, Size origsz,
+ const BrinTuple *newtup, Size newsz,
+ bool samepage);
+extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz,
+ Size newsz);
+extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
+ BrinTuple *tup, Size itemsz);
+
+extern void brin_page_init(Page page, uint16 type);
+extern void brin_metapage_init(Page page, BlockNumber pagesPerRange,
+ uint16 version);
+
+extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf);
+extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer buf);
+
+extern void brin_page_cleanup(Relation idxrel, Buffer buf);
+
+#endif /* BRIN_PAGEOPS_H */
diff --git a/src/include/access/brin_revmap.h b/src/include/access/brin_revmap.h
new file mode 100644
index 0000000..4259fe8
--- /dev/null
+++ b/src/include/access/brin_revmap.h
@@ -0,0 +1,41 @@
+/*
+ * brin_revmap.h
+ * Prototypes for BRIN reverse range maps
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/include/access/brin_revmap.h
+ */
+
+#ifndef BRIN_REVMAP_H
+#define BRIN_REVMAP_H
+
+#include "access/brin_tuple.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/itemptr.h"
+#include "storage/off.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+/* struct definition lives in brin_revmap.c */
+typedef struct BrinRevmap BrinRevmap;
+
+extern BrinRevmap *brinRevmapInitialize(Relation idxrel,
+ BlockNumber *pagesPerRange, Snapshot snapshot);
+extern void brinRevmapTerminate(BrinRevmap *revmap);
+
+extern void brinRevmapExtend(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+extern void brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
+ BlockNumber heapBlk, ItemPointerData tid);
+extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap,
+ BlockNumber heapBlk, Buffer *buf, OffsetNumber *off,
+ Size *size, int mode, Snapshot snapshot);
+extern bool brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk);
+
+#endif /* BRIN_REVMAP_H */
diff --git a/src/include/access/brin_tuple.h b/src/include/access/brin_tuple.h
new file mode 100644
index 0000000..c80341f
--- /dev/null
+++ b/src/include/access/brin_tuple.h
@@ -0,0 +1,110 @@
+/*
+ * brin_tuple.h
+ * Declarations for dealing with BRIN-specific tuples.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/include/access/brin_tuple.h
+ */
+#ifndef BRIN_TUPLE_H
+#define BRIN_TUPLE_H
+
+#include "access/brin_internal.h"
+#include "access/tupdesc.h"
+
+/*
+ * The BRIN opclasses may register serialization callback, in case the on-disk
+ * and in-memory representations differ (e.g. for performance reasons).
+ */
+typedef void (*brin_serialize_callback_type) (BrinDesc *bdesc, Datum src, Datum *dst);
+
+/*
+ * A BRIN index stores one index tuple per page range. Each index tuple
+ * has one BrinValues struct for each indexed column; in turn, each BrinValues
+ * has (besides the null flags) an array of Datum whose size is determined by
+ * the opclass.
+ */
+typedef struct BrinValues
+{
+ AttrNumber bv_attno; /* index attribute number */
+ bool bv_hasnulls; /* are there any nulls in the page range? */
+ bool bv_allnulls; /* are all values nulls in the page range? */
+ Datum *bv_values; /* current accumulated values */
+ Datum bv_mem_value; /* expanded accumulated values */
+ MemoryContext bv_context;
+ brin_serialize_callback_type bv_serialize;
+} BrinValues;
+
+/*
+ * This struct is used to represent an in-memory index tuple. The values can
+ * only be meaningfully decoded with an appropriate BrinDesc.
+ */
+typedef struct BrinMemTuple
+{
+ bool bt_placeholder; /* this is a placeholder tuple */
+ BlockNumber bt_blkno; /* heap blkno that the tuple is for */
+ MemoryContext bt_context; /* memcxt holding the bt_columns values */
+ /* output arrays for brin_deform_tuple: */
+ Datum *bt_values; /* values array */
+ bool *bt_allnulls; /* allnulls array */
+ bool *bt_hasnulls; /* hasnulls array */
+ /* not an output array, but must be last */
+ BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER];
+} BrinMemTuple;
+
+/*
+ * An on-disk BRIN tuple. This is possibly followed by a nulls bitmask, with
+ * room for 2 null bits (two bits for each indexed column); an opclass-defined
+ * number of Datum values for each column follow.
+ */
+typedef struct BrinTuple
+{
+ /* heap block number that the tuple is for */
+ BlockNumber bt_blkno;
+
+ /* ---------------
+ * bt_info is laid out in the following fashion:
+ *
+ * 7th (high) bit: has nulls
+ * 6th bit: is placeholder tuple
+ * 5th bit: unused
+ * 4-0 bit: offset of data
+ * ---------------
+ */
+ uint8 bt_info;
+} BrinTuple;
+
+#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8))
+
+/*
+ * bt_info manipulation macros
+ */
+#define BRIN_OFFSET_MASK 0x1F
+/* bit 0x20 is not used at present */
+#define BRIN_PLACEHOLDER_MASK 0x40
+#define BRIN_NULLS_MASK 0x80
+
+#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK))
+#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0)
+#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0)
+
+
+extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno,
+ BrinMemTuple *tuple, Size *size);
+extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc,
+ BlockNumber blkno, Size *size);
+extern void brin_free_tuple(BrinTuple *tuple);
+extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len,
+ BrinTuple *dest, Size *destsz);
+extern bool brin_tuples_equal(const BrinTuple *a, Size alen,
+ const BrinTuple *b, Size blen);
+
+extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc);
+extern BrinMemTuple *brin_memtuple_initialize(BrinMemTuple *dtuple,
+ BrinDesc *brdesc);
+extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc,
+ BrinTuple *tuple, BrinMemTuple *dMemtuple);
+
+#endif /* BRIN_TUPLE_H */
diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h
new file mode 100644
index 0000000..ace8aa0
--- /dev/null
+++ b/src/include/access/brin_xlog.h
@@ -0,0 +1,151 @@
+/*-------------------------------------------------------------------------
+ *
+ * brin_xlog.h
+ * POSTGRES BRIN access XLOG definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/brin_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BRIN_XLOG_H
+#define BRIN_XLOG_H
+
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/bufpage.h"
+#include "storage/itemptr.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+
+/*
+ * WAL record definitions for BRIN's WAL operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field.
+ */
+#define XLOG_BRIN_CREATE_INDEX 0x00
+#define XLOG_BRIN_INSERT 0x10
+#define XLOG_BRIN_UPDATE 0x20
+#define XLOG_BRIN_SAMEPAGE_UPDATE 0x30
+#define XLOG_BRIN_REVMAP_EXTEND 0x40
+#define XLOG_BRIN_DESUMMARIZE 0x50
+
+#define XLOG_BRIN_OPMASK 0x70
+/*
+ * When we insert the first item on a new page, we restore the entire page in
+ * redo.
+ */
+#define XLOG_BRIN_INIT_PAGE 0x80
+
+/*
+ * This is what we need to know about a BRIN index create.
+ *
+ * Backup block 0: metapage
+ */
+typedef struct xl_brin_createidx
+{
+ BlockNumber pagesPerRange;
+ uint16 version;
+} xl_brin_createidx;
+#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16))
+
+/*
+ * This is what we need to know about a BRIN tuple insert
+ *
+ * Backup block 0: main page, block data is the new BrinTuple.
+ * Backup block 1: revmap page
+ */
+typedef struct xl_brin_insert
+{
+ BlockNumber heapBlk;
+
+ /* extra information needed to update the revmap */
+ BlockNumber pagesPerRange;
+
+ /* offset number in the main page to insert the tuple to. */
+ OffsetNumber offnum;
+} xl_brin_insert;
+
+#define SizeOfBrinInsert (offsetof(xl_brin_insert, offnum) + sizeof(OffsetNumber))
+
+/*
+ * A cross-page update is the same as an insert, but also stores information
+ * about the old tuple.
+ *
+ * Like in xl_brin_insert:
+ * Backup block 0: new page, block data includes the new BrinTuple.
+ * Backup block 1: revmap page
+ *
+ * And in addition:
+ * Backup block 2: old page
+ */
+typedef struct xl_brin_update
+{
+ /* offset number of old tuple on old page */
+ OffsetNumber oldOffnum;
+
+ xl_brin_insert insert;
+} xl_brin_update;
+
+#define SizeOfBrinUpdate (offsetof(xl_brin_update, insert) + SizeOfBrinInsert)
+
+/*
+ * This is what we need to know about a BRIN tuple samepage update
+ *
+ * Backup block 0: updated page, with new BrinTuple as block data
+ */
+typedef struct xl_brin_samepage_update
+{
+ OffsetNumber offnum;
+} xl_brin_samepage_update;
+
+#define SizeOfBrinSamepageUpdate (sizeof(OffsetNumber))
+
+/*
+ * This is what we need to know about a revmap extension
+ *
+ * Backup block 0: metapage
+ * Backup block 1: new revmap page
+ */
+typedef struct xl_brin_revmap_extend
+{
+ /*
+ * XXX: This is actually redundant - the block number is stored as part of
+ * backup block 1.
+ */
+ BlockNumber targetBlk;
+} xl_brin_revmap_extend;
+
+#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \
+ sizeof(BlockNumber))
+
+/*
+ * This is what we need to know about a range de-summarization
+ *
+ * Backup block 0: revmap page
+ * Backup block 1: regular page
+ */
+typedef struct xl_brin_desummarize
+{
+ BlockNumber pagesPerRange;
+ /* page number location to set to invalid */
+ BlockNumber heapBlk;
+ /* offset of item to delete in regular index page */
+ OffsetNumber regOffset;
+} xl_brin_desummarize;
+
+#define SizeOfBrinDesummarize (offsetof(xl_brin_desummarize, regOffset) + \
+ sizeof(OffsetNumber))
+
+
+extern void brin_redo(XLogReaderState *record);
+extern void brin_desc(StringInfo buf, XLogReaderState *record);
+extern const char *brin_identify(uint8 info);
+extern void brin_mask(char *pagedata, BlockNumber blkno);
+
+#endif /* BRIN_XLOG_H */
diff --git a/src/include/access/bufmask.h b/src/include/access/bufmask.h
new file mode 100644
index 0000000..add6c9a
--- /dev/null
+++ b/src/include/access/bufmask.h
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmask.h
+ * Definitions for buffer masking routines, used to mask certain bits
+ * in a page which can be different when the WAL is generated
+ * and when the WAL is applied. This is really the job of each
+ * individual rmgr, but we make things easier by providing some
+ * common routines to handle cases which occur in multiple rmgrs.
+ *
+ * Portions Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/bufmask.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef BUFMASK_H
+#define BUFMASK_H
+
+#include "storage/block.h"
+#include "storage/bufmgr.h"
+
+/* Marker used to mask pages consistently */
+#define MASK_MARKER 0
+
+extern void mask_page_lsn_and_checksum(Page page);
+extern void mask_page_hint_bits(Page page);
+extern void mask_unused_space(Page page);
+extern void mask_lp_flags(Page page);
+extern void mask_page_content(Page page);
+
+#endif
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
new file mode 100644
index 0000000..39b8e4a
--- /dev/null
+++ b/src/include/access/clog.h
@@ -0,0 +1,63 @@
+/*
+ * clog.h
+ *
+ * PostgreSQL transaction-commit-log manager
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/clog.h
+ */
+#ifndef CLOG_H
+#define CLOG_H
+
+#include "access/xlogreader.h"
+#include "storage/sync.h"
+#include "lib/stringinfo.h"
+
+/*
+ * Possible transaction statuses --- note that all-zeroes is the initial
+ * state.
+ *
+ * A "subcommitted" transaction is a committed subtransaction whose parent
+ * hasn't committed or aborted yet.
+ */
+typedef int XidStatus;
+
+#define TRANSACTION_STATUS_IN_PROGRESS 0x00
+#define TRANSACTION_STATUS_COMMITTED 0x01
+#define TRANSACTION_STATUS_ABORTED 0x02
+#define TRANSACTION_STATUS_SUB_COMMITTED 0x03
+
+typedef struct xl_clog_truncate
+{
+ int pageno;
+ TransactionId oldestXact;
+ Oid oldestXactDb;
+} xl_clog_truncate;
+
+extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
+ TransactionId *subxids, XidStatus status, XLogRecPtr lsn);
+extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn);
+
+extern Size CLOGShmemBuffers(void);
+extern Size CLOGShmemSize(void);
+extern void CLOGShmemInit(void);
+extern void BootStrapCLOG(void);
+extern void StartupCLOG(void);
+extern void TrimCLOG(void);
+extern void CheckPointCLOG(void);
+extern void ExtendCLOG(TransactionId newestXact);
+extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid);
+
+extern int clogsyncfiletag(const FileTag *ftag, char *path);
+
+/* XLOG stuff */
+#define CLOG_ZEROPAGE 0x00
+#define CLOG_TRUNCATE 0x10
+
+extern void clog_redo(XLogReaderState *record);
+extern void clog_desc(StringInfo buf, XLogReaderState *record);
+extern const char *clog_identify(uint8 info);
+
+#endif /* CLOG_H */
diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h
new file mode 100644
index 0000000..e045dd4
--- /dev/null
+++ b/src/include/access/commit_ts.h
@@ -0,0 +1,78 @@
+/*
+ * commit_ts.h
+ *
+ * PostgreSQL commit timestamp manager
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/commit_ts.h
+ */
+#ifndef COMMIT_TS_H
+#define COMMIT_TS_H
+
+#include "access/xlog.h"
+#include "datatype/timestamp.h"
+#include "replication/origin.h"
+#include "storage/sync.h"
+#include "utils/guc.h"
+
+
+extern PGDLLIMPORT bool track_commit_timestamp;
+
+extern bool check_track_commit_timestamp(bool *newval, void **extra,
+ GucSource source);
+
+extern void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TimestampTz timestamp,
+ RepOriginId nodeid);
+extern bool TransactionIdGetCommitTsData(TransactionId xid,
+ TimestampTz *ts, RepOriginId *nodeid);
+extern TransactionId GetLatestCommitTsData(TimestampTz *ts,
+ RepOriginId *nodeid);
+
+extern Size CommitTsShmemBuffers(void);
+extern Size CommitTsShmemSize(void);
+extern void CommitTsShmemInit(void);
+extern void BootStrapCommitTs(void);
+extern void StartupCommitTs(void);
+extern void CommitTsParameterChange(bool newvalue, bool oldvalue);
+extern void CompleteCommitTsInitialization(void);
+extern void CheckPointCommitTs(void);
+extern void ExtendCommitTs(TransactionId newestXact);
+extern void TruncateCommitTs(TransactionId oldestXact);
+extern void SetCommitTsLimit(TransactionId oldestXact,
+ TransactionId newestXact);
+extern void AdvanceOldestCommitTsXid(TransactionId oldestXact);
+
+extern int committssyncfiletag(const FileTag *ftag, char *path);
+
+/* XLOG stuff */
+#define COMMIT_TS_ZEROPAGE 0x00
+#define COMMIT_TS_TRUNCATE 0x10
+
+typedef struct xl_commit_ts_set
+{
+ TimestampTz timestamp;
+ RepOriginId nodeid;
+ TransactionId mainxid;
+ /* subxact Xids follow */
+} xl_commit_ts_set;
+
+#define SizeOfCommitTsSet (offsetof(xl_commit_ts_set, mainxid) + \
+ sizeof(TransactionId))
+
+typedef struct xl_commit_ts_truncate
+{
+ int pageno;
+ TransactionId oldestXid;
+} xl_commit_ts_truncate;
+
+#define SizeOfCommitTsTruncate (offsetof(xl_commit_ts_truncate, oldestXid) + \
+ sizeof(TransactionId))
+
+extern void commit_ts_redo(XLogReaderState *record);
+extern void commit_ts_desc(StringInfo buf, XLogReaderState *record);
+extern const char *commit_ts_identify(uint8 info);
+
+#endif /* COMMIT_TS_H */
diff --git a/src/include/access/detoast.h b/src/include/access/detoast.h
new file mode 100644
index 0000000..773a02f
--- /dev/null
+++ b/src/include/access/detoast.h
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------
+ *
+ * detoast.h
+ * Access to compressed and external varlena values.
+ *
+ * Copyright (c) 2000-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/detoast.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DETOAST_H
+#define DETOAST_H
+
+/*
+ * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum
+ * into a local "struct varatt_external" toast pointer. This should be
+ * just a memcpy, but some versions of gcc seem to produce broken code
+ * that assumes the datum contents are aligned. Introducing an explicit
+ * intermediate "varattrib_1b_e *" variable seems to fix it.
+ */
+#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \
+do { \
+ varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \
+ Assert(VARATT_IS_EXTERNAL(attre)); \
+ Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \
+ memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
+} while (0)
+
+/* Size of an EXTERNAL datum that contains a standard TOAST pointer */
+#define TOAST_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(varatt_external))
+
+/* Size of an EXTERNAL datum that contains an indirection pointer */
+#define INDIRECT_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(varatt_indirect))
+
+/* ----------
+ * detoast_external_attr() -
+ *
+ * Fetches an external stored attribute from the toast
+ * relation. Does NOT decompress it, if stored external
+ * in compressed format.
+ * ----------
+ */
+extern struct varlena *detoast_external_attr(struct varlena *attr);
+
+/* ----------
+ * detoast_attr() -
+ *
+ * Fully detoasts one attribute, fetching and/or decompressing
+ * it as needed.
+ * ----------
+ */
+extern struct varlena *detoast_attr(struct varlena *attr);
+
+/* ----------
+ * detoast_attr_slice() -
+ *
+ * Fetches only the specified portion of an attribute.
+ * (Handles all cases for attribute storage)
+ * ----------
+ */
+extern struct varlena *detoast_attr_slice(struct varlena *attr,
+ int32 sliceoffset,
+ int32 slicelength);
+
+/* ----------
+ * toast_raw_datum_size -
+ *
+ * Return the raw (detoasted) size of a varlena datum
+ * ----------
+ */
+extern Size toast_raw_datum_size(Datum value);
+
+/* ----------
+ * toast_datum_size -
+ *
+ * Return the storage size of a varlena datum
+ * ----------
+ */
+extern Size toast_datum_size(Datum value);
+
+#endif /* DETOAST_H */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
new file mode 100644
index 0000000..480a476
--- /dev/null
+++ b/src/include/access/genam.h
@@ -0,0 +1,231 @@
+/*-------------------------------------------------------------------------
+ *
+ * genam.h
+ * POSTGRES generalized index access method definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/genam.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GENAM_H
+#define GENAM_H
+
+#include "access/sdir.h"
+#include "access/skey.h"
+#include "nodes/tidbitmap.h"
+#include "storage/lockdefs.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+/* We don't want this file to depend on execnodes.h. */
+struct IndexInfo;
+
+/*
+ * Struct for statistics returned by ambuild
+ */
+typedef struct IndexBuildResult
+{
+ double heap_tuples; /* # of tuples seen in parent table */
+ double index_tuples; /* # of tuples inserted into index */
+} IndexBuildResult;
+
+/*
+ * Struct for input arguments passed to ambulkdelete and amvacuumcleanup
+ *
+ * num_heap_tuples is accurate only when estimated_count is false;
+ * otherwise it's just an estimate (currently, the estimate is the
+ * prior value of the relation's pg_class.reltuples field, so it could
+ * even be -1). It will always just be an estimate during ambulkdelete.
+ */
+typedef struct IndexVacuumInfo
+{
+ Relation index; /* the index being vacuumed */
+ bool analyze_only; /* ANALYZE (without any actual vacuum) */
+ bool report_progress; /* emit progress.h status reports */
+ bool estimated_count; /* num_heap_tuples is an estimate */
+ int message_level; /* ereport level for progress messages */
+ double num_heap_tuples; /* tuples remaining in heap */
+ BufferAccessStrategy strategy; /* access strategy for reads */
+} IndexVacuumInfo;
+
+/*
+ * Struct for statistics returned by ambulkdelete and amvacuumcleanup
+ *
+ * This struct is normally allocated by the first ambulkdelete call and then
+ * passed along through subsequent ones until amvacuumcleanup; however,
+ * amvacuumcleanup must be prepared to allocate it in the case where no
+ * ambulkdelete calls were made (because no tuples needed deletion).
+ * Note that an index AM could choose to return a larger struct
+ * of which this is just the first field; this provides a way for ambulkdelete
+ * to communicate additional private data to amvacuumcleanup.
+ *
+ * Note: pages_newly_deleted is the number of pages in the index that were
+ * deleted by the current vacuum operation. pages_deleted and pages_free
+ * refer to free space within the index file.
+ *
+ * Note: Some index AMs may compute num_index_tuples by reference to
+ * num_heap_tuples, in which case they should copy the estimated_count field
+ * from IndexVacuumInfo.
+ */
+typedef struct IndexBulkDeleteResult
+{
+ BlockNumber num_pages; /* pages remaining in index */
+ bool estimated_count; /* num_index_tuples is an estimate */
+ double num_index_tuples; /* tuples remaining */
+ double tuples_removed; /* # removed during vacuum operation */
+ BlockNumber pages_newly_deleted; /* # pages marked deleted by us */
+ BlockNumber pages_deleted; /* # pages marked deleted (could be by us) */
+ BlockNumber pages_free; /* # pages available for reuse */
+} IndexBulkDeleteResult;
+
+/* Typedef for callback function to determine if a tuple is bulk-deletable */
+typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
+
+/* struct definitions appear in relscan.h */
+typedef struct IndexScanDescData *IndexScanDesc;
+typedef struct SysScanDescData *SysScanDesc;
+
+typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc;
+
+/*
+ * Enumeration specifying the type of uniqueness check to perform in
+ * index_insert().
+ *
+ * UNIQUE_CHECK_YES is the traditional Postgres immediate check, possibly
+ * blocking to see if a conflicting transaction commits.
+ *
+ * For deferrable unique constraints, UNIQUE_CHECK_PARTIAL is specified at
+ * insertion time. The index AM should test if the tuple is unique, but
+ * should not throw error, block, or prevent the insertion if the tuple
+ * appears not to be unique. We'll recheck later when it is time for the
+ * constraint to be enforced. The AM must return true if the tuple is
+ * known unique, false if it is possibly non-unique. In the "true" case
+ * it is safe to omit the later recheck.
+ *
+ * When it is time to recheck the deferred constraint, a pseudo-insertion
+ * call is made with UNIQUE_CHECK_EXISTING. The tuple is already in the
+ * index in this case, so it should not be inserted again. Rather, just
+ * check for conflicting live tuples (possibly blocking).
+ */
+typedef enum IndexUniqueCheck
+{
+ UNIQUE_CHECK_NO, /* Don't do any uniqueness checking */
+ UNIQUE_CHECK_YES, /* Enforce uniqueness at insertion time */
+ UNIQUE_CHECK_PARTIAL, /* Test uniqueness, but no error */
+ UNIQUE_CHECK_EXISTING /* Check if existing tuple is unique */
+} IndexUniqueCheck;
+
+
+/* Nullable "ORDER BY col op const" distance */
+typedef struct IndexOrderByDistance
+{
+ double value;
+ bool isnull;
+} IndexOrderByDistance;
+
+/*
+ * generalized index_ interface routines (in indexam.c)
+ */
+
+/*
+ * IndexScanIsValid
+ * True iff the index scan is valid.
+ */
+#define IndexScanIsValid(scan) PointerIsValid(scan)
+
+extern Relation index_open(Oid relationId, LOCKMODE lockmode);
+extern void index_close(Relation relation, LOCKMODE lockmode);
+
+extern bool index_insert(Relation indexRelation,
+ Datum *values, bool *isnull,
+ ItemPointer heap_t_ctid,
+ Relation heapRelation,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+
+extern IndexScanDesc index_beginscan(Relation heapRelation,
+ Relation indexRelation,
+ Snapshot snapshot,
+ int nkeys, int norderbys);
+extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
+ Snapshot snapshot,
+ int nkeys);
+extern void index_rescan(IndexScanDesc scan,
+ ScanKey keys, int nkeys,
+ ScanKey orderbys, int norderbys);
+extern void index_endscan(IndexScanDesc scan);
+extern void index_markpos(IndexScanDesc scan);
+extern void index_restrpos(IndexScanDesc scan);
+extern Size index_parallelscan_estimate(Relation indexrel, Snapshot snapshot);
+extern void index_parallelscan_initialize(Relation heaprel, Relation indexrel,
+ Snapshot snapshot, ParallelIndexScanDesc target);
+extern void index_parallelrescan(IndexScanDesc scan);
+extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
+ Relation indexrel, int nkeys, int norderbys,
+ ParallelIndexScanDesc pscan);
+extern ItemPointer index_getnext_tid(IndexScanDesc scan,
+ ScanDirection direction);
+struct TupleTableSlot;
+extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot);
+extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
+ struct TupleTableSlot *slot);
+extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
+
+extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *istat,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *istat);
+extern bool index_can_return(Relation indexRelation, int attno);
+extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
+ uint16 procnum);
+extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
+ uint16 procnum);
+extern void index_store_float8_orderby_distances(IndexScanDesc scan,
+ Oid *orderByTypes,
+ IndexOrderByDistance *distances,
+ bool recheckOrderBy);
+extern bytea *index_opclass_options(Relation relation, AttrNumber attnum,
+ Datum attoptions, bool validate);
+
+
+/*
+ * index access method support routines (in genam.c)
+ */
+extern IndexScanDesc RelationGetIndexScan(Relation indexRelation,
+ int nkeys, int norderbys);
+extern void IndexScanEnd(IndexScanDesc scan);
+extern char *BuildIndexValueDescription(Relation indexRelation,
+ Datum *values, bool *isnull);
+extern TransactionId index_compute_xid_horizon_for_tuples(Relation irel,
+ Relation hrel,
+ Buffer ibuf,
+ OffsetNumber *itemnos,
+ int nitems);
+
+/*
+ * heap-or-index access to system catalogs (in genam.c)
+ */
+extern SysScanDesc systable_beginscan(Relation heapRelation,
+ Oid indexId,
+ bool indexOK,
+ Snapshot snapshot,
+ int nkeys, ScanKey key);
+extern HeapTuple systable_getnext(SysScanDesc sysscan);
+extern bool systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup);
+extern void systable_endscan(SysScanDesc sysscan);
+extern SysScanDesc systable_beginscan_ordered(Relation heapRelation,
+ Relation indexRelation,
+ Snapshot snapshot,
+ int nkeys, ScanKey key);
+extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan,
+ ScanDirection direction);
+extern void systable_endscan_ordered(SysScanDesc sysscan);
+
+#endif /* GENAM_H */
diff --git a/src/include/access/generic_xlog.h b/src/include/access/generic_xlog.h
new file mode 100644
index 0000000..6e0a275
--- /dev/null
+++ b/src/include/access/generic_xlog.h
@@ -0,0 +1,45 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic_xlog.h
+ * Generic xlog API definition.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/generic_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GENERIC_XLOG_H
+#define GENERIC_XLOG_H
+
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xloginsert.h"
+#include "storage/bufpage.h"
+#include "utils/rel.h"
+
+#define MAX_GENERIC_XLOG_PAGES XLR_NORMAL_MAX_BLOCK_ID
+
+/* Flag bits for GenericXLogRegisterBuffer */
+#define GENERIC_XLOG_FULL_IMAGE 0x0001 /* write full-page image */
+
+/* state of generic xlog record construction */
+struct GenericXLogState;
+typedef struct GenericXLogState GenericXLogState;
+
+/* API for construction of generic xlog records */
+extern GenericXLogState *GenericXLogStart(Relation relation);
+extern Page GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer,
+ int flags);
+extern XLogRecPtr GenericXLogFinish(GenericXLogState *state);
+extern void GenericXLogAbort(GenericXLogState *state);
+
+/* functions defined for rmgr */
+extern void generic_redo(XLogReaderState *record);
+extern const char *generic_identify(uint8 info);
+extern void generic_desc(StringInfo buf, XLogReaderState *record);
+extern void generic_mask(char *pagedata, BlockNumber blkno);
+
+#endif /* GENERIC_XLOG_H */
diff --git a/src/include/access/gin.h b/src/include/access/gin.h
new file mode 100644
index 0000000..266cb07
--- /dev/null
+++ b/src/include/access/gin.h
@@ -0,0 +1,78 @@
+/*--------------------------------------------------------------------------
+ * gin.h
+ * Public header file for Generalized Inverted Index access method.
+ *
+ * Copyright (c) 2006-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/gin.h
+ *--------------------------------------------------------------------------
+ */
+#ifndef GIN_H
+#define GIN_H
+
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/block.h"
+#include "utils/relcache.h"
+
+
+/*
+ * amproc indexes for inverted indexes.
+ */
+#define GIN_COMPARE_PROC 1
+#define GIN_EXTRACTVALUE_PROC 2
+#define GIN_EXTRACTQUERY_PROC 3
+#define GIN_CONSISTENT_PROC 4
+#define GIN_COMPARE_PARTIAL_PROC 5
+#define GIN_TRICONSISTENT_PROC 6
+#define GIN_OPTIONS_PROC 7
+#define GINNProcs 7
+
+/*
+ * searchMode settings for extractQueryFn.
+ */
+#define GIN_SEARCH_MODE_DEFAULT 0
+#define GIN_SEARCH_MODE_INCLUDE_EMPTY 1
+#define GIN_SEARCH_MODE_ALL 2
+#define GIN_SEARCH_MODE_EVERYTHING 3 /* for internal use only */
+
+/*
+ * GinStatsData represents stats data for planner use
+ */
+typedef struct GinStatsData
+{
+ BlockNumber nPendingPages;
+ BlockNumber nTotalPages;
+ BlockNumber nEntryPages;
+ BlockNumber nDataPages;
+ int64 nEntries;
+ int32 ginVersion;
+} GinStatsData;
+
+/*
+ * A ternary value used by tri-consistent functions.
+ *
+ * This must be of the same size as a bool because some code will cast a
+ * pointer to a bool to a pointer to a GinTernaryValue.
+ */
+typedef char GinTernaryValue;
+
+#define GIN_FALSE 0 /* item is not present / does not match */
+#define GIN_TRUE 1 /* item is present / matches */
+#define GIN_MAYBE 2 /* don't know if item is present / don't know
+ * if matches */
+
+#define DatumGetGinTernaryValue(X) ((GinTernaryValue)(X))
+#define GinTernaryValueGetDatum(X) ((Datum)(X))
+#define PG_RETURN_GIN_TERNARY_VALUE(x) return GinTernaryValueGetDatum(x)
+
+/* GUC parameters */
+extern PGDLLIMPORT int GinFuzzySearchLimit;
+extern int gin_pending_list_limit;
+
+/* ginutil.c */
+extern void ginGetStats(Relation index, GinStatsData *stats);
+extern void ginUpdateStats(Relation index, const GinStatsData *stats,
+ bool is_build);
+
+#endif /* GIN_H */
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
new file mode 100644
index 0000000..670a40b
--- /dev/null
+++ b/src/include/access/gin_private.h
@@ -0,0 +1,500 @@
+/*--------------------------------------------------------------------------
+ * gin_private.h
+ * header file for postgres inverted index access method implementation.
+ *
+ * Copyright (c) 2006-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/gin_private.h
+ *--------------------------------------------------------------------------
+ */
+#ifndef GIN_PRIVATE_H
+#define GIN_PRIVATE_H
+
+#include "access/amapi.h"
+#include "access/gin.h"
+#include "access/ginblock.h"
+#include "access/itup.h"
+#include "catalog/pg_am_d.h"
+#include "fmgr.h"
+#include "lib/rbtree.h"
+#include "storage/bufmgr.h"
+
+/*
+ * Storage type for GIN's reloptions
+ */
+typedef struct GinOptions
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ bool useFastUpdate; /* use fast updates? */
+ int pendingListCleanupSize; /* maximum size of pending list */
+} GinOptions;
+
+#define GIN_DEFAULT_USE_FASTUPDATE true
+#define GinGetUseFastUpdate(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == GIN_AM_OID), \
+ (relation)->rd_options ? \
+ ((GinOptions *) (relation)->rd_options)->useFastUpdate : GIN_DEFAULT_USE_FASTUPDATE)
+#define GinGetPendingListCleanupSize(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == GIN_AM_OID), \
+ (relation)->rd_options && \
+ ((GinOptions *) (relation)->rd_options)->pendingListCleanupSize != -1 ? \
+ ((GinOptions *) (relation)->rd_options)->pendingListCleanupSize : \
+ gin_pending_list_limit)
+
+
+/* Macros for buffer lock/unlock operations */
+#define GIN_UNLOCK BUFFER_LOCK_UNLOCK
+#define GIN_SHARE BUFFER_LOCK_SHARE
+#define GIN_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
+
+
+/*
+ * GinState: working data structure describing the index being worked on
+ */
+typedef struct GinState
+{
+ Relation index;
+ bool oneCol; /* true if single-column index */
+
+ /*
+ * origTupdesc is the nominal tuple descriptor of the index, ie, the i'th
+ * attribute shows the key type (not the input data type!) of the i'th
+ * index column. In a single-column index this describes the actual leaf
+ * index tuples. In a multi-column index, the actual leaf tuples contain
+ * a smallint column number followed by a key datum of the appropriate
+ * type for that column. We set up tupdesc[i] to describe the actual
+ * rowtype of the index tuples for the i'th column, ie, (int2, keytype).
+ * Note that in any case, leaf tuples contain more data than is known to
+ * the TupleDesc; see access/gin/README for details.
+ */
+ TupleDesc origTupdesc;
+ TupleDesc tupdesc[INDEX_MAX_KEYS];
+
+ /*
+ * Per-index-column opclass support functions
+ */
+ FmgrInfo compareFn[INDEX_MAX_KEYS];
+ FmgrInfo extractValueFn[INDEX_MAX_KEYS];
+ FmgrInfo extractQueryFn[INDEX_MAX_KEYS];
+ FmgrInfo consistentFn[INDEX_MAX_KEYS];
+ FmgrInfo triConsistentFn[INDEX_MAX_KEYS];
+ FmgrInfo comparePartialFn[INDEX_MAX_KEYS]; /* optional method */
+ /* canPartialMatch[i] is true if comparePartialFn[i] is valid */
+ bool canPartialMatch[INDEX_MAX_KEYS];
+ /* Collations to pass to the support functions */
+ Oid supportCollation[INDEX_MAX_KEYS];
+} GinState;
+
+
+/* ginutil.c */
+extern bytea *ginoptions(Datum reloptions, bool validate);
+extern void initGinState(GinState *state, Relation index);
+extern Buffer GinNewBuffer(Relation index);
+extern void GinInitBuffer(Buffer b, uint32 f);
+extern void GinInitPage(Page page, uint32 f, Size pageSize);
+extern void GinInitMetabuffer(Buffer b);
+extern int ginCompareEntries(GinState *ginstate, OffsetNumber attnum,
+ Datum a, GinNullCategory categorya,
+ Datum b, GinNullCategory categoryb);
+extern int ginCompareAttEntries(GinState *ginstate,
+ OffsetNumber attnuma, Datum a, GinNullCategory categorya,
+ OffsetNumber attnumb, Datum b, GinNullCategory categoryb);
+extern Datum *ginExtractEntries(GinState *ginstate, OffsetNumber attnum,
+ Datum value, bool isNull,
+ int32 *nentries, GinNullCategory **categories);
+
+extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple);
+extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple,
+ GinNullCategory *category);
+
+/* gininsert.c */
+extern IndexBuildResult *ginbuild(Relation heap, Relation index,
+ struct IndexInfo *indexInfo);
+extern void ginbuildempty(Relation index);
+extern bool gininsert(Relation index, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+extern void ginEntryInsert(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats);
+
+/* ginbtree.c */
+
+typedef struct GinBtreeStack
+{
+ BlockNumber blkno;
+ Buffer buffer;
+ OffsetNumber off;
+ ItemPointerData iptr;
+ /* predictNumber contains predicted number of pages on current level */
+ uint32 predictNumber;
+ struct GinBtreeStack *parent;
+} GinBtreeStack;
+
+typedef struct GinBtreeData *GinBtree;
+
+/* Return codes for GinBtreeData.beginPlaceToPage method */
+typedef enum
+{
+ GPTP_NO_WORK,
+ GPTP_INSERT,
+ GPTP_SPLIT
+} GinPlaceToPageRC;
+
+typedef struct GinBtreeData
+{
+ /* search methods */
+ BlockNumber (*findChildPage) (GinBtree, GinBtreeStack *);
+ BlockNumber (*getLeftMostChild) (GinBtree, Page);
+ bool (*isMoveRight) (GinBtree, Page);
+ bool (*findItem) (GinBtree, GinBtreeStack *);
+
+ /* insert methods */
+ OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber);
+ GinPlaceToPageRC (*beginPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void **, Page *, Page *);
+ void (*execPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void *);
+ void *(*prepareDownlink) (GinBtree, Buffer);
+ void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page);
+
+ bool isData;
+
+ Relation index;
+ BlockNumber rootBlkno;
+ GinState *ginstate; /* not valid in a data scan */
+ bool fullScan;
+ bool isBuild;
+
+ /* Search key for Entry tree */
+ OffsetNumber entryAttnum;
+ Datum entryKey;
+ GinNullCategory entryCategory;
+
+ /* Search key for data tree (posting tree) */
+ ItemPointerData itemptr;
+} GinBtreeData;
+
+/* This represents a tuple to be inserted to entry tree. */
+typedef struct
+{
+ IndexTuple entry; /* tuple to insert */
+ bool isDelete; /* delete old tuple at same offset? */
+} GinBtreeEntryInsertData;
+
+/*
+ * This represents an itempointer, or many itempointers, to be inserted to
+ * a data (posting tree) leaf page
+ */
+typedef struct
+{
+ ItemPointerData *items;
+ uint32 nitem;
+ uint32 curitem;
+} GinBtreeDataLeafInsertData;
+
+/*
+ * For internal data (posting tree) pages, the insertion payload is a
+ * PostingItem
+ */
+
+extern GinBtreeStack *ginFindLeafPage(GinBtree btree, bool searchMode,
+ bool rootConflictCheck, Snapshot snapshot);
+extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode);
+extern void freeGinBtreeStack(GinBtreeStack *stack);
+extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
+ void *insertdata, GinStatsData *buildStats);
+
+/* ginentrypage.c */
+extern IndexTuple GinFormTuple(GinState *ginstate,
+ OffsetNumber attnum, Datum key, GinNullCategory category,
+ Pointer data, Size dataSize, int nipd, bool errorTooBig);
+extern void ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
+ Datum key, GinNullCategory category,
+ GinState *ginstate);
+extern void ginEntryFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
+extern ItemPointer ginReadTuple(GinState *ginstate, OffsetNumber attnum,
+ IndexTuple itup, int *nitems);
+
+/* gindatapage.c */
+extern ItemPointer GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast);
+extern int GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm);
+extern BlockNumber createPostingTree(Relation index,
+ ItemPointerData *items, uint32 nitems,
+ GinStatsData *buildStats, Buffer entrybuffer);
+extern void GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset);
+extern void GinPageDeletePostingItem(Page page, OffsetNumber offset);
+extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
+ ItemPointerData *items, uint32 nitem,
+ GinStatsData *buildStats);
+extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot);
+extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
+
+/*
+ * This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers
+ * and ginVacuumPostingTreeLeaf and as an opaque struct, so we need a forward
+ * declaration for it.
+ */
+typedef struct GinVacuumState GinVacuumState;
+
+extern void ginVacuumPostingTreeLeaf(Relation rel, Buffer buf, GinVacuumState *gvs);
+
+/* ginscan.c */
+
+/*
+ * GinScanKeyData describes a single GIN index qualifier expression.
+ *
+ * From each qual expression, we extract one or more specific index search
+ * conditions, which are represented by GinScanEntryData. It's quite
+ * possible for identical search conditions to be requested by more than
+ * one qual expression, in which case we merge such conditions to have just
+ * one unique GinScanEntry --- this is particularly important for efficiency
+ * when dealing with full-index-scan entries. So there can be multiple
+ * GinScanKeyData.scanEntry pointers to the same GinScanEntryData.
+ *
+ * In each GinScanKeyData, nentries is the true number of entries, while
+ * nuserentries is the number that extractQueryFn returned (which is what
+ * we report to consistentFn). The "user" entries must come first.
+ */
+typedef struct GinScanKeyData *GinScanKey;
+
+typedef struct GinScanEntryData *GinScanEntry;
+
+typedef struct GinScanKeyData
+{
+ /* Real number of entries in scanEntry[] (always > 0) */
+ uint32 nentries;
+ /* Number of entries that extractQueryFn and consistentFn know about */
+ uint32 nuserentries;
+
+ /* array of GinScanEntry pointers, one per extracted search condition */
+ GinScanEntry *scanEntry;
+
+ /*
+ * At least one of the entries in requiredEntries must be present for a
+ * tuple to match the overall qual.
+ *
+ * additionalEntries contains entries that are needed by the consistent
+ * function to decide if an item matches, but are not sufficient to
+ * satisfy the qual without entries from requiredEntries.
+ */
+ GinScanEntry *requiredEntries;
+ int nrequired;
+ GinScanEntry *additionalEntries;
+ int nadditional;
+
+ /* array of check flags, reported to consistentFn */
+ GinTernaryValue *entryRes;
+ bool (*boolConsistentFn) (GinScanKey key);
+ GinTernaryValue (*triConsistentFn) (GinScanKey key);
+ FmgrInfo *consistentFmgrInfo;
+ FmgrInfo *triConsistentFmgrInfo;
+ Oid collation;
+
+ /* other data needed for calling consistentFn */
+ Datum query;
+ /* NB: these three arrays have only nuserentries elements! */
+ Datum *queryValues;
+ GinNullCategory *queryCategories;
+ Pointer *extra_data;
+ StrategyNumber strategy;
+ int32 searchMode;
+ OffsetNumber attnum;
+
+ /*
+ * An excludeOnly scan key is not able to enumerate all matching tuples.
+ * That is, to be semantically correct on its own, it would need to have a
+ * GIN_CAT_EMPTY_QUERY scanEntry, but it doesn't. Such a key can still be
+ * used to filter tuples returned by other scan keys, so we will get the
+ * right answers as long as there's at least one non-excludeOnly scan key
+ * for each index attribute considered by the search. For efficiency
+ * reasons we don't want to have unnecessary GIN_CAT_EMPTY_QUERY entries,
+ * so we will convert an excludeOnly scan key to non-excludeOnly (by
+ * adding a GIN_CAT_EMPTY_QUERY scanEntry) only if there are no other
+ * non-excludeOnly scan keys.
+ */
+ bool excludeOnly;
+
+ /*
+ * Match status data. curItem is the TID most recently tested (could be a
+ * lossy-page pointer). curItemMatches is true if it passes the
+ * consistentFn test; if so, recheckCurItem is the recheck flag.
+ * isFinished means that all the input entry streams are finished, so this
+ * key cannot succeed for any later TIDs.
+ */
+ ItemPointerData curItem;
+ bool curItemMatches;
+ bool recheckCurItem;
+ bool isFinished;
+} GinScanKeyData;
+
+typedef struct GinScanEntryData
+{
+ /* query key and other information from extractQueryFn */
+ Datum queryKey;
+ GinNullCategory queryCategory;
+ bool isPartialMatch;
+ Pointer extra_data;
+ StrategyNumber strategy;
+ int32 searchMode;
+ OffsetNumber attnum;
+
+ /* Current page in posting tree */
+ Buffer buffer;
+
+ /* current ItemPointer to heap */
+ ItemPointerData curItem;
+
+ /* for a partial-match or full-scan query, we accumulate all TIDs here */
+ TIDBitmap *matchBitmap;
+ TBMIterator *matchIterator;
+ TBMIterateResult *matchResult;
+
+ /* used for Posting list and one page in Posting tree */
+ ItemPointerData *list;
+ int nlist;
+ OffsetNumber offset;
+
+ bool isFinished;
+ bool reduceResult;
+ uint32 predictNumberResult;
+ GinBtreeData btree;
+} GinScanEntryData;
+
+typedef struct GinScanOpaqueData
+{
+ MemoryContext tempCtx;
+ GinState ginstate;
+
+ GinScanKey keys; /* one per scan qualifier expr */
+ uint32 nkeys;
+
+ GinScanEntry *entries; /* one per index search condition */
+ uint32 totalentries;
+ uint32 allocentries; /* allocated length of entries[] */
+
+ MemoryContext keyCtx; /* used to hold key and entry data */
+
+ bool isVoidRes; /* true if query is unsatisfiable */
+} GinScanOpaqueData;
+
+typedef GinScanOpaqueData *GinScanOpaque;
+
+extern IndexScanDesc ginbeginscan(Relation rel, int nkeys, int norderbys);
+extern void ginendscan(IndexScanDesc scan);
+extern void ginrescan(IndexScanDesc scan, ScanKey key, int nscankeys,
+ ScanKey orderbys, int norderbys);
+extern void ginNewScanKey(IndexScanDesc scan);
+extern void ginFreeScanKeys(GinScanOpaque so);
+
+/* ginget.c */
+extern int64 gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+
+/* ginlogic.c */
+extern void ginInitConsistentFunction(GinState *ginstate, GinScanKey key);
+
+/* ginvacuum.c */
+extern IndexBulkDeleteResult *ginbulkdelete(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+extern IndexBulkDeleteResult *ginvacuumcleanup(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats);
+extern ItemPointer ginVacuumItemPointers(GinVacuumState *gvs,
+ ItemPointerData *items, int nitem, int *nremaining);
+
+/* ginvalidate.c */
+extern bool ginvalidate(Oid opclassoid);
+extern void ginadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions);
+
+/* ginbulk.c */
+typedef struct GinEntryAccumulator
+{
+ RBTNode rbtnode;
+ Datum key;
+ GinNullCategory category;
+ OffsetNumber attnum;
+ bool shouldSort;
+ ItemPointerData *list;
+ uint32 maxcount; /* allocated size of list[] */
+ uint32 count; /* current number of list[] entries */
+} GinEntryAccumulator;
+
+typedef struct
+{
+ GinState *ginstate;
+ Size allocatedMemory;
+ GinEntryAccumulator *entryallocator;
+ uint32 eas_used;
+ RBTree *tree;
+ RBTreeIterator tree_walk;
+} BuildAccumulator;
+
+extern void ginInitBA(BuildAccumulator *accum);
+extern void ginInsertBAEntries(BuildAccumulator *accum,
+ ItemPointer heapptr, OffsetNumber attnum,
+ Datum *entries, GinNullCategory *categories,
+ int32 nentries);
+extern void ginBeginBAScan(BuildAccumulator *accum);
+extern ItemPointerData *ginGetBAEntry(BuildAccumulator *accum,
+ OffsetNumber *attnum, Datum *key, GinNullCategory *category,
+ uint32 *n);
+
+/* ginfast.c */
+
+typedef struct GinTupleCollector
+{
+ IndexTuple *tuples;
+ uint32 ntuples;
+ uint32 lentuples;
+ uint32 sumsize;
+} GinTupleCollector;
+
+extern void ginHeapTupleFastInsert(GinState *ginstate,
+ GinTupleCollector *collector);
+extern void ginHeapTupleFastCollect(GinState *ginstate,
+ GinTupleCollector *collector,
+ OffsetNumber attnum, Datum value, bool isNull,
+ ItemPointer ht_ctid);
+extern void ginInsertCleanup(GinState *ginstate, bool full_clean,
+ bool fill_fsm, bool forceCleanup, IndexBulkDeleteResult *stats);
+
+/* ginpostinglist.c */
+
+extern GinPostingList *ginCompressPostingList(const ItemPointer ipd, int nipd,
+ int maxsize, int *nwritten);
+extern int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int totalsize, TIDBitmap *tbm);
+
+extern ItemPointer ginPostingListDecodeAllSegments(GinPostingList *ptr, int len, int *ndecoded);
+extern ItemPointer ginPostingListDecode(GinPostingList *ptr, int *ndecoded);
+extern ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na,
+ ItemPointerData *b, uint32 nb,
+ int *nmerged);
+
+/*
+ * Merging the results of several gin scans compares item pointers a lot,
+ * so we want this to be inlined.
+ */
+static inline int
+ginCompareItemPointers(ItemPointer a, ItemPointer b)
+{
+ uint64 ia = (uint64) GinItemPointerGetBlockNumber(a) << 32 | GinItemPointerGetOffsetNumber(a);
+ uint64 ib = (uint64) GinItemPointerGetBlockNumber(b) << 32 | GinItemPointerGetOffsetNumber(b);
+
+ if (ia == ib)
+ return 0;
+ else if (ia > ib)
+ return 1;
+ else
+ return -1;
+}
+
+extern int ginTraverseLock(Buffer buffer, bool searchMode);
+
+#endif /* GIN_PRIVATE_H */
diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h
new file mode 100644
index 0000000..37d650a
--- /dev/null
+++ b/src/include/access/ginblock.h
@@ -0,0 +1,346 @@
+/*--------------------------------------------------------------------------
+ * ginblock.h
+ * details of structures stored in GIN index blocks
+ *
+ * Copyright (c) 2006-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/ginblock.h
+ *--------------------------------------------------------------------------
+ */
+#ifndef GINBLOCK_H
+#define GINBLOCK_H
+
+#include "access/transam.h"
+#include "storage/block.h"
+#include "storage/bufpage.h"
+#include "storage/itemptr.h"
+#include "storage/off.h"
+
+/*
+ * Page opaque data in an inverted index page.
+ *
+ * Note: GIN does not include a page ID word as do the other index types.
+ * This is OK because the opaque data is only 8 bytes and so can be reliably
+ * distinguished by size. Revisit this if the size ever increases.
+ * Further note: as of 9.2, SP-GiST also uses 8-byte special space, as does
+ * BRIN as of 9.5. This is still OK, as long as GIN isn't using all of the
+ * high-order bits in its flags word, because that way the flags word cannot
+ * match the page IDs used by SP-GiST and BRIN.
+ */
+typedef struct GinPageOpaqueData
+{
+ BlockNumber rightlink; /* next page if any */
+ OffsetNumber maxoff; /* number of PostingItems on GIN_DATA &
+ * ~GIN_LEAF page. On GIN_LIST page, number of
+ * heap tuples. */
+ uint16 flags; /* see bit definitions below */
+} GinPageOpaqueData;
+
+typedef GinPageOpaqueData *GinPageOpaque;
+
+#define GIN_DATA (1 << 0)
+#define GIN_LEAF (1 << 1)
+#define GIN_DELETED (1 << 2)
+#define GIN_META (1 << 3)
+#define GIN_LIST (1 << 4)
+#define GIN_LIST_FULLROW (1 << 5) /* makes sense only on GIN_LIST page */
+#define GIN_INCOMPLETE_SPLIT (1 << 6) /* page was split, but parent not
+ * updated */
+#define GIN_COMPRESSED (1 << 7)
+
+/* Page numbers of fixed-location pages */
+#define GIN_METAPAGE_BLKNO (0)
+#define GIN_ROOT_BLKNO (1)
+
+typedef struct GinMetaPageData
+{
+ /*
+ * Pointers to head and tail of pending list, which consists of GIN_LIST
+ * pages. These store fast-inserted entries that haven't yet been moved
+ * into the regular GIN structure.
+ */
+ BlockNumber head;
+ BlockNumber tail;
+
+ /*
+ * Free space in bytes in the pending list's tail page.
+ */
+ uint32 tailFreeSize;
+
+ /*
+ * We store both number of pages and number of heap tuples that are in the
+ * pending list.
+ */
+ BlockNumber nPendingPages;
+ int64 nPendingHeapTuples;
+
+ /*
+ * Statistics for planner use (accurate as of last VACUUM)
+ */
+ BlockNumber nTotalPages;
+ BlockNumber nEntryPages;
+ BlockNumber nDataPages;
+ int64 nEntries;
+
+ /*
+ * GIN version number (ideally this should have been at the front, but too
+ * late now. Don't move it!)
+ *
+ * Currently 2 (for indexes initialized in 9.4 or later)
+ *
+ * Version 1 (indexes initialized in version 9.1, 9.2 or 9.3), is
+ * compatible, but may contain uncompressed posting tree (leaf) pages and
+ * posting lists. They will be converted to compressed format when
+ * modified.
+ *
+ * Version 0 (indexes initialized in 9.0 or before) is compatible but may
+ * be missing null entries, including both null keys and placeholders.
+ * Reject full-index-scan attempts on such indexes.
+ */
+ int32 ginVersion;
+} GinMetaPageData;
+
+#define GIN_CURRENT_VERSION 2
+
+#define GinPageGetMeta(p) \
+ ((GinMetaPageData *) PageGetContents(p))
+
+/*
+ * Macros for accessing a GIN index page's opaque data
+ */
+#define GinPageGetOpaque(page) ( (GinPageOpaque) PageGetSpecialPointer(page) )
+
+#define GinPageIsLeaf(page) ( (GinPageGetOpaque(page)->flags & GIN_LEAF) != 0 )
+#define GinPageSetLeaf(page) ( GinPageGetOpaque(page)->flags |= GIN_LEAF )
+#define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF )
+#define GinPageIsData(page) ( (GinPageGetOpaque(page)->flags & GIN_DATA) != 0 )
+#define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA )
+#define GinPageIsList(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST) != 0 )
+#define GinPageSetList(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST )
+#define GinPageHasFullRow(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW) != 0 )
+#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW )
+#define GinPageIsCompressed(page) ( (GinPageGetOpaque(page)->flags & GIN_COMPRESSED) != 0 )
+#define GinPageSetCompressed(page) ( GinPageGetOpaque(page)->flags |= GIN_COMPRESSED )
+
+#define GinPageIsDeleted(page) ( (GinPageGetOpaque(page)->flags & GIN_DELETED) != 0 )
+#define GinPageSetDeleted(page) ( GinPageGetOpaque(page)->flags |= GIN_DELETED)
+#define GinPageSetNonDeleted(page) ( GinPageGetOpaque(page)->flags &= ~GIN_DELETED)
+#define GinPageIsIncompleteSplit(page) ( (GinPageGetOpaque(page)->flags & GIN_INCOMPLETE_SPLIT) != 0 )
+
+#define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber)
+
+/*
+ * We should reclaim deleted page only once every transaction started before
+ * its deletion is over.
+ */
+#define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
+#define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid)
+extern bool GinPageIsRecyclable(Page page);
+
+/*
+ * We use our own ItemPointerGet(BlockNumber|OffsetNumber)
+ * to avoid Asserts, since sometimes the ip_posid isn't "valid"
+ */
+#define GinItemPointerGetBlockNumber(pointer) \
+ (ItemPointerGetBlockNumberNoCheck(pointer))
+
+#define GinItemPointerGetOffsetNumber(pointer) \
+ (ItemPointerGetOffsetNumberNoCheck(pointer))
+
+#define GinItemPointerSetBlockNumber(pointer, blkno) \
+ (ItemPointerSetBlockNumber((pointer), (blkno)))
+
+#define GinItemPointerSetOffsetNumber(pointer, offnum) \
+ (ItemPointerSetOffsetNumber((pointer), (offnum)))
+
+
+/*
+ * Special-case item pointer values needed by the GIN search logic.
+ * MIN: sorts less than any valid item pointer
+ * MAX: sorts greater than any valid item pointer
+ * LOSSY PAGE: indicates a whole heap page, sorts after normal item
+ * pointers for that page
+ * Note that these are all distinguishable from an "invalid" item pointer
+ * (which is InvalidBlockNumber/0) as well as from all normal item
+ * pointers (which have item numbers in the range 1..MaxHeapTuplesPerPage).
+ */
+#define ItemPointerSetMin(p) \
+ ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0)
+#define ItemPointerIsMin(p) \
+ (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0 && \
+ GinItemPointerGetBlockNumber(p) == (BlockNumber)0)
+#define ItemPointerSetMax(p) \
+ ItemPointerSet((p), InvalidBlockNumber, (OffsetNumber)0xffff)
+#define ItemPointerSetLossyPage(p, b) \
+ ItemPointerSet((p), (b), (OffsetNumber)0xffff)
+#define ItemPointerIsLossyPage(p) \
+ (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \
+ GinItemPointerGetBlockNumber(p) != InvalidBlockNumber)
+
+/*
+ * Posting item in a non-leaf posting-tree page
+ */
+typedef struct
+{
+ /* We use BlockIdData not BlockNumber to avoid padding space wastage */
+ BlockIdData child_blkno;
+ ItemPointerData key;
+} PostingItem;
+
+#define PostingItemGetBlockNumber(pointer) \
+ BlockIdGetBlockNumber(&(pointer)->child_blkno)
+
+#define PostingItemSetBlockNumber(pointer, blockNumber) \
+ BlockIdSet(&((pointer)->child_blkno), (blockNumber))
+
+/*
+ * Category codes to distinguish placeholder nulls from ordinary NULL keys.
+ *
+ * The first two code values were chosen to be compatible with the usual usage
+ * of bool isNull flags. However, casting between bool and GinNullCategory is
+ * risky because of the possibility of different bit patterns and type sizes,
+ * so it is no longer done.
+ *
+ * GIN_CAT_EMPTY_QUERY is never stored in the index; and notice that it is
+ * chosen to sort before not after regular key values.
+ */
+typedef signed char GinNullCategory;
+
+#define GIN_CAT_NORM_KEY 0 /* normal, non-null key value */
+#define GIN_CAT_NULL_KEY 1 /* null key value */
+#define GIN_CAT_EMPTY_ITEM 2 /* placeholder for zero-key item */
+#define GIN_CAT_NULL_ITEM 3 /* placeholder for null item */
+#define GIN_CAT_EMPTY_QUERY (-1) /* placeholder for full-scan query */
+
+/*
+ * Access macros for null category byte in entry tuples
+ */
+#define GinCategoryOffset(itup,ginstate) \
+ (IndexInfoFindDataOffset((itup)->t_info) + \
+ ((ginstate)->oneCol ? 0 : sizeof(int16)))
+#define GinGetNullCategory(itup,ginstate) \
+ (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate))))
+#define GinSetNullCategory(itup,ginstate,c) \
+ (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate))) = (c))
+
+/*
+ * Access macros for leaf-page entry tuples (see discussion in README)
+ */
+#define GinGetNPosting(itup) GinItemPointerGetOffsetNumber(&(itup)->t_tid)
+#define GinSetNPosting(itup,n) ItemPointerSetOffsetNumber(&(itup)->t_tid,n)
+#define GIN_TREE_POSTING ((OffsetNumber)0xffff)
+#define GinIsPostingTree(itup) (GinGetNPosting(itup) == GIN_TREE_POSTING)
+#define GinSetPostingTree(itup, blkno) ( GinSetNPosting((itup),GIN_TREE_POSTING), ItemPointerSetBlockNumber(&(itup)->t_tid, blkno) )
+#define GinGetPostingTree(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid)
+
+#define GIN_ITUP_COMPRESSED (1U << 31)
+#define GinGetPostingOffset(itup) (GinItemPointerGetBlockNumber(&(itup)->t_tid) & (~GIN_ITUP_COMPRESSED))
+#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,(n)|GIN_ITUP_COMPRESSED)
+#define GinGetPosting(itup) ((Pointer) ((char*)(itup) + GinGetPostingOffset(itup)))
+#define GinItupIsCompressed(itup) ((GinItemPointerGetBlockNumber(&(itup)->t_tid) & GIN_ITUP_COMPRESSED) != 0)
+
+/*
+ * Maximum size of an item on entry tree page. Make sure that we fit at least
+ * three items on each page. (On regular B-tree indexes, we must fit at least
+ * three items: two data items and the "high key". In GIN entry tree, we don't
+ * currently store the high key explicitly, we just use the rightmost item on
+ * the page, so it would actually be enough to fit two items.)
+ */
+#define GinMaxItemSize \
+ Min(INDEX_SIZE_MASK, \
+ MAXALIGN_DOWN(((BLCKSZ - \
+ MAXALIGN(SizeOfPageHeaderData + 3 * sizeof(ItemIdData)) - \
+ MAXALIGN(sizeof(GinPageOpaqueData))) / 3)))
+
+/*
+ * Access macros for non-leaf entry tuples
+ */
+#define GinGetDownlink(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid)
+#define GinSetDownlink(itup,blkno) ItemPointerSet(&(itup)->t_tid, blkno, InvalidOffsetNumber)
+
+
+/*
+ * Data (posting tree) pages
+ *
+ * Posting tree pages don't store regular tuples. Non-leaf pages contain
+ * PostingItems, which are pairs of ItemPointers and child block numbers.
+ * Leaf pages contain GinPostingLists and an uncompressed array of item
+ * pointers.
+ *
+ * In a leaf page, the compressed posting lists are stored after the regular
+ * page header, one after each other. Although we don't store regular tuples,
+ * pd_lower is used to indicate the end of the posting lists. After that, free
+ * space follows. This layout is compatible with the "standard" heap and
+ * index page layout described in bufpage.h, so that we can e.g set buffer_std
+ * when writing WAL records.
+ *
+ * In the special space is the GinPageOpaque struct.
+ */
+#define GinDataLeafPageGetPostingList(page) \
+ (GinPostingList *) ((PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData))))
+#define GinDataLeafPageGetPostingListSize(page) \
+ (((PageHeader) page)->pd_lower - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(ItemPointerData)))
+
+#define GinDataLeafPageIsEmpty(page) \
+ (GinPageIsCompressed(page) ? (GinDataLeafPageGetPostingListSize(page) == 0) : (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber))
+
+#define GinDataLeafPageGetFreeSpace(page) PageGetExactFreeSpace(page)
+
+#define GinDataPageGetRightBound(page) ((ItemPointer) PageGetContents(page))
+/*
+ * Pointer to the data portion of a posting tree page. For internal pages,
+ * that's the beginning of the array of PostingItems. For compressed leaf
+ * pages, the first compressed posting list. For uncompressed (pre-9.4) leaf
+ * pages, it's the beginning of the ItemPointer array.
+ */
+#define GinDataPageGetData(page) \
+ (PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData)))
+/* non-leaf pages contain PostingItems */
+#define GinDataPageGetPostingItem(page, i) \
+ ((PostingItem *) (GinDataPageGetData(page) + ((i)-1) * sizeof(PostingItem)))
+
+/*
+ * Note: there is no GinDataPageGetDataSize macro, because before version
+ * 9.4, we didn't set pd_lower on data pages. There can be pages in the index
+ * that were binary-upgraded from earlier versions and still have an invalid
+ * pd_lower, so we cannot trust it in general. Compressed posting tree leaf
+ * pages are new in 9.4, however, so we can trust them; see
+ * GinDataLeafPageGetPostingListSize.
+ */
+#define GinDataPageSetDataSize(page, size) \
+ { \
+ Assert(size <= GinDataPageMaxDataSize); \
+ ((PageHeader) page)->pd_lower = (size) + MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(ItemPointerData)); \
+ }
+
+#define GinNonLeafDataPageGetFreeSpace(page) \
+ (GinDataPageMaxDataSize - \
+ GinPageGetOpaque(page)->maxoff * sizeof(PostingItem))
+
+#define GinDataPageMaxDataSize \
+ (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
+ - MAXALIGN(sizeof(ItemPointerData)) \
+ - MAXALIGN(sizeof(GinPageOpaqueData)))
+
+/*
+ * List pages
+ */
+#define GinListPageSize \
+ ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) )
+
+/*
+ * A compressed posting list.
+ *
+ * Note: This requires 2-byte alignment.
+ */
+typedef struct
+{
+ ItemPointerData first; /* first item in this posting list (unpacked) */
+ uint16 nbytes; /* number of bytes that follow */
+ unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]; /* varbyte encoded items */
+} GinPostingList;
+
+#define SizeOfGinPostingList(plist) (offsetof(GinPostingList, bytes) + SHORTALIGN((plist)->nbytes) )
+#define GinNextPostingListSegment(cur) ((GinPostingList *) (((char *) (cur)) + SizeOfGinPostingList((cur))))
+
+#endif /* GINBLOCK_H */
diff --git a/src/include/access/ginxlog.h b/src/include/access/ginxlog.h
new file mode 100644
index 0000000..8a2507b
--- /dev/null
+++ b/src/include/access/ginxlog.h
@@ -0,0 +1,216 @@
+/*--------------------------------------------------------------------------
+ * ginxlog.h
+ * header file for postgres inverted index xlog implementation.
+ *
+ * Copyright (c) 2006-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/ginxlog.h
+ *--------------------------------------------------------------------------
+ */
+#ifndef GINXLOG_H
+#define GINXLOG_H
+
+#include "access/ginblock.h"
+#include "access/itup.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/off.h"
+
+#define XLOG_GIN_CREATE_PTREE 0x10
+
+typedef struct ginxlogCreatePostingTree
+{
+ uint32 size;
+ /* A compressed posting list follows */
+} ginxlogCreatePostingTree;
+
+/*
+ * The format of the insertion record varies depending on the page type.
+ * ginxlogInsert is the common part between all variants.
+ *
+ * Backup Blk 0: target page
+ * Backup Blk 1: left child, if this insertion finishes an incomplete split
+ */
+
+#define XLOG_GIN_INSERT 0x20
+
+typedef struct
+{
+ uint16 flags; /* GIN_INSERT_ISLEAF and/or GIN_INSERT_ISDATA */
+
+ /*
+ * FOLLOWS:
+ *
+ * 1. if not leaf page, block numbers of the left and right child pages
+ * whose split this insertion finishes, as BlockIdData[2] (beware of
+ * adding fields in this struct that would make them not 16-bit aligned)
+ *
+ * 2. a ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending
+ * on tree type.
+ *
+ * NB: the below structs are only 16-bit aligned when appended to a
+ * ginxlogInsert struct! Beware of adding fields to them that require
+ * stricter alignment.
+ */
+} ginxlogInsert;
+
+typedef struct
+{
+ OffsetNumber offset;
+ bool isDelete;
+ IndexTupleData tuple; /* variable length */
+} ginxlogInsertEntry;
+
+
+typedef struct
+{
+ uint16 nactions;
+
+ /* Variable number of 'actions' follow */
+} ginxlogRecompressDataLeaf;
+
+/*
+ * Note: this struct is currently not used in code, and only acts as
+ * documentation. The WAL record format is as specified here, but the code
+ * uses straight access through a Pointer and memcpy to read/write these.
+ */
+typedef struct
+{
+ uint8 segno; /* segment this action applies to */
+ char type; /* action type (see below) */
+
+ /*
+ * Action-specific data follows. For INSERT and REPLACE actions that is a
+ * GinPostingList struct. For ADDITEMS, a uint16 for the number of items
+ * added, followed by the items themselves as ItemPointers. DELETE actions
+ * have no further data.
+ */
+} ginxlogSegmentAction;
+
+/* Action types */
+#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */
+#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */
+#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */
+#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */
+#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */
+
+typedef struct
+{
+ OffsetNumber offset;
+ PostingItem newitem;
+} ginxlogInsertDataInternal;
+
+/*
+ * Backup Blk 0: new left page (= original page, if not root split)
+ * Backup Blk 1: new right page
+ * Backup Blk 2: original page / new root page, if root split
+ * Backup Blk 3: left child, if this insertion completes an earlier split
+ */
+#define XLOG_GIN_SPLIT 0x30
+
+typedef struct ginxlogSplit
+{
+ RelFileNode node;
+ BlockNumber rrlink; /* right link, or root's blocknumber if root
+ * split */
+ BlockNumber leftChildBlkno; /* valid on a non-leaf split */
+ BlockNumber rightChildBlkno;
+ uint16 flags; /* see below */
+} ginxlogSplit;
+
+/*
+ * Flags used in ginxlogInsert and ginxlogSplit records
+ */
+#define GIN_INSERT_ISDATA 0x01 /* for both insert and split records */
+#define GIN_INSERT_ISLEAF 0x02 /* ditto */
+#define GIN_SPLIT_ROOT 0x04 /* only for split records */
+
+/*
+ * Vacuum simply WAL-logs the whole page, when anything is modified. This
+ * is functionally identical to XLOG_FPI records, but is kept separate for
+ * debugging purposes. (When inspecting the WAL stream, it's easier to see
+ * what's going on when GIN vacuum records are marked as such, not as heap
+ * records.) This is currently only used for entry tree leaf pages.
+ */
+#define XLOG_GIN_VACUUM_PAGE 0x40
+
+/*
+ * Vacuuming posting tree leaf page is WAL-logged like recompression caused
+ * by insertion.
+ */
+#define XLOG_GIN_VACUUM_DATA_LEAF_PAGE 0x90
+
+typedef struct ginxlogVacuumDataLeafPage
+{
+ ginxlogRecompressDataLeaf data;
+} ginxlogVacuumDataLeafPage;
+
+/*
+ * Backup Blk 0: deleted page
+ * Backup Blk 1: parent
+ * Backup Blk 2: left sibling
+ */
+#define XLOG_GIN_DELETE_PAGE 0x50
+
+typedef struct ginxlogDeletePage
+{
+ OffsetNumber parentOffset;
+ BlockNumber rightLink;
+ TransactionId deleteXid; /* last Xid which could see this page in scan */
+} ginxlogDeletePage;
+
+#define XLOG_GIN_UPDATE_META_PAGE 0x60
+
+/*
+ * Backup Blk 0: metapage
+ * Backup Blk 1: tail page
+ */
+typedef struct ginxlogUpdateMeta
+{
+ RelFileNode node;
+ GinMetaPageData metadata;
+ BlockNumber prevTail;
+ BlockNumber newRightlink;
+ int32 ntuples; /* if ntuples > 0 then metadata.tail was
+ * updated with that many tuples; else new sub
+ * list was inserted */
+ /* array of inserted tuples follows */
+} ginxlogUpdateMeta;
+
+#define XLOG_GIN_INSERT_LISTPAGE 0x70
+
+typedef struct ginxlogInsertListPage
+{
+ BlockNumber rightlink;
+ int32 ntuples;
+ /* array of inserted tuples follows */
+} ginxlogInsertListPage;
+
+/*
+ * Backup Blk 0: metapage
+ * Backup Blk 1 to (ndeleted + 1): deleted pages
+ */
+
+#define XLOG_GIN_DELETE_LISTPAGE 0x80
+
+/*
+ * The WAL record for deleting list pages must contain a block reference to
+ * all the deleted pages, so the number of pages that can be deleted in one
+ * record is limited by XLR_MAX_BLOCK_ID. (block_id 0 is used for the
+ * metapage.)
+ */
+#define GIN_NDELETE_AT_ONCE Min(16, XLR_MAX_BLOCK_ID - 1)
+typedef struct ginxlogDeleteListPages
+{
+ GinMetaPageData metadata;
+ int32 ndeleted;
+} ginxlogDeleteListPages;
+
+extern void gin_redo(XLogReaderState *record);
+extern void gin_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gin_identify(uint8 info);
+extern void gin_xlog_startup(void);
+extern void gin_xlog_cleanup(void);
+extern void gin_mask(char *pagedata, BlockNumber blkno);
+
+#endif /* GINXLOG_H */
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
new file mode 100644
index 0000000..4b06575
--- /dev/null
+++ b/src/include/access/gist.h
@@ -0,0 +1,248 @@
+/*-------------------------------------------------------------------------
+ *
+ * gist.h
+ * The public API for GiST indexes. This API is exposed to
+ * individuals implementing GiST indexes, so backward-incompatible
+ * changes should be made with care.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/gist.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GIST_H
+#define GIST_H
+
+#include "access/itup.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "storage/block.h"
+#include "storage/bufpage.h"
+#include "utils/relcache.h"
+
+/*
+ * amproc indexes for GiST indexes.
+ */
+#define GIST_CONSISTENT_PROC 1
+#define GIST_UNION_PROC 2
+#define GIST_COMPRESS_PROC 3
+#define GIST_DECOMPRESS_PROC 4
+#define GIST_PENALTY_PROC 5
+#define GIST_PICKSPLIT_PROC 6
+#define GIST_EQUAL_PROC 7
+#define GIST_DISTANCE_PROC 8
+#define GIST_FETCH_PROC 9
+#define GIST_OPTIONS_PROC 10
+#define GIST_SORTSUPPORT_PROC 11
+#define GISTNProcs 11
+
+/*
+ * Page opaque data in a GiST index page.
+ */
+#define F_LEAF (1 << 0) /* leaf page */
+#define F_DELETED (1 << 1) /* the page has been deleted */
+#define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were
+ * deleted */
+#define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */
+#define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead,
+ * but not deleted yet */
+
+/*
+ * NSN (node sequence number) is a special-purpose LSN which is stored on each
+ * index page in GISTPageOpaqueData and updated only during page splits. By
+ * recording the parent's LSN in GISTSearchItem.parentlsn, it is possible to
+ * detect concurrent child page splits by checking if parentlsn < child's NSN,
+ * and handle them properly. The child page's LSN is insufficient for this
+ * purpose since it is updated for every page change.
+ */
+typedef XLogRecPtr GistNSN;
+
+/*
+ * A fake LSN / NSN value used during index builds. Must be smaller than any
+ * real or fake (unlogged) LSN generated after the index build completes so
+ * that all splits are considered complete.
+ */
+#define GistBuildLSN ((XLogRecPtr) 1)
+
+/*
+ * For on-disk compatibility with pre-9.3 servers, NSN is stored as two
+ * 32-bit fields on disk, same as LSNs.
+ */
+typedef PageXLogRecPtr PageGistNSN;
+
+typedef struct GISTPageOpaqueData
+{
+ PageGistNSN nsn; /* this value must change on page split */
+ BlockNumber rightlink; /* next page if any */
+ uint16 flags; /* see bit definitions above */
+ uint16 gist_page_id; /* for identification of GiST indexes */
+} GISTPageOpaqueData;
+
+typedef GISTPageOpaqueData *GISTPageOpaque;
+
+/*
+ * Maximum possible sizes for GiST index tuple and index key. Calculation is
+ * based on assumption that GiST page should fit at least 4 tuples. In theory,
+ * GiST index can be functional when page can fit 3 tuples. But that seems
+ * rather inefficient, so we use a bit conservative estimate.
+ *
+ * The maximum size of index key is true for unicolumn index. Therefore, this
+ * estimation should be used to figure out which maximum size of GiST index key
+ * makes sense at all. For multicolumn indexes, user might be able to tune
+ * key size using opclass parameters.
+ */
+#define GISTMaxIndexTupleSize \
+ MAXALIGN_DOWN((BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)) / \
+ 4 - sizeof(ItemIdData))
+
+#define GISTMaxIndexKeySize \
+ (GISTMaxIndexTupleSize - MAXALIGN(sizeof(IndexTupleData)))
+
+/*
+ * The page ID is for the convenience of pg_filedump and similar utilities,
+ * which otherwise would have a hard time telling pages of different index
+ * types apart. It should be the last 2 bytes on the page. This is more or
+ * less "free" due to alignment considerations.
+ */
+#define GIST_PAGE_ID 0xFF81
+
+/*
+ * This is the Split Vector to be returned by the PickSplit method.
+ * PickSplit should fill the indexes of tuples to go to the left side into
+ * spl_left[], and those to go to the right into spl_right[] (note the method
+ * is responsible for palloc'ing both of these arrays!). The tuple counts
+ * go into spl_nleft/spl_nright, and spl_ldatum/spl_rdatum must be set to
+ * the union keys for each side.
+ *
+ * If spl_ldatum_exists and spl_rdatum_exists are true, then we are performing
+ * a "secondary split" using a non-first index column. In this case some
+ * decisions have already been made about a page split, and the set of tuples
+ * being passed to PickSplit is just the tuples about which we are undecided.
+ * spl_ldatum/spl_rdatum then contain the union keys for the tuples already
+ * chosen to go left or right. Ideally the PickSplit method should take those
+ * keys into account while deciding what to do with the remaining tuples, ie
+ * it should try to "build out" from those unions so as to minimally expand
+ * them. If it does so, it should union the given tuples' keys into the
+ * existing spl_ldatum/spl_rdatum values rather than just setting those values
+ * from scratch, and then set spl_ldatum_exists/spl_rdatum_exists to false to
+ * show it has done this.
+ *
+ * If the PickSplit method fails to clear spl_ldatum_exists/spl_rdatum_exists,
+ * the core GiST code will make its own decision about how to merge the
+ * secondary-split results with the previously-chosen tuples, and will then
+ * recompute the union keys from scratch. This is a workable though often not
+ * optimal approach.
+ */
+typedef struct GIST_SPLITVEC
+{
+ OffsetNumber *spl_left; /* array of entries that go left */
+ int spl_nleft; /* size of this array */
+ Datum spl_ldatum; /* Union of keys in spl_left */
+ bool spl_ldatum_exists; /* true, if spl_ldatum already exists. */
+
+ OffsetNumber *spl_right; /* array of entries that go right */
+ int spl_nright; /* size of the array */
+ Datum spl_rdatum; /* Union of keys in spl_right */
+ bool spl_rdatum_exists; /* true, if spl_rdatum already exists. */
+} GIST_SPLITVEC;
+
+/*
+ * An entry on a GiST node. Contains the key, as well as its own
+ * location (rel,page,offset) which can supply the matching pointer.
+ * leafkey is a flag to tell us if the entry is in a leaf node.
+ */
+typedef struct GISTENTRY
+{
+ Datum key;
+ Relation rel;
+ Page page;
+ OffsetNumber offset;
+ bool leafkey;
+} GISTENTRY;
+
+#define GistPageGetOpaque(page) ( (GISTPageOpaque) PageGetSpecialPointer(page) )
+
+#define GistPageIsLeaf(page) ( GistPageGetOpaque(page)->flags & F_LEAF)
+#define GIST_LEAF(entry) (GistPageIsLeaf((entry)->page))
+
+#define GistPageIsDeleted(page) ( GistPageGetOpaque(page)->flags & F_DELETED)
+
+#define GistTuplesDeleted(page) ( GistPageGetOpaque(page)->flags & F_TUPLES_DELETED)
+#define GistMarkTuplesDeleted(page) ( GistPageGetOpaque(page)->flags |= F_TUPLES_DELETED)
+#define GistClearTuplesDeleted(page) ( GistPageGetOpaque(page)->flags &= ~F_TUPLES_DELETED)
+
+#define GistPageHasGarbage(page) ( GistPageGetOpaque(page)->flags & F_HAS_GARBAGE)
+#define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE)
+#define GistClearPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE)
+
+#define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT)
+#define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT)
+#define GistClearFollowRight(page) ( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT)
+
+#define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
+#define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
+
+
+/*
+ * On a deleted page, we store this struct. A deleted page doesn't contain any
+ * tuples, so we don't use the normal page layout with line pointers. Instead,
+ * this struct is stored right after the standard page header. pd_lower points
+ * to the end of this struct. If we add fields to this struct in the future, we
+ * can distinguish the old and new formats by pd_lower.
+ */
+typedef struct GISTDeletedPageContents
+{
+ /* last xid which could see the page in a scan */
+ FullTransactionId deleteXid;
+} GISTDeletedPageContents;
+
+static inline void
+GistPageSetDeleted(Page page, FullTransactionId deletexid)
+{
+ Assert(PageIsEmpty(page));
+
+ GistPageGetOpaque(page)->flags |= F_DELETED;
+ ((PageHeader) page)->pd_lower = MAXALIGN(SizeOfPageHeaderData) + sizeof(GISTDeletedPageContents);
+
+ ((GISTDeletedPageContents *) PageGetContents(page))->deleteXid = deletexid;
+}
+
+static inline FullTransactionId
+GistPageGetDeleteXid(Page page)
+{
+ Assert(GistPageIsDeleted(page));
+
+ /* Is the deleteXid field present? */
+ if (((PageHeader) page)->pd_lower >= MAXALIGN(SizeOfPageHeaderData) +
+ offsetof(GISTDeletedPageContents, deleteXid) + sizeof(FullTransactionId))
+ {
+ return ((GISTDeletedPageContents *) PageGetContents(page))->deleteXid;
+ }
+ else
+ return FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
+}
+
+/*
+ * Vector of GISTENTRY structs; user-defined methods union and picksplit
+ * take it as one of their arguments
+ */
+typedef struct
+{
+ int32 n; /* number of elements */
+ GISTENTRY vector[FLEXIBLE_ARRAY_MEMBER];
+} GistEntryVector;
+
+#define GEVHDRSZ (offsetof(GistEntryVector, vector))
+
+/*
+ * macro to initialize a GISTENTRY
+ */
+#define gistentryinit(e, k, r, pg, o, l) \
+ do { (e).key = (k); (e).rel = (r); (e).page = (pg); \
+ (e).offset = (o); (e).leafkey = (l); } while (0)
+
+#endif /* GIST_H */
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
new file mode 100644
index 0000000..553d364
--- /dev/null
+++ b/src/include/access/gist_private.h
@@ -0,0 +1,571 @@
+/*-------------------------------------------------------------------------
+ *
+ * gist_private.h
+ * private declarations for GiST -- declarations related to the
+ * internal implementation of GiST, not the public API
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/gist_private.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GIST_PRIVATE_H
+#define GIST_PRIVATE_H
+
+#include "access/amapi.h"
+#include "access/gist.h"
+#include "access/itup.h"
+#include "lib/pairingheap.h"
+#include "storage/bufmgr.h"
+#include "storage/buffile.h"
+#include "utils/hsearch.h"
+#include "access/genam.h"
+
+/*
+ * Maximum number of "halves" a page can be split into in one operation.
+ * Typically a split produces 2 halves, but can be more if keys have very
+ * different lengths, or when inserting multiple keys in one operation (as
+ * when inserting downlinks to an internal node). There is no theoretical
+ * limit on this, but in practice if you get more than a handful page halves
+ * in one split, there's something wrong with the opclass implementation.
+ * GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some
+ * local arrays used during split. Note that there is also a limit on the
+ * number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS,
+ * so if you raise this higher than that limit, you'll just get a different
+ * error.
+ */
+#define GIST_MAX_SPLIT_PAGES 75
+
+/* Buffer lock modes */
+#define GIST_SHARE BUFFER_LOCK_SHARE
+#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
+#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
+
+typedef struct
+{
+ BlockNumber prev;
+ uint32 freespace;
+ char tupledata[FLEXIBLE_ARRAY_MEMBER];
+} GISTNodeBufferPage;
+
+#define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
+/* Returns free space in node buffer page */
+#define PAGE_FREE_SPACE(nbp) (nbp->freespace)
+/* Checks if node buffer page is empty */
+#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
+/* Checks if node buffers page don't contain sufficient space for index tuple */
+#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
+ MAXALIGN(IndexTupleSize(itup)))
+
+/*
+ * GISTSTATE: information needed for any GiST index operation
+ *
+ * This struct retains call info for the index's opclass-specific support
+ * functions (per index column), plus the index's tuple descriptor.
+ *
+ * scanCxt holds the GISTSTATE itself as well as any data that lives for the
+ * lifetime of the index operation. We pass this to the support functions
+ * via fn_mcxt, so that they can store scan-lifespan data in it. The
+ * functions are invoked in tempCxt, which is typically short-lifespan
+ * (that is, it's reset after each tuple). However, tempCxt can be the same
+ * as scanCxt if we're not bothering with per-tuple context resets.
+ */
+typedef struct GISTSTATE
+{
+ MemoryContext scanCxt; /* context for scan-lifespan data */
+ MemoryContext tempCxt; /* short-term context for calling functions */
+
+ TupleDesc leafTupdesc; /* index's tuple descriptor */
+ TupleDesc nonLeafTupdesc; /* truncated tuple descriptor for non-leaf
+ * pages */
+ TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an
+ * index-only scan */
+
+ FmgrInfo consistentFn[INDEX_MAX_KEYS];
+ FmgrInfo unionFn[INDEX_MAX_KEYS];
+ FmgrInfo compressFn[INDEX_MAX_KEYS];
+ FmgrInfo decompressFn[INDEX_MAX_KEYS];
+ FmgrInfo penaltyFn[INDEX_MAX_KEYS];
+ FmgrInfo picksplitFn[INDEX_MAX_KEYS];
+ FmgrInfo equalFn[INDEX_MAX_KEYS];
+ FmgrInfo distanceFn[INDEX_MAX_KEYS];
+ FmgrInfo fetchFn[INDEX_MAX_KEYS];
+
+ /* Collations to pass to the support functions */
+ Oid supportCollation[INDEX_MAX_KEYS];
+} GISTSTATE;
+
+
+/*
+ * During a GiST index search, we must maintain a queue of unvisited items,
+ * which can be either individual heap tuples or whole index pages. If it
+ * is an ordered search, the unvisited items should be visited in distance
+ * order. Unvisited items at the same distance should be visited in
+ * depth-first order, that is heap items first, then lower index pages, then
+ * upper index pages; this rule avoids doing extra work during a search that
+ * ends early due to LIMIT.
+ *
+ * To perform an ordered search, we use a pairing heap to manage the
+ * distance-order queue. In a non-ordered search (no order-by operators),
+ * we use it to return heap tuples before unvisited index pages, to
+ * ensure depth-first order, but all entries are otherwise considered
+ * equal.
+ */
+
+/* Individual heap tuple to be visited */
+typedef struct GISTSearchHeapItem
+{
+ ItemPointerData heapPtr;
+ bool recheck; /* T if quals must be rechecked */
+ bool recheckDistances; /* T if distances must be rechecked */
+ HeapTuple recontup; /* data reconstructed from the index, used in
+ * index-only scans */
+ OffsetNumber offnum; /* track offset in page to mark tuple as
+ * LP_DEAD */
+} GISTSearchHeapItem;
+
+/* Unvisited item, either index page or heap tuple */
+typedef struct GISTSearchItem
+{
+ pairingheap_node phNode;
+ BlockNumber blkno; /* index page number, or InvalidBlockNumber */
+ union
+ {
+ GistNSN parentlsn; /* parent page's LSN, if index page */
+ /* we must store parentlsn to detect whether a split occurred */
+ GISTSearchHeapItem heap; /* heap info, if heap tuple */
+ } data;
+
+ /* numberOfOrderBys entries */
+ IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER];
+} GISTSearchItem;
+
+#define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber)
+
+#define SizeOfGISTSearchItem(n_distances) \
+ (offsetof(GISTSearchItem, distances) + \
+ sizeof(IndexOrderByDistance) * (n_distances))
+
+/*
+ * GISTScanOpaqueData: private state for a scan of a GiST index
+ */
+typedef struct GISTScanOpaqueData
+{
+ GISTSTATE *giststate; /* index information, see above */
+ Oid *orderByTypes; /* datatypes of ORDER BY expressions */
+
+ pairingheap *queue; /* queue of unvisited items */
+ MemoryContext queueCxt; /* context holding the queue */
+ bool qual_ok; /* false if qual can never be satisfied */
+ bool firstCall; /* true until first gistgettuple call */
+
+ /* pre-allocated workspace arrays */
+ IndexOrderByDistance *distances; /* output area for gistindex_keytest */
+
+ /* info about killed items if any (killedItems is NULL if never used) */
+ OffsetNumber *killedItems; /* offset numbers of killed items */
+ int numKilled; /* number of currently stored items */
+ BlockNumber curBlkno; /* current number of block */
+ GistNSN curPageLSN; /* pos in the WAL stream when page was read */
+
+ /* In a non-ordered search, returnable heap items are stored here: */
+ GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
+ OffsetNumber nPageData; /* number of valid items in array */
+ OffsetNumber curPageData; /* next item to return */
+ MemoryContext pageDataCxt; /* context holding the fetched tuples, for
+ * index-only scans */
+} GISTScanOpaqueData;
+
+typedef GISTScanOpaqueData *GISTScanOpaque;
+
+/* despite the name, gistxlogPage is not part of any xlog record */
+typedef struct gistxlogPage
+{
+ BlockNumber blkno;
+ int num; /* number of index tuples following */
+} gistxlogPage;
+
+/* SplitedPageLayout - gistSplit function result */
+typedef struct SplitedPageLayout
+{
+ gistxlogPage block;
+ IndexTupleData *list;
+ int lenlist;
+ IndexTuple itup; /* union key for page */
+ Page page; /* to operate */
+ Buffer buffer; /* to write after all proceed */
+
+ struct SplitedPageLayout *next;
+} SplitedPageLayout;
+
+/*
+ * GISTInsertStack used for locking buffers and transfer arguments during
+ * insertion
+ */
+typedef struct GISTInsertStack
+{
+ /* current page */
+ BlockNumber blkno;
+ Buffer buffer;
+ Page page;
+
+ /*
+ * log sequence number from page->lsn to recognize page update and compare
+ * it with page's nsn to recognize page split
+ */
+ GistNSN lsn;
+
+ /*
+ * If set, we split the page while descending the tree to find an
+ * insertion target. It means that we need to retry from the parent,
+ * because the downlink of this page might no longer cover the new key.
+ */
+ bool retry_from_parent;
+
+ /* offset of the downlink in the parent page, that points to this page */
+ OffsetNumber downlinkoffnum;
+
+ /* pointer to parent */
+ struct GISTInsertStack *parent;
+} GISTInsertStack;
+
+/* Working state and results for multi-column split logic in gistsplit.c */
+typedef struct GistSplitVector
+{
+ GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */
+
+ Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in
+ * splitVector.spl_left */
+ bool spl_lisnull[INDEX_MAX_KEYS];
+
+ Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in
+ * splitVector.spl_right */
+ bool spl_risnull[INDEX_MAX_KEYS];
+
+ bool *spl_dontcare; /* flags tuples which could go to either side
+ * of the split for zero penalty */
+} GistSplitVector;
+
+typedef struct
+{
+ Relation r;
+ Relation heapRel;
+ Size freespace; /* free space to be left */
+ bool is_build;
+
+ GISTInsertStack *stack;
+} GISTInsertState;
+
+/* root page of a gist index */
+#define GIST_ROOT_BLKNO 0
+
+/*
+ * Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on
+ * inner pages to finish crash recovery of incomplete page splits. If a crash
+ * happened in the middle of a page split, so that the downlink pointers were
+ * not yet inserted, crash recovery inserted a special downlink pointer. The
+ * semantics of an invalid tuple was that it if you encounter one in a scan,
+ * it must always be followed, because we don't know if the tuples on the
+ * child page match or not.
+ *
+ * We no longer create such invalid tuples, we now mark the left-half of such
+ * an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the
+ * split properly the next time we need to insert on that page. To retain
+ * on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as
+ * the offset number of all inner tuples. If we encounter any invalid tuples
+ * with 0xfffe during insertion, we throw an error, though scans still handle
+ * them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1
+ * gist index which already has invalid tuples in it because of a crash. That
+ * should be rare, and you are recommended to REINDEX anyway if you have any
+ * invalid tuples in an index, so throwing an error is as far as we go with
+ * supporting that.
+ */
+#define TUPLE_IS_VALID 0xffff
+#define TUPLE_IS_INVALID 0xfffe
+
+#define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
+#define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
+
+
+
+
+/*
+ * A buffer attached to an internal node, used when building an index in
+ * buffering mode.
+ */
+typedef struct
+{
+ BlockNumber nodeBlocknum; /* index block # this buffer is for */
+ int32 blocksCount; /* current # of blocks occupied by buffer */
+
+ BlockNumber pageBlocknum; /* temporary file block # */
+ GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */
+
+ /* is this buffer queued for emptying? */
+ bool queuedForEmptying;
+
+ /* is this a temporary copy, not in the hash table? */
+ bool isTemp;
+
+ int level; /* 0 == leaf */
+} GISTNodeBuffer;
+
+/*
+ * Does specified level have buffers? (Beware of multiple evaluation of
+ * arguments.)
+ */
+#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
+ ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
+ (nlevel) != (gfbb)->rootlevel)
+
+/* Is specified buffer at least half-filled (should be queued for emptying)? */
+#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
+ ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
+
+/*
+ * Is specified buffer full? Our buffers can actually grow indefinitely,
+ * beyond the "maximum" size, so this just means whether the buffer has grown
+ * beyond the nominal maximum size.
+ */
+#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
+ ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
+
+/*
+ * Data structure with general information about build buffers.
+ */
+typedef struct GISTBuildBuffers
+{
+ /* Persistent memory context for the buffers and metadata. */
+ MemoryContext context;
+
+ BufFile *pfile; /* Temporary file to store buffers in */
+ long nFileBlocks; /* Current size of the temporary file */
+
+ /*
+ * resizable array of free blocks.
+ */
+ long *freeBlocks;
+ int nFreeBlocks; /* # of currently free blocks in the array */
+ int freeBlocksLen; /* current allocated length of the array */
+
+ /* Hash for buffers by block number */
+ HTAB *nodeBuffersTab;
+
+ /* List of buffers scheduled for emptying */
+ List *bufferEmptyingQueue;
+
+ /*
+ * Parameters to the buffering build algorithm. levelStep determines which
+ * levels in the tree have buffers, and pagesPerBuffer determines how
+ * large each buffer is.
+ */
+ int levelStep;
+ int pagesPerBuffer;
+
+ /* Array of lists of buffers on each level, for final emptying */
+ List **buffersOnLevels;
+ int buffersOnLevelsLen;
+
+ /*
+ * Dynamically-sized array of buffers that currently have their last page
+ * loaded in main memory.
+ */
+ GISTNodeBuffer **loadedBuffers;
+ int loadedBuffersCount; /* # of entries in loadedBuffers */
+ int loadedBuffersLen; /* allocated size of loadedBuffers */
+
+ /* Level of the current root node (= height of the index tree - 1) */
+ int rootlevel;
+} GISTBuildBuffers;
+
+/* GiSTOptions->buffering_mode values */
+typedef enum GistOptBufferingMode
+{
+ GIST_OPTION_BUFFERING_AUTO,
+ GIST_OPTION_BUFFERING_ON,
+ GIST_OPTION_BUFFERING_OFF
+} GistOptBufferingMode;
+
+/*
+ * Storage type for GiST's reloptions
+ */
+typedef struct GiSTOptions
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ int fillfactor; /* page fill factor in percent (0..100) */
+ GistOptBufferingMode buffering_mode; /* buffering build mode */
+} GiSTOptions;
+
+/* gist.c */
+extern void gistbuildempty(Relation index);
+extern bool gistinsert(Relation r, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+extern MemoryContext createTempGistContext(void);
+extern GISTSTATE *initGISTstate(Relation index);
+extern void freeGISTstate(GISTSTATE *giststate);
+extern void gistdoinsert(Relation r,
+ IndexTuple itup,
+ Size freespace,
+ GISTSTATE *giststate,
+ Relation heapRel,
+ bool is_build);
+
+/* A List of these is returned from gistplacetopage() in *splitinfo */
+typedef struct
+{
+ Buffer buf; /* the split page "half" */
+ IndexTuple downlink; /* downlink for this half. */
+} GISTPageSplitInfo;
+
+extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
+ Buffer buffer,
+ IndexTuple *itup, int ntup,
+ OffsetNumber oldoffnum, BlockNumber *newblkno,
+ Buffer leftchildbuf,
+ List **splitinfo,
+ bool markfollowright,
+ Relation heapRel,
+ bool is_build);
+
+extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
+ int len, GISTSTATE *giststate);
+
+/* gistxlog.c */
+extern XLogRecPtr gistXLogPageDelete(Buffer buffer,
+ FullTransactionId xid, Buffer parentBuffer,
+ OffsetNumber downlinkOffset);
+
+extern void gistXLogPageReuse(Relation rel, BlockNumber blkno,
+ FullTransactionId latestRemovedXid);
+
+extern XLogRecPtr gistXLogUpdate(Buffer buffer,
+ OffsetNumber *todelete, int ntodelete,
+ IndexTuple *itup, int ntup,
+ Buffer leftchild);
+
+extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+ int ntodelete, TransactionId latestRemovedXid);
+
+extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
+ SplitedPageLayout *dist,
+ BlockNumber origrlink, GistNSN oldnsn,
+ Buffer leftchild, bool markfollowright);
+
+extern XLogRecPtr gistXLogAssignLSN(void);
+
+/* gistget.c */
+extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir);
+extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+extern bool gistcanreturn(Relation index, int attno);
+
+/* gistvalidate.c */
+extern bool gistvalidate(Oid opclassoid);
+extern void gistadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions);
+
+/* gistutil.c */
+
+#define GiSTPageSize \
+ ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
+
+#define GIST_MIN_FILLFACTOR 10
+#define GIST_DEFAULT_FILLFACTOR 90
+
+extern bytea *gistoptions(Datum reloptions, bool validate);
+extern bool gistproperty(Oid index_oid, int attno,
+ IndexAMProperty prop, const char *propname,
+ bool *res, bool *isnull);
+extern bool gistfitpage(IndexTuple *itvec, int len);
+extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace);
+extern void gistcheckpage(Relation rel, Buffer buf);
+extern Buffer gistNewBuffer(Relation r);
+extern bool gistPageRecyclable(Page page);
+extern void gistfillbuffer(Page page, IndexTuple *itup, int len,
+ OffsetNumber off);
+extern IndexTuple *gistextractpage(Page page, int *len /* out */ );
+extern IndexTuple *gistjoinvector(IndexTuple *itvec, int *len,
+ IndexTuple *additvec, int addlen);
+extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen);
+
+extern IndexTuple gistunion(Relation r, IndexTuple *itvec,
+ int len, GISTSTATE *giststate);
+extern IndexTuple gistgetadjusted(Relation r,
+ IndexTuple oldtup,
+ IndexTuple addtup,
+ GISTSTATE *giststate);
+extern IndexTuple gistFormTuple(GISTSTATE *giststate,
+ Relation r, Datum *attdata, bool *isnull, bool isleaf);
+extern void gistCompressValues(GISTSTATE *giststate, Relation r,
+ Datum *attdata, bool *isnull, bool isleaf, Datum *compatt);
+
+extern OffsetNumber gistchoose(Relation r, Page p,
+ IndexTuple it,
+ GISTSTATE *giststate);
+
+extern void GISTInitBuffer(Buffer b, uint32 f);
+extern void gistinitpage(Page page, uint32 f);
+extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
+ Datum k, Relation r, Page pg, OffsetNumber o,
+ bool l, bool isNull);
+
+extern float gistpenalty(GISTSTATE *giststate, int attno,
+ GISTENTRY *key1, bool isNull1,
+ GISTENTRY *key2, bool isNull2);
+extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
+ Datum *attr, bool *isnull);
+extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b);
+extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
+ OffsetNumber o, GISTENTRY *attdata, bool *isnull);
+extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r,
+ IndexTuple tuple);
+extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
+ GISTENTRY *entry1, bool isnull1,
+ GISTENTRY *entry2, bool isnull2,
+ Datum *dst, bool *dstisnull);
+
+extern XLogRecPtr gistGetFakeLSN(Relation rel);
+
+/* gistvacuum.c */
+extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats);
+
+/* gistsplit.c */
+extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
+ int len, GISTSTATE *giststate,
+ GistSplitVector *v,
+ int attno);
+
+/* gistbuild.c */
+extern IndexBuildResult *gistbuild(Relation heap, Relation index,
+ struct IndexInfo *indexInfo);
+extern void gistValidateBufferingOption(const char *value);
+
+/* gistbuildbuffers.c */
+extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
+ int maxLevel);
+extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
+ GISTSTATE *giststate,
+ BlockNumber blkno, int level);
+extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
+ GISTNodeBuffer *nodeBuffer, IndexTuple item);
+extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
+ GISTNodeBuffer *nodeBuffer, IndexTuple *item);
+extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
+extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
+ GISTSTATE *giststate, Relation r,
+ int level, Buffer buffer,
+ List *splitinfo);
+extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
+
+#endif /* GIST_PRIVATE_H */
diff --git a/src/include/access/gistscan.h b/src/include/access/gistscan.h
new file mode 100644
index 0000000..54451b5
--- /dev/null
+++ b/src/include/access/gistscan.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistscan.h
+ * routines defined in access/gist/gistscan.c
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/gistscan.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GISTSCAN_H
+#define GISTSCAN_H
+
+#include "access/amapi.h"
+
+extern IndexScanDesc gistbeginscan(Relation r, int nkeys, int norderbys);
+extern void gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
+ ScanKey orderbys, int norderbys);
+extern void gistendscan(IndexScanDesc scan);
+
+#endif /* GISTSCAN_H */
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
new file mode 100644
index 0000000..fd5144f
--- /dev/null
+++ b/src/include/access/gistxlog.h
@@ -0,0 +1,114 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistxlog.h
+ * gist xlog routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/gistxlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GIST_XLOG_H
+#define GIST_XLOG_H
+
+#include "access/gist.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+
+#define XLOG_GIST_PAGE_UPDATE 0x00
+#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a
+ * page */
+#define XLOG_GIST_PAGE_REUSE 0x20 /* old page is about to be reused
+ * from FSM */
+#define XLOG_GIST_PAGE_SPLIT 0x30
+ /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
+ /* #define XLOG_GIST_CREATE_INDEX 0x50 */ /* not used anymore */
+#define XLOG_GIST_PAGE_DELETE 0x60
+#define XLOG_GIST_ASSIGN_LSN 0x70 /* nop, assign new LSN */
+
+/*
+ * Backup Blk 0: updated page.
+ * Backup Blk 1: If this operation completes a page split, by inserting a
+ * downlink for the split page, the left half of the split
+ */
+typedef struct gistxlogPageUpdate
+{
+ /* number of deleted offsets */
+ uint16 ntodelete;
+ uint16 ntoinsert;
+
+ /*
+ * In payload of blk 0 : 1. todelete OffsetNumbers 2. tuples to insert
+ */
+} gistxlogPageUpdate;
+
+/*
+ * Backup Blk 0: Leaf page, whose index tuples are deleted.
+ */
+typedef struct gistxlogDelete
+{
+ TransactionId latestRemovedXid;
+ uint16 ntodelete; /* number of deleted offsets */
+
+ /*
+ * In payload of blk 0 : todelete OffsetNumbers
+ */
+} gistxlogDelete;
+
+#define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16))
+
+/*
+ * Backup Blk 0: If this operation completes a page split, by inserting a
+ * downlink for the split page, the left half of the split
+ * Backup Blk 1 - npage: split pages (1 is the original page)
+ */
+typedef struct gistxlogPageSplit
+{
+ BlockNumber origrlink; /* rightlink of the page before split */
+ GistNSN orignsn; /* NSN of the page before split */
+ bool origleaf; /* was splitted page a leaf page? */
+
+ uint16 npage; /* # of pages in the split */
+ bool markfollowright; /* set F_FOLLOW_RIGHT flags */
+
+ /*
+ * follow: 1. gistxlogPage and array of IndexTupleData per page
+ */
+} gistxlogPageSplit;
+
+/*
+ * Backup Blk 0: page that was deleted.
+ * Backup Blk 1: parent page, containing the downlink to the deleted page.
+ */
+typedef struct gistxlogPageDelete
+{
+ FullTransactionId deleteXid; /* last Xid which could see page in scan */
+ OffsetNumber downlinkOffset; /* Offset of downlink referencing this
+ * page */
+} gistxlogPageDelete;
+
+#define SizeOfGistxlogPageDelete (offsetof(gistxlogPageDelete, downlinkOffset) + sizeof(OffsetNumber))
+
+
+/*
+ * This is what we need to know about page reuse, for hot standby.
+ */
+typedef struct gistxlogPageReuse
+{
+ RelFileNode node;
+ BlockNumber block;
+ FullTransactionId latestRemovedFullXid;
+} gistxlogPageReuse;
+
+#define SizeOfGistxlogPageReuse (offsetof(gistxlogPageReuse, latestRemovedFullXid) + sizeof(FullTransactionId))
+
+extern void gist_redo(XLogReaderState *record);
+extern void gist_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gist_identify(uint8 info);
+extern void gist_xlog_startup(void);
+extern void gist_xlog_cleanup(void);
+extern void gist_mask(char *pagedata, BlockNumber blkno);
+
+#endif
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
new file mode 100644
index 0000000..1cce865
--- /dev/null
+++ b/src/include/access/hash.h
@@ -0,0 +1,483 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash.h
+ * header file for postgres hash access method implementation
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/hash.h
+ *
+ * NOTES
+ * modeled after Margo Seltzer's hash implementation for unix.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HASH_H
+#define HASH_H
+
+#include "access/amapi.h"
+#include "access/itup.h"
+#include "access/sdir.h"
+#include "catalog/pg_am_d.h"
+#include "common/hashfn.h"
+#include "lib/stringinfo.h"
+#include "storage/bufmgr.h"
+#include "storage/lockdefs.h"
+#include "utils/hsearch.h"
+#include "utils/relcache.h"
+
+/*
+ * Mapping from hash bucket number to physical block number of bucket's
+ * starting page. Beware of multiple evaluations of argument!
+ */
+typedef uint32 Bucket;
+
+#define InvalidBucket ((Bucket) 0xFFFFFFFF)
+
+#define BUCKET_TO_BLKNO(metap,B) \
+ ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
+
+/*
+ * Special space for hash index pages.
+ *
+ * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
+ * Additional bits in the flag word are used for more transient purposes.
+ *
+ * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
+ * However, we ensure that each used page type has a distinct bit so that
+ * we can OR together page types for uses such as the allowable-page-types
+ * argument of _hash_checkpage().
+ */
+#define LH_UNUSED_PAGE (0)
+#define LH_OVERFLOW_PAGE (1 << 0)
+#define LH_BUCKET_PAGE (1 << 1)
+#define LH_BITMAP_PAGE (1 << 2)
+#define LH_META_PAGE (1 << 3)
+#define LH_BUCKET_BEING_POPULATED (1 << 4)
+#define LH_BUCKET_BEING_SPLIT (1 << 5)
+#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
+#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
+
+#define LH_PAGE_TYPE \
+ (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
+
+/*
+ * In an overflow page, hasho_prevblkno stores the block number of the previous
+ * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
+ * hashm_maxbucket value as of the last time the bucket was last split, or
+ * else as of the time the bucket was created. The latter convention is used
+ * to determine whether a cached copy of the metapage is too stale to be used
+ * without needing to lock or pin the metapage.
+ *
+ * hasho_nextblkno is always the block number of the next page in the
+ * bucket chain, or InvalidBlockNumber if there are no more such pages.
+ */
+typedef struct HashPageOpaqueData
+{
+ BlockNumber hasho_prevblkno; /* see above */
+ BlockNumber hasho_nextblkno; /* see above */
+ Bucket hasho_bucket; /* bucket number this pg belongs to */
+ uint16 hasho_flag; /* page type code + flag bits, see above */
+ uint16 hasho_page_id; /* for identification of hash indexes */
+} HashPageOpaqueData;
+
+typedef HashPageOpaqueData *HashPageOpaque;
+
+#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
+#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
+#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
+#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
+
+/*
+ * The page ID is for the convenience of pg_filedump and similar utilities,
+ * which otherwise would have a hard time telling pages of different index
+ * types apart. It should be the last 2 bytes on the page. This is more or
+ * less "free" due to alignment considerations.
+ */
+#define HASHO_PAGE_ID 0xFF80
+
+typedef struct HashScanPosItem /* what we remember about each match */
+{
+ ItemPointerData heapTid; /* TID of referenced heap item */
+ OffsetNumber indexOffset; /* index item's location within page */
+} HashScanPosItem;
+
+typedef struct HashScanPosData
+{
+ Buffer buf; /* if valid, the buffer is pinned */
+ BlockNumber currPage; /* current hash index page */
+ BlockNumber nextPage; /* next overflow page */
+ BlockNumber prevPage; /* prev overflow or bucket page */
+
+ /*
+ * The items array is always ordered in index order (ie, increasing
+ * indexoffset). When scanning backwards it is convenient to fill the
+ * array back-to-front, so we start at the last slot and fill downwards.
+ * Hence we need both a first-valid-entry and a last-valid-entry counter.
+ * itemIndex is a cursor showing which entry was last returned to caller.
+ */
+ int firstItem; /* first valid index in items[] */
+ int lastItem; /* last valid index in items[] */
+ int itemIndex; /* current index in items[] */
+
+ HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+} HashScanPosData;
+
+#define HashScanPosIsPinned(scanpos) \
+( \
+ AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+ !BufferIsValid((scanpos).buf)), \
+ BufferIsValid((scanpos).buf) \
+)
+
+#define HashScanPosIsValid(scanpos) \
+( \
+ AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+ !BufferIsValid((scanpos).buf)), \
+ BlockNumberIsValid((scanpos).currPage) \
+)
+
+#define HashScanPosInvalidate(scanpos) \
+ do { \
+ (scanpos).buf = InvalidBuffer; \
+ (scanpos).currPage = InvalidBlockNumber; \
+ (scanpos).nextPage = InvalidBlockNumber; \
+ (scanpos).prevPage = InvalidBlockNumber; \
+ (scanpos).firstItem = 0; \
+ (scanpos).lastItem = 0; \
+ (scanpos).itemIndex = 0; \
+ } while (0)
+
+/*
+ * HashScanOpaqueData is private state for a hash index scan.
+ */
+typedef struct HashScanOpaqueData
+{
+ /* Hash value of the scan key, ie, the hash key we seek */
+ uint32 hashso_sk_hash;
+
+ /* remember the buffer associated with primary bucket */
+ Buffer hashso_bucket_buf;
+
+ /*
+ * remember the buffer associated with primary bucket page of bucket being
+ * split. it is required during the scan of the bucket which is being
+ * populated during split operation.
+ */
+ Buffer hashso_split_bucket_buf;
+
+ /* Whether scan starts on bucket being populated due to split */
+ bool hashso_buc_populated;
+
+ /*
+ * Whether scanning bucket being split? The value of this parameter is
+ * referred only when hashso_buc_populated is true.
+ */
+ bool hashso_buc_split;
+ /* info about killed items if any (killedItems is NULL if never used) */
+ int *killedItems; /* currPos.items indexes of killed items */
+ int numKilled; /* number of currently stored items */
+
+ /*
+ * Identify all the matching items on a page and save them in
+ * HashScanPosData
+ */
+ HashScanPosData currPos; /* current position data */
+} HashScanOpaqueData;
+
+typedef HashScanOpaqueData *HashScanOpaque;
+
+/*
+ * Definitions for metapage.
+ */
+
+#define HASH_METAPAGE 0 /* metapage is always block 0 */
+
+#define HASH_MAGIC 0x6440640
+#define HASH_VERSION 4
+
+/*
+ * spares[] holds the number of overflow pages currently allocated at or
+ * before a certain splitpoint. For example, if spares[3] = 7 then there are
+ * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The
+ * value in spares[ovflpoint] increases as overflow pages are added at the
+ * end of the index. Once ovflpoint increases (ie, we have actually allocated
+ * the bucket pages belonging to that splitpoint) the number of spares at the
+ * prior splitpoint cannot change anymore.
+ *
+ * ovflpages that have been recycled for reuse can be found by looking at
+ * bitmaps that are stored within ovflpages dedicated for the purpose.
+ * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
+ * number of currently existing bitmaps.
+ *
+ * The limitation on the size of spares[] comes from the fact that there's
+ * no point in having more than 2^32 buckets with only uint32 hashcodes.
+ * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
+ * adjusted in such a way to accommodate multi phased allocation of buckets
+ * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
+ *
+ * There is no particular upper limit on the size of mapp[], other than
+ * needing to fit into the metapage. (With 8K block size, 1024 bitmaps
+ * limit us to 256 GB of overflow space...). For smaller block size we
+ * can not use 1024 bitmaps as it will lead to the meta page data crossing
+ * the block size boundary. So we use BLCKSZ to determine the maximum number
+ * of bitmaps.
+ */
+#define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024)
+
+#define HASH_SPLITPOINT_PHASE_BITS 2
+#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
+#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
+#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
+
+/* defines max number of splitpoint phases a hash index can have */
+#define HASH_MAX_SPLITPOINT_GROUP 32
+#define HASH_MAX_SPLITPOINTS \
+ (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
+ HASH_SPLITPOINT_PHASES_PER_GRP) + \
+ HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
+
+typedef struct HashMetaPageData
+{
+ uint32 hashm_magic; /* magic no. for hash tables */
+ uint32 hashm_version; /* version ID */
+ double hashm_ntuples; /* number of tuples stored in the table */
+ uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */
+ uint16 hashm_bsize; /* index page size (bytes) */
+ uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power
+ * of 2 */
+ uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */
+ uint32 hashm_maxbucket; /* ID of maximum bucket in use */
+ uint32 hashm_highmask; /* mask to modulo into entire table */
+ uint32 hashm_lowmask; /* mask to modulo into lower half of table */
+ uint32 hashm_ovflpoint; /* splitpoint from which ovflpage being
+ * allocated */
+ uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */
+ uint32 hashm_nmaps; /* number of bitmap pages */
+ RegProcedure hashm_procid; /* hash function id from pg_proc */
+ uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each
+ * splitpoint */
+ BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */
+} HashMetaPageData;
+
+typedef HashMetaPageData *HashMetaPage;
+
+typedef struct HashOptions
+{
+ int32 varlena_header_; /* varlena header (do not touch directly!) */
+ int fillfactor; /* page fill factor in percent (0..100) */
+} HashOptions;
+
+#define HashGetFillFactor(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == HASH_AM_OID), \
+ (relation)->rd_options ? \
+ ((HashOptions *) (relation)->rd_options)->fillfactor : \
+ HASH_DEFAULT_FILLFACTOR)
+#define HashGetTargetPageUsage(relation) \
+ (BLCKSZ * HashGetFillFactor(relation) / 100)
+
+/*
+ * Maximum size of a hash index item (it's okay to have only one per page)
+ */
+#define HashMaxItemSize(page) \
+ MAXALIGN_DOWN(PageGetPageSize(page) - \
+ SizeOfPageHeaderData - \
+ sizeof(ItemIdData) - \
+ MAXALIGN(sizeof(HashPageOpaqueData)))
+
+#define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT
+
+#define HASH_MIN_FILLFACTOR 10
+#define HASH_DEFAULT_FILLFACTOR 75
+
+/*
+ * Constants
+ */
+#define BYTE_TO_BIT 3 /* 2^3 bits/byte */
+#define ALL_SET ((uint32) ~0)
+
+/*
+ * Bitmap pages do not contain tuples. They do contain the standard
+ * page headers and trailers; however, everything in between is a
+ * giant bit array. The number of bits that fit on a page obviously
+ * depends on the page size and the header/trailer overhead. We require
+ * the number of bits per page to be a power of 2.
+ */
+#define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
+#define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
+#define BMPG_SHIFT(metap) ((metap)->hashm_bmshift)
+#define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
+
+#define HashPageGetBitmap(page) \
+ ((uint32 *) PageGetContents(page))
+
+#define HashGetMaxBitmapSize(page) \
+ (PageGetPageSize((Page) page) - \
+ (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
+
+#define HashPageGetMeta(page) \
+ ((HashMetaPage) PageGetContents(page))
+
+/*
+ * The number of bits in an ovflpage bitmap word.
+ */
+#define BITS_PER_MAP 32 /* Number of bits in uint32 */
+
+/* Given the address of the beginning of a bit map, clear/set the nth bit */
+#define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
+#define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
+#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
+
+/*
+ * page-level and high-level locking modes (see README)
+ */
+#define HASH_READ BUFFER_LOCK_SHARE
+#define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
+#define HASH_NOLOCK (-1)
+
+/*
+ * When a new operator class is declared, we require that the user supply
+ * us with an amproc function for hashing a key of the new type, returning
+ * a 32-bit hash value. We call this the "standard" hash function. We
+ * also allow an optional "extended" hash function which accepts a salt and
+ * returns a 64-bit hash value. This is highly recommended but, for reasons
+ * of backward compatibility, optional.
+ *
+ * When the salt is 0, the low 32 bits of the value returned by the extended
+ * hash function should match the value that would have been returned by the
+ * standard hash function.
+ */
+#define HASHSTANDARD_PROC 1
+#define HASHEXTENDED_PROC 2
+#define HASHOPTIONS_PROC 3
+#define HASHNProcs 3
+
+
+/* public routines */
+
+extern IndexBuildResult *hashbuild(Relation heap, Relation index,
+ struct IndexInfo *indexInfo);
+extern void hashbuildempty(Relation index);
+extern bool hashinsert(Relation rel, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir);
+extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys);
+extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys);
+extern void hashendscan(IndexScanDesc scan);
+extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats);
+extern bytea *hashoptions(Datum reloptions, bool validate);
+extern bool hashvalidate(Oid opclassoid);
+extern void hashadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions);
+
+/* private routines */
+
+/* hashinsert.c */
+extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
+extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
+ Size itemsize, IndexTuple itup);
+extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
+ OffsetNumber *itup_offsets, uint16 nitups);
+
+/* hashovfl.c */
+extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
+extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
+ Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
+ Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
+extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
+extern void _hash_squeezebucket(Relation rel,
+ Bucket bucket, BlockNumber bucket_blkno,
+ Buffer bucket_buf,
+ BufferAccessStrategy bstrategy);
+extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno);
+
+/* hashpage.c */
+extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
+ int access, int flags);
+extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
+ BlockNumber blkno, int flags);
+extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
+ bool force_refresh);
+extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
+ int access,
+ HashMetaPage *cachedmetap);
+extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
+extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket,
+ uint32 flag, bool initpage);
+extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
+ ForkNumber forkNum);
+extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
+ int access, int flags,
+ BufferAccessStrategy bstrategy);
+extern void _hash_relbuf(Relation rel, Buffer buf);
+extern void _hash_dropbuf(Relation rel, Buffer buf);
+extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
+extern uint32 _hash_init(Relation rel, double num_tuples,
+ ForkNumber forkNum);
+extern void _hash_init_metabuffer(Buffer buf, double num_tuples,
+ RegProcedure procid, uint16 ffactor, bool initpage);
+extern void _hash_pageinit(Page page, Size size);
+extern void _hash_expandtable(Relation rel, Buffer metabuf);
+extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
+ Bucket obucket, uint32 maxbucket, uint32 highmask,
+ uint32 lowmask);
+
+/* hashsearch.c */
+extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
+
+/* hashsort.c */
+typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
+
+extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets);
+extern void _h_spooldestroy(HSpool *hspool);
+extern void _h_spool(HSpool *hspool, ItemPointer self,
+ Datum *values, bool *isnull);
+extern void _h_indexbuild(HSpool *hspool, Relation heapRel);
+
+/* hashutil.c */
+extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
+extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
+extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
+extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
+ uint32 highmask, uint32 lowmask);
+extern uint32 _hash_spareindex(uint32 num_bucket);
+extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
+extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
+extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
+extern bool _hash_convert_tuple(Relation index,
+ Datum *user_values, bool *user_isnull,
+ Datum *index_values, bool *index_isnull);
+extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
+extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
+extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
+extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
+extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
+ uint32 lowmask, uint32 maxbucket);
+extern void _hash_kill_items(IndexScanDesc scan);
+
+/* hash.c */
+extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
+ Buffer bucket_buf, BlockNumber bucket_blkno,
+ BufferAccessStrategy bstrategy,
+ uint32 maxbucket, uint32 highmask, uint32 lowmask,
+ double *tuples_removed, double *num_index_tuples,
+ bool split_cleanup,
+ IndexBulkDeleteCallback callback, void *callback_state);
+
+#endif /* HASH_H */
diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h
new file mode 100644
index 0000000..4353a32
--- /dev/null
+++ b/src/include/access/hash_xlog.h
@@ -0,0 +1,267 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash_xlog.h
+ * header file for Postgres hash AM implementation
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/hash_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HASH_XLOG_H
+#define HASH_XLOG_H
+
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/off.h"
+
+/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */
+#define HASH_XLOG_FREE_OVFL_BUFS 6
+
+/*
+ * XLOG records for hash operations
+ */
+#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */
+#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */
+#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */
+#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */
+#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */
+#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */
+#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split operation */
+#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page
+ * and add to another page */
+#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous
+ * pages in chain and free the ovfl
+ * page */
+#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */
+#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary
+ * bucket page after deleting tuples
+ * that are moved due to split */
+#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after vacuum */
+
+#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index
+ * page */
+
+/*
+ * xl_hash_split_allocate_page flag values, 8 bits are available.
+ */
+#define XLH_SPLIT_META_UPDATE_MASKS (1<<0)
+#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1)
+
+/*
+ * This is what we need to know about simple (without split) insert.
+ *
+ * This data record is used for XLOG_HASH_INSERT
+ *
+ * Backup Blk 0: original page (data contains the inserted tuple)
+ * Backup Blk 1: metapage (HashMetaPageData)
+ */
+typedef struct xl_hash_insert
+{
+ OffsetNumber offnum;
+} xl_hash_insert;
+
+#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber))
+
+/*
+ * This is what we need to know about addition of overflow page.
+ *
+ * This data record is used for XLOG_HASH_ADD_OVFL_PAGE
+ *
+ * Backup Blk 0: newly allocated overflow page
+ * Backup Blk 1: page before new overflow page in the bucket chain
+ * Backup Blk 2: bitmap page
+ * Backup Blk 3: new bitmap page
+ * Backup Blk 4: metapage
+ */
+typedef struct xl_hash_add_ovfl_page
+{
+ uint16 bmsize;
+ bool bmpage_found;
+} xl_hash_add_ovfl_page;
+
+#define SizeOfHashAddOvflPage \
+ (offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool))
+
+/*
+ * This is what we need to know about allocating a page for split.
+ *
+ * This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE
+ *
+ * Backup Blk 0: page for old bucket
+ * Backup Blk 1: page for new bucket
+ * Backup Blk 2: metapage
+ */
+typedef struct xl_hash_split_allocate_page
+{
+ uint32 new_bucket;
+ uint16 old_bucket_flag;
+ uint16 new_bucket_flag;
+ uint8 flags;
+} xl_hash_split_allocate_page;
+
+#define SizeOfHashSplitAllocPage \
+ (offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8))
+
+/*
+ * This is what we need to know about completing the split operation.
+ *
+ * This data record is used for XLOG_HASH_SPLIT_COMPLETE
+ *
+ * Backup Blk 0: page for old bucket
+ * Backup Blk 1: page for new bucket
+ */
+typedef struct xl_hash_split_complete
+{
+ uint16 old_bucket_flag;
+ uint16 new_bucket_flag;
+} xl_hash_split_complete;
+
+#define SizeOfHashSplitComplete \
+ (offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16))
+
+/*
+ * This is what we need to know about move page contents required during
+ * squeeze operation.
+ *
+ * This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS
+ *
+ * Backup Blk 0: bucket page
+ * Backup Blk 1: page containing moved tuples
+ * Backup Blk 2: page from which tuples will be removed
+ */
+typedef struct xl_hash_move_page_contents
+{
+ uint16 ntups;
+ bool is_prim_bucket_same_wrt; /* true if the page to which
+ * tuples are moved is same as
+ * primary bucket page */
+} xl_hash_move_page_contents;
+
+#define SizeOfHashMovePageContents \
+ (offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool))
+
+/*
+ * This is what we need to know about the squeeze page operation.
+ *
+ * This data record is used for XLOG_HASH_SQUEEZE_PAGE
+ *
+ * Backup Blk 0: page containing tuples moved from freed overflow page
+ * Backup Blk 1: freed overflow page
+ * Backup Blk 2: page previous to the freed overflow page
+ * Backup Blk 3: page next to the freed overflow page
+ * Backup Blk 4: bitmap page containing info of freed overflow page
+ * Backup Blk 5: meta page
+ */
+typedef struct xl_hash_squeeze_page
+{
+ BlockNumber prevblkno;
+ BlockNumber nextblkno;
+ uint16 ntups;
+ bool is_prim_bucket_same_wrt; /* true if the page to which
+ * tuples are moved is same as
+ * primary bucket page */
+ bool is_prev_bucket_same_wrt; /* true if the page to which
+ * tuples are moved is the page
+ * previous to the freed overflow
+ * page */
+} xl_hash_squeeze_page;
+
+#define SizeOfHashSqueezePage \
+ (offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool))
+
+/*
+ * This is what we need to know about the deletion of index tuples from a page.
+ *
+ * This data record is used for XLOG_HASH_DELETE
+ *
+ * Backup Blk 0: primary bucket page
+ * Backup Blk 1: page from which tuples are deleted
+ */
+typedef struct xl_hash_delete
+{
+ bool clear_dead_marking; /* true if this operation clears
+ * LH_PAGE_HAS_DEAD_TUPLES flag */
+ bool is_primary_bucket_page; /* true if the operation is for
+ * primary bucket page */
+} xl_hash_delete;
+
+#define SizeOfHashDelete (offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool))
+
+/*
+ * This is what we need for metapage update operation.
+ *
+ * This data record is used for XLOG_HASH_UPDATE_META_PAGE
+ *
+ * Backup Blk 0: meta page
+ */
+typedef struct xl_hash_update_meta_page
+{
+ double ntuples;
+} xl_hash_update_meta_page;
+
+#define SizeOfHashUpdateMetaPage \
+ (offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double))
+
+/*
+ * This is what we need to initialize metapage.
+ *
+ * This data record is used for XLOG_HASH_INIT_META_PAGE
+ *
+ * Backup Blk 0: meta page
+ */
+typedef struct xl_hash_init_meta_page
+{
+ double num_tuples;
+ RegProcedure procid;
+ uint16 ffactor;
+} xl_hash_init_meta_page;
+
+#define SizeOfHashInitMetaPage \
+ (offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16))
+
+/*
+ * This is what we need to initialize bitmap page.
+ *
+ * This data record is used for XLOG_HASH_INIT_BITMAP_PAGE
+ *
+ * Backup Blk 0: bitmap page
+ * Backup Blk 1: meta page
+ */
+typedef struct xl_hash_init_bitmap_page
+{
+ uint16 bmsize;
+} xl_hash_init_bitmap_page;
+
+#define SizeOfHashInitBitmapPage \
+ (offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16))
+
+/*
+ * This is what we need for index tuple deletion and to
+ * update the meta page.
+ *
+ * This data record is used for XLOG_HASH_VACUUM_ONE_PAGE
+ *
+ * Backup Blk 0: bucket page
+ * Backup Blk 1: meta page
+ */
+typedef struct xl_hash_vacuum_one_page
+{
+ TransactionId latestRemovedXid;
+ int ntuples;
+
+ /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
+} xl_hash_vacuum_one_page;
+
+#define SizeOfHashVacuumOnePage \
+ (offsetof(xl_hash_vacuum_one_page, ntuples) + sizeof(int))
+
+extern void hash_redo(XLogReaderState *record);
+extern void hash_desc(StringInfo buf, XLogReaderState *record);
+extern const char *hash_identify(uint8 info);
+extern void hash_mask(char *pagedata, BlockNumber blkno);
+
+#endif /* HASH_XLOG_H */
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
new file mode 100644
index 0000000..4f1dff9
--- /dev/null
+++ b/src/include/access/heapam.h
@@ -0,0 +1,235 @@
+/*-------------------------------------------------------------------------
+ *
+ * heapam.h
+ * POSTGRES heap access method definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/heapam.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HEAPAM_H
+#define HEAPAM_H
+
+#include "access/relation.h" /* for backward compatibility */
+#include "access/relscan.h"
+#include "access/sdir.h"
+#include "access/skey.h"
+#include "access/table.h" /* for backward compatibility */
+#include "access/tableam.h"
+#include "nodes/lockoptions.h"
+#include "nodes/primnodes.h"
+#include "storage/bufpage.h"
+#include "storage/dsm.h"
+#include "storage/lockdefs.h"
+#include "storage/shm_toc.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+
+/* "options" flag bits for heap_insert */
+#define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM
+#define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN
+#define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL
+#define HEAP_INSERT_SPECULATIVE 0x0010
+
+typedef struct BulkInsertStateData *BulkInsertState;
+struct TupleTableSlot;
+
+#define MaxLockTupleMode LockTupleExclusive
+
+/*
+ * Descriptor for heap table scans.
+ */
+typedef struct HeapScanDescData
+{
+ TableScanDescData rs_base; /* AM independent part of the descriptor */
+
+ /* state set up at initscan time */
+ BlockNumber rs_nblocks; /* total number of blocks in rel */
+ BlockNumber rs_startblock; /* block # to start at */
+ BlockNumber rs_numblocks; /* max number of blocks to scan */
+ /* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */
+
+ /* scan current state */
+ bool rs_inited; /* false = scan not init'd yet */
+ BlockNumber rs_cblock; /* current block # in scan, if any */
+ Buffer rs_cbuf; /* current buffer in scan, if any */
+ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
+
+ /* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */
+ BufferAccessStrategy rs_strategy; /* access strategy for reads */
+
+ HeapTupleData rs_ctup; /* current tuple in scan, if any */
+
+ /*
+ * For parallel scans to store page allocation data. NULL when not
+ * performing a parallel scan.
+ */
+ ParallelBlockTableScanWorkerData *rs_parallelworkerdata;
+
+ /* these fields only used in page-at-a-time mode and for bitmap scans */
+ int rs_cindex; /* current tuple's index in vistuples */
+ int rs_ntuples; /* number of visible tuples on page */
+ OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */
+} HeapScanDescData;
+typedef struct HeapScanDescData *HeapScanDesc;
+
+/*
+ * Descriptor for fetches from heap via an index.
+ */
+typedef struct IndexFetchHeapData
+{
+ IndexFetchTableData xs_base; /* AM independent part of the descriptor */
+
+ Buffer xs_cbuf; /* current heap buffer in scan, if any */
+ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
+} IndexFetchHeapData;
+
+/* Result codes for HeapTupleSatisfiesVacuum */
+typedef enum
+{
+ HEAPTUPLE_DEAD, /* tuple is dead and deletable */
+ HEAPTUPLE_LIVE, /* tuple is live (committed, no deleter) */
+ HEAPTUPLE_RECENTLY_DEAD, /* tuple is dead, but not deletable yet */
+ HEAPTUPLE_INSERT_IN_PROGRESS, /* inserting xact is still in progress */
+ HEAPTUPLE_DELETE_IN_PROGRESS /* deleting xact is still in progress */
+} HTSV_Result;
+
+/* ----------------
+ * function prototypes for heap access method
+ *
+ * heap_create, heap_create_with_catalog, and heap_drop_with_catalog
+ * are declared in catalog/heap.h
+ * ----------------
+ */
+
+
+/*
+ * HeapScanIsValid
+ * True iff the heap scan is valid.
+ */
+#define HeapScanIsValid(scan) PointerIsValid(scan)
+
+extern TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot,
+ int nkeys, ScanKey key,
+ ParallelTableScanDesc parallel_scan,
+ uint32 flags);
+extern void heap_setscanlimits(TableScanDesc scan, BlockNumber startBlk,
+ BlockNumber numBlks);
+extern void heapgetpage(TableScanDesc scan, BlockNumber page);
+extern void heap_rescan(TableScanDesc scan, ScanKey key, bool set_params,
+ bool allow_strat, bool allow_sync, bool allow_pagemode);
+extern void heap_endscan(TableScanDesc scan);
+extern HeapTuple heap_getnext(TableScanDesc scan, ScanDirection direction);
+extern bool heap_getnextslot(TableScanDesc sscan,
+ ScanDirection direction, struct TupleTableSlot *slot);
+extern void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid,
+ ItemPointer maxtid);
+extern bool heap_getnextslot_tidrange(TableScanDesc sscan,
+ ScanDirection direction,
+ TupleTableSlot *slot);
+extern bool heap_fetch(Relation relation, Snapshot snapshot,
+ HeapTuple tuple, Buffer *userbuf);
+extern bool heap_fetch_extended(Relation relation, Snapshot snapshot,
+ HeapTuple tuple, Buffer *userbuf,
+ bool keep_buf);
+extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
+ Buffer buffer, Snapshot snapshot, HeapTuple heapTuple,
+ bool *all_dead, bool first_call);
+
+extern void heap_get_latest_tid(TableScanDesc scan, ItemPointer tid);
+
+extern BulkInsertState GetBulkInsertState(void);
+extern void FreeBulkInsertState(BulkInsertState);
+extern void ReleaseBulkInsertStatePin(BulkInsertState bistate);
+
+extern void heap_insert(Relation relation, HeapTuple tup, CommandId cid,
+ int options, BulkInsertState bistate);
+extern void heap_multi_insert(Relation relation, struct TupleTableSlot **slots,
+ int ntuples, CommandId cid, int options,
+ BulkInsertState bistate);
+extern TM_Result heap_delete(Relation relation, ItemPointer tid,
+ CommandId cid, Snapshot crosscheck, bool wait,
+ struct TM_FailureData *tmfd, bool changingPart);
+extern void heap_finish_speculative(Relation relation, ItemPointer tid);
+extern void heap_abort_speculative(Relation relation, ItemPointer tid);
+extern TM_Result heap_update(Relation relation, ItemPointer otid,
+ HeapTuple newtup,
+ CommandId cid, Snapshot crosscheck, bool wait,
+ struct TM_FailureData *tmfd, LockTupleMode *lockmode);
+extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
+ CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
+ bool follow_update,
+ Buffer *buffer, struct TM_FailureData *tmfd);
+
+extern void heap_inplace_update(Relation relation, HeapTuple tuple);
+extern bool heap_freeze_tuple(HeapTupleHeader tuple,
+ TransactionId relfrozenxid, TransactionId relminmxid,
+ TransactionId cutoff_xid, TransactionId cutoff_multi);
+extern bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
+ MultiXactId cutoff_multi, Buffer buf);
+extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple);
+
+extern void simple_heap_insert(Relation relation, HeapTuple tup);
+extern void simple_heap_delete(Relation relation, ItemPointer tid);
+extern void simple_heap_update(Relation relation, ItemPointer otid,
+ HeapTuple tup);
+
+extern TransactionId heap_index_delete_tuples(Relation rel,
+ TM_IndexDeleteOp *delstate);
+
+/* in heap/pruneheap.c */
+struct GlobalVisState;
+extern void heap_page_prune_opt(Relation relation, Buffer buffer);
+extern int heap_page_prune(Relation relation, Buffer buffer,
+ struct GlobalVisState *vistest,
+ TransactionId old_snap_xmin,
+ TimestampTz old_snap_ts_ts,
+ bool report_stats,
+ OffsetNumber *off_loc);
+extern void heap_page_prune_execute(Buffer buffer,
+ OffsetNumber *redirected, int nredirected,
+ OffsetNumber *nowdead, int ndead,
+ OffsetNumber *nowunused, int nunused);
+extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
+
+/* in heap/vacuumlazy.c */
+struct VacuumParams;
+extern void heap_vacuum_rel(Relation rel,
+ struct VacuumParams *params, BufferAccessStrategy bstrategy);
+extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc);
+
+/* in heap/heapam_visibility.c */
+extern bool HeapTupleSatisfiesVisibility(HeapTuple stup, Snapshot snapshot,
+ Buffer buffer);
+extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple stup, CommandId curcid,
+ Buffer buffer);
+extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple stup, TransactionId OldestXmin,
+ Buffer buffer);
+extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple stup, Buffer buffer,
+ TransactionId *dead_after);
+extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
+ uint16 infomask, TransactionId xid);
+extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple);
+extern bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
+extern bool HeapTupleIsSurelyDead(HeapTuple htup,
+ struct GlobalVisState *vistest);
+
+/*
+ * To avoid leaking too much knowledge about reorderbuffer implementation
+ * details this is implemented in reorderbuffer.c not heapam_visibility.c
+ */
+struct HTAB;
+extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data,
+ Snapshot snapshot,
+ HeapTuple htup,
+ Buffer buffer,
+ CommandId *cmin, CommandId *cmax);
+extern void HeapCheckForSerializableConflictOut(bool valid, Relation relation, HeapTuple tuple,
+ Buffer buffer, Snapshot snapshot);
+
+#endif /* HEAPAM_H */
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
new file mode 100644
index 0000000..27db481
--- /dev/null
+++ b/src/include/access/heapam_xlog.h
@@ -0,0 +1,419 @@
+/*-------------------------------------------------------------------------
+ *
+ * heapam_xlog.h
+ * POSTGRES heap access XLOG definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/heapam_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HEAPAM_XLOG_H
+#define HEAPAM_XLOG_H
+
+#include "access/htup.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+
+/*
+ * WAL record definitions for heapam.c's WAL operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field. We use 3 for opcode and one for init bit.
+ */
+#define XLOG_HEAP_INSERT 0x00
+#define XLOG_HEAP_DELETE 0x10
+#define XLOG_HEAP_UPDATE 0x20
+#define XLOG_HEAP_TRUNCATE 0x30
+#define XLOG_HEAP_HOT_UPDATE 0x40
+#define XLOG_HEAP_CONFIRM 0x50
+#define XLOG_HEAP_LOCK 0x60
+#define XLOG_HEAP_INPLACE 0x70
+
+#define XLOG_HEAP_OPMASK 0x70
+/*
+ * When we insert 1st item on new page in INSERT, UPDATE, HOT_UPDATE,
+ * or MULTI_INSERT, we can (and we do) restore entire page in redo
+ */
+#define XLOG_HEAP_INIT_PAGE 0x80
+/*
+ * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes
+ * are associated with RM_HEAP2_ID, but are not logically different from
+ * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to
+ * these, too.
+ */
+#define XLOG_HEAP2_REWRITE 0x00
+#define XLOG_HEAP2_PRUNE 0x10
+#define XLOG_HEAP2_VACUUM 0x20
+#define XLOG_HEAP2_FREEZE_PAGE 0x30
+#define XLOG_HEAP2_VISIBLE 0x40
+#define XLOG_HEAP2_MULTI_INSERT 0x50
+#define XLOG_HEAP2_LOCK_UPDATED 0x60
+#define XLOG_HEAP2_NEW_CID 0x70
+
+/*
+ * xl_heap_insert/xl_heap_multi_insert flag values, 8 bits are available.
+ */
+/* PD_ALL_VISIBLE was cleared */
+#define XLH_INSERT_ALL_VISIBLE_CLEARED (1<<0)
+#define XLH_INSERT_LAST_IN_MULTI (1<<1)
+#define XLH_INSERT_IS_SPECULATIVE (1<<2)
+#define XLH_INSERT_CONTAINS_NEW_TUPLE (1<<3)
+#define XLH_INSERT_ON_TOAST_RELATION (1<<4)
+
+/* all_frozen_set always implies all_visible_set */
+#define XLH_INSERT_ALL_FROZEN_SET (1<<5)
+
+/*
+ * xl_heap_update flag values, 8 bits are available.
+ */
+/* PD_ALL_VISIBLE was cleared */
+#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED (1<<0)
+/* PD_ALL_VISIBLE was cleared in the 2nd page */
+#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED (1<<1)
+#define XLH_UPDATE_CONTAINS_OLD_TUPLE (1<<2)
+#define XLH_UPDATE_CONTAINS_OLD_KEY (1<<3)
+#define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4)
+#define XLH_UPDATE_PREFIX_FROM_OLD (1<<5)
+#define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6)
+
+/* convenience macro for checking whether any form of old tuple was logged */
+#define XLH_UPDATE_CONTAINS_OLD \
+ (XLH_UPDATE_CONTAINS_OLD_TUPLE | XLH_UPDATE_CONTAINS_OLD_KEY)
+
+/*
+ * xl_heap_delete flag values, 8 bits are available.
+ */
+/* PD_ALL_VISIBLE was cleared */
+#define XLH_DELETE_ALL_VISIBLE_CLEARED (1<<0)
+#define XLH_DELETE_CONTAINS_OLD_TUPLE (1<<1)
+#define XLH_DELETE_CONTAINS_OLD_KEY (1<<2)
+#define XLH_DELETE_IS_SUPER (1<<3)
+#define XLH_DELETE_IS_PARTITION_MOVE (1<<4)
+
+/* convenience macro for checking whether any form of old tuple was logged */
+#define XLH_DELETE_CONTAINS_OLD \
+ (XLH_DELETE_CONTAINS_OLD_TUPLE | XLH_DELETE_CONTAINS_OLD_KEY)
+
+/* This is what we need to know about delete */
+typedef struct xl_heap_delete
+{
+ TransactionId xmax; /* xmax of the deleted tuple */
+ OffsetNumber offnum; /* deleted tuple's offset */
+ uint8 infobits_set; /* infomask bits */
+ uint8 flags;
+} xl_heap_delete;
+
+#define SizeOfHeapDelete (offsetof(xl_heap_delete, flags) + sizeof(uint8))
+
+/*
+ * xl_heap_truncate flag values, 8 bits are available.
+ */
+#define XLH_TRUNCATE_CASCADE (1<<0)
+#define XLH_TRUNCATE_RESTART_SEQS (1<<1)
+
+/*
+ * For truncate we list all truncated relids in an array, followed by all
+ * sequence relids that need to be restarted, if any.
+ * All rels are always within the same database, so we just list dbid once.
+ */
+typedef struct xl_heap_truncate
+{
+ Oid dbId;
+ uint32 nrelids;
+ uint8 flags;
+ Oid relids[FLEXIBLE_ARRAY_MEMBER];
+} xl_heap_truncate;
+
+#define SizeOfHeapTruncate (offsetof(xl_heap_truncate, relids))
+
+/*
+ * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted
+ * or updated tuple in WAL; we can save a few bytes by reconstructing the
+ * fields that are available elsewhere in the WAL record, or perhaps just
+ * plain needn't be reconstructed. These are the fields we must store.
+ */
+typedef struct xl_heap_header
+{
+ uint16 t_infomask2;
+ uint16 t_infomask;
+ uint8 t_hoff;
+} xl_heap_header;
+
+#define SizeOfHeapHeader (offsetof(xl_heap_header, t_hoff) + sizeof(uint8))
+
+/* This is what we need to know about insert */
+typedef struct xl_heap_insert
+{
+ OffsetNumber offnum; /* inserted tuple's offset */
+ uint8 flags;
+
+ /* xl_heap_header & TUPLE DATA in backup block 0 */
+} xl_heap_insert;
+
+#define SizeOfHeapInsert (offsetof(xl_heap_insert, flags) + sizeof(uint8))
+
+/*
+ * This is what we need to know about a multi-insert.
+ *
+ * The main data of the record consists of this xl_heap_multi_insert header.
+ * 'offsets' array is omitted if the whole page is reinitialized
+ * (XLOG_HEAP_INIT_PAGE).
+ *
+ * In block 0's data portion, there is an xl_multi_insert_tuple struct,
+ * followed by the tuple data for each tuple. There is padding to align
+ * each xl_multi_insert_tuple struct.
+ */
+typedef struct xl_heap_multi_insert
+{
+ uint8 flags;
+ uint16 ntuples;
+ OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+} xl_heap_multi_insert;
+
+#define SizeOfHeapMultiInsert offsetof(xl_heap_multi_insert, offsets)
+
+typedef struct xl_multi_insert_tuple
+{
+ uint16 datalen; /* size of tuple data that follows */
+ uint16 t_infomask2;
+ uint16 t_infomask;
+ uint8 t_hoff;
+ /* TUPLE DATA FOLLOWS AT END OF STRUCT */
+} xl_multi_insert_tuple;
+
+#define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8))
+
+/*
+ * This is what we need to know about update|hot_update
+ *
+ * Backup blk 0: new page
+ *
+ * If XLH_UPDATE_PREFIX_FROM_OLD or XLH_UPDATE_SUFFIX_FROM_OLD flags are set,
+ * the prefix and/or suffix come first, as one or two uint16s.
+ *
+ * After that, xl_heap_header and new tuple data follow. The new tuple
+ * data doesn't include the prefix and suffix, which are copied from the
+ * old tuple on replay.
+ *
+ * If XLH_UPDATE_CONTAINS_NEW_TUPLE flag is given, the tuple data is
+ * included even if a full-page image was taken.
+ *
+ * Backup blk 1: old page, if different. (no data, just a reference to the blk)
+ */
+typedef struct xl_heap_update
+{
+ TransactionId old_xmax; /* xmax of the old tuple */
+ OffsetNumber old_offnum; /* old tuple's offset */
+ uint8 old_infobits_set; /* infomask bits to set on old tuple */
+ uint8 flags;
+ TransactionId new_xmax; /* xmax of the new tuple */
+ OffsetNumber new_offnum; /* new tuple's offset */
+
+ /*
+ * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags
+ * are set, xl_heap_header and tuple data for the old tuple follow.
+ */
+} xl_heap_update;
+
+#define SizeOfHeapUpdate (offsetof(xl_heap_update, new_offnum) + sizeof(OffsetNumber))
+
+/*
+ * This is what we need to know about page pruning (both during VACUUM and
+ * during opportunistic pruning)
+ *
+ * The array of OffsetNumbers following the fixed part of the record contains:
+ * * for each redirected item: the item offset, then the offset redirected to
+ * * for each now-dead item: the item offset
+ * * for each now-unused item: the item offset
+ * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused.
+ * Note that nunused is not explicitly stored, but may be found by reference
+ * to the total record length.
+ *
+ * Requires a super-exclusive lock.
+ */
+typedef struct xl_heap_prune
+{
+ TransactionId latestRemovedXid;
+ uint16 nredirected;
+ uint16 ndead;
+ /* OFFSET NUMBERS are in the block reference 0 */
+} xl_heap_prune;
+
+#define SizeOfHeapPrune (offsetof(xl_heap_prune, ndead) + sizeof(uint16))
+
+/*
+ * The vacuum page record is similar to the prune record, but can only mark
+ * already dead items as unused
+ *
+ * Used by heap vacuuming only. Does not require a super-exclusive lock.
+ */
+typedef struct xl_heap_vacuum
+{
+ uint16 nunused;
+ /* OFFSET NUMBERS are in the block reference 0 */
+} xl_heap_vacuum;
+
+#define SizeOfHeapVacuum (offsetof(xl_heap_vacuum, nunused) + sizeof(uint16))
+
+/* flags for infobits_set */
+#define XLHL_XMAX_IS_MULTI 0x01
+#define XLHL_XMAX_LOCK_ONLY 0x02
+#define XLHL_XMAX_EXCL_LOCK 0x04
+#define XLHL_XMAX_KEYSHR_LOCK 0x08
+#define XLHL_KEYS_UPDATED 0x10
+
+/* flag bits for xl_heap_lock / xl_heap_lock_updated's flag field */
+#define XLH_LOCK_ALL_FROZEN_CLEARED 0x01
+
+/* This is what we need to know about lock */
+typedef struct xl_heap_lock
+{
+ TransactionId locking_xid; /* might be a MultiXactId not xid */
+ OffsetNumber offnum; /* locked tuple's offset on page */
+ int8 infobits_set; /* infomask and infomask2 bits to set */
+ uint8 flags; /* XLH_LOCK_* flag bits */
+} xl_heap_lock;
+
+#define SizeOfHeapLock (offsetof(xl_heap_lock, flags) + sizeof(int8))
+
+/* This is what we need to know about locking an updated version of a row */
+typedef struct xl_heap_lock_updated
+{
+ TransactionId xmax;
+ OffsetNumber offnum;
+ uint8 infobits_set;
+ uint8 flags;
+} xl_heap_lock_updated;
+
+#define SizeOfHeapLockUpdated (offsetof(xl_heap_lock_updated, flags) + sizeof(uint8))
+
+/* This is what we need to know about confirmation of speculative insertion */
+typedef struct xl_heap_confirm
+{
+ OffsetNumber offnum; /* confirmed tuple's offset on page */
+} xl_heap_confirm;
+
+#define SizeOfHeapConfirm (offsetof(xl_heap_confirm, offnum) + sizeof(OffsetNumber))
+
+/* This is what we need to know about in-place update */
+typedef struct xl_heap_inplace
+{
+ OffsetNumber offnum; /* updated tuple's offset on page */
+ /* TUPLE DATA FOLLOWS AT END OF STRUCT */
+} xl_heap_inplace;
+
+#define SizeOfHeapInplace (offsetof(xl_heap_inplace, offnum) + sizeof(OffsetNumber))
+
+/*
+ * This struct represents a 'freeze plan', which is what we need to know about
+ * a single tuple being frozen during vacuum.
+ */
+/* 0x01 was XLH_FREEZE_XMIN */
+#define XLH_FREEZE_XVAC 0x02
+#define XLH_INVALID_XVAC 0x04
+
+typedef struct xl_heap_freeze_tuple
+{
+ TransactionId xmax;
+ OffsetNumber offset;
+ uint16 t_infomask2;
+ uint16 t_infomask;
+ uint8 frzflags;
+} xl_heap_freeze_tuple;
+
+/*
+ * This is what we need to know about a block being frozen during vacuum
+ *
+ * Backup block 0's data contains an array of xl_heap_freeze_tuple structs,
+ * one for each tuple.
+ */
+typedef struct xl_heap_freeze_page
+{
+ TransactionId cutoff_xid;
+ uint16 ntuples;
+} xl_heap_freeze_page;
+
+#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, ntuples) + sizeof(uint16))
+
+/*
+ * This is what we need to know about setting a visibility map bit
+ *
+ * Backup blk 0: visibility map buffer
+ * Backup blk 1: heap buffer
+ */
+typedef struct xl_heap_visible
+{
+ TransactionId cutoff_xid;
+ uint8 flags;
+} xl_heap_visible;
+
+#define SizeOfHeapVisible (offsetof(xl_heap_visible, flags) + sizeof(uint8))
+
+typedef struct xl_heap_new_cid
+{
+ /*
+ * store toplevel xid so we don't have to merge cids from different
+ * transactions
+ */
+ TransactionId top_xid;
+ CommandId cmin;
+ CommandId cmax;
+ CommandId combocid; /* just for debugging */
+
+ /*
+ * Store the relfilenode/ctid pair to facilitate lookups.
+ */
+ RelFileNode target_node;
+ ItemPointerData target_tid;
+} xl_heap_new_cid;
+
+#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target_tid) + sizeof(ItemPointerData))
+
+/* logical rewrite xlog record header */
+typedef struct xl_heap_rewrite_mapping
+{
+ TransactionId mapped_xid; /* xid that might need to see the row */
+ Oid mapped_db; /* DbOid or InvalidOid for shared rels */
+ Oid mapped_rel; /* Oid of the mapped relation */
+ off_t offset; /* How far have we written so far */
+ uint32 num_mappings; /* Number of in-memory mappings */
+ XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */
+} xl_heap_rewrite_mapping;
+
+extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
+ TransactionId *latestRemovedXid);
+
+extern void heap_redo(XLogReaderState *record);
+extern void heap_desc(StringInfo buf, XLogReaderState *record);
+extern const char *heap_identify(uint8 info);
+extern void heap_mask(char *pagedata, BlockNumber blkno);
+extern void heap2_redo(XLogReaderState *record);
+extern void heap2_desc(StringInfo buf, XLogReaderState *record);
+extern const char *heap2_identify(uint8 info);
+extern void heap_xlog_logical_rewrite(XLogReaderState *r);
+
+extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
+ TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples,
+ int ntuples);
+extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple,
+ TransactionId relfrozenxid,
+ TransactionId relminmxid,
+ TransactionId cutoff_xid,
+ TransactionId cutoff_multi,
+ xl_heap_freeze_tuple *frz,
+ bool *totally_frozen);
+extern void heap_execute_freeze_tuple(HeapTupleHeader tuple,
+ xl_heap_freeze_tuple *xlrec_tp);
+extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer,
+ Buffer vm_buffer, TransactionId cutoff_xid, uint8 flags);
+
+#endif /* HEAPAM_XLOG_H */
diff --git a/src/include/access/heaptoast.h b/src/include/access/heaptoast.h
new file mode 100644
index 0000000..8b29f1a
--- /dev/null
+++ b/src/include/access/heaptoast.h
@@ -0,0 +1,149 @@
+/*-------------------------------------------------------------------------
+ *
+ * heaptoast.h
+ * Heap-specific definitions for external and compressed storage
+ * of variable size attributes.
+ *
+ * Copyright (c) 2000-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/heaptoast.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HEAPTOAST_H
+#define HEAPTOAST_H
+
+#include "access/htup_details.h"
+#include "storage/lockdefs.h"
+#include "utils/relcache.h"
+
+/*
+ * Find the maximum size of a tuple if there are to be N tuples per page.
+ */
+#define MaximumBytesPerTuple(tuplesPerPage) \
+ MAXALIGN_DOWN((BLCKSZ - \
+ MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData))) \
+ / (tuplesPerPage))
+
+/*
+ * These symbols control toaster activation. If a tuple is larger than
+ * TOAST_TUPLE_THRESHOLD, we will try to toast it down to no more than
+ * TOAST_TUPLE_TARGET bytes through compressing compressible fields and
+ * moving EXTENDED and EXTERNAL data out-of-line.
+ *
+ * The numbers need not be the same, though they currently are. It doesn't
+ * make sense for TARGET to exceed THRESHOLD, but it could be useful to make
+ * it be smaller.
+ *
+ * Currently we choose both values to match the largest tuple size for which
+ * TOAST_TUPLES_PER_PAGE tuples can fit on a heap page.
+ *
+ * XXX while these can be modified without initdb, some thought needs to be
+ * given to needs_toast_table() in toasting.c before unleashing random
+ * changes. Also see LOBLKSIZE in large_object.h, which can *not* be
+ * changed without initdb.
+ */
+#define TOAST_TUPLES_PER_PAGE 4
+
+#define TOAST_TUPLE_THRESHOLD MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE)
+
+#define TOAST_TUPLE_TARGET TOAST_TUPLE_THRESHOLD
+
+/*
+ * The code will also consider moving MAIN data out-of-line, but only as a
+ * last resort if the previous steps haven't reached the target tuple size.
+ * In this phase we use a different target size, currently equal to the
+ * largest tuple that will fit on a heap page. This is reasonable since
+ * the user has told us to keep the data in-line if at all possible.
+ */
+#define TOAST_TUPLES_PER_PAGE_MAIN 1
+
+#define TOAST_TUPLE_TARGET_MAIN MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE_MAIN)
+
+/*
+ * If an index value is larger than TOAST_INDEX_TARGET, we will try to
+ * compress it (we can't move it out-of-line, however). Note that this
+ * number is per-datum, not per-tuple, for simplicity in index_form_tuple().
+ */
+#define TOAST_INDEX_TARGET (MaxHeapTupleSize / 16)
+
+/*
+ * When we store an oversize datum externally, we divide it into chunks
+ * containing at most TOAST_MAX_CHUNK_SIZE data bytes. This number *must*
+ * be small enough that the completed toast-table tuple (including the
+ * ID and sequence fields and all overhead) will fit on a page.
+ * The coding here sets the size on the theory that we want to fit
+ * EXTERN_TUPLES_PER_PAGE tuples of maximum size onto a page.
+ *
+ * NB: Changing TOAST_MAX_CHUNK_SIZE requires an initdb.
+ */
+#define EXTERN_TUPLES_PER_PAGE 4 /* tweak only this */
+
+#define EXTERN_TUPLE_MAX_SIZE MaximumBytesPerTuple(EXTERN_TUPLES_PER_PAGE)
+
+#define TOAST_MAX_CHUNK_SIZE \
+ (EXTERN_TUPLE_MAX_SIZE - \
+ MAXALIGN(SizeofHeapTupleHeader) - \
+ sizeof(Oid) - \
+ sizeof(int32) - \
+ VARHDRSZ)
+
+/* ----------
+ * heap_toast_insert_or_update -
+ *
+ * Called by heap_insert() and heap_update().
+ * ----------
+ */
+extern HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup,
+ HeapTuple oldtup, int options);
+
+/* ----------
+ * heap_toast_delete -
+ *
+ * Called by heap_delete().
+ * ----------
+ */
+extern void heap_toast_delete(Relation rel, HeapTuple oldtup,
+ bool is_speculative);
+
+/* ----------
+ * toast_flatten_tuple -
+ *
+ * "Flatten" a tuple to contain no out-of-line toasted fields.
+ * (This does not eliminate compressed or short-header datums.)
+ * ----------
+ */
+extern HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc);
+
+/* ----------
+ * toast_flatten_tuple_to_datum -
+ *
+ * "Flatten" a tuple containing out-of-line toasted fields into a Datum.
+ * ----------
+ */
+extern Datum toast_flatten_tuple_to_datum(HeapTupleHeader tup,
+ uint32 tup_len,
+ TupleDesc tupleDesc);
+
+/* ----------
+ * toast_build_flattened_tuple -
+ *
+ * Build a tuple containing no out-of-line toasted fields.
+ * (This does not eliminate compressed or short-header datums.)
+ * ----------
+ */
+extern HeapTuple toast_build_flattened_tuple(TupleDesc tupleDesc,
+ Datum *values,
+ bool *isnull);
+
+/* ----------
+ * heap_fetch_toast_slice
+ *
+ * Fetch a slice from a toast value stored in a heap table.
+ * ----------
+ */
+extern void heap_fetch_toast_slice(Relation toastrel, Oid valueid,
+ int32 attrsize, int32 sliceoffset,
+ int32 slicelength, struct varlena *result);
+
+#endif /* HEAPTOAST_H */
diff --git a/src/include/access/hio.h b/src/include/access/hio.h
new file mode 100644
index 0000000..1d61128
--- /dev/null
+++ b/src/include/access/hio.h
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * hio.h
+ * POSTGRES heap access method input/output definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/hio.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HIO_H
+#define HIO_H
+
+#include "access/htup.h"
+#include "storage/buf.h"
+#include "utils/relcache.h"
+
+/*
+ * state for bulk inserts --- private to heapam.c and hio.c
+ *
+ * If current_buf isn't InvalidBuffer, then we are holding an extra pin
+ * on that buffer.
+ *
+ * "typedef struct BulkInsertStateData *BulkInsertState" is in heapam.h
+ */
+typedef struct BulkInsertStateData
+{
+ BufferAccessStrategy strategy; /* our BULKWRITE strategy object */
+ Buffer current_buf; /* current insertion target page */
+} BulkInsertStateData;
+
+
+extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
+ HeapTuple tuple, bool token);
+extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
+ Buffer otherBuffer, int options,
+ BulkInsertStateData *bistate,
+ Buffer *vmbuffer, Buffer *vmbuffer_other);
+
+#endif /* HIO_H */
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
new file mode 100644
index 0000000..cf0bbd7
--- /dev/null
+++ b/src/include/access/htup.h
@@ -0,0 +1,89 @@
+/*-------------------------------------------------------------------------
+ *
+ * htup.h
+ * POSTGRES heap tuple definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/htup.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HTUP_H
+#define HTUP_H
+
+#include "storage/itemptr.h"
+
+/* typedefs and forward declarations for structs defined in htup_details.h */
+
+typedef struct HeapTupleHeaderData HeapTupleHeaderData;
+
+typedef HeapTupleHeaderData *HeapTupleHeader;
+
+typedef struct MinimalTupleData MinimalTupleData;
+
+typedef MinimalTupleData *MinimalTuple;
+
+
+/*
+ * HeapTupleData is an in-memory data structure that points to a tuple.
+ *
+ * There are several ways in which this data structure is used:
+ *
+ * * Pointer to a tuple in a disk buffer: t_data points directly into the
+ * buffer (which the code had better be holding a pin on, but this is not
+ * reflected in HeapTupleData itself).
+ *
+ * * Pointer to nothing: t_data is NULL. This is used as a failure indication
+ * in some functions.
+ *
+ * * Part of a palloc'd tuple: the HeapTupleData itself and the tuple
+ * form a single palloc'd chunk. t_data points to the memory location
+ * immediately following the HeapTupleData struct (at offset HEAPTUPLESIZE).
+ * This is the output format of heap_form_tuple and related routines.
+ *
+ * * Separately allocated tuple: t_data points to a palloc'd chunk that
+ * is not adjacent to the HeapTupleData. (This case is deprecated since
+ * it's difficult to tell apart from case #1. It should be used only in
+ * limited contexts where the code knows that case #1 will never apply.)
+ *
+ * * Separately allocated minimal tuple: t_data points MINIMAL_TUPLE_OFFSET
+ * bytes before the start of a MinimalTuple. As with the previous case,
+ * this can't be told apart from case #1 by inspection; code setting up
+ * or destroying this representation has to know what it's doing.
+ *
+ * t_len should always be valid, except in the pointer-to-nothing case.
+ * t_self and t_tableOid should be valid if the HeapTupleData points to
+ * a disk buffer, or if it represents a copy of a tuple on disk. They
+ * should be explicitly set invalid in manufactured tuples.
+ */
+typedef struct HeapTupleData
+{
+ uint32 t_len; /* length of *t_data */
+ ItemPointerData t_self; /* SelfItemPointer */
+ Oid t_tableOid; /* table the tuple came from */
+#define FIELDNO_HEAPTUPLEDATA_DATA 3
+ HeapTupleHeader t_data; /* -> tuple header and data */
+} HeapTupleData;
+
+typedef HeapTupleData *HeapTuple;
+
+#define HEAPTUPLESIZE MAXALIGN(sizeof(HeapTupleData))
+
+/*
+ * Accessor macros to be used with HeapTuple pointers.
+ */
+#define HeapTupleIsValid(tuple) PointerIsValid(tuple)
+
+/* HeapTupleHeader functions implemented in utils/time/combocid.c */
+extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
+extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
+extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup,
+ CommandId *cmax, bool *iscombo);
+
+/* Prototype for HeapTupleHeader accessors in heapam.c */
+extern TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple);
+
+#endif /* HTUP_H */
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
new file mode 100644
index 0000000..960772f
--- /dev/null
+++ b/src/include/access/htup_details.h
@@ -0,0 +1,818 @@
+/*-------------------------------------------------------------------------
+ *
+ * htup_details.h
+ * POSTGRES heap tuple header definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/htup_details.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HTUP_DETAILS_H
+#define HTUP_DETAILS_H
+
+#include "access/htup.h"
+#include "access/transam.h"
+#include "access/tupdesc.h"
+#include "access/tupmacs.h"
+#include "storage/bufpage.h"
+
+/*
+ * MaxTupleAttributeNumber limits the number of (user) columns in a tuple.
+ * The key limit on this value is that the size of the fixed overhead for
+ * a tuple, plus the size of the null-values bitmap (at 1 bit per column),
+ * plus MAXALIGN alignment, must fit into t_hoff which is uint8. On most
+ * machines the upper limit without making t_hoff wider would be a little
+ * over 1700. We use round numbers here and for MaxHeapAttributeNumber
+ * so that alterations in HeapTupleHeaderData layout won't change the
+ * supported max number of columns.
+ */
+#define MaxTupleAttributeNumber 1664 /* 8 * 208 */
+
+/*
+ * MaxHeapAttributeNumber limits the number of (user) columns in a table.
+ * This should be somewhat less than MaxTupleAttributeNumber. It must be
+ * at least one less, else we will fail to do UPDATEs on a maximal-width
+ * table (because UPDATE has to form working tuples that include CTID).
+ * In practice we want some additional daylight so that we can gracefully
+ * support operations that add hidden "resjunk" columns, for example
+ * SELECT * FROM wide_table ORDER BY foo, bar, baz.
+ * In any case, depending on column data types you will likely be running
+ * into the disk-block-based limit on overall tuple size if you have more
+ * than a thousand or so columns. TOAST won't help.
+ */
+#define MaxHeapAttributeNumber 1600 /* 8 * 200 */
+
+/*
+ * Heap tuple header. To avoid wasting space, the fields should be
+ * laid out in such a way as to avoid structure padding.
+ *
+ * Datums of composite types (row types) share the same general structure
+ * as on-disk tuples, so that the same routines can be used to build and
+ * examine them. However the requirements are slightly different: a Datum
+ * does not need any transaction visibility information, and it does need
+ * a length word and some embedded type information. We can achieve this
+ * by overlaying the xmin/cmin/xmax/cmax/xvac fields of a heap tuple
+ * with the fields needed in the Datum case. Typically, all tuples built
+ * in-memory will be initialized with the Datum fields; but when a tuple is
+ * about to be inserted in a table, the transaction fields will be filled,
+ * overwriting the datum fields.
+ *
+ * The overall structure of a heap tuple looks like:
+ * fixed fields (HeapTupleHeaderData struct)
+ * nulls bitmap (if HEAP_HASNULL is set in t_infomask)
+ * alignment padding (as needed to make user data MAXALIGN'd)
+ * object ID (if HEAP_HASOID_OLD is set in t_infomask, not created
+ * anymore)
+ * user data fields
+ *
+ * We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac in three
+ * physical fields. Xmin and Xmax are always really stored, but Cmin, Cmax
+ * and Xvac share a field. This works because we know that Cmin and Cmax
+ * are only interesting for the lifetime of the inserting and deleting
+ * transaction respectively. If a tuple is inserted and deleted in the same
+ * transaction, we store a "combo" command id that can be mapped to the real
+ * cmin and cmax, but only by use of local state within the originating
+ * backend. See combocid.c for more details. Meanwhile, Xvac is only set by
+ * old-style VACUUM FULL, which does not have any command sub-structure and so
+ * does not need either Cmin or Cmax. (This requires that old-style VACUUM
+ * FULL never try to move a tuple whose Cmin or Cmax is still interesting,
+ * ie, an insert-in-progress or delete-in-progress tuple.)
+ *
+ * A word about t_ctid: whenever a new tuple is stored on disk, its t_ctid
+ * is initialized with its own TID (location). If the tuple is ever updated,
+ * its t_ctid is changed to point to the replacement version of the tuple. Or
+ * if the tuple is moved from one partition to another, due to an update of
+ * the partition key, t_ctid is set to a special value to indicate that
+ * (see ItemPointerSetMovedPartitions). Thus, a tuple is the latest version
+ * of its row iff XMAX is invalid or
+ * t_ctid points to itself (in which case, if XMAX is valid, the tuple is
+ * either locked or deleted). One can follow the chain of t_ctid links
+ * to find the newest version of the row, unless it was moved to a different
+ * partition. Beware however that VACUUM might
+ * erase the pointed-to (newer) tuple before erasing the pointing (older)
+ * tuple. Hence, when following a t_ctid link, it is necessary to check
+ * to see if the referenced slot is empty or contains an unrelated tuple.
+ * Check that the referenced tuple has XMIN equal to the referencing tuple's
+ * XMAX to verify that it is actually the descendant version and not an
+ * unrelated tuple stored into a slot recently freed by VACUUM. If either
+ * check fails, one may assume that there is no live descendant version.
+ *
+ * t_ctid is sometimes used to store a speculative insertion token, instead
+ * of a real TID. A speculative token is set on a tuple that's being
+ * inserted, until the inserter is sure that it wants to go ahead with the
+ * insertion. Hence a token should only be seen on a tuple with an XMAX
+ * that's still in-progress, or invalid/aborted. The token is replaced with
+ * the tuple's real TID when the insertion is confirmed. One should never
+ * see a speculative insertion token while following a chain of t_ctid links,
+ * because they are not used on updates, only insertions.
+ *
+ * Following the fixed header fields, the nulls bitmap is stored (beginning
+ * at t_bits). The bitmap is *not* stored if t_infomask shows that there
+ * are no nulls in the tuple. If an OID field is present (as indicated by
+ * t_infomask), then it is stored just before the user data, which begins at
+ * the offset shown by t_hoff. Note that t_hoff must be a multiple of
+ * MAXALIGN.
+ */
+
+typedef struct HeapTupleFields
+{
+ TransactionId t_xmin; /* inserting xact ID */
+ TransactionId t_xmax; /* deleting or locking xact ID */
+
+ union
+ {
+ CommandId t_cid; /* inserting or deleting command ID, or both */
+ TransactionId t_xvac; /* old-style VACUUM FULL xact ID */
+ } t_field3;
+} HeapTupleFields;
+
+typedef struct DatumTupleFields
+{
+ int32 datum_len_; /* varlena header (do not touch directly!) */
+
+ int32 datum_typmod; /* -1, or identifier of a record type */
+
+ Oid datum_typeid; /* composite type OID, or RECORDOID */
+
+ /*
+ * datum_typeid cannot be a domain over composite, only plain composite,
+ * even if the datum is meant as a value of a domain-over-composite type.
+ * This is in line with the general principle that CoerceToDomain does not
+ * change the physical representation of the base type value.
+ *
+ * Note: field ordering is chosen with thought that Oid might someday
+ * widen to 64 bits.
+ */
+} DatumTupleFields;
+
+struct HeapTupleHeaderData
+{
+ union
+ {
+ HeapTupleFields t_heap;
+ DatumTupleFields t_datum;
+ } t_choice;
+
+ ItemPointerData t_ctid; /* current TID of this or newer tuple (or a
+ * speculative insertion token) */
+
+ /* Fields below here must match MinimalTupleData! */
+
+#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
+ uint16 t_infomask2; /* number of attributes + various flags */
+
+#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
+ uint16 t_infomask; /* various flag bits, see below */
+
+#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4
+ uint8 t_hoff; /* sizeof header incl. bitmap, padding */
+
+ /* ^ - 23 bytes - ^ */
+
+#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5
+ bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */
+
+ /* MORE DATA FOLLOWS AT END OF STRUCT */
+};
+
+/* typedef appears in htup.h */
+
+#define SizeofHeapTupleHeader offsetof(HeapTupleHeaderData, t_bits)
+
+/*
+ * information stored in t_infomask:
+ */
+#define HEAP_HASNULL 0x0001 /* has null attribute(s) */
+#define HEAP_HASVARWIDTH 0x0002 /* has variable-width attribute(s) */
+#define HEAP_HASEXTERNAL 0x0004 /* has external stored attribute(s) */
+#define HEAP_HASOID_OLD 0x0008 /* has an object-id field */
+#define HEAP_XMAX_KEYSHR_LOCK 0x0010 /* xmax is a key-shared locker */
+#define HEAP_COMBOCID 0x0020 /* t_cid is a combo CID */
+#define HEAP_XMAX_EXCL_LOCK 0x0040 /* xmax is exclusive locker */
+#define HEAP_XMAX_LOCK_ONLY 0x0080 /* xmax, if valid, is only a locker */
+
+ /* xmax is a shared locker */
+#define HEAP_XMAX_SHR_LOCK (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)
+
+#define HEAP_LOCK_MASK (HEAP_XMAX_SHR_LOCK | HEAP_XMAX_EXCL_LOCK | \
+ HEAP_XMAX_KEYSHR_LOCK)
+#define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */
+#define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */
+#define HEAP_XMIN_FROZEN (HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID)
+#define HEAP_XMAX_COMMITTED 0x0400 /* t_xmax committed */
+#define HEAP_XMAX_INVALID 0x0800 /* t_xmax invalid/aborted */
+#define HEAP_XMAX_IS_MULTI 0x1000 /* t_xmax is a MultiXactId */
+#define HEAP_UPDATED 0x2000 /* this is UPDATEd version of row */
+#define HEAP_MOVED_OFF 0x4000 /* moved to another place by pre-9.0
+ * VACUUM FULL; kept for binary
+ * upgrade support */
+#define HEAP_MOVED_IN 0x8000 /* moved from another place by pre-9.0
+ * VACUUM FULL; kept for binary
+ * upgrade support */
+#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN)
+
+#define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */
+
+/*
+ * A tuple is only locked (i.e. not updated by its Xmax) if the
+ * HEAP_XMAX_LOCK_ONLY bit is set; or, for pg_upgrade's sake, if the Xmax is
+ * not a multi and the EXCL_LOCK bit is set.
+ *
+ * See also HeapTupleHeaderIsOnlyLocked, which also checks for a possible
+ * aborted updater transaction.
+ *
+ * Beware of multiple evaluations of the argument.
+ */
+#define HEAP_XMAX_IS_LOCKED_ONLY(infomask) \
+ (((infomask) & HEAP_XMAX_LOCK_ONLY) || \
+ (((infomask) & (HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK)) == HEAP_XMAX_EXCL_LOCK))
+
+/*
+ * A tuple that has HEAP_XMAX_IS_MULTI and HEAP_XMAX_LOCK_ONLY but neither of
+ * HEAP_XMAX_EXCL_LOCK and HEAP_XMAX_KEYSHR_LOCK must come from a tuple that was
+ * share-locked in 9.2 or earlier and then pg_upgrade'd.
+ *
+ * In 9.2 and prior, HEAP_XMAX_IS_MULTI was only set when there were multiple
+ * FOR SHARE lockers of that tuple. That set HEAP_XMAX_LOCK_ONLY (with a
+ * different name back then) but neither of HEAP_XMAX_EXCL_LOCK and
+ * HEAP_XMAX_KEYSHR_LOCK. That combination is no longer possible in 9.3 and
+ * up, so if we see that combination we know for certain that the tuple was
+ * locked in an earlier release; since all such lockers are gone (they cannot
+ * survive through pg_upgrade), such tuples can safely be considered not
+ * locked.
+ *
+ * We must not resolve such multixacts locally, because the result would be
+ * bogus, regardless of where they stand with respect to the current valid
+ * multixact range.
+ */
+#define HEAP_LOCKED_UPGRADED(infomask) \
+( \
+ ((infomask) & HEAP_XMAX_IS_MULTI) != 0 && \
+ ((infomask) & HEAP_XMAX_LOCK_ONLY) != 0 && \
+ (((infomask) & (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)) == 0) \
+)
+
+/*
+ * Use these to test whether a particular lock is applied to a tuple
+ */
+#define HEAP_XMAX_IS_SHR_LOCKED(infomask) \
+ (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_SHR_LOCK)
+#define HEAP_XMAX_IS_EXCL_LOCKED(infomask) \
+ (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_EXCL_LOCK)
+#define HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) \
+ (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_KEYSHR_LOCK)
+
+/* turn these all off when Xmax is to change */
+#define HEAP_XMAX_BITS (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | \
+ HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK | HEAP_XMAX_LOCK_ONLY)
+
+/*
+ * information stored in t_infomask2:
+ */
+#define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */
+/* bits 0x1800 are available */
+#define HEAP_KEYS_UPDATED 0x2000 /* tuple was updated and key cols
+ * modified, or tuple deleted */
+#define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */
+#define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */
+
+#define HEAP2_XACT_MASK 0xE000 /* visibility-related bits */
+
+/*
+ * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins. It is
+ * only used in tuples that are in the hash table, and those don't need
+ * any visibility information, so we can overlay it on a visibility flag
+ * instead of using up a dedicated bit.
+ */
+#define HEAP_TUPLE_HAS_MATCH HEAP_ONLY_TUPLE /* tuple has a join match */
+
+/*
+ * HeapTupleHeader accessor macros
+ *
+ * Note: beware of multiple evaluations of "tup" argument. But the Set
+ * macros evaluate their other argument only once.
+ */
+
+/*
+ * HeapTupleHeaderGetRawXmin returns the "raw" xmin field, which is the xid
+ * originally used to insert the tuple. However, the tuple might actually
+ * be frozen (via HeapTupleHeaderSetXminFrozen) in which case the tuple's xmin
+ * is visible to every snapshot. Prior to PostgreSQL 9.4, we actually changed
+ * the xmin to FrozenTransactionId, and that value may still be encountered
+ * on disk.
+ */
+#define HeapTupleHeaderGetRawXmin(tup) \
+( \
+ (tup)->t_choice.t_heap.t_xmin \
+)
+
+#define HeapTupleHeaderGetXmin(tup) \
+( \
+ HeapTupleHeaderXminFrozen(tup) ? \
+ FrozenTransactionId : HeapTupleHeaderGetRawXmin(tup) \
+)
+
+#define HeapTupleHeaderSetXmin(tup, xid) \
+( \
+ (tup)->t_choice.t_heap.t_xmin = (xid) \
+)
+
+#define HeapTupleHeaderXminCommitted(tup) \
+( \
+ ((tup)->t_infomask & HEAP_XMIN_COMMITTED) != 0 \
+)
+
+#define HeapTupleHeaderXminInvalid(tup) \
+( \
+ ((tup)->t_infomask & (HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID)) == \
+ HEAP_XMIN_INVALID \
+)
+
+#define HeapTupleHeaderXminFrozen(tup) \
+( \
+ ((tup)->t_infomask & (HEAP_XMIN_FROZEN)) == HEAP_XMIN_FROZEN \
+)
+
+#define HeapTupleHeaderSetXminCommitted(tup) \
+( \
+ AssertMacro(!HeapTupleHeaderXminInvalid(tup)), \
+ ((tup)->t_infomask |= HEAP_XMIN_COMMITTED) \
+)
+
+#define HeapTupleHeaderSetXminInvalid(tup) \
+( \
+ AssertMacro(!HeapTupleHeaderXminCommitted(tup)), \
+ ((tup)->t_infomask |= HEAP_XMIN_INVALID) \
+)
+
+#define HeapTupleHeaderSetXminFrozen(tup) \
+( \
+ AssertMacro(!HeapTupleHeaderXminInvalid(tup)), \
+ ((tup)->t_infomask |= HEAP_XMIN_FROZEN) \
+)
+
+/*
+ * HeapTupleHeaderGetRawXmax gets you the raw Xmax field. To find out the Xid
+ * that updated a tuple, you might need to resolve the MultiXactId if certain
+ * bits are set. HeapTupleHeaderGetUpdateXid checks those bits and takes care
+ * to resolve the MultiXactId if necessary. This might involve multixact I/O,
+ * so it should only be used if absolutely necessary.
+ */
+#define HeapTupleHeaderGetUpdateXid(tup) \
+( \
+ (!((tup)->t_infomask & HEAP_XMAX_INVALID) && \
+ ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \
+ !((tup)->t_infomask & HEAP_XMAX_LOCK_ONLY)) ? \
+ HeapTupleGetUpdateXid(tup) \
+ : \
+ HeapTupleHeaderGetRawXmax(tup) \
+)
+
+#define HeapTupleHeaderGetRawXmax(tup) \
+( \
+ (tup)->t_choice.t_heap.t_xmax \
+)
+
+#define HeapTupleHeaderSetXmax(tup, xid) \
+( \
+ (tup)->t_choice.t_heap.t_xmax = (xid) \
+)
+
+/*
+ * HeapTupleHeaderGetRawCommandId will give you what's in the header whether
+ * it is useful or not. Most code should use HeapTupleHeaderGetCmin or
+ * HeapTupleHeaderGetCmax instead, but note that those Assert that you can
+ * get a legitimate result, ie you are in the originating transaction!
+ */
+#define HeapTupleHeaderGetRawCommandId(tup) \
+( \
+ (tup)->t_choice.t_heap.t_field3.t_cid \
+)
+
+/* SetCmin is reasonably simple since we never need a combo CID */
+#define HeapTupleHeaderSetCmin(tup, cid) \
+do { \
+ Assert(!((tup)->t_infomask & HEAP_MOVED)); \
+ (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \
+ (tup)->t_infomask &= ~HEAP_COMBOCID; \
+} while (0)
+
+/* SetCmax must be used after HeapTupleHeaderAdjustCmax; see combocid.c */
+#define HeapTupleHeaderSetCmax(tup, cid, iscombo) \
+do { \
+ Assert(!((tup)->t_infomask & HEAP_MOVED)); \
+ (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \
+ if (iscombo) \
+ (tup)->t_infomask |= HEAP_COMBOCID; \
+ else \
+ (tup)->t_infomask &= ~HEAP_COMBOCID; \
+} while (0)
+
+#define HeapTupleHeaderGetXvac(tup) \
+( \
+ ((tup)->t_infomask & HEAP_MOVED) ? \
+ (tup)->t_choice.t_heap.t_field3.t_xvac \
+ : \
+ InvalidTransactionId \
+)
+
+#define HeapTupleHeaderSetXvac(tup, xid) \
+do { \
+ Assert((tup)->t_infomask & HEAP_MOVED); \
+ (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \
+} while (0)
+
+#define HeapTupleHeaderIsSpeculative(tup) \
+( \
+ (ItemPointerGetOffsetNumberNoCheck(&(tup)->t_ctid) == SpecTokenOffsetNumber) \
+)
+
+#define HeapTupleHeaderGetSpeculativeToken(tup) \
+( \
+ AssertMacro(HeapTupleHeaderIsSpeculative(tup)), \
+ ItemPointerGetBlockNumber(&(tup)->t_ctid) \
+)
+
+#define HeapTupleHeaderSetSpeculativeToken(tup, token) \
+( \
+ ItemPointerSet(&(tup)->t_ctid, token, SpecTokenOffsetNumber) \
+)
+
+#define HeapTupleHeaderIndicatesMovedPartitions(tup) \
+ ItemPointerIndicatesMovedPartitions(&(tup)->t_ctid)
+
+#define HeapTupleHeaderSetMovedPartitions(tup) \
+ ItemPointerSetMovedPartitions(&(tup)->t_ctid)
+
+#define HeapTupleHeaderGetDatumLength(tup) \
+ VARSIZE(tup)
+
+#define HeapTupleHeaderSetDatumLength(tup, len) \
+ SET_VARSIZE(tup, len)
+
+#define HeapTupleHeaderGetTypeId(tup) \
+( \
+ (tup)->t_choice.t_datum.datum_typeid \
+)
+
+#define HeapTupleHeaderSetTypeId(tup, typeid) \
+( \
+ (tup)->t_choice.t_datum.datum_typeid = (typeid) \
+)
+
+#define HeapTupleHeaderGetTypMod(tup) \
+( \
+ (tup)->t_choice.t_datum.datum_typmod \
+)
+
+#define HeapTupleHeaderSetTypMod(tup, typmod) \
+( \
+ (tup)->t_choice.t_datum.datum_typmod = (typmod) \
+)
+
+/*
+ * Note that we stop considering a tuple HOT-updated as soon as it is known
+ * aborted or the would-be updating transaction is known aborted. For best
+ * efficiency, check tuple visibility before using this macro, so that the
+ * INVALID bits will be as up to date as possible.
+ */
+#define HeapTupleHeaderIsHotUpdated(tup) \
+( \
+ ((tup)->t_infomask2 & HEAP_HOT_UPDATED) != 0 && \
+ ((tup)->t_infomask & HEAP_XMAX_INVALID) == 0 && \
+ !HeapTupleHeaderXminInvalid(tup) \
+)
+
+#define HeapTupleHeaderSetHotUpdated(tup) \
+( \
+ (tup)->t_infomask2 |= HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderClearHotUpdated(tup) \
+( \
+ (tup)->t_infomask2 &= ~HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderIsHeapOnly(tup) \
+( \
+ ((tup)->t_infomask2 & HEAP_ONLY_TUPLE) != 0 \
+)
+
+#define HeapTupleHeaderSetHeapOnly(tup) \
+( \
+ (tup)->t_infomask2 |= HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderClearHeapOnly(tup) \
+( \
+ (tup)->t_infomask2 &= ~HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderHasMatch(tup) \
+( \
+ ((tup)->t_infomask2 & HEAP_TUPLE_HAS_MATCH) != 0 \
+)
+
+#define HeapTupleHeaderSetMatch(tup) \
+( \
+ (tup)->t_infomask2 |= HEAP_TUPLE_HAS_MATCH \
+)
+
+#define HeapTupleHeaderClearMatch(tup) \
+( \
+ (tup)->t_infomask2 &= ~HEAP_TUPLE_HAS_MATCH \
+)
+
+#define HeapTupleHeaderGetNatts(tup) \
+ ((tup)->t_infomask2 & HEAP_NATTS_MASK)
+
+#define HeapTupleHeaderSetNatts(tup, natts) \
+( \
+ (tup)->t_infomask2 = ((tup)->t_infomask2 & ~HEAP_NATTS_MASK) | (natts) \
+)
+
+#define HeapTupleHeaderHasExternal(tup) \
+ (((tup)->t_infomask & HEAP_HASEXTERNAL) != 0)
+
+
+/*
+ * BITMAPLEN(NATTS) -
+ * Computes size of null bitmap given number of data columns.
+ */
+#define BITMAPLEN(NATTS) (((int)(NATTS) + 7) / 8)
+
+/*
+ * MaxHeapTupleSize is the maximum allowed size of a heap tuple, including
+ * header and MAXALIGN alignment padding. Basically it's BLCKSZ minus the
+ * other stuff that has to be on a disk page. Since heap pages use no
+ * "special space", there's no deduction for that.
+ *
+ * NOTE: we allow for the ItemId that must point to the tuple, ensuring that
+ * an otherwise-empty page can indeed hold a tuple of this size. Because
+ * ItemIds and tuples have different alignment requirements, don't assume that
+ * you can, say, fit 2 tuples of size MaxHeapTupleSize/2 on the same page.
+ */
+#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData)))
+#define MinHeapTupleSize MAXALIGN(SizeofHeapTupleHeader)
+
+/*
+ * MaxHeapTuplesPerPage is an upper bound on the number of tuples that can
+ * fit on one heap page. (Note that indexes could have more, because they
+ * use a smaller tuple header.) We arrive at the divisor because each tuple
+ * must be maxaligned, and it must have an associated line pointer.
+ *
+ * Note: with HOT, there could theoretically be more line pointers (not actual
+ * tuples) than this on a heap page. However we constrain the number of line
+ * pointers to this anyway, to avoid excessive line-pointer bloat and not
+ * require increases in the size of work arrays.
+ */
+#define MaxHeapTuplesPerPage \
+ ((int) ((BLCKSZ - SizeOfPageHeaderData) / \
+ (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))))
+
+/*
+ * MaxAttrSize is a somewhat arbitrary upper limit on the declared size of
+ * data fields of char(n) and similar types. It need not have anything
+ * directly to do with the *actual* upper limit of varlena values, which
+ * is currently 1Gb (see TOAST structures in postgres.h). I've set it
+ * at 10Mb which seems like a reasonable number --- tgl 8/6/00.
+ */
+#define MaxAttrSize (10 * 1024 * 1024)
+
+
+/*
+ * MinimalTuple is an alternative representation that is used for transient
+ * tuples inside the executor, in places where transaction status information
+ * is not required, the tuple rowtype is known, and shaving off a few bytes
+ * is worthwhile because we need to store many tuples. The representation
+ * is chosen so that tuple access routines can work with either full or
+ * minimal tuples via a HeapTupleData pointer structure. The access routines
+ * see no difference, except that they must not access the transaction status
+ * or t_ctid fields because those aren't there.
+ *
+ * For the most part, MinimalTuples should be accessed via TupleTableSlot
+ * routines. These routines will prevent access to the "system columns"
+ * and thereby prevent accidental use of the nonexistent fields.
+ *
+ * MinimalTupleData contains a length word, some padding, and fields matching
+ * HeapTupleHeaderData beginning with t_infomask2. The padding is chosen so
+ * that offsetof(t_infomask2) is the same modulo MAXIMUM_ALIGNOF in both
+ * structs. This makes data alignment rules equivalent in both cases.
+ *
+ * When a minimal tuple is accessed via a HeapTupleData pointer, t_data is
+ * set to point MINIMAL_TUPLE_OFFSET bytes before the actual start of the
+ * minimal tuple --- that is, where a full tuple matching the minimal tuple's
+ * data would start. This trick is what makes the structs seem equivalent.
+ *
+ * Note that t_hoff is computed the same as in a full tuple, hence it includes
+ * the MINIMAL_TUPLE_OFFSET distance. t_len does not include that, however.
+ *
+ * MINIMAL_TUPLE_DATA_OFFSET is the offset to the first useful (non-pad) data
+ * other than the length word. tuplesort.c and tuplestore.c use this to avoid
+ * writing the padding to disk.
+ */
+#define MINIMAL_TUPLE_OFFSET \
+ ((offsetof(HeapTupleHeaderData, t_infomask2) - sizeof(uint32)) / MAXIMUM_ALIGNOF * MAXIMUM_ALIGNOF)
+#define MINIMAL_TUPLE_PADDING \
+ ((offsetof(HeapTupleHeaderData, t_infomask2) - sizeof(uint32)) % MAXIMUM_ALIGNOF)
+#define MINIMAL_TUPLE_DATA_OFFSET \
+ offsetof(MinimalTupleData, t_infomask2)
+
+struct MinimalTupleData
+{
+ uint32 t_len; /* actual length of minimal tuple */
+
+ char mt_padding[MINIMAL_TUPLE_PADDING];
+
+ /* Fields below here must match HeapTupleHeaderData! */
+
+ uint16 t_infomask2; /* number of attributes + various flags */
+
+ uint16 t_infomask; /* various flag bits, see below */
+
+ uint8 t_hoff; /* sizeof header incl. bitmap, padding */
+
+ /* ^ - 23 bytes - ^ */
+
+ bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */
+
+ /* MORE DATA FOLLOWS AT END OF STRUCT */
+};
+
+/* typedef appears in htup.h */
+
+#define SizeofMinimalTupleHeader offsetof(MinimalTupleData, t_bits)
+
+
+/*
+ * GETSTRUCT - given a HeapTuple pointer, return address of the user data
+ */
+#define GETSTRUCT(TUP) ((char *) ((TUP)->t_data) + (TUP)->t_data->t_hoff)
+
+/*
+ * Accessor macros to be used with HeapTuple pointers.
+ */
+
+#define HeapTupleHasNulls(tuple) \
+ (((tuple)->t_data->t_infomask & HEAP_HASNULL) != 0)
+
+#define HeapTupleNoNulls(tuple) \
+ (!((tuple)->t_data->t_infomask & HEAP_HASNULL))
+
+#define HeapTupleHasVarWidth(tuple) \
+ (((tuple)->t_data->t_infomask & HEAP_HASVARWIDTH) != 0)
+
+#define HeapTupleAllFixed(tuple) \
+ (!((tuple)->t_data->t_infomask & HEAP_HASVARWIDTH))
+
+#define HeapTupleHasExternal(tuple) \
+ (((tuple)->t_data->t_infomask & HEAP_HASEXTERNAL) != 0)
+
+#define HeapTupleIsHotUpdated(tuple) \
+ HeapTupleHeaderIsHotUpdated((tuple)->t_data)
+
+#define HeapTupleSetHotUpdated(tuple) \
+ HeapTupleHeaderSetHotUpdated((tuple)->t_data)
+
+#define HeapTupleClearHotUpdated(tuple) \
+ HeapTupleHeaderClearHotUpdated((tuple)->t_data)
+
+#define HeapTupleIsHeapOnly(tuple) \
+ HeapTupleHeaderIsHeapOnly((tuple)->t_data)
+
+#define HeapTupleSetHeapOnly(tuple) \
+ HeapTupleHeaderSetHeapOnly((tuple)->t_data)
+
+#define HeapTupleClearHeapOnly(tuple) \
+ HeapTupleHeaderClearHeapOnly((tuple)->t_data)
+
+
+/* ----------------
+ * fastgetattr
+ *
+ * Fetch a user attribute's value as a Datum (might be either a
+ * value, or a pointer into the data area of the tuple).
+ *
+ * This must not be used when a system attribute might be requested.
+ * Furthermore, the passed attnum MUST be valid. Use heap_getattr()
+ * instead, if in doubt.
+ *
+ * This gets called many times, so we macro the cacheable and NULL
+ * lookups, and call nocachegetattr() for the rest.
+ * ----------------
+ */
+
+#if !defined(DISABLE_COMPLEX_MACRO)
+
+#define fastgetattr(tup, attnum, tupleDesc, isnull) \
+( \
+ AssertMacro((attnum) > 0), \
+ (*(isnull) = false), \
+ HeapTupleNoNulls(tup) ? \
+ ( \
+ TupleDescAttr((tupleDesc), (attnum)-1)->attcacheoff >= 0 ? \
+ ( \
+ fetchatt(TupleDescAttr((tupleDesc), (attnum)-1), \
+ (char *) (tup)->t_data + (tup)->t_data->t_hoff + \
+ TupleDescAttr((tupleDesc), (attnum)-1)->attcacheoff)\
+ ) \
+ : \
+ nocachegetattr((tup), (attnum), (tupleDesc)) \
+ ) \
+ : \
+ ( \
+ att_isnull((attnum)-1, (tup)->t_data->t_bits) ? \
+ ( \
+ (*(isnull) = true), \
+ (Datum)NULL \
+ ) \
+ : \
+ ( \
+ nocachegetattr((tup), (attnum), (tupleDesc)) \
+ ) \
+ ) \
+)
+#else /* defined(DISABLE_COMPLEX_MACRO) */
+
+extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
+ bool *isnull);
+#endif /* defined(DISABLE_COMPLEX_MACRO) */
+
+
+/* ----------------
+ * heap_getattr
+ *
+ * Extract an attribute of a heap tuple and return it as a Datum.
+ * This works for either system or user attributes. The given attnum
+ * is properly range-checked.
+ *
+ * If the field in question has a NULL value, we return a zero Datum
+ * and set *isnull == true. Otherwise, we set *isnull == false.
+ *
+ * <tup> is the pointer to the heap tuple. <attnum> is the attribute
+ * number of the column (field) caller wants. <tupleDesc> is a
+ * pointer to the structure describing the row and all its fields.
+ * ----------------
+ */
+#define heap_getattr(tup, attnum, tupleDesc, isnull) \
+ ( \
+ ((attnum) > 0) ? \
+ ( \
+ ((attnum) > (int) HeapTupleHeaderGetNatts((tup)->t_data)) ? \
+ getmissingattr((tupleDesc), (attnum), (isnull)) \
+ : \
+ fastgetattr((tup), (attnum), (tupleDesc), (isnull)) \
+ ) \
+ : \
+ heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \
+ )
+
+
+/* prototypes for functions in common/heaptuple.c */
+extern Size heap_compute_data_size(TupleDesc tupleDesc,
+ Datum *values, bool *isnull);
+extern void heap_fill_tuple(TupleDesc tupleDesc,
+ Datum *values, bool *isnull,
+ char *data, Size data_size,
+ uint16 *infomask, bits8 *bit);
+extern bool heap_attisnull(HeapTuple tup, int attnum, TupleDesc tupleDesc);
+extern Datum nocachegetattr(HeapTuple tup, int attnum,
+ TupleDesc att);
+extern Datum heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
+ bool *isnull);
+extern Datum getmissingattr(TupleDesc tupleDesc,
+ int attnum, bool *isnull);
+extern HeapTuple heap_copytuple(HeapTuple tuple);
+extern void heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest);
+extern Datum heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc);
+extern HeapTuple heap_form_tuple(TupleDesc tupleDescriptor,
+ Datum *values, bool *isnull);
+extern HeapTuple heap_modify_tuple(HeapTuple tuple,
+ TupleDesc tupleDesc,
+ Datum *replValues,
+ bool *replIsnull,
+ bool *doReplace);
+extern HeapTuple heap_modify_tuple_by_cols(HeapTuple tuple,
+ TupleDesc tupleDesc,
+ int nCols,
+ int *replCols,
+ Datum *replValues,
+ bool *replIsnull);
+extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
+ Datum *values, bool *isnull);
+extern void heap_freetuple(HeapTuple htup);
+extern MinimalTuple heap_form_minimal_tuple(TupleDesc tupleDescriptor,
+ Datum *values, bool *isnull);
+extern void heap_free_minimal_tuple(MinimalTuple mtup);
+extern MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup);
+extern HeapTuple heap_tuple_from_minimal_tuple(MinimalTuple mtup);
+extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup);
+extern size_t varsize_any(void *p);
+extern HeapTuple heap_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc);
+extern MinimalTuple minimal_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc);
+
+#endif /* HTUP_DETAILS_H */
diff --git a/src/include/access/itup.h b/src/include/access/itup.h
new file mode 100644
index 0000000..1917375
--- /dev/null
+++ b/src/include/access/itup.h
@@ -0,0 +1,164 @@
+/*-------------------------------------------------------------------------
+ *
+ * itup.h
+ * POSTGRES index tuple definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/itup.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITUP_H
+#define ITUP_H
+
+#include "access/tupdesc.h"
+#include "access/tupmacs.h"
+#include "storage/bufpage.h"
+#include "storage/itemptr.h"
+
+/*
+ * Index tuple header structure
+ *
+ * All index tuples start with IndexTupleData. If the HasNulls bit is set,
+ * this is followed by an IndexAttributeBitMapData. The index attribute
+ * values follow, beginning at a MAXALIGN boundary.
+ *
+ * Note that the space allocated for the bitmap does not vary with the number
+ * of attributes; that is because we don't have room to store the number of
+ * attributes in the header. Given the MAXALIGN constraint there's no space
+ * savings to be had anyway, for usual values of INDEX_MAX_KEYS.
+ */
+
+typedef struct IndexTupleData
+{
+ ItemPointerData t_tid; /* reference TID to heap tuple */
+
+ /* ---------------
+ * t_info is laid out in the following fashion:
+ *
+ * 15th (high) bit: has nulls
+ * 14th bit: has var-width attributes
+ * 13th bit: AM-defined meaning
+ * 12-0 bit: size of tuple
+ * ---------------
+ */
+
+ unsigned short t_info; /* various info about tuple */
+
+} IndexTupleData; /* MORE DATA FOLLOWS AT END OF STRUCT */
+
+typedef IndexTupleData *IndexTuple;
+
+typedef struct IndexAttributeBitMapData
+{
+ bits8 bits[(INDEX_MAX_KEYS + 8 - 1) / 8];
+} IndexAttributeBitMapData;
+
+typedef IndexAttributeBitMapData * IndexAttributeBitMap;
+
+/*
+ * t_info manipulation macros
+ */
+#define INDEX_SIZE_MASK 0x1FFF
+#define INDEX_AM_RESERVED_BIT 0x2000 /* reserved for index-AM specific
+ * usage */
+#define INDEX_VAR_MASK 0x4000
+#define INDEX_NULL_MASK 0x8000
+
+#define IndexTupleSize(itup) ((Size) ((itup)->t_info & INDEX_SIZE_MASK))
+#define IndexTupleHasNulls(itup) ((((IndexTuple) (itup))->t_info & INDEX_NULL_MASK))
+#define IndexTupleHasVarwidths(itup) ((((IndexTuple) (itup))->t_info & INDEX_VAR_MASK))
+
+
+/*
+ * Takes an infomask as argument (primarily because this needs to be usable
+ * at index_form_tuple time so enough space is allocated).
+ */
+#define IndexInfoFindDataOffset(t_info) \
+( \
+ (!((t_info) & INDEX_NULL_MASK)) ? \
+ ( \
+ (Size)MAXALIGN(sizeof(IndexTupleData)) \
+ ) \
+ : \
+ ( \
+ (Size)MAXALIGN(sizeof(IndexTupleData) + sizeof(IndexAttributeBitMapData)) \
+ ) \
+)
+
+/* ----------------
+ * index_getattr
+ *
+ * This gets called many times, so we macro the cacheable and NULL
+ * lookups, and call nocache_index_getattr() for the rest.
+ *
+ * ----------------
+ */
+#define index_getattr(tup, attnum, tupleDesc, isnull) \
+( \
+ AssertMacro(PointerIsValid(isnull) && (attnum) > 0), \
+ *(isnull) = false, \
+ !IndexTupleHasNulls(tup) ? \
+ ( \
+ TupleDescAttr((tupleDesc), (attnum)-1)->attcacheoff >= 0 ? \
+ ( \
+ fetchatt(TupleDescAttr((tupleDesc), (attnum)-1), \
+ (char *) (tup) + IndexInfoFindDataOffset((tup)->t_info) \
+ + TupleDescAttr((tupleDesc), (attnum)-1)->attcacheoff) \
+ ) \
+ : \
+ nocache_index_getattr((tup), (attnum), (tupleDesc)) \
+ ) \
+ : \
+ ( \
+ (att_isnull((attnum)-1, (char *)(tup) + sizeof(IndexTupleData))) ? \
+ ( \
+ *(isnull) = true, \
+ (Datum)NULL \
+ ) \
+ : \
+ ( \
+ nocache_index_getattr((tup), (attnum), (tupleDesc)) \
+ ) \
+ ) \
+)
+
+/*
+ * MaxIndexTuplesPerPage is an upper bound on the number of tuples that can
+ * fit on one index page. An index tuple must have either data or a null
+ * bitmap, so we can safely assume it's at least 1 byte bigger than a bare
+ * IndexTupleData struct. We arrive at the divisor because each tuple
+ * must be maxaligned, and it must have an associated line pointer.
+ *
+ * To be index-type-independent, this does not account for any special space
+ * on the page, and is thus conservative.
+ *
+ * Note: in btree non-leaf pages, the first tuple has no key (it's implicitly
+ * minus infinity), thus breaking the "at least 1 byte bigger" assumption.
+ * On such a page, N tuples could take one MAXALIGN quantum less space than
+ * estimated here, seemingly allowing one more tuple than estimated here.
+ * But such a page always has at least MAXALIGN special space, so we're safe.
+ */
+#define MaxIndexTuplesPerPage \
+ ((int) ((BLCKSZ - SizeOfPageHeaderData) / \
+ (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))))
+
+
+/* routines in indextuple.c */
+extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor,
+ Datum *values, bool *isnull);
+extern Datum nocache_index_getattr(IndexTuple tup, int attnum,
+ TupleDesc tupleDesc);
+extern void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor,
+ Datum *values, bool *isnull);
+extern void index_deform_tuple_internal(TupleDesc tupleDescriptor,
+ Datum *values, bool *isnull,
+ char *tp, bits8 *bp, int hasnulls);
+extern IndexTuple CopyIndexTuple(IndexTuple source);
+extern IndexTuple index_truncate_tuple(TupleDesc sourceDescriptor,
+ IndexTuple source, int leavenatts);
+
+#endif /* ITUP_H */
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
new file mode 100644
index 0000000..4bbb035
--- /dev/null
+++ b/src/include/access/multixact.h
@@ -0,0 +1,164 @@
+/*
+ * multixact.h
+ *
+ * PostgreSQL multi-transaction-log manager
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/multixact.h
+ */
+#ifndef MULTIXACT_H
+#define MULTIXACT_H
+
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/sync.h"
+
+
+/*
+ * The first two MultiXactId values are reserved to store the truncation Xid
+ * and epoch of the first segment, so we start assigning multixact values from
+ * 2.
+ */
+#define InvalidMultiXactId ((MultiXactId) 0)
+#define FirstMultiXactId ((MultiXactId) 1)
+#define MaxMultiXactId ((MultiXactId) 0xFFFFFFFF)
+
+#define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId)
+
+#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF)
+
+/* Number of SLRU buffers to use for multixact */
+#define NUM_MULTIXACTOFFSET_BUFFERS 8
+#define NUM_MULTIXACTMEMBER_BUFFERS 16
+
+/*
+ * Possible multixact lock modes ("status"). The first four modes are for
+ * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the
+ * next two are used for update and delete modes.
+ */
+typedef enum
+{
+ MultiXactStatusForKeyShare = 0x00,
+ MultiXactStatusForShare = 0x01,
+ MultiXactStatusForNoKeyUpdate = 0x02,
+ MultiXactStatusForUpdate = 0x03,
+ /* an update that doesn't touch "key" columns */
+ MultiXactStatusNoKeyUpdate = 0x04,
+ /* other updates, and delete */
+ MultiXactStatusUpdate = 0x05
+} MultiXactStatus;
+
+#define MaxMultiXactStatus MultiXactStatusUpdate
+
+/* does a status value correspond to a tuple update? */
+#define ISUPDATE_from_mxstatus(status) \
+ ((status) > MultiXactStatusForUpdate)
+
+
+typedef struct MultiXactMember
+{
+ TransactionId xid;
+ MultiXactStatus status;
+} MultiXactMember;
+
+
+/* ----------------
+ * multixact-related XLOG entries
+ * ----------------
+ */
+
+#define XLOG_MULTIXACT_ZERO_OFF_PAGE 0x00
+#define XLOG_MULTIXACT_ZERO_MEM_PAGE 0x10
+#define XLOG_MULTIXACT_CREATE_ID 0x20
+#define XLOG_MULTIXACT_TRUNCATE_ID 0x30
+
+typedef struct xl_multixact_create
+{
+ MultiXactId mid; /* new MultiXact's ID */
+ MultiXactOffset moff; /* its starting offset in members file */
+ int32 nmembers; /* number of member XIDs */
+ MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
+} xl_multixact_create;
+
+#define SizeOfMultiXactCreate (offsetof(xl_multixact_create, members))
+
+typedef struct xl_multixact_truncate
+{
+ Oid oldestMultiDB;
+
+ /* to-be-truncated range of multixact offsets */
+ MultiXactId startTruncOff; /* just for completeness' sake */
+ MultiXactId endTruncOff;
+
+ /* to-be-truncated range of multixact members */
+ MultiXactOffset startTruncMemb;
+ MultiXactOffset endTruncMemb;
+} xl_multixact_truncate;
+
+#define SizeOfMultiXactTruncate (sizeof(xl_multixact_truncate))
+
+
+extern MultiXactId MultiXactIdCreate(TransactionId xid1,
+ MultiXactStatus status1, TransactionId xid2,
+ MultiXactStatus status2);
+extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
+ MultiXactStatus status);
+extern MultiXactId MultiXactIdCreateFromMembers(int nmembers,
+ MultiXactMember *members);
+
+extern MultiXactId ReadNextMultiXactId(void);
+extern void ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next);
+extern bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly);
+extern void MultiXactIdSetOldestMember(void);
+extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **xids,
+ bool allow_old, bool isLockOnly);
+extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
+extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1,
+ MultiXactId multi2);
+
+extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path);
+extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path);
+
+extern void AtEOXact_MultiXact(void);
+extern void AtPrepare_MultiXact(void);
+extern void PostPrepare_MultiXact(TransactionId xid);
+
+extern Size MultiXactShmemSize(void);
+extern void MultiXactShmemInit(void);
+extern void BootStrapMultiXact(void);
+extern void StartupMultiXact(void);
+extern void TrimMultiXact(void);
+extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid,
+ Oid oldest_datoid,
+ bool is_startup);
+extern void MultiXactGetCheckptMulti(bool is_shutdown,
+ MultiXactId *nextMulti,
+ MultiXactOffset *nextMultiOffset,
+ MultiXactId *oldestMulti,
+ Oid *oldestMultiDB);
+extern void CheckPointMultiXact(void);
+extern MultiXactId GetOldestMultiXactId(void);
+extern void TruncateMultiXact(MultiXactId oldestMulti, Oid oldestMultiDB);
+extern void MultiXactSetNextMXact(MultiXactId nextMulti,
+ MultiXactOffset nextMultiOffset);
+extern void MultiXactAdvanceNextMXact(MultiXactId minMulti,
+ MultiXactOffset minMultiOffset);
+extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB);
+extern int MultiXactMemberFreezeThreshold(void);
+
+extern void multixact_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+extern void multixact_twophase_postcommit(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+extern void multixact_twophase_postabort(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+
+extern void multixact_redo(XLogReaderState *record);
+extern void multixact_desc(StringInfo buf, XLogReaderState *record);
+extern const char *multixact_identify(uint8 info);
+extern char *mxid_to_string(MultiXactId multi, int nmembers,
+ MultiXactMember *members);
+
+#endif /* MULTIXACT_H */
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
new file mode 100644
index 0000000..30a216e
--- /dev/null
+++ b/src/include/access/nbtree.h
@@ -0,0 +1,1286 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtree.h
+ * header file for postgres btree access method implementation.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/nbtree.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NBTREE_H
+#define NBTREE_H
+
+#include "access/amapi.h"
+#include "access/itup.h"
+#include "access/sdir.h"
+#include "access/tableam.h"
+#include "access/xlogreader.h"
+#include "catalog/pg_am_d.h"
+#include "catalog/pg_index.h"
+#include "lib/stringinfo.h"
+#include "storage/bufmgr.h"
+#include "storage/shm_toc.h"
+
+/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
+typedef uint16 BTCycleId;
+
+/*
+ * BTPageOpaqueData -- At the end of every page, we store a pointer
+ * to both siblings in the tree. This is used to do forward/backward
+ * index scans. The next-page link is also critical for recovery when
+ * a search has navigated to the wrong page due to concurrent page splits
+ * or deletions; see src/backend/access/nbtree/README for more info.
+ *
+ * In addition, we store the page's btree level (counting upwards from
+ * zero at a leaf page) as well as some flag bits indicating the page type
+ * and status. If the page is deleted, a BTDeletedPageData struct is stored
+ * in the page's tuple area, while a standard BTPageOpaqueData struct is
+ * stored in the page special area.
+ *
+ * We also store a "vacuum cycle ID". When a page is split while VACUUM is
+ * processing the index, a nonzero value associated with the VACUUM run is
+ * stored into both halves of the split page. (If VACUUM is not running,
+ * both pages receive zero cycleids.) This allows VACUUM to detect whether
+ * a page was split since it started, with a small probability of false match
+ * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
+ * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left
+ * (original) page, and set in the right page, but only if the next page
+ * to its right has a different cycleid.
+ *
+ * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
+ * instead.
+ *
+ * NOTE: the btpo_level field used to be a union type in order to allow
+ * deleted pages to store a 32-bit safexid in the same field. We now store
+ * 64-bit/full safexid values using BTDeletedPageData instead.
+ */
+
+typedef struct BTPageOpaqueData
+{
+ BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
+ BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
+ uint32 btpo_level; /* tree level --- zero for leaf pages */
+ uint16 btpo_flags; /* flag bits, see below */
+ BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */
+} BTPageOpaqueData;
+
+typedef BTPageOpaqueData *BTPageOpaque;
+
+/* Bits defined in btpo_flags */
+#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
+#define BTP_ROOT (1 << 1) /* root page (has no parent) */
+#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
+#define BTP_META (1 << 3) /* meta-page */
+#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
+#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
+#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */
+#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
+#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */
+
+/*
+ * The max allowed value of a cycle ID is a bit less than 64K. This is
+ * for convenience of pg_filedump and similar utilities: we want to use
+ * the last 2 bytes of special space as an index type indicator, and
+ * restricting cycle ID lets btree use that space for vacuum cycle IDs
+ * while still allowing index type to be identified.
+ */
+#define MAX_BT_CYCLE_ID 0xFF7F
+
+
+/*
+ * The Meta page is always the first page in the btree index.
+ * Its primary purpose is to point to the location of the btree root page.
+ * We also point to the "fast" root, which is the current effective root;
+ * see README for discussion.
+ */
+
+typedef struct BTMetaPageData
+{
+ uint32 btm_magic; /* should contain BTREE_MAGIC */
+ uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */
+ BlockNumber btm_root; /* current root location */
+ uint32 btm_level; /* tree level of the root page */
+ BlockNumber btm_fastroot; /* current "fast" root location */
+ uint32 btm_fastlevel; /* tree level of the "fast" root page */
+ /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
+
+ /* number of deleted, non-recyclable pages during last cleanup */
+ uint32 btm_last_cleanup_num_delpages;
+ /* number of heap tuples during last cleanup (deprecated) */
+ float8 btm_last_cleanup_num_heap_tuples;
+
+ bool btm_allequalimage; /* are all columns "equalimage"? */
+} BTMetaPageData;
+
+#define BTPageGetMeta(p) \
+ ((BTMetaPageData *) PageGetContents(p))
+
+/*
+ * The current Btree version is 4. That's what you'll get when you create
+ * a new index.
+ *
+ * Btree version 3 was used in PostgreSQL v11. It is mostly the same as
+ * version 4, but heap TIDs were not part of the keyspace. Index tuples
+ * with duplicate keys could be stored in any order. We continue to
+ * support reading and writing Btree versions 2 and 3, so that they don't
+ * need to be immediately re-indexed at pg_upgrade. In order to get the
+ * new heapkeyspace semantics, however, a REINDEX is needed.
+ *
+ * Deduplication is safe to use when the btm_allequalimage field is set to
+ * true. It's safe to read the btm_allequalimage field on version 3, but
+ * only version 4 indexes make use of deduplication. Even version 4
+ * indexes created on PostgreSQL v12 will need a REINDEX to make use of
+ * deduplication, though, since there is no other way to set
+ * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
+ * metapage field).
+ *
+ * Btree version 2 is mostly the same as version 3. There are two new
+ * fields in the metapage that were introduced in version 3. A version 2
+ * metapage will be automatically upgraded to version 3 on the first
+ * insert to it. INCLUDE indexes cannot use version 2.
+ */
+#define BTREE_METAPAGE 0 /* first page is meta */
+#define BTREE_MAGIC 0x053162 /* magic number in metapage */
+#define BTREE_VERSION 4 /* current version number */
+#define BTREE_MIN_VERSION 2 /* minimum supported version */
+#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */
+
+/*
+ * Maximum size of a btree index entry, including its tuple header.
+ *
+ * We actually need to be able to fit three items on every page,
+ * so restrict any one item to 1/3 the per-page available space.
+ *
+ * There are rare cases where _bt_truncate() will need to enlarge
+ * a heap index tuple to make space for a tiebreaker heap TID
+ * attribute, which we account for here.
+ */
+#define BTMaxItemSize(page) \
+ MAXALIGN_DOWN((PageGetPageSize(page) - \
+ MAXALIGN(SizeOfPageHeaderData + \
+ 3*sizeof(ItemIdData) + \
+ 3*sizeof(ItemPointerData)) - \
+ MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+#define BTMaxItemSizeNoHeapTid(page) \
+ MAXALIGN_DOWN((PageGetPageSize(page) - \
+ MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
+ MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+
+/*
+ * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
+ * that may be stored on a btree leaf page. It is used to size the
+ * per-page temporary buffers.
+ *
+ * Note: we don't bother considering per-tuple overheads here to keep
+ * things simple (value is based on how many elements a single array of
+ * heap TIDs must have to fill the space between the page header and
+ * special area). The value is slightly higher (i.e. more conservative)
+ * than necessary as a result, which is considered acceptable.
+ */
+#define MaxTIDsPerBTreePage \
+ (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
+ sizeof(ItemPointerData))
+
+/*
+ * The leaf-page fillfactor defaults to 90% but is user-adjustable.
+ * For pages above the leaf level, we use a fixed 70% fillfactor.
+ * The fillfactor is applied during index build and when splitting
+ * a rightmost page; when splitting non-rightmost pages we try to
+ * divide the data equally. When splitting a page that's entirely
+ * filled with a single value (duplicates), the effective leaf-page
+ * fillfactor is 96%, regardless of whether the page is a rightmost
+ * page.
+ */
+#define BTREE_MIN_FILLFACTOR 10
+#define BTREE_DEFAULT_FILLFACTOR 90
+#define BTREE_NONLEAF_FILLFACTOR 70
+#define BTREE_SINGLEVAL_FILLFACTOR 96
+
+/*
+ * In general, the btree code tries to localize its knowledge about
+ * page layout to a couple of routines. However, we need a special
+ * value to indicate "no page number" in those places where we expect
+ * page numbers. We can use zero for this because we never need to
+ * make a pointer to the metadata page.
+ */
+
+#define P_NONE 0
+
+/*
+ * Macros to test whether a page is leftmost or rightmost on its tree level,
+ * as well as other state info kept in the opaque data.
+ */
+#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
+#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
+#define P_ISLEAF(opaque) (((opaque)->btpo_flags & BTP_LEAF) != 0)
+#define P_ISROOT(opaque) (((opaque)->btpo_flags & BTP_ROOT) != 0)
+#define P_ISDELETED(opaque) (((opaque)->btpo_flags & BTP_DELETED) != 0)
+#define P_ISMETA(opaque) (((opaque)->btpo_flags & BTP_META) != 0)
+#define P_ISHALFDEAD(opaque) (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
+#define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
+#define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
+#define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
+#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+
+/*
+ * BTDeletedPageData is the page contents of a deleted page
+ */
+typedef struct BTDeletedPageData
+{
+ FullTransactionId safexid; /* See BTPageIsRecyclable() */
+} BTDeletedPageData;
+
+static inline void
+BTPageSetDeleted(Page page, FullTransactionId safexid)
+{
+ BTPageOpaque opaque;
+ PageHeader header;
+ BTDeletedPageData *contents;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ header = ((PageHeader) page);
+
+ opaque->btpo_flags &= ~BTP_HALF_DEAD;
+ opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
+ header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
+ sizeof(BTDeletedPageData);
+ header->pd_upper = header->pd_special;
+
+ /* Set safexid in deleted page */
+ contents = ((BTDeletedPageData *) PageGetContents(page));
+ contents->safexid = safexid;
+}
+
+static inline FullTransactionId
+BTPageGetDeleteXid(Page page)
+{
+ BTPageOpaque opaque;
+ BTDeletedPageData *contents;
+
+ /* We only expect to be called with a deleted page */
+ Assert(!PageIsNew(page));
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISDELETED(opaque));
+
+ /* pg_upgrade'd deleted page -- must be safe to delete now */
+ if (!P_HAS_FULLXID(opaque))
+ return FirstNormalFullTransactionId;
+
+ /* Get safexid from deleted page */
+ contents = ((BTDeletedPageData *) PageGetContents(page));
+ return contents->safexid;
+}
+
+/*
+ * Is an existing page recyclable?
+ *
+ * This exists to centralize the policy on which deleted pages are now safe to
+ * re-use. However, _bt_pendingfsm_finalize() duplicates some of the same
+ * logic because it doesn't work directly with pages -- keep the two in sync.
+ *
+ * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
+ * them here (caller is responsible for that case themselves). Caller might
+ * well need special handling for new pages anyway.
+ */
+static inline bool
+BTPageIsRecyclable(Page page)
+{
+ BTPageOpaque opaque;
+
+ Assert(!PageIsNew(page));
+
+ /* Recycling okay iff page is deleted and safexid is old enough */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISDELETED(opaque))
+ {
+ /*
+ * The page was deleted, but when? If it was just deleted, a scan
+ * might have seen the downlink to it, and will read the page later.
+ * As long as that can happen, we must keep the deleted page around as
+ * a tombstone.
+ *
+ * For that check if the deletion XID could still be visible to
+ * anyone. If not, then no scan that's still in progress could have
+ * seen its downlink, and we can recycle it.
+ *
+ * XXX: If we had the heap relation we could be more aggressive about
+ * recycling deleted pages in non-catalog relations. For now we just
+ * pass NULL. That is at least simple and consistent.
+ */
+ return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page));
+ }
+
+ return false;
+}
+
+/*
+ * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM.
+ * They are exported for use by page deletion related code in nbtpage.c.
+ */
+typedef struct BTPendingFSM
+{
+ BlockNumber target; /* Page deleted by current VACUUM */
+ FullTransactionId safexid; /* Page's BTDeletedPageData.safexid */
+} BTPendingFSM;
+
+typedef struct BTVacState
+{
+ IndexVacuumInfo *info;
+ IndexBulkDeleteResult *stats;
+ IndexBulkDeleteCallback callback;
+ void *callback_state;
+ BTCycleId cycleid;
+ MemoryContext pagedelcontext;
+
+ /*
+ * _bt_pendingfsm_finalize() state
+ */
+ int bufsize; /* pendingpages space (in # elements) */
+ int maxbufsize; /* max bufsize that respects work_mem */
+ BTPendingFSM *pendingpages; /* One entry per newly deleted page */
+ int npendingpages; /* current # valid pendingpages */
+} BTVacState;
+
+/*
+ * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
+ * page. The high key is not a tuple that is used to visit the heap. It is
+ * a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
+ * The high key on a page is required to be greater than or equal to any
+ * other key that appears on the page. If we find ourselves trying to
+ * insert a key that is strictly > high key, we know we need to move right
+ * (this should only happen if the page was split since we examined the
+ * parent page).
+ *
+ * Our insertion algorithm guarantees that we can use the initial least key
+ * on our right sibling as the high key. Once a page is created, its high
+ * key changes only if the page is split.
+ *
+ * On a non-rightmost page, the high key lives in item 1 and data items
+ * start in item 2. Rightmost pages have no high key, so we store data
+ * items beginning in item 1.
+ */
+
+#define P_HIKEY ((OffsetNumber) 1)
+#define P_FIRSTKEY ((OffsetNumber) 2)
+#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
+
+/*
+ * Notes on B-Tree tuple format, and key and non-key attributes:
+ *
+ * INCLUDE B-Tree indexes have non-key attributes. These are extra
+ * attributes that may be returned by index-only scans, but do not influence
+ * the order of items in the index (formally, non-key attributes are not
+ * considered to be part of the key space). Non-key attributes are only
+ * present in leaf index tuples whose item pointers actually point to heap
+ * tuples (non-pivot tuples). _bt_check_natts() enforces the rules
+ * described here.
+ *
+ * Non-pivot tuple format (plain/non-posting variant):
+ *
+ * t_tid | t_info | key values | INCLUDE columns, if any
+ *
+ * t_tid points to the heap TID, which is a tiebreaker key column as of
+ * BTREE_VERSION 4.
+ *
+ * Non-pivot tuples complement pivot tuples, which only have key columns.
+ * The sole purpose of pivot tuples is to represent how the key space is
+ * separated. In general, any B-Tree index that has more than one level
+ * (i.e. any index that does not just consist of a metapage and a single
+ * leaf root page) must have some number of pivot tuples, since pivot
+ * tuples are used for traversing the tree. Suffix truncation can omit
+ * trailing key columns when a new pivot is formed, which makes minus
+ * infinity their logical value. Since BTREE_VERSION 4 indexes treat heap
+ * TID as a trailing key column that ensures that all index tuples are
+ * physically unique, it is necessary to represent heap TID as a trailing
+ * key column in pivot tuples, though very often this can be truncated
+ * away, just like any other key column. (Actually, the heap TID is
+ * omitted rather than truncated, since its representation is different to
+ * the non-pivot representation.)
+ *
+ * Pivot tuple format:
+ *
+ * t_tid | t_info | key values | [heap TID]
+ *
+ * We store the number of columns present inside pivot tuples by abusing
+ * their t_tid offset field, since pivot tuples never need to store a real
+ * offset (pivot tuples generally store a downlink in t_tid, though). The
+ * offset field only stores the number of columns/attributes when the
+ * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
+ * TID column sometimes stored in pivot tuples -- that's represented by
+ * the presence of BT_PIVOT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in
+ * t_info is always set on BTREE_VERSION 4 pivot tuples, since
+ * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
+ *
+ * In version 2 or version 3 (!heapkeyspace) indexes, INDEX_ALT_TID_MASK
+ * might not be set in pivot tuples. BTreeTupleIsPivot() won't work
+ * reliably as a result. The number of columns stored is implicitly the
+ * same as the number of columns in the index, just like any non-pivot
+ * tuple. (The number of columns stored should not vary, since suffix
+ * truncation of key columns is unsafe within any !heapkeyspace index.)
+ *
+ * The 12 least significant bits from t_tid's offset number are used to
+ * represent the number of key columns within a pivot tuple. This leaves 4
+ * status bits (BT_STATUS_OFFSET_MASK bits), which are shared by all tuples
+ * that have the INDEX_ALT_TID_MASK bit set (set in t_info) to store basic
+ * tuple metadata. BTreeTupleIsPivot() and BTreeTupleIsPosting() use the
+ * BT_STATUS_OFFSET_MASK bits.
+ *
+ * Sometimes non-pivot tuples also use a representation that repurposes
+ * t_tid to store metadata rather than a TID. PostgreSQL v13 introduced a
+ * new non-pivot tuple format to support deduplication: posting list
+ * tuples. Deduplication merges together multiple equal non-pivot tuples
+ * into a logically equivalent, space efficient representation. A posting
+ * list is an array of ItemPointerData elements. Non-pivot tuples are
+ * merged together to form posting list tuples lazily, at the point where
+ * we'd otherwise have to split a leaf page.
+ *
+ * Posting tuple format (alternative non-pivot tuple representation):
+ *
+ * t_tid | t_info | key values | posting list (TID array)
+ *
+ * Posting list tuples are recognized as such by having the
+ * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
+ * bit set in t_tid's offset number. These flags redefine the content of
+ * the posting tuple's t_tid to store the location of the posting list
+ * (instead of a block number), as well as the total number of heap TIDs
+ * present in the tuple (instead of a real offset number).
+ *
+ * The 12 least significant bits from t_tid's offset number are used to
+ * represent the number of heap TIDs present in the tuple, leaving 4 status
+ * bits (the BT_STATUS_OFFSET_MASK bits). Like any non-pivot tuple, the
+ * number of columns stored is always implicitly the total number in the
+ * index (in practice there can never be non-key columns stored, since
+ * deduplication is not supported with INCLUDE indexes).
+ */
+#define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT
+
+/* Item pointer offset bit masks */
+#define BT_OFFSET_MASK 0x0FFF
+#define BT_STATUS_OFFSET_MASK 0xF000
+/* BT_STATUS_OFFSET_MASK status bits */
+#define BT_PIVOT_HEAP_TID_ATTR 0x1000
+#define BT_IS_POSTING 0x2000
+
+/*
+ * Note: BTreeTupleIsPivot() can have false negatives (but not false
+ * positives) when used with !heapkeyspace indexes
+ */
+static inline bool
+BTreeTupleIsPivot(IndexTuple itup)
+{
+ if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ return false;
+ /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
+ return false;
+
+ return true;
+}
+
+static inline bool
+BTreeTupleIsPosting(IndexTuple itup)
+{
+ if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ return false;
+ /* presence of BT_IS_POSTING in offset number indicates posting tuple */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
+ return false;
+
+ return true;
+}
+
+static inline void
+BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
+{
+ Assert(nhtids > 1);
+ Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
+ Assert((size_t) postingoffset == MAXALIGN(postingoffset));
+ Assert(postingoffset < INDEX_SIZE_MASK);
+ Assert(!BTreeTupleIsPivot(itup));
+
+ itup->t_info |= INDEX_ALT_TID_MASK;
+ ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
+ ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
+}
+
+static inline uint16
+BTreeTupleGetNPosting(IndexTuple posting)
+{
+ OffsetNumber existing;
+
+ Assert(BTreeTupleIsPosting(posting));
+
+ existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
+ return (existing & BT_OFFSET_MASK);
+}
+
+static inline uint32
+BTreeTupleGetPostingOffset(IndexTuple posting)
+{
+ Assert(BTreeTupleIsPosting(posting));
+
+ return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
+}
+
+static inline ItemPointer
+BTreeTupleGetPosting(IndexTuple posting)
+{
+ return (ItemPointer) ((char *) posting +
+ BTreeTupleGetPostingOffset(posting));
+}
+
+static inline ItemPointer
+BTreeTupleGetPostingN(IndexTuple posting, int n)
+{
+ return BTreeTupleGetPosting(posting) + n;
+}
+
+/*
+ * Get/set downlink block number in pivot tuple.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple. If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetDownLink(IndexTuple pivot)
+{
+ return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
+}
+
+static inline void
+BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
+{
+ ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
+}
+
+/*
+ * Get number of attributes within tuple.
+ *
+ * Note that this does not include an implicit tiebreaker heap TID
+ * attribute, if any. Note also that the number of key attributes must be
+ * explicitly represented in all heapkeyspace pivot tuples.
+ *
+ * Note: This is defined as a macro rather than an inline function to
+ * avoid including rel.h.
+ */
+#define BTreeTupleGetNAtts(itup, rel) \
+ ( \
+ (BTreeTupleIsPivot(itup)) ? \
+ ( \
+ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
+ ) \
+ : \
+ IndexRelationGetNumberOfAttributes(rel) \
+ )
+
+/*
+ * Set number of key attributes in tuple.
+ *
+ * The heap TID tiebreaker attribute bit may also be set here, indicating that
+ * a heap TID value will be stored at the end of the tuple (i.e. using the
+ * special pivot tuple representation).
+ */
+static inline void
+BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
+{
+ Assert(nkeyatts <= INDEX_MAX_KEYS);
+ Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
+ Assert(!heaptid || nkeyatts > 0);
+ Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
+
+ itup->t_info |= INDEX_ALT_TID_MASK;
+
+ if (heaptid)
+ nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
+
+ /* BT_IS_POSTING bit is deliberately unset here */
+ ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
+ Assert(BTreeTupleIsPivot(itup));
+}
+
+/*
+ * Get/set leaf page's "top parent" link from its high key. Used during page
+ * deletion.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple. If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetTopParent(IndexTuple leafhikey)
+{
+ return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
+}
+
+static inline void
+BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
+{
+ ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
+ BTreeTupleSetNAtts(leafhikey, 0, false);
+}
+
+/*
+ * Get tiebreaker heap TID attribute, if any.
+ *
+ * This returns the first/lowest heap TID in the case of a posting list tuple.
+ */
+static inline ItemPointer
+BTreeTupleGetHeapTID(IndexTuple itup)
+{
+ if (BTreeTupleIsPivot(itup))
+ {
+ /* Pivot tuple heap TID representation? */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+ BT_PIVOT_HEAP_TID_ATTR) != 0)
+ return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
+ sizeof(ItemPointerData));
+
+ /* Heap TID attribute was truncated */
+ return NULL;
+ }
+ else if (BTreeTupleIsPosting(itup))
+ return BTreeTupleGetPosting(itup);
+
+ return &itup->t_tid;
+}
+
+/*
+ * Get maximum heap TID attribute, which could be the only TID in the case of
+ * a non-pivot tuple that does not have a posting list tuple.
+ *
+ * Works with non-pivot tuples only.
+ */
+static inline ItemPointer
+BTreeTupleGetMaxHeapTID(IndexTuple itup)
+{
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (BTreeTupleIsPosting(itup))
+ {
+ uint16 nposting = BTreeTupleGetNPosting(itup);
+
+ return BTreeTupleGetPostingN(itup, nposting - 1);
+ }
+
+ return &itup->t_tid;
+}
+
+/*
+ * Operator strategy numbers for B-tree have been moved to access/stratnum.h,
+ * because many places need to use them in ScanKeyInit() calls.
+ *
+ * The strategy numbers are chosen so that we can commute them by
+ * subtraction, thus:
+ */
+#define BTCommuteStrategyNumber(strat) (BTMaxStrategyNumber + 1 - (strat))
+
+/*
+ * When a new operator class is declared, we require that the user
+ * supply us with an amproc procedure (BTORDER_PROC) for determining
+ * whether, for two keys a and b, a < b, a = b, or a > b. This routine
+ * must return < 0, 0, > 0, respectively, in these three cases.
+ *
+ * To facilitate accelerated sorting, an operator class may choose to
+ * offer a second procedure (BTSORTSUPPORT_PROC). For full details, see
+ * src/include/utils/sortsupport.h.
+ *
+ * To support window frames defined by "RANGE offset PRECEDING/FOLLOWING",
+ * an operator class may choose to offer a third amproc procedure
+ * (BTINRANGE_PROC), independently of whether it offers sortsupport.
+ * For full details, see doc/src/sgml/btree.sgml.
+ *
+ * To facilitate B-Tree deduplication, an operator class may choose to
+ * offer a forth amproc procedure (BTEQUALIMAGE_PROC). For full details,
+ * see doc/src/sgml/btree.sgml.
+ */
+
+#define BTORDER_PROC 1
+#define BTSORTSUPPORT_PROC 2
+#define BTINRANGE_PROC 3
+#define BTEQUALIMAGE_PROC 4
+#define BTOPTIONS_PROC 5
+#define BTNProcs 5
+
+/*
+ * We need to be able to tell the difference between read and write
+ * requests for pages, in order to do locking correctly.
+ */
+
+#define BT_READ BUFFER_LOCK_SHARE
+#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
+
+/*
+ * BTStackData -- As we descend a tree, we push the location of pivot
+ * tuples whose downlink we are about to follow onto a private stack. If
+ * we split a leaf, we use this stack to walk back up the tree and insert
+ * data into its parent page at the correct location. We also have to
+ * recursively insert into the grandparent page if and when the parent page
+ * splits. Our private stack can become stale due to concurrent page
+ * splits and page deletions, but it should never give us an irredeemably
+ * bad picture.
+ */
+typedef struct BTStackData
+{
+ BlockNumber bts_blkno;
+ OffsetNumber bts_offset;
+ struct BTStackData *bts_parent;
+} BTStackData;
+
+typedef BTStackData *BTStack;
+
+/*
+ * BTScanInsertData is the btree-private state needed to find an initial
+ * position for an indexscan, or to insert new tuples -- an "insertion
+ * scankey" (not to be confused with a search scankey). It's used to descend
+ * a B-Tree using _bt_search.
+ *
+ * heapkeyspace indicates if we expect all keys in the index to be physically
+ * unique because heap TID is used as a tiebreaker attribute, and if index may
+ * have truncated key attributes in pivot tuples. This is actually a property
+ * of the index relation itself (not an indexscan). heapkeyspace indexes are
+ * indexes whose version is >= version 4. It's convenient to keep this close
+ * by, rather than accessing the metapage repeatedly.
+ *
+ * allequalimage is set to indicate that deduplication is safe for the index.
+ * This is also a property of the index relation rather than an indexscan.
+ *
+ * anynullkeys indicates if any of the keys had NULL value when scankey was
+ * built from index tuple (note that already-truncated tuple key attributes
+ * set NULL as a placeholder key value, which also affects value of
+ * anynullkeys). This is a convenience for unique index non-pivot tuple
+ * insertion, which usually temporarily unsets scantid, but shouldn't iff
+ * anynullkeys is true. Value generally matches non-pivot tuple's HasNulls
+ * bit, but may not when inserting into an INCLUDE index (tuple header value
+ * is affected by the NULL-ness of both key and non-key attributes).
+ *
+ * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
+ * locate the first item >= scankey. When nextkey is true, they will locate
+ * the first item > scan key.
+ *
+ * pivotsearch is set to true by callers that want to re-find a leaf page
+ * using a scankey built from a leaf page's high key. Most callers set this
+ * to false.
+ *
+ * scantid is the heap TID that is used as a final tiebreaker attribute. It
+ * is set to NULL when index scan doesn't need to find a position for a
+ * specific physical tuple. Must be set when inserting new tuples into
+ * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
+ * in one exact position (it's never set with !heapkeyspace indexes, though).
+ * Despite the representational difference, nbtree search code considers
+ * scantid to be just another insertion scankey attribute.
+ *
+ * scankeys is an array of scan key entries for attributes that are compared
+ * before scantid (user-visible attributes). keysz is the size of the array.
+ * During insertion, there must be a scan key for every attribute, but when
+ * starting a regular index scan some can be omitted. The array is used as a
+ * flexible array member, though it's sized in a way that makes it possible to
+ * use stack allocations. See nbtree/README for full details.
+ */
+typedef struct BTScanInsertData
+{
+ bool heapkeyspace;
+ bool allequalimage;
+ bool anynullkeys;
+ bool nextkey;
+ bool pivotsearch;
+ ItemPointer scantid; /* tiebreaker for scankeys */
+ int keysz; /* Size of scankeys array */
+ ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */
+} BTScanInsertData;
+
+typedef BTScanInsertData *BTScanInsert;
+
+/*
+ * BTInsertStateData is a working area used during insertion.
+ *
+ * This is filled in after descending the tree to the first leaf page the new
+ * tuple might belong on. Tracks the current position while performing
+ * uniqueness check, before we have determined which exact page to insert
+ * to.
+ *
+ * (This should be private to nbtinsert.c, but it's also used by
+ * _bt_binsrch_insert)
+ */
+typedef struct BTInsertStateData
+{
+ IndexTuple itup; /* Item we're inserting */
+ Size itemsz; /* Size of itup -- should be MAXALIGN()'d */
+ BTScanInsert itup_key; /* Insertion scankey */
+
+ /* Buffer containing leaf page we're likely to insert itup on */
+ Buffer buf;
+
+ /*
+ * Cache of bounds within the current buffer. Only used for insertions
+ * where _bt_check_unique is called. See _bt_binsrch_insert and
+ * _bt_findinsertloc for details.
+ */
+ bool bounds_valid;
+ OffsetNumber low;
+ OffsetNumber stricthigh;
+
+ /*
+ * if _bt_binsrch_insert found the location inside existing posting list,
+ * save the position inside the list. -1 sentinel value indicates overlap
+ * with an existing posting list tuple that has its LP_DEAD bit set.
+ */
+ int postingoff;
+} BTInsertStateData;
+
+typedef BTInsertStateData *BTInsertState;
+
+/*
+ * State used to representing an individual pending tuple during
+ * deduplication.
+ */
+typedef struct BTDedupInterval
+{
+ OffsetNumber baseoff;
+ uint16 nitems;
+} BTDedupInterval;
+
+/*
+ * BTDedupStateData is a working area used during deduplication.
+ *
+ * The status info fields track the state of a whole-page deduplication pass.
+ * State about the current pending posting list is also tracked.
+ *
+ * A pending posting list is comprised of a contiguous group of equal items
+ * from the page, starting from page offset number 'baseoff'. This is the
+ * offset number of the "base" tuple for new posting list. 'nitems' is the
+ * current total number of existing items from the page that will be merged to
+ * make a new posting list tuple, including the base tuple item. (Existing
+ * items may themselves be posting list tuples, or regular non-pivot tuples.)
+ *
+ * The total size of the existing tuples to be freed when pending posting list
+ * is processed gets tracked by 'phystupsize'. This information allows
+ * deduplication to calculate the space saving for each new posting list
+ * tuple, and for the entire pass over the page as a whole.
+ */
+typedef struct BTDedupStateData
+{
+ /* Deduplication status info for entire pass over page */
+ bool deduplicate; /* Still deduplicating page? */
+ int nmaxitems; /* Number of max-sized tuples so far */
+ Size maxpostingsize; /* Limit on size of final tuple */
+
+ /* Metadata about base tuple of current pending posting list */
+ IndexTuple base; /* Use to form new posting list */
+ OffsetNumber baseoff; /* page offset of base */
+ Size basetupsize; /* base size without original posting list */
+
+ /* Other metadata about pending posting list */
+ ItemPointer htids; /* Heap TIDs in pending posting list */
+ int nhtids; /* Number of heap TIDs in htids array */
+ int nitems; /* Number of existing tuples/line pointers */
+ Size phystupsize; /* Includes line pointer overhead */
+
+ /*
+ * Array of tuples to go on new version of the page. Contains one entry
+ * for each group of consecutive items. Note that existing tuples that
+ * will not become posting list tuples do not appear in the array (they
+ * are implicitly unchanged by deduplication pass).
+ */
+ int nintervals; /* current number of intervals in array */
+ BTDedupInterval intervals[MaxIndexTuplesPerPage];
+} BTDedupStateData;
+
+typedef BTDedupStateData *BTDedupState;
+
+/*
+ * BTVacuumPostingData is state that represents how to VACUUM (or delete) a
+ * posting list tuple when some (though not all) of its TIDs are to be
+ * deleted.
+ *
+ * Convention is that itup field is the original posting list tuple on input,
+ * and palloc()'d final tuple used to overwrite existing tuple on output.
+ */
+typedef struct BTVacuumPostingData
+{
+ /* Tuple that will be/was updated */
+ IndexTuple itup;
+ OffsetNumber updatedoffset;
+
+ /* State needed to describe final itup in WAL */
+ uint16 ndeletedtids;
+ uint16 deletetids[FLEXIBLE_ARRAY_MEMBER];
+} BTVacuumPostingData;
+
+typedef BTVacuumPostingData *BTVacuumPosting;
+
+/*
+ * BTScanOpaqueData is the btree-private state needed for an indexscan.
+ * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
+ * details of the preprocessing), information about the current location
+ * of the scan, and information about the marked location, if any. (We use
+ * BTScanPosData to represent the data needed for each of current and marked
+ * locations.) In addition we can remember some known-killed index entries
+ * that must be marked before we can move off the current page.
+ *
+ * Index scans work a page at a time: we pin and read-lock the page, identify
+ * all the matching items on the page and save them in BTScanPosData, then
+ * release the read-lock while returning the items to the caller for
+ * processing. This approach minimizes lock/unlock traffic. Note that we
+ * keep the pin on the index page until the caller is done with all the items
+ * (this is needed for VACUUM synchronization, see nbtree/README). When we
+ * are ready to step to the next page, if the caller has told us any of the
+ * items were killed, we re-lock the page to mark them killed, then unlock.
+ * Finally we drop the pin and step to the next page in the appropriate
+ * direction.
+ *
+ * If we are doing an index-only scan, we save the entire IndexTuple for each
+ * matched item, otherwise only its heap TID and offset. The IndexTuples go
+ * into a separate workspace array; each BTScanPosItem stores its tuple's
+ * offset within that array. Posting list tuples store a "base" tuple once,
+ * allowing the same key to be returned for each TID in the posting list
+ * tuple.
+ */
+
+typedef struct BTScanPosItem /* what we remember about each match */
+{
+ ItemPointerData heapTid; /* TID of referenced heap item */
+ OffsetNumber indexOffset; /* index item's location within page */
+ LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
+} BTScanPosItem;
+
+typedef struct BTScanPosData
+{
+ Buffer buf; /* if valid, the buffer is pinned */
+
+ XLogRecPtr lsn; /* pos in the WAL stream when page was read */
+ BlockNumber currPage; /* page referenced by items array */
+ BlockNumber nextPage; /* page's right link when we scanned it */
+
+ /*
+ * moreLeft and moreRight track whether we think there may be matching
+ * index entries to the left and right of the current page, respectively.
+ * We can clear the appropriate one of these flags when _bt_checkkeys()
+ * returns continuescan = false.
+ */
+ bool moreLeft;
+ bool moreRight;
+
+ /*
+ * If we are doing an index-only scan, nextTupleOffset is the first free
+ * location in the associated tuple storage workspace.
+ */
+ int nextTupleOffset;
+
+ /*
+ * The items array is always ordered in index order (ie, increasing
+ * indexoffset). When scanning backwards it is convenient to fill the
+ * array back-to-front, so we start at the last slot and fill downwards.
+ * Hence we need both a first-valid-entry and a last-valid-entry counter.
+ * itemIndex is a cursor showing which entry was last returned to caller.
+ */
+ int firstItem; /* first valid index in items[] */
+ int lastItem; /* last valid index in items[] */
+ int itemIndex; /* current index in items[] */
+
+ BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */
+} BTScanPosData;
+
+typedef BTScanPosData *BTScanPos;
+
+#define BTScanPosIsPinned(scanpos) \
+( \
+ AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+ !BufferIsValid((scanpos).buf)), \
+ BufferIsValid((scanpos).buf) \
+)
+#define BTScanPosUnpin(scanpos) \
+ do { \
+ ReleaseBuffer((scanpos).buf); \
+ (scanpos).buf = InvalidBuffer; \
+ } while (0)
+#define BTScanPosUnpinIfPinned(scanpos) \
+ do { \
+ if (BTScanPosIsPinned(scanpos)) \
+ BTScanPosUnpin(scanpos); \
+ } while (0)
+
+#define BTScanPosIsValid(scanpos) \
+( \
+ AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+ !BufferIsValid((scanpos).buf)), \
+ BlockNumberIsValid((scanpos).currPage) \
+)
+#define BTScanPosInvalidate(scanpos) \
+ do { \
+ (scanpos).currPage = InvalidBlockNumber; \
+ (scanpos).nextPage = InvalidBlockNumber; \
+ (scanpos).buf = InvalidBuffer; \
+ (scanpos).lsn = InvalidXLogRecPtr; \
+ (scanpos).nextTupleOffset = 0; \
+ } while (0)
+
+/* We need one of these for each equality-type SK_SEARCHARRAY scan key */
+typedef struct BTArrayKeyInfo
+{
+ int scan_key; /* index of associated key in arrayKeyData */
+ int cur_elem; /* index of current element in elem_values */
+ int mark_elem; /* index of marked element in elem_values */
+ int num_elems; /* number of elems in current array value */
+ Datum *elem_values; /* array of num_elems Datums */
+} BTArrayKeyInfo;
+
+typedef struct BTScanOpaqueData
+{
+ /* these fields are set by _bt_preprocess_keys(): */
+ bool qual_ok; /* false if qual can never be satisfied */
+ int numberOfKeys; /* number of preprocessed scan keys */
+ ScanKey keyData; /* array of preprocessed scan keys */
+
+ /* workspace for SK_SEARCHARRAY support */
+ ScanKey arrayKeyData; /* modified copy of scan->keyData */
+ int numArrayKeys; /* number of equality-type array keys (-1 if
+ * there are any unsatisfiable array keys) */
+ int arrayKeyCount; /* count indicating number of array scan keys
+ * processed */
+ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
+ MemoryContext arrayContext; /* scan-lifespan context for array data */
+
+ /* info about killed items if any (killedItems is NULL if never used) */
+ int *killedItems; /* currPos.items indexes of killed items */
+ int numKilled; /* number of currently stored items */
+
+ /*
+ * If we are doing an index-only scan, these are the tuple storage
+ * workspaces for the currPos and markPos respectively. Each is of size
+ * BLCKSZ, so it can hold as much as a full page's worth of tuples.
+ */
+ char *currTuples; /* tuple storage for currPos */
+ char *markTuples; /* tuple storage for markPos */
+
+ /*
+ * If the marked position is on the same page as current position, we
+ * don't use markPos, but just keep the marked itemIndex in markItemIndex
+ * (all the rest of currPos is valid for the mark position). Hence, to
+ * determine if there is a mark, first look at markItemIndex, then at
+ * markPos.
+ */
+ int markItemIndex; /* itemIndex, or -1 if not valid */
+
+ /* keep these last in struct for efficiency */
+ BTScanPosData currPos; /* current position data */
+ BTScanPosData markPos; /* marked position, if any */
+} BTScanOpaqueData;
+
+typedef BTScanOpaqueData *BTScanOpaque;
+
+/*
+ * We use some private sk_flags bits in preprocessed scan keys. We're allowed
+ * to use bits 16-31 (see skey.h). The uppermost bits are copied from the
+ * index's indoption[] array entry for the index attribute.
+ */
+#define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */
+#define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */
+#define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */
+#define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
+#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
+
+typedef struct BTOptions
+{
+ int32 varlena_header_; /* varlena header (do not touch directly!) */
+ int fillfactor; /* page fill factor in percent (0..100) */
+ float8 vacuum_cleanup_index_scale_factor; /* deprecated */
+ bool deduplicate_items; /* Try to deduplicate items? */
+} BTOptions;
+
+#define BTGetFillFactor(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == BTREE_AM_OID), \
+ (relation)->rd_options ? \
+ ((BTOptions *) (relation)->rd_options)->fillfactor : \
+ BTREE_DEFAULT_FILLFACTOR)
+#define BTGetTargetPageFreeSpace(relation) \
+ (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
+#define BTGetDeduplicateItems(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == BTREE_AM_OID), \
+ ((relation)->rd_options ? \
+ ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
+
+/*
+ * Constant definition for progress reporting. Phase numbers must match
+ * btbuildphasename.
+ */
+/* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */
+#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN 2
+#define PROGRESS_BTREE_PHASE_PERFORMSORT_1 3
+#define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4
+#define PROGRESS_BTREE_PHASE_LEAF_LOAD 5
+
+/*
+ * external entry points for btree, in nbtree.c
+ */
+extern void btbuildempty(Relation index);
+extern bool btinsert(Relation rel, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
+extern Size btestimateparallelscan(void);
+extern void btinitparallelscan(void *target);
+extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys);
+extern void btparallelrescan(IndexScanDesc scan);
+extern void btendscan(IndexScanDesc scan);
+extern void btmarkpos(IndexScanDesc scan);
+extern void btrestrpos(IndexScanDesc scan);
+extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats);
+extern bool btcanreturn(Relation index, int attno);
+
+/*
+ * prototypes for internal functions in nbtree.c
+ */
+extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
+extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
+extern void _bt_parallel_done(IndexScanDesc scan);
+extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
+
+/*
+ * prototypes for functions in nbtdedup.c
+ */
+extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
+ IndexTuple newitem, Size newitemsz,
+ bool bottomupdedup);
+extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
+ Size newitemsz);
+extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+ OffsetNumber baseoff);
+extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
+extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
+extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
+ int nhtids);
+extern void _bt_update_posting(BTVacuumPosting vacposting);
+extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
+ int postingoff);
+
+/*
+ * prototypes for functions in nbtinsert.c
+ */
+extern bool _bt_doinsert(Relation rel, IndexTuple itup,
+ IndexUniqueCheck checkUnique, bool indexUnchanged,
+ Relation heapRel);
+extern void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack);
+extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
+
+/*
+ * prototypes for functions in nbtsplitloc.c
+ */
+extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
+ OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+ bool *newitemonleft);
+
+/*
+ * prototypes for functions in nbtpage.c
+ */
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+ bool allequalimage);
+extern bool _bt_vacuum_needs_cleanup(Relation rel);
+extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages);
+extern void _bt_upgrademetapage(Page page);
+extern Buffer _bt_getroot(Relation rel, int access);
+extern Buffer _bt_gettrueroot(Relation rel);
+extern int _bt_getrootheight(Relation rel);
+extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
+ bool *allequalimage);
+extern void _bt_checkpage(Relation rel, Buffer buf);
+extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
+extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
+ BlockNumber blkno, int access);
+extern void _bt_relbuf(Relation rel, Buffer buf);
+extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
+extern void _bt_unlockbuf(Relation rel, Buffer buf);
+extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
+extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
+extern void _bt_pageinit(Page page, Size size);
+extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable);
+extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
+ Relation heapRel,
+ TM_IndexDeleteOp *delstate);
+extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate);
+extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate,
+ bool cleanuponly);
+extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
+
+/*
+ * prototypes for functions in nbtsearch.c
+ */
+extern BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP,
+ int access, Snapshot snapshot);
+extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf,
+ bool forupdate, BTStack stack, int access, Snapshot snapshot);
+extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
+extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
+extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+ Snapshot snapshot);
+
+/*
+ * prototypes for functions in nbtutils.c
+ */
+extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
+extern void _bt_freestack(BTStack stack);
+extern void _bt_preprocess_array_keys(IndexScanDesc scan);
+extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
+extern void _bt_mark_array_keys(IndexScanDesc scan);
+extern void _bt_restore_array_keys(IndexScanDesc scan);
+extern void _bt_preprocess_keys(IndexScanDesc scan);
+extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
+ int tupnatts, ScanDirection dir, bool *continuescan);
+extern void _bt_killitems(IndexScanDesc scan);
+extern BTCycleId _bt_vacuum_cycleid(Relation rel);
+extern BTCycleId _bt_start_vacuum(Relation rel);
+extern void _bt_end_vacuum(Relation rel);
+extern void _bt_end_vacuum_callback(int code, Datum arg);
+extern Size BTreeShmemSize(void);
+extern void BTreeShmemInit(void);
+extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btproperty(Oid index_oid, int attno,
+ IndexAMProperty prop, const char *propname,
+ bool *res, bool *isnull);
+extern char *btbuildphasename(int64 phasenum);
+extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
+ IndexTuple firstright, BTScanInsert itup_key);
+extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft,
+ IndexTuple firstright);
+extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
+ OffsetNumber offnum);
+extern void _bt_check_third_page(Relation rel, Relation heap,
+ bool needheaptidspace, Page page, IndexTuple newtup);
+extern bool _bt_allequalimage(Relation rel, bool debugmessage);
+
+/*
+ * prototypes for functions in nbtvalidate.c
+ */
+extern bool btvalidate(Oid opclassoid);
+extern void btadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions);
+
+/*
+ * prototypes for functions in nbtsort.c
+ */
+extern IndexBuildResult *btbuild(Relation heap, Relation index,
+ struct IndexInfo *indexInfo);
+extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
+
+#endif /* NBTREE_H */
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
new file mode 100644
index 0000000..0f77318
--- /dev/null
+++ b/src/include/access/nbtxlog.h
@@ -0,0 +1,351 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtxlog.h
+ * header file for postgres btree xlog routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/nbtxlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NBTXLOG_H
+#define NBTXLOG_H
+
+#include "access/transam.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/off.h"
+
+/*
+ * XLOG records for btree operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field
+ */
+#define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
+#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
+#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
+#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
+#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
+#define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */
+#define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */
+#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
+#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
+#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
+#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
+#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
+#define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
+ * vacuum */
+#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
+ * FSM */
+#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
+ * metapage */
+
+/*
+ * All that we need to regenerate the meta-data page
+ */
+typedef struct xl_btree_metadata
+{
+ uint32 version;
+ BlockNumber root;
+ uint32 level;
+ BlockNumber fastroot;
+ uint32 fastlevel;
+ uint32 last_cleanup_num_delpages;
+ bool allequalimage;
+} xl_btree_metadata;
+
+/*
+ * This is what we need to know about simple (without split) insert.
+ *
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
+ * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a
+ * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
+ * page.
+ *
+ * Backup Blk 0: original page
+ * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
+ * Backup Blk 2: xl_btree_metadata, if INSERT_META
+ *
+ * Note: The new tuple is actually the "original" new item in the posting
+ * list split insert case (i.e. the INSERT_POST case). A split offset for
+ * the posting list is logged before the original new item. Recovery needs
+ * both, since it must do an in-place update of the existing posting list
+ * that was split as an extra step. Also, recovery generates a "final"
+ * newitem. See _bt_swap_posting() for details on posting list splits.
+ */
+typedef struct xl_btree_insert
+{
+ OffsetNumber offnum;
+
+ /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
+ /* NEW TUPLE ALWAYS FOLLOWS AT THE END */
+} xl_btree_insert;
+
+#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
+
+/*
+ * On insert with split, we save all the items going into the right sibling
+ * so that we can restore it completely from the log record. This way takes
+ * less xlog space than the normal approach, because if we did it standardly,
+ * XLogInsert would almost always think the right page is new and store its
+ * whole page image. The left page, however, is handled in the normal
+ * incremental-update fashion.
+ *
+ * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
+ * There are two variants to indicate whether the inserted tuple went into the
+ * left or right split page (and thus, whether the new item is stored or not).
+ * We always log the left page high key because suffix truncation can generate
+ * a new leaf high key using user-defined code. This is also necessary on
+ * internal pages, since the firstright item that the left page's high key was
+ * based on will have been truncated to zero attributes in the right page (the
+ * separator key is unavailable from the right page).
+ *
+ * Backup Blk 0: original page / new left page
+ *
+ * The left page's data portion contains the new item, if it's the _L variant.
+ * _R variant split records generally do not have a newitem (_R variant leaf
+ * page split records that must deal with a posting list split will include an
+ * explicit newitem, though it is never used on the right page -- it is
+ * actually an orignewitem needed to update existing posting list). The new
+ * high key of the left/original page appears last of all (and must always be
+ * present).
+ *
+ * Page split records that need the REDO routine to deal with a posting list
+ * split directly will have an explicit newitem, which is actually an
+ * orignewitem (the newitem as it was before the posting list split, not
+ * after). A posting list split always has a newitem that comes immediately
+ * after the posting list being split (which would have overlapped with
+ * orignewitem prior to split). Usually REDO must deal with posting list
+ * splits with an _L variant page split record, and usually both the new
+ * posting list and the final newitem go on the left page (the existing
+ * posting list will be inserted instead of the old, and the final newitem
+ * will be inserted next to that). However, _R variant split records will
+ * include an orignewitem when the split point for the page happens to have a
+ * lastleft tuple that is also the posting list being split (leaving newitem
+ * as the page split's firstright tuple). The existence of this corner case
+ * does not change the basic fact about newitem/orignewitem for the REDO
+ * routine: it is always state used for the left page alone. (This is why the
+ * record's postingoff field isn't a reliable indicator of whether or not a
+ * posting list split occurred during the page split; a non-zero value merely
+ * indicates that the REDO routine must reconstruct a new posting list tuple
+ * that is needed for the left page.)
+ *
+ * This posting list split handling is equivalent to the xl_btree_insert REDO
+ * routine's INSERT_POST handling. While the details are more complicated
+ * here, the concept and goals are exactly the same. See _bt_swap_posting()
+ * for details on posting list splits.
+ *
+ * Backup Blk 1: new right page
+ *
+ * The right page's data portion contains the right page's tuples in the form
+ * used by _bt_restore_page. This includes the new item, if it's the _R
+ * variant. The right page's tuples also include the right page's high key
+ * with either variant (moved from the left/original page during the split),
+ * unless the split happened to be of the rightmost page on its level, where
+ * there is no high key for new right page.
+ *
+ * Backup Blk 2: next block (orig page's rightlink), if any
+ * Backup Blk 3: child's left sibling, if non-leaf split
+ */
+typedef struct xl_btree_split
+{
+ uint32 level; /* tree level of page being split */
+ OffsetNumber firstrightoff; /* first origpage item on rightpage */
+ OffsetNumber newitemoff; /* new item's offset */
+ uint16 postingoff; /* offset inside orig posting tuple */
+} xl_btree_split;
+
+#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents a deduplication pass for a leaf page. An array
+ * of BTDedupInterval structs follows.
+ */
+typedef struct xl_btree_dedup
+{
+ uint16 nintervals;
+
+ /* DEDUPLICATION INTERVALS FOLLOW */
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
+
+/*
+ * This is what we need to know about page reuse within btree. This record
+ * only exists to generate a conflict point for Hot Standby.
+ *
+ * Note that we must include a RelFileNode in the record because we don't
+ * actually register the buffer with the record.
+ */
+typedef struct xl_btree_reuse_page
+{
+ RelFileNode node;
+ BlockNumber block;
+ FullTransactionId latestRemovedFullXid;
+} xl_btree_reuse_page;
+
+#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
+
+/*
+ * xl_btree_vacuum and xl_btree_delete records describe deletion of index
+ * tuples on a leaf page. The former variant is used by VACUUM, while the
+ * latter variant is used by the ad-hoc deletions that sometimes take place
+ * when btinsert() is called.
+ *
+ * The records are very similar. The only difference is that xl_btree_delete
+ * has to include a latestRemovedXid field to generate recovery conflicts.
+ * (VACUUM operations can just rely on earlier conflicts generated during
+ * pruning of the table whose TIDs the to-be-deleted index tuples point to.
+ * There are also small differences between each REDO routine that we don't go
+ * into here.)
+ *
+ * xl_btree_vacuum and xl_btree_delete both represent deletion of any number
+ * of index tuples on a single leaf page using page offset numbers. Both also
+ * support "updates" of index tuples, which is how deletes of a subset of TIDs
+ * contained in an existing posting list tuple are implemented.
+ *
+ * Updated posting list tuples are represented using xl_btree_update metadata.
+ * The REDO routines each use the xl_btree_update entries (plus each
+ * corresponding original index tuple from the target leaf page) to generate
+ * the final updated tuple.
+ *
+ * Updates are only used when there will be some remaining TIDs left by the
+ * REDO routine. Otherwise the posting list tuple just gets deleted outright.
+ */
+typedef struct xl_btree_vacuum
+{
+ uint16 ndeleted;
+ uint16 nupdated;
+
+ /* DELETED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
+} xl_btree_vacuum;
+
+#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
+
+typedef struct xl_btree_delete
+{
+ TransactionId latestRemovedXid;
+ uint16 ndeleted;
+ uint16 nupdated;
+
+ /* DELETED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
+} xl_btree_delete;
+
+#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16))
+
+/*
+ * The offsets that appear in xl_btree_update metadata are offsets into the
+ * original posting list from tuple, not page offset numbers. These are
+ * 0-based. The page offset number for the original posting list tuple comes
+ * from the main xl_btree_vacuum/xl_btree_delete record.
+ */
+typedef struct xl_btree_update
+{
+ uint16 ndeletedtids;
+
+ /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
+} xl_btree_update;
+
+#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
+
+/*
+ * This is what we need to know about marking an empty subtree for deletion.
+ * The target identifies the tuple removed from the parent page (note that we
+ * remove this tuple's downlink and the *following* tuple's key). Note that
+ * the leaf page is empty, so we don't need to store its content --- it is
+ * just reinitialized during recovery using the rest of the fields.
+ *
+ * Backup Blk 0: leaf block
+ * Backup Blk 1: top parent
+ */
+typedef struct xl_btree_mark_page_halfdead
+{
+ OffsetNumber poffset; /* deleted tuple id in parent page */
+
+ /* information needed to recreate the leaf page: */
+ BlockNumber leafblk; /* leaf block ultimately being deleted */
+ BlockNumber leftblk; /* leaf block's left sibling, if any */
+ BlockNumber rightblk; /* leaf block's right sibling */
+ BlockNumber topparent; /* topmost internal page in the subtree */
+} xl_btree_mark_page_halfdead;
+
+#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
+
+/*
+ * This is what we need to know about deletion of a btree page. Note that we
+ * only leave behind a small amount of bookkeeping information in deleted
+ * pages (deleted pages must be kept around as tombstones for a while). It is
+ * convenient for the REDO routine to regenerate its target page from scratch.
+ * This is why WAL record describes certain details that are actually directly
+ * available from the target page.
+ *
+ * Backup Blk 0: target block being deleted
+ * Backup Blk 1: target block's left sibling, if any
+ * Backup Blk 2: target block's right sibling
+ * Backup Blk 3: leaf block (if different from target)
+ * Backup Blk 4: metapage (if rightsib becomes new fast root)
+ */
+typedef struct xl_btree_unlink_page
+{
+ BlockNumber leftsib; /* target block's left sibling, if any */
+ BlockNumber rightsib; /* target block's right sibling */
+ uint32 level; /* target block's level */
+ FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */
+
+ /*
+ * Information needed to recreate a half-dead leaf page with correct
+ * topparent link. The fields are only used when deletion operation's
+ * target page is an internal page. REDO routine creates half-dead page
+ * from scratch to keep things simple (this is the same convenient
+ * approach used for the target page itself).
+ */
+ BlockNumber leafleftsib;
+ BlockNumber leafrightsib;
+ BlockNumber leaftopparent; /* next child down in the subtree */
+
+ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
+} xl_btree_unlink_page;
+
+#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
+
+/*
+ * New root log record. There are zero tuples if this is to establish an
+ * empty root, or two if it is the result of splitting an old root.
+ *
+ * Note that although this implies rewriting the metadata page, we don't need
+ * an xl_btree_metadata record --- the rootblk and level are sufficient.
+ *
+ * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
+ * Backup Blk 1: left child (if splitting an old root)
+ * Backup Blk 2: metapage
+ */
+typedef struct xl_btree_newroot
+{
+ BlockNumber rootblk; /* location of new root (redundant with blk 0) */
+ uint32 level; /* its tree level */
+} xl_btree_newroot;
+
+#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
+
+
+/*
+ * prototypes for functions in nbtxlog.c
+ */
+extern void btree_redo(XLogReaderState *record);
+extern void btree_desc(StringInfo buf, XLogReaderState *record);
+extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
+extern void btree_mask(char *pagedata, BlockNumber blkno);
+
+#endif /* NBTXLOG_H */
diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h
new file mode 100644
index 0000000..93d88ac
--- /dev/null
+++ b/src/include/access/parallel.h
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.h
+ * Infrastructure for launching parallel workers
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/parallel.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#include "access/xlogdefs.h"
+#include "lib/ilist.h"
+#include "postmaster/bgworker.h"
+#include "storage/shm_mq.h"
+#include "storage/shm_toc.h"
+
+typedef void (*parallel_worker_main_type) (dsm_segment *seg, shm_toc *toc);
+
+typedef struct ParallelWorkerInfo
+{
+ BackgroundWorkerHandle *bgwhandle;
+ shm_mq_handle *error_mqh;
+ int32 pid;
+} ParallelWorkerInfo;
+
+typedef struct ParallelContext
+{
+ dlist_node node;
+ SubTransactionId subid;
+ int nworkers; /* Maximum number of workers to launch */
+ int nworkers_to_launch; /* Actual number of workers to launch */
+ int nworkers_launched;
+ char *library_name;
+ char *function_name;
+ ErrorContextCallback *error_context_stack;
+ shm_toc_estimator estimator;
+ dsm_segment *seg;
+ void *private_memory;
+ shm_toc *toc;
+ ParallelWorkerInfo *worker;
+ int nknown_attached_workers;
+ bool *known_attached_workers;
+} ParallelContext;
+
+typedef struct ParallelWorkerContext
+{
+ dsm_segment *seg;
+ shm_toc *toc;
+} ParallelWorkerContext;
+
+extern volatile bool ParallelMessagePending;
+extern PGDLLIMPORT int ParallelWorkerNumber;
+extern PGDLLIMPORT bool InitializingParallelWorker;
+
+#define IsParallelWorker() (ParallelWorkerNumber >= 0)
+
+extern ParallelContext *CreateParallelContext(const char *library_name,
+ const char *function_name, int nworkers);
+extern void InitializeParallelDSM(ParallelContext *pcxt);
+extern void ReinitializeParallelDSM(ParallelContext *pcxt);
+extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch);
+extern void LaunchParallelWorkers(ParallelContext *pcxt);
+extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt);
+extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt);
+extern void DestroyParallelContext(ParallelContext *pcxt);
+extern bool ParallelContextActive(void);
+
+extern void HandleParallelMessageInterrupt(void);
+extern void HandleParallelMessages(void);
+extern void AtEOXact_Parallel(bool isCommit);
+extern void AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId);
+extern void ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end);
+
+extern void ParallelWorkerMain(Datum main_arg);
+
+#endif /* PARALLEL_H */
diff --git a/src/include/access/printsimple.h b/src/include/access/printsimple.h
new file mode 100644
index 0000000..67a9950
--- /dev/null
+++ b/src/include/access/printsimple.h
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * printsimple.h
+ * print simple tuples without catalog access
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/printsimple.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PRINTSIMPLE_H
+#define PRINTSIMPLE_H
+
+#include "tcop/dest.h"
+
+extern bool printsimple(TupleTableSlot *slot, DestReceiver *self);
+extern void printsimple_startup(DestReceiver *self, int operation,
+ TupleDesc tupdesc);
+
+#endif /* PRINTSIMPLE_H */
diff --git a/src/include/access/printtup.h b/src/include/access/printtup.h
new file mode 100644
index 0000000..c9b3753
--- /dev/null
+++ b/src/include/access/printtup.h
@@ -0,0 +1,35 @@
+/*-------------------------------------------------------------------------
+ *
+ * printtup.h
+ *
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/printtup.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PRINTTUP_H
+#define PRINTTUP_H
+
+#include "utils/portal.h"
+
+extern DestReceiver *printtup_create_DR(CommandDest dest);
+
+extern void SetRemoteDestReceiverParams(DestReceiver *self, Portal portal);
+
+extern void SendRowDescriptionMessage(StringInfo buf,
+ TupleDesc typeinfo, List *targetlist, int16 *formats);
+
+extern void debugStartup(DestReceiver *self, int operation,
+ TupleDesc typeinfo);
+extern bool debugtup(TupleTableSlot *slot, DestReceiver *self);
+
+/* XXX these are really in executor/spi.c */
+extern void spi_dest_startup(DestReceiver *self, int operation,
+ TupleDesc typeinfo);
+extern bool spi_printtup(TupleTableSlot *slot, DestReceiver *self);
+
+#endif /* PRINTTUP_H */
diff --git a/src/include/access/relation.h b/src/include/access/relation.h
new file mode 100644
index 0000000..fd77a13
--- /dev/null
+++ b/src/include/access/relation.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * relation.h
+ * Generic relation related routines.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/relation.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ACCESS_RELATION_H
+#define ACCESS_RELATION_H
+
+#include "nodes/primnodes.h"
+#include "storage/lockdefs.h"
+#include "utils/relcache.h"
+
+extern Relation relation_open(Oid relationId, LOCKMODE lockmode);
+extern Relation try_relation_open(Oid relationId, LOCKMODE lockmode);
+extern Relation relation_openrv(const RangeVar *relation, LOCKMODE lockmode);
+extern Relation relation_openrv_extended(const RangeVar *relation,
+ LOCKMODE lockmode, bool missing_ok);
+extern void relation_close(Relation relation, LOCKMODE lockmode);
+
+#endif /* ACCESS_RELATION_H */
diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h
new file mode 100644
index 0000000..7c5fbeb
--- /dev/null
+++ b/src/include/access/reloptions.h
@@ -0,0 +1,247 @@
+/*-------------------------------------------------------------------------
+ *
+ * reloptions.h
+ * Core support for relation and tablespace options (pg_class.reloptions
+ * and pg_tablespace.spcoptions)
+ *
+ * Note: the functions dealing with text-array reloptions values declare
+ * them as Datum, not ArrayType *, to avoid needing to include array.h
+ * into a lot of low-level code.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/reloptions.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RELOPTIONS_H
+#define RELOPTIONS_H
+
+#include "access/amapi.h"
+#include "access/htup.h"
+#include "access/tupdesc.h"
+#include "nodes/pg_list.h"
+#include "storage/lock.h"
+
+/* types supported by reloptions */
+typedef enum relopt_type
+{
+ RELOPT_TYPE_BOOL,
+ RELOPT_TYPE_INT,
+ RELOPT_TYPE_REAL,
+ RELOPT_TYPE_ENUM,
+ RELOPT_TYPE_STRING
+} relopt_type;
+
+/* kinds supported by reloptions */
+typedef enum relopt_kind
+{
+ RELOPT_KIND_LOCAL = 0,
+ RELOPT_KIND_HEAP = (1 << 0),
+ RELOPT_KIND_TOAST = (1 << 1),
+ RELOPT_KIND_BTREE = (1 << 2),
+ RELOPT_KIND_HASH = (1 << 3),
+ RELOPT_KIND_GIN = (1 << 4),
+ RELOPT_KIND_GIST = (1 << 5),
+ RELOPT_KIND_ATTRIBUTE = (1 << 6),
+ RELOPT_KIND_TABLESPACE = (1 << 7),
+ RELOPT_KIND_SPGIST = (1 << 8),
+ RELOPT_KIND_VIEW = (1 << 9),
+ RELOPT_KIND_BRIN = (1 << 10),
+ RELOPT_KIND_PARTITIONED = (1 << 11),
+ /* if you add a new kind, make sure you update "last_default" too */
+ RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_PARTITIONED,
+ /* some compilers treat enums as signed ints, so we can't use 1 << 31 */
+ RELOPT_KIND_MAX = (1 << 30)
+} relopt_kind;
+
+/* reloption namespaces allowed for heaps -- currently only TOAST */
+#define HEAP_RELOPT_NAMESPACES { "toast", NULL }
+
+/* generic struct to hold shared data */
+typedef struct relopt_gen
+{
+ const char *name; /* must be first (used as list termination
+ * marker) */
+ const char *desc;
+ bits32 kinds;
+ LOCKMODE lockmode;
+ int namelen;
+ relopt_type type;
+} relopt_gen;
+
+/* holds a parsed value */
+typedef struct relopt_value
+{
+ relopt_gen *gen;
+ bool isset;
+ union
+ {
+ bool bool_val;
+ int int_val;
+ double real_val;
+ int enum_val;
+ char *string_val; /* allocated separately */
+ } values;
+} relopt_value;
+
+/* reloptions records for specific variable types */
+typedef struct relopt_bool
+{
+ relopt_gen gen;
+ bool default_val;
+} relopt_bool;
+
+typedef struct relopt_int
+{
+ relopt_gen gen;
+ int default_val;
+ int min;
+ int max;
+} relopt_int;
+
+typedef struct relopt_real
+{
+ relopt_gen gen;
+ double default_val;
+ double min;
+ double max;
+} relopt_real;
+
+/*
+ * relopt_enum_elt_def -- One member of the array of acceptable values
+ * of an enum reloption.
+ */
+typedef struct relopt_enum_elt_def
+{
+ const char *string_val;
+ int symbol_val;
+} relopt_enum_elt_def;
+
+typedef struct relopt_enum
+{
+ relopt_gen gen;
+ relopt_enum_elt_def *members;
+ int default_val;
+ const char *detailmsg;
+ /* null-terminated array of members */
+} relopt_enum;
+
+/* validation routines for strings */
+typedef void (*validate_string_relopt) (const char *value);
+typedef Size (*fill_string_relopt) (const char *value, void *ptr);
+
+/* validation routine for the whole option set */
+typedef void (*relopts_validator) (void *parsed_options, relopt_value *vals, int nvals);
+
+typedef struct relopt_string
+{
+ relopt_gen gen;
+ int default_len;
+ bool default_isnull;
+ validate_string_relopt validate_cb;
+ fill_string_relopt fill_cb;
+ char *default_val;
+} relopt_string;
+
+/* This is the table datatype for build_reloptions() */
+typedef struct
+{
+ const char *optname; /* option's name */
+ relopt_type opttype; /* option's datatype */
+ int offset; /* offset of field in result struct */
+} relopt_parse_elt;
+
+/* Local reloption definition */
+typedef struct local_relopt
+{
+ relopt_gen *option; /* option definition */
+ int offset; /* offset of parsed value in bytea structure */
+} local_relopt;
+
+/* Structure to hold local reloption data for build_local_reloptions() */
+typedef struct local_relopts
+{
+ List *options; /* list of local_relopt definitions */
+ List *validators; /* list of relopts_validator callbacks */
+ Size relopt_struct_size; /* size of parsed bytea structure */
+} local_relopts;
+
+/*
+ * Utility macro to get a value for a string reloption once the options
+ * are parsed. This gets a pointer to the string value itself. "optstruct"
+ * is the StdRdOptions struct or equivalent, "member" is the struct member
+ * corresponding to the string option.
+ */
+#define GET_STRING_RELOPTION(optstruct, member) \
+ ((optstruct)->member == 0 ? NULL : \
+ (char *)(optstruct) + (optstruct)->member)
+
+extern relopt_kind add_reloption_kind(void);
+extern void add_bool_reloption(bits32 kinds, const char *name, const char *desc,
+ bool default_val, LOCKMODE lockmode);
+extern void add_int_reloption(bits32 kinds, const char *name, const char *desc,
+ int default_val, int min_val, int max_val,
+ LOCKMODE lockmode);
+extern void add_real_reloption(bits32 kinds, const char *name, const char *desc,
+ double default_val, double min_val, double max_val,
+ LOCKMODE lockmode);
+extern void add_enum_reloption(bits32 kinds, const char *name, const char *desc,
+ relopt_enum_elt_def *members, int default_val,
+ const char *detailmsg, LOCKMODE lockmode);
+extern void add_string_reloption(bits32 kinds, const char *name, const char *desc,
+ const char *default_val, validate_string_relopt validator,
+ LOCKMODE lockmode);
+
+extern void init_local_reloptions(local_relopts *opts, Size relopt_struct_size);
+extern void register_reloptions_validator(local_relopts *opts,
+ relopts_validator validator);
+extern void add_local_bool_reloption(local_relopts *opts, const char *name,
+ const char *desc, bool default_val,
+ int offset);
+extern void add_local_int_reloption(local_relopts *opts, const char *name,
+ const char *desc, int default_val,
+ int min_val, int max_val, int offset);
+extern void add_local_real_reloption(local_relopts *opts, const char *name,
+ const char *desc, double default_val,
+ double min_val, double max_val,
+ int offset);
+extern void add_local_enum_reloption(local_relopts *relopts,
+ const char *name, const char *desc,
+ relopt_enum_elt_def *members,
+ int default_val, const char *detailmsg,
+ int offset);
+extern void add_local_string_reloption(local_relopts *opts, const char *name,
+ const char *desc,
+ const char *default_val,
+ validate_string_relopt validator,
+ fill_string_relopt filler, int offset);
+
+extern Datum transformRelOptions(Datum oldOptions, List *defList,
+ const char *namspace, char *validnsps[],
+ bool acceptOidsOff, bool isReset);
+extern List *untransformRelOptions(Datum options);
+extern bytea *extractRelOptions(HeapTuple tuple, TupleDesc tupdesc,
+ amoptions_function amoptions);
+extern void *build_reloptions(Datum reloptions, bool validate,
+ relopt_kind kind,
+ Size relopt_struct_size,
+ const relopt_parse_elt *relopt_elems,
+ int num_relopt_elems);
+extern void *build_local_reloptions(local_relopts *relopts, Datum options,
+ bool validate);
+
+extern bytea *default_reloptions(Datum reloptions, bool validate,
+ relopt_kind kind);
+extern bytea *heap_reloptions(char relkind, Datum reloptions, bool validate);
+extern bytea *view_reloptions(Datum reloptions, bool validate);
+extern bytea *partitioned_table_reloptions(Datum reloptions, bool validate);
+extern bytea *index_reloptions(amoptions_function amoptions, Datum reloptions,
+ bool validate);
+extern bytea *attribute_reloptions(Datum reloptions, bool validate);
+extern bytea *tablespace_reloptions(Datum reloptions, bool validate);
+extern LOCKMODE AlterTableGetRelOptionsLockLevel(List *defList);
+
+#endif /* RELOPTIONS_H */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
new file mode 100644
index 0000000..74a07ef
--- /dev/null
+++ b/src/include/access/relscan.h
@@ -0,0 +1,191 @@
+/*-------------------------------------------------------------------------
+ *
+ * relscan.h
+ * POSTGRES relation scan descriptor definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/relscan.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RELSCAN_H
+#define RELSCAN_H
+
+#include "access/htup_details.h"
+#include "access/itup.h"
+#include "port/atomics.h"
+#include "storage/buf.h"
+#include "storage/spin.h"
+#include "utils/relcache.h"
+
+
+struct ParallelTableScanDescData;
+
+/*
+ * Generic descriptor for table scans. This is the base-class for table scans,
+ * which needs to be embedded in the scans of individual AMs.
+ */
+typedef struct TableScanDescData
+{
+ /* scan parameters */
+ Relation rs_rd; /* heap relation descriptor */
+ struct SnapshotData *rs_snapshot; /* snapshot to see */
+ int rs_nkeys; /* number of scan keys */
+ struct ScanKeyData *rs_key; /* array of scan key descriptors */
+
+ /* Range of ItemPointers for table_scan_getnextslot_tidrange() to scan. */
+ ItemPointerData rs_mintid;
+ ItemPointerData rs_maxtid;
+
+ /*
+ * Information about type and behaviour of the scan, a bitmask of members
+ * of the ScanOptions enum (see tableam.h).
+ */
+ uint32 rs_flags;
+
+ struct ParallelTableScanDescData *rs_parallel; /* parallel scan
+ * information */
+} TableScanDescData;
+typedef struct TableScanDescData *TableScanDesc;
+
+/*
+ * Shared state for parallel table scan.
+ *
+ * Each backend participating in a parallel table scan has its own
+ * TableScanDesc in backend-private memory, and those objects all contain a
+ * pointer to this structure. The information here must be sufficient to
+ * properly initialize each new TableScanDesc as workers join the scan, and it
+ * must act as a information what to scan for those workers.
+ */
+typedef struct ParallelTableScanDescData
+{
+ Oid phs_relid; /* OID of relation to scan */
+ bool phs_syncscan; /* report location to syncscan logic? */
+ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */
+ Size phs_snapshot_off; /* data for snapshot */
+} ParallelTableScanDescData;
+typedef struct ParallelTableScanDescData *ParallelTableScanDesc;
+
+/*
+ * Shared state for parallel table scans, for block oriented storage.
+ */
+typedef struct ParallelBlockTableScanDescData
+{
+ ParallelTableScanDescData base;
+
+ BlockNumber phs_nblocks; /* # blocks in relation at start of scan */
+ slock_t phs_mutex; /* mutual exclusion for setting startblock */
+ BlockNumber phs_startblock; /* starting block number */
+ pg_atomic_uint64 phs_nallocated; /* number of blocks allocated to
+ * workers so far. */
+} ParallelBlockTableScanDescData;
+typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
+
+/*
+ * Per backend state for parallel table scan, for block-oriented storage.
+ */
+typedef struct ParallelBlockTableScanWorkerData
+{
+ uint64 phsw_nallocated; /* Current # of blocks into the scan */
+ uint32 phsw_chunk_remaining; /* # blocks left in this chunk */
+ uint32 phsw_chunk_size; /* The number of blocks to allocate in
+ * each I/O chunk for the scan */
+} ParallelBlockTableScanWorkerData;
+typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
+
+/*
+ * Base class for fetches from a table via an index. This is the base-class
+ * for such scans, which needs to be embedded in the respective struct for
+ * individual AMs.
+ */
+typedef struct IndexFetchTableData
+{
+ Relation rel;
+} IndexFetchTableData;
+
+/*
+ * We use the same IndexScanDescData structure for both amgettuple-based
+ * and amgetbitmap-based index scans. Some fields are only relevant in
+ * amgettuple-based scans.
+ */
+typedef struct IndexScanDescData
+{
+ /* scan parameters */
+ Relation heapRelation; /* heap relation descriptor, or NULL */
+ Relation indexRelation; /* index relation descriptor */
+ struct SnapshotData *xs_snapshot; /* snapshot to see */
+ int numberOfKeys; /* number of index qualifier conditions */
+ int numberOfOrderBys; /* number of ordering operators */
+ struct ScanKeyData *keyData; /* array of index qualifier descriptors */
+ struct ScanKeyData *orderByData; /* array of ordering op descriptors */
+ bool xs_want_itup; /* caller requests index tuples */
+ bool xs_temp_snap; /* unregister snapshot at scan end? */
+
+ /* signaling to index AM about killing index tuples */
+ bool kill_prior_tuple; /* last-returned tuple is dead */
+ bool ignore_killed_tuples; /* do not return killed entries */
+ bool xactStartedInRecovery; /* prevents killing/seeing killed
+ * tuples */
+
+ /* index access method's private state */
+ void *opaque; /* access-method-specific info */
+
+ /*
+ * In an index-only scan, a successful amgettuple call must fill either
+ * xs_itup (and xs_itupdesc) or xs_hitup (and xs_hitupdesc) to provide the
+ * data returned by the scan. It can fill both, in which case the heap
+ * format will be used.
+ */
+ IndexTuple xs_itup; /* index tuple returned by AM */
+ struct TupleDescData *xs_itupdesc; /* rowtype descriptor of xs_itup */
+ HeapTuple xs_hitup; /* index data returned by AM, as HeapTuple */
+ struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */
+
+ ItemPointerData xs_heaptid; /* result */
+ bool xs_heap_continue; /* T if must keep walking, potential
+ * further results */
+ IndexFetchTableData *xs_heapfetch;
+
+ bool xs_recheck; /* T means scan keys must be rechecked */
+
+ /*
+ * When fetching with an ordering operator, the values of the ORDER BY
+ * expressions of the last returned tuple, according to the index. If
+ * xs_recheckorderby is true, these need to be rechecked just like the
+ * scan keys, and the values returned here are a lower-bound on the actual
+ * values.
+ */
+ Datum *xs_orderbyvals;
+ bool *xs_orderbynulls;
+ bool xs_recheckorderby;
+
+ /* parallel index scan information, in shared memory */
+ struct ParallelIndexScanDescData *parallel_scan;
+} IndexScanDescData;
+
+/* Generic structure for parallel scans */
+typedef struct ParallelIndexScanDescData
+{
+ Oid ps_relid;
+ Oid ps_indexid;
+ Size ps_offset; /* Offset in bytes of am specific structure */
+ char ps_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
+} ParallelIndexScanDescData;
+
+struct TupleTableSlot;
+
+/* Struct for storage-or-index scans of system tables */
+typedef struct SysScanDescData
+{
+ Relation heap_rel; /* catalog being scanned */
+ Relation irel; /* NULL if doing heap scan */
+ struct TableScanDescData *scan; /* only valid in storage-scan case */
+ struct IndexScanDescData *iscan; /* only valid in index-scan case */
+ struct SnapshotData *snapshot; /* snapshot to unregister at end of scan */
+ struct TupleTableSlot *slot;
+} SysScanDescData;
+
+#endif /* RELSCAN_H */
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
new file mode 100644
index 0000000..121f552
--- /dev/null
+++ b/src/include/access/rewriteheap.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * rewriteheap.h
+ * Declarations for heap rewrite support functions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994-5, Regents of the University of California
+ *
+ * src/include/access/rewriteheap.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef REWRITE_HEAP_H
+#define REWRITE_HEAP_H
+
+#include "access/htup.h"
+#include "storage/itemptr.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+/* struct definition is private to rewriteheap.c */
+typedef struct RewriteStateData *RewriteState;
+
+extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
+ TransactionId OldestXmin, TransactionId FreezeXid,
+ MultiXactId MultiXactCutoff);
+extern void end_heap_rewrite(RewriteState state);
+extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
+ HeapTuple newTuple);
+extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple);
+
+/*
+ * On-Disk data format for an individual logical rewrite mapping.
+ */
+typedef struct LogicalRewriteMappingData
+{
+ RelFileNode old_node;
+ RelFileNode new_node;
+ ItemPointerData old_tid;
+ ItemPointerData new_tid;
+} LogicalRewriteMappingData;
+
+/* ---
+ * The filename consists of the following, dash separated,
+ * components:
+ * 1) database oid or InvalidOid for shared relations
+ * 2) the oid of the relation
+ * 3) upper 32bit of the LSN at which a rewrite started
+ * 4) lower 32bit of the LSN at which a rewrite started
+ * 5) xid we are mapping for
+ * 6) xid of the xact performing the mapping
+ * ---
+ */
+#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x"
+void CheckPointLogicalRewriteHeap(void);
+
+#endif /* REWRITE_HEAP_H */
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
new file mode 100644
index 0000000..c9b5c56
--- /dev/null
+++ b/src/include/access/rmgr.h
@@ -0,0 +1,35 @@
+/*
+ * rmgr.h
+ *
+ * Resource managers definition
+ *
+ * src/include/access/rmgr.h
+ */
+#ifndef RMGR_H
+#define RMGR_H
+
+typedef uint8 RmgrId;
+
+/*
+ * Built-in resource managers
+ *
+ * The actual numerical values for each rmgr ID are defined by the order
+ * of entries in rmgrlist.h.
+ *
+ * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG
+ * file format.
+ */
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+ symname,
+
+typedef enum RmgrIds
+{
+#include "access/rmgrlist.h"
+ RM_NEXT_ID
+} RmgrIds;
+
+#undef PG_RMGR
+
+#define RM_MAX_ID (RM_NEXT_ID - 1)
+
+#endif /* RMGR_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
new file mode 100644
index 0000000..f582cf5
--- /dev/null
+++ b/src/include/access/rmgrlist.h
@@ -0,0 +1,49 @@
+/*---------------------------------------------------------------------------
+ * rmgrlist.h
+ *
+ * The resource manager list is kept in its own source file for possible
+ * use by automatic tools. The exact representation of a rmgr is determined
+ * by the PG_RMGR macro, which is not defined in this file; it can be
+ * defined by the caller for special purposes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/rmgrlist.h
+ *---------------------------------------------------------------------------
+ */
+
+/* there is deliberately not an #ifndef RMGRLIST_H here */
+
+/*
+ * List of resource manager entries. Note that order of entries defines the
+ * numerical values of each rmgr's ID, which is stored in WAL records. New
+ * entries should be added at the end, to avoid changing IDs of existing
+ * entries.
+ *
+ * Changes to this list possibly need an XLOG_PAGE_MAGIC bump.
+ */
+
+/* symbol name, textual name, redo, desc, identify, startup, cleanup */
+PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
+PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
+PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
+PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
+PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
+PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
+PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
+PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
+PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
+PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
+PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
+PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
+PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
+PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
+PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
+PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
+PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
+PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
+PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
+PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
+PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
diff --git a/src/include/access/sdir.h b/src/include/access/sdir.h
new file mode 100644
index 0000000..8154adf
--- /dev/null
+++ b/src/include/access/sdir.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * sdir.h
+ * POSTGRES scan direction definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/sdir.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SDIR_H
+#define SDIR_H
+
+
+/*
+ * ScanDirection was an int8 for no apparent reason. I kept the original
+ * values because I'm not sure if I'll break anything otherwise. -ay 2/95
+ */
+typedef enum ScanDirection
+{
+ BackwardScanDirection = -1,
+ NoMovementScanDirection = 0,
+ ForwardScanDirection = 1
+} ScanDirection;
+
+/*
+ * ScanDirectionIsValid
+ * True iff scan direction is valid.
+ */
+#define ScanDirectionIsValid(direction) \
+ ((bool) (BackwardScanDirection <= (direction) && \
+ (direction) <= ForwardScanDirection))
+
+/*
+ * ScanDirectionIsBackward
+ * True iff scan direction is backward.
+ */
+#define ScanDirectionIsBackward(direction) \
+ ((bool) ((direction) == BackwardScanDirection))
+
+/*
+ * ScanDirectionIsNoMovement
+ * True iff scan direction indicates no movement.
+ */
+#define ScanDirectionIsNoMovement(direction) \
+ ((bool) ((direction) == NoMovementScanDirection))
+
+/*
+ * ScanDirectionIsForward
+ * True iff scan direction is forward.
+ */
+#define ScanDirectionIsForward(direction) \
+ ((bool) ((direction) == ForwardScanDirection))
+
+#endif /* SDIR_H */
diff --git a/src/include/access/session.h b/src/include/access/session.h
new file mode 100644
index 0000000..82cee5a
--- /dev/null
+++ b/src/include/access/session.h
@@ -0,0 +1,44 @@
+/*-------------------------------------------------------------------------
+ *
+ * session.h
+ * Encapsulation of user session.
+ *
+ * Copyright (c) 2017-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/session.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SESSION_H
+#define SESSION_H
+
+#include "lib/dshash.h"
+
+/* Avoid including typcache.h */
+struct SharedRecordTypmodRegistry;
+
+/*
+ * A struct encapsulating some elements of a user's session. For now this
+ * manages state that applies to parallel query, but in principle it could
+ * include other things that are currently global variables.
+ */
+typedef struct Session
+{
+ dsm_segment *segment; /* The session-scoped DSM segment. */
+ dsa_area *area; /* The session-scoped DSA area. */
+
+ /* State managed by typcache.c. */
+ struct SharedRecordTypmodRegistry *shared_typmod_registry;
+ dshash_table *shared_record_table;
+ dshash_table *shared_typmod_table;
+} Session;
+
+extern void InitializeSession(void);
+extern dsm_handle GetSessionDsmHandle(void);
+extern void AttachSession(dsm_handle handle);
+extern void DetachSession(void);
+
+/* The current session, or NULL for none. */
+extern Session *CurrentSession;
+
+#endif /* SESSION_H */
diff --git a/src/include/access/skey.h b/src/include/access/skey.h
new file mode 100644
index 0000000..92b7d09
--- /dev/null
+++ b/src/include/access/skey.h
@@ -0,0 +1,151 @@
+/*-------------------------------------------------------------------------
+ *
+ * skey.h
+ * POSTGRES scan key definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/skey.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SKEY_H
+#define SKEY_H
+
+#include "access/attnum.h"
+#include "access/stratnum.h"
+#include "fmgr.h"
+
+
+/*
+ * A ScanKey represents the application of a comparison operator between
+ * a table or index column and a constant. When it's part of an array of
+ * ScanKeys, the comparison conditions are implicitly ANDed. The index
+ * column is the left argument of the operator, if it's a binary operator.
+ * (The data structure can support unary indexable operators too; in that
+ * case sk_argument would go unused. This is not currently implemented.)
+ *
+ * For an index scan, sk_strategy and sk_subtype must be set correctly for
+ * the operator. When using a ScanKey in a heap scan, these fields are not
+ * used and may be set to InvalidStrategy/InvalidOid.
+ *
+ * If the operator is collation-sensitive, sk_collation must be set
+ * correctly as well.
+ *
+ * A ScanKey can also represent a ScalarArrayOpExpr, that is a condition
+ * "column op ANY(ARRAY[...])". This is signaled by the SK_SEARCHARRAY
+ * flag bit. The sk_argument is not a value of the operator's right-hand
+ * argument type, but rather an array of such values, and the per-element
+ * comparisons are to be ORed together.
+ *
+ * A ScanKey can also represent a condition "column IS NULL" or "column
+ * IS NOT NULL"; these cases are signaled by the SK_SEARCHNULL and
+ * SK_SEARCHNOTNULL flag bits respectively. The argument is always NULL,
+ * and the sk_strategy, sk_subtype, sk_collation, and sk_func fields are
+ * not used (unless set by the index AM).
+ *
+ * SK_SEARCHARRAY, SK_SEARCHNULL and SK_SEARCHNOTNULL are supported only
+ * for index scans, not heap scans; and not all index AMs support them,
+ * only those that set amsearcharray or amsearchnulls respectively.
+ *
+ * A ScanKey can also represent an ordering operator invocation, that is
+ * an ordering requirement "ORDER BY indexedcol op constant". This looks
+ * the same as a comparison operator, except that the operator doesn't
+ * (usually) yield boolean. We mark such ScanKeys with SK_ORDER_BY.
+ * SK_SEARCHARRAY, SK_SEARCHNULL, SK_SEARCHNOTNULL cannot be used here.
+ *
+ * Note: in some places, ScanKeys are used as a convenient representation
+ * for the invocation of an access method support procedure. In this case
+ * sk_strategy/sk_subtype are not meaningful (but sk_collation can be); and
+ * sk_func may refer to a function that returns something other than boolean.
+ */
+typedef struct ScanKeyData
+{
+ int sk_flags; /* flags, see below */
+ AttrNumber sk_attno; /* table or index column number */
+ StrategyNumber sk_strategy; /* operator strategy number */
+ Oid sk_subtype; /* strategy subtype */
+ Oid sk_collation; /* collation to use, if needed */
+ FmgrInfo sk_func; /* lookup info for function to call */
+ Datum sk_argument; /* data to compare */
+} ScanKeyData;
+
+typedef ScanKeyData *ScanKey;
+
+/*
+ * About row comparisons:
+ *
+ * The ScanKey data structure also supports row comparisons, that is ordered
+ * tuple comparisons like (x, y) > (c1, c2), having the SQL-spec semantics
+ * "x > c1 OR (x = c1 AND y > c2)". Note that this is currently only
+ * implemented for btree index searches, not for heapscans or any other index
+ * type. A row comparison is represented by a "header" ScanKey entry plus
+ * a separate array of ScanKeys, one for each column of the row comparison.
+ * The header entry has these properties:
+ * sk_flags = SK_ROW_HEADER
+ * sk_attno = index column number for leading column of row comparison
+ * sk_strategy = btree strategy code for semantics of row comparison
+ * (ie, < <= > or >=)
+ * sk_subtype, sk_collation, sk_func: not used
+ * sk_argument: pointer to subsidiary ScanKey array
+ * If the header is part of a ScanKey array that's sorted by attno, it
+ * must be sorted according to the leading column number.
+ *
+ * The subsidiary ScanKey array appears in logical column order of the row
+ * comparison, which may be different from index column order. The array
+ * elements are like a normal ScanKey array except that:
+ * sk_flags must include SK_ROW_MEMBER, plus SK_ROW_END in the last
+ * element (needed since row header does not include a count)
+ * sk_func points to the btree comparison support function for the
+ * opclass, NOT the operator's implementation function.
+ * sk_strategy must be the same in all elements of the subsidiary array,
+ * that is, the same as in the header entry.
+ * SK_SEARCHARRAY, SK_SEARCHNULL, SK_SEARCHNOTNULL cannot be used here.
+ */
+
+/*
+ * ScanKeyData sk_flags
+ *
+ * sk_flags bits 0-15 are reserved for system-wide use (symbols for those
+ * bits should be defined here). Bits 16-31 are reserved for use within
+ * individual index access methods.
+ */
+#define SK_ISNULL 0x0001 /* sk_argument is NULL */
+#define SK_UNARY 0x0002 /* unary operator (not supported!) */
+#define SK_ROW_HEADER 0x0004 /* row comparison header (see above) */
+#define SK_ROW_MEMBER 0x0008 /* row comparison member (see above) */
+#define SK_ROW_END 0x0010 /* last row comparison member */
+#define SK_SEARCHARRAY 0x0020 /* scankey represents ScalarArrayOp */
+#define SK_SEARCHNULL 0x0040 /* scankey represents "col IS NULL" */
+#define SK_SEARCHNOTNULL 0x0080 /* scankey represents "col IS NOT NULL" */
+#define SK_ORDER_BY 0x0100 /* scankey is for ORDER BY op */
+
+
+/*
+ * prototypes for functions in access/common/scankey.c
+ */
+extern void ScanKeyInit(ScanKey entry,
+ AttrNumber attributeNumber,
+ StrategyNumber strategy,
+ RegProcedure procedure,
+ Datum argument);
+extern void ScanKeyEntryInitialize(ScanKey entry,
+ int flags,
+ AttrNumber attributeNumber,
+ StrategyNumber strategy,
+ Oid subtype,
+ Oid collation,
+ RegProcedure procedure,
+ Datum argument);
+extern void ScanKeyEntryInitializeWithInfo(ScanKey entry,
+ int flags,
+ AttrNumber attributeNumber,
+ StrategyNumber strategy,
+ Oid subtype,
+ Oid collation,
+ FmgrInfo *finfo,
+ Datum argument);
+
+#endif /* SKEY_H */
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
new file mode 100644
index 0000000..dd52e8c
--- /dev/null
+++ b/src/include/access/slru.h
@@ -0,0 +1,174 @@
+/*-------------------------------------------------------------------------
+ *
+ * slru.h
+ * Simple LRU buffering for transaction status logfiles
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/slru.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SLRU_H
+#define SLRU_H
+
+#include "access/xlogdefs.h"
+#include "storage/lwlock.h"
+#include "storage/sync.h"
+
+
+/*
+ * Define SLRU segment size. A page is the same BLCKSZ as is used everywhere
+ * else in Postgres. The segment size can be chosen somewhat arbitrarily;
+ * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG
+ * or 64K transactions for SUBTRANS.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * page numbering also wraps around at 0xFFFFFFFF/xxxx_XACTS_PER_PAGE (where
+ * xxxx is CLOG or SUBTRANS, respectively), and segment numbering at
+ * 0xFFFFFFFF/xxxx_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
+ * take no explicit notice of that fact in slru.c, except when comparing
+ * segment and page numbers in SimpleLruTruncate (see PagePrecedes()).
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+/*
+ * Page status codes. Note that these do not include the "dirty" bit.
+ * page_dirty can be true only in the VALID or WRITE_IN_PROGRESS states;
+ * in the latter case it implies that the page has been re-dirtied since
+ * the write started.
+ */
+typedef enum
+{
+ SLRU_PAGE_EMPTY, /* buffer is not in use */
+ SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */
+ SLRU_PAGE_VALID, /* page is valid and not being written */
+ SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */
+} SlruPageStatus;
+
+/*
+ * Shared-memory state
+ */
+typedef struct SlruSharedData
+{
+ LWLock *ControlLock;
+
+ /* Number of buffers managed by this SLRU structure */
+ int num_slots;
+
+ /*
+ * Arrays holding info for each buffer slot. Page number is undefined
+ * when status is EMPTY, as is page_lru_count.
+ */
+ char **page_buffer;
+ SlruPageStatus *page_status;
+ bool *page_dirty;
+ int *page_number;
+ int *page_lru_count;
+ LWLockPadded *buffer_locks;
+
+ /*
+ * Optional array of WAL flush LSNs associated with entries in the SLRU
+ * pages. If not zero/NULL, we must flush WAL before writing pages (true
+ * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[]
+ * has lsn_groups_per_page entries per buffer slot, each containing the
+ * highest LSN known for a contiguous group of SLRU entries on that slot's
+ * page.
+ */
+ XLogRecPtr *group_lsn;
+ int lsn_groups_per_page;
+
+ /*----------
+ * We mark a page "most recently used" by setting
+ * page_lru_count[slotno] = ++cur_lru_count;
+ * The oldest page is therefore the one with the highest value of
+ * cur_lru_count - page_lru_count[slotno]
+ * The counts will eventually wrap around, but this calculation still
+ * works as long as no page's age exceeds INT_MAX counts.
+ *----------
+ */
+ int cur_lru_count;
+
+ /*
+ * latest_page_number is the page number of the current end of the log;
+ * this is not critical data, since we use it only to avoid swapping out
+ * the latest page.
+ */
+ int latest_page_number;
+
+ /* SLRU's index for statistics purposes (might not be unique) */
+ int slru_stats_idx;
+} SlruSharedData;
+
+typedef SlruSharedData *SlruShared;
+
+/*
+ * SlruCtlData is an unshared structure that points to the active information
+ * in shared memory.
+ */
+typedef struct SlruCtlData
+{
+ SlruShared shared;
+
+ /*
+ * Which sync handler function to use when handing sync requests over to
+ * the checkpointer. SYNC_HANDLER_NONE to disable fsync (eg pg_notify).
+ */
+ SyncRequestHandler sync_handler;
+
+ /*
+ * Decide whether a page is "older" for truncation and as a hint for
+ * evicting pages in LRU order. Return true if every entry of the first
+ * argument is older than every entry of the second argument. Note that
+ * !PagePrecedes(a,b) && !PagePrecedes(b,a) need not imply a==b; it also
+ * arises when some entries are older and some are not. For SLRUs using
+ * SimpleLruTruncate(), this must use modular arithmetic. (For others,
+ * the behavior of this callback has no functional implications.) Use
+ * SlruPagePrecedesUnitTests() in SLRUs meeting its criteria.
+ */
+ bool (*PagePrecedes) (int, int);
+
+ /*
+ * Dir is set during SimpleLruInit and does not change thereafter. Since
+ * it's always the same, it doesn't need to be in shared memory.
+ */
+ char Dir[64];
+} SlruCtlData;
+
+typedef SlruCtlData *SlruCtl;
+
+
+extern Size SimpleLruShmemSize(int nslots, int nlsns);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+ LWLock *ctllock, const char *subdir, int tranche_id,
+ SyncRequestHandler sync_handler);
+extern int SimpleLruZeroPage(SlruCtl ctl, int pageno);
+extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
+ TransactionId xid);
+extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
+ TransactionId xid);
+extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
+extern void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied);
+#ifdef USE_ASSERT_CHECKING
+extern void SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page);
+#else
+#define SlruPagePrecedesUnitTests(ctl, per_page) do {} while (0)
+#endif
+extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
+extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno);
+
+typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
+ void *data);
+extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
+extern void SlruDeleteSegment(SlruCtl ctl, int segno);
+
+extern int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path);
+
+/* SlruScanDirectory public callbacks */
+extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
+ int segpage, void *data);
+extern bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage,
+ void *data);
+
+#endif /* SLRU_H */
diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h
new file mode 100644
index 0000000..2eb2f42
--- /dev/null
+++ b/src/include/access/spgist.h
@@ -0,0 +1,229 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgist.h
+ * Public header file for SP-GiST access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/spgist.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPGIST_H
+#define SPGIST_H
+
+#include "access/amapi.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+
+
+/* SPGiST opclass support function numbers */
+#define SPGIST_CONFIG_PROC 1
+#define SPGIST_CHOOSE_PROC 2
+#define SPGIST_PICKSPLIT_PROC 3
+#define SPGIST_INNER_CONSISTENT_PROC 4
+#define SPGIST_LEAF_CONSISTENT_PROC 5
+#define SPGIST_COMPRESS_PROC 6
+#define SPGIST_OPTIONS_PROC 7
+#define SPGISTNRequiredProc 5
+#define SPGISTNProc 7
+
+/*
+ * Argument structs for spg_config method
+ */
+typedef struct spgConfigIn
+{
+ Oid attType; /* Data type to be indexed */
+} spgConfigIn;
+
+typedef struct spgConfigOut
+{
+ Oid prefixType; /* Data type of inner-tuple prefixes */
+ Oid labelType; /* Data type of inner-tuple node labels */
+ Oid leafType; /* Data type of leaf-tuple values */
+ bool canReturnData; /* Opclass can reconstruct original data */
+ bool longValuesOK; /* Opclass can cope with values > 1 page */
+} spgConfigOut;
+
+/*
+ * Argument structs for spg_choose method
+ */
+typedef struct spgChooseIn
+{
+ Datum datum; /* original datum to be indexed */
+ Datum leafDatum; /* current datum to be stored at leaf */
+ int level; /* current level (counting from zero) */
+
+ /* Data from current inner tuple */
+ bool allTheSame; /* tuple is marked all-the-same? */
+ bool hasPrefix; /* tuple has a prefix? */
+ Datum prefixDatum; /* if so, the prefix value */
+ int nNodes; /* number of nodes in the inner tuple */
+ Datum *nodeLabels; /* node label values (NULL if none) */
+} spgChooseIn;
+
+typedef enum spgChooseResultType
+{
+ spgMatchNode = 1, /* descend into existing node */
+ spgAddNode, /* add a node to the inner tuple */
+ spgSplitTuple /* split inner tuple (change its prefix) */
+} spgChooseResultType;
+
+typedef struct spgChooseOut
+{
+ spgChooseResultType resultType; /* action code, see above */
+ union
+ {
+ struct /* results for spgMatchNode */
+ {
+ int nodeN; /* descend to this node (index from 0) */
+ int levelAdd; /* increment level by this much */
+ Datum restDatum; /* new leaf datum */
+ } matchNode;
+ struct /* results for spgAddNode */
+ {
+ Datum nodeLabel; /* new node's label */
+ int nodeN; /* where to insert it (index from 0) */
+ } addNode;
+ struct /* results for spgSplitTuple */
+ {
+ /* Info to form new upper-level inner tuple with one child tuple */
+ bool prefixHasPrefix; /* tuple should have a prefix? */
+ Datum prefixPrefixDatum; /* if so, its value */
+ int prefixNNodes; /* number of nodes */
+ Datum *prefixNodeLabels; /* their labels (or NULL for no
+ * labels) */
+ int childNodeN; /* which node gets child tuple */
+
+ /* Info to form new lower-level inner tuple with all old nodes */
+ bool postfixHasPrefix; /* tuple should have a prefix? */
+ Datum postfixPrefixDatum; /* if so, its value */
+ } splitTuple;
+ } result;
+} spgChooseOut;
+
+/*
+ * Argument structs for spg_picksplit method
+ */
+typedef struct spgPickSplitIn
+{
+ int nTuples; /* number of leaf tuples */
+ Datum *datums; /* their datums (array of length nTuples) */
+ int level; /* current level (counting from zero) */
+} spgPickSplitIn;
+
+typedef struct spgPickSplitOut
+{
+ bool hasPrefix; /* new inner tuple should have a prefix? */
+ Datum prefixDatum; /* if so, its value */
+
+ int nNodes; /* number of nodes for new inner tuple */
+ Datum *nodeLabels; /* their labels (or NULL for no labels) */
+
+ int *mapTuplesToNodes; /* node index for each leaf tuple */
+ Datum *leafTupleDatums; /* datum to store in each new leaf tuple */
+} spgPickSplitOut;
+
+/*
+ * Argument structs for spg_inner_consistent method
+ */
+typedef struct spgInnerConsistentIn
+{
+ ScanKey scankeys; /* array of operators and comparison values */
+ ScanKey orderbys; /* array of ordering operators and comparison
+ * values */
+ int nkeys; /* length of scankeys array */
+ int norderbys; /* length of orderbys array */
+
+ Datum reconstructedValue; /* value reconstructed at parent */
+ void *traversalValue; /* opclass-specific traverse value */
+ MemoryContext traversalMemoryContext; /* put new traverse values here */
+ int level; /* current level (counting from zero) */
+ bool returnData; /* original data must be returned? */
+
+ /* Data from current inner tuple */
+ bool allTheSame; /* tuple is marked all-the-same? */
+ bool hasPrefix; /* tuple has a prefix? */
+ Datum prefixDatum; /* if so, the prefix value */
+ int nNodes; /* number of nodes in the inner tuple */
+ Datum *nodeLabels; /* node label values (NULL if none) */
+} spgInnerConsistentIn;
+
+typedef struct spgInnerConsistentOut
+{
+ int nNodes; /* number of child nodes to be visited */
+ int *nodeNumbers; /* their indexes in the node array */
+ int *levelAdds; /* increment level by this much for each */
+ Datum *reconstructedValues; /* associated reconstructed values */
+ void **traversalValues; /* opclass-specific traverse values */
+ double **distances; /* associated distances */
+} spgInnerConsistentOut;
+
+/*
+ * Argument structs for spg_leaf_consistent method
+ */
+typedef struct spgLeafConsistentIn
+{
+ ScanKey scankeys; /* array of operators and comparison values */
+ ScanKey orderbys; /* array of ordering operators and comparison
+ * values */
+ int nkeys; /* length of scankeys array */
+ int norderbys; /* length of orderbys array */
+
+ Datum reconstructedValue; /* value reconstructed at parent */
+ void *traversalValue; /* opclass-specific traverse value */
+ int level; /* current level (counting from zero) */
+ bool returnData; /* original data must be returned? */
+
+ Datum leafDatum; /* datum in leaf tuple */
+} spgLeafConsistentIn;
+
+typedef struct spgLeafConsistentOut
+{
+ Datum leafValue; /* reconstructed original data, if any */
+ bool recheck; /* set true if operator must be rechecked */
+ bool recheckDistances; /* set true if distances must be rechecked */
+ double *distances; /* associated distances */
+} spgLeafConsistentOut;
+
+
+/* spgutils.c */
+extern bytea *spgoptions(Datum reloptions, bool validate);
+
+/* spginsert.c */
+extern IndexBuildResult *spgbuild(Relation heap, Relation index,
+ struct IndexInfo *indexInfo);
+extern void spgbuildempty(Relation index);
+extern bool spginsert(Relation index, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ struct IndexInfo *indexInfo);
+
+/* spgscan.c */
+extern IndexScanDesc spgbeginscan(Relation rel, int keysz, int orderbysz);
+extern void spgendscan(IndexScanDesc scan);
+extern void spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys);
+extern int64 spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+extern bool spggettuple(IndexScanDesc scan, ScanDirection dir);
+extern bool spgcanreturn(Relation index, int attno);
+
+/* spgvacuum.c */
+extern IndexBulkDeleteResult *spgbulkdelete(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback,
+ void *callback_state);
+extern IndexBulkDeleteResult *spgvacuumcleanup(IndexVacuumInfo *info,
+ IndexBulkDeleteResult *stats);
+
+/* spgvalidate.c */
+extern bool spgvalidate(Oid opclassoid);
+extern void spgadjustmembers(Oid opfamilyoid,
+ Oid opclassoid,
+ List *operators,
+ List *functions);
+
+#endif /* SPGIST_H */
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
new file mode 100644
index 0000000..40d3b71
--- /dev/null
+++ b/src/include/access/spgist_private.h
@@ -0,0 +1,548 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgist_private.h
+ * Private declarations for SP-GiST access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/spgist_private.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPGIST_PRIVATE_H
+#define SPGIST_PRIVATE_H
+
+#include "access/itup.h"
+#include "access/spgist.h"
+#include "catalog/pg_am_d.h"
+#include "nodes/tidbitmap.h"
+#include "storage/buf.h"
+#include "utils/geo_decls.h"
+#include "utils/relcache.h"
+
+
+typedef struct SpGistOptions
+{
+ int32 varlena_header_; /* varlena header (do not touch directly!) */
+ int fillfactor; /* page fill factor in percent (0..100) */
+} SpGistOptions;
+
+#define SpGistGetFillFactor(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == SPGIST_AM_OID), \
+ (relation)->rd_options ? \
+ ((SpGistOptions *) (relation)->rd_options)->fillfactor : \
+ SPGIST_DEFAULT_FILLFACTOR)
+#define SpGistGetTargetPageFreeSpace(relation) \
+ (BLCKSZ * (100 - SpGistGetFillFactor(relation)) / 100)
+
+
+/* SPGiST leaf tuples have one key column, optionally have included columns */
+#define spgKeyColumn 0
+#define spgFirstIncludeColumn 1
+
+/* Page numbers of fixed-location pages */
+#define SPGIST_METAPAGE_BLKNO (0) /* metapage */
+#define SPGIST_ROOT_BLKNO (1) /* root for normal entries */
+#define SPGIST_NULL_BLKNO (2) /* root for null-value entries */
+#define SPGIST_LAST_FIXED_BLKNO SPGIST_NULL_BLKNO
+
+#define SpGistBlockIsRoot(blkno) \
+ ((blkno) == SPGIST_ROOT_BLKNO || (blkno) == SPGIST_NULL_BLKNO)
+#define SpGistBlockIsFixed(blkno) \
+ ((BlockNumber) (blkno) <= (BlockNumber) SPGIST_LAST_FIXED_BLKNO)
+
+/*
+ * Contents of page special space on SPGiST index pages
+ */
+typedef struct SpGistPageOpaqueData
+{
+ uint16 flags; /* see bit definitions below */
+ uint16 nRedirection; /* number of redirection tuples on page */
+ uint16 nPlaceholder; /* number of placeholder tuples on page */
+ /* note there's no count of either LIVE or DEAD tuples ... */
+ uint16 spgist_page_id; /* for identification of SP-GiST indexes */
+} SpGistPageOpaqueData;
+
+typedef SpGistPageOpaqueData *SpGistPageOpaque;
+
+/* Flag bits in page special space */
+#define SPGIST_META (1<<0)
+#define SPGIST_DELETED (1<<1) /* never set, but keep for backwards
+ * compatibility */
+#define SPGIST_LEAF (1<<2)
+#define SPGIST_NULLS (1<<3)
+
+#define SpGistPageGetOpaque(page) ((SpGistPageOpaque) PageGetSpecialPointer(page))
+#define SpGistPageIsMeta(page) (SpGistPageGetOpaque(page)->flags & SPGIST_META)
+#define SpGistPageIsDeleted(page) (SpGistPageGetOpaque(page)->flags & SPGIST_DELETED)
+#define SpGistPageIsLeaf(page) (SpGistPageGetOpaque(page)->flags & SPGIST_LEAF)
+#define SpGistPageStoresNulls(page) (SpGistPageGetOpaque(page)->flags & SPGIST_NULLS)
+
+/*
+ * The page ID is for the convenience of pg_filedump and similar utilities,
+ * which otherwise would have a hard time telling pages of different index
+ * types apart. It should be the last 2 bytes on the page. This is more or
+ * less "free" due to alignment considerations.
+ *
+ * See comments above GinPageOpaqueData.
+ */
+#define SPGIST_PAGE_ID 0xFF82
+
+/*
+ * Each backend keeps a cache of last-used page info in its index->rd_amcache
+ * area. This is initialized from, and occasionally written back to,
+ * shared storage in the index metapage.
+ */
+typedef struct SpGistLastUsedPage
+{
+ BlockNumber blkno; /* block number, or InvalidBlockNumber */
+ int freeSpace; /* page's free space (could be obsolete!) */
+} SpGistLastUsedPage;
+
+/* Note: indexes in cachedPage[] match flag assignments for SpGistGetBuffer */
+#define SPGIST_CACHED_PAGES 8
+
+typedef struct SpGistLUPCache
+{
+ SpGistLastUsedPage cachedPage[SPGIST_CACHED_PAGES];
+} SpGistLUPCache;
+
+/*
+ * metapage
+ */
+typedef struct SpGistMetaPageData
+{
+ uint32 magicNumber; /* for identity cross-check */
+ SpGistLUPCache lastUsedPages; /* shared storage of last-used info */
+} SpGistMetaPageData;
+
+#define SPGIST_MAGIC_NUMBER (0xBA0BABEE)
+
+#define SpGistPageGetMeta(p) \
+ ((SpGistMetaPageData *) PageGetContents(p))
+
+/*
+ * Private state of index AM. SpGistState is common to both insert and
+ * search code; SpGistScanOpaque is for searches only.
+ */
+
+typedef struct SpGistLeafTupleData *SpGistLeafTuple; /* forward reference */
+
+/* Per-datatype info needed in SpGistState */
+typedef struct SpGistTypeDesc
+{
+ Oid type;
+ int16 attlen;
+ bool attbyval;
+ char attalign;
+ char attstorage;
+} SpGistTypeDesc;
+
+typedef struct SpGistState
+{
+ Relation index; /* index we're working with */
+
+ spgConfigOut config; /* filled in by opclass config method */
+
+ SpGistTypeDesc attType; /* type of values to be indexed/restored */
+ SpGistTypeDesc attLeafType; /* type of leaf-tuple values */
+ SpGistTypeDesc attPrefixType; /* type of inner-tuple prefix values */
+ SpGistTypeDesc attLabelType; /* type of node label values */
+
+ /* leafTupDesc typically points to index's tupdesc, but not always */
+ TupleDesc leafTupDesc; /* descriptor for leaf-level tuples */
+
+ char *deadTupleStorage; /* workspace for spgFormDeadTuple */
+
+ TransactionId myXid; /* XID to use when creating a redirect tuple */
+ bool isBuild; /* true if doing index build */
+} SpGistState;
+
+/* Item to be re-examined later during a search */
+typedef struct SpGistSearchItem
+{
+ pairingheap_node phNode; /* pairing heap node */
+ Datum value; /* value reconstructed from parent, or
+ * leafValue if isLeaf */
+ SpGistLeafTuple leafTuple; /* whole leaf tuple, if needed */
+ void *traversalValue; /* opclass-specific traverse value */
+ int level; /* level of items on this page */
+ ItemPointerData heapPtr; /* heap info, if heap tuple */
+ bool isNull; /* SearchItem is NULL item */
+ bool isLeaf; /* SearchItem is heap item */
+ bool recheck; /* qual recheck is needed */
+ bool recheckDistances; /* distance recheck is needed */
+
+ /* array with numberOfOrderBys entries */
+ double distances[FLEXIBLE_ARRAY_MEMBER];
+} SpGistSearchItem;
+
+#define SizeOfSpGistSearchItem(n_distances) \
+ (offsetof(SpGistSearchItem, distances) + sizeof(double) * (n_distances))
+
+/*
+ * Private state of an index scan
+ */
+typedef struct SpGistScanOpaqueData
+{
+ SpGistState state; /* see above */
+ pairingheap *scanQueue; /* queue of to be visited items */
+ MemoryContext tempCxt; /* short-lived memory context */
+ MemoryContext traversalCxt; /* single scan lifetime memory context */
+
+ /* Control flags showing whether to search nulls and/or non-nulls */
+ bool searchNulls; /* scan matches (all) null entries */
+ bool searchNonNulls; /* scan matches (some) non-null entries */
+
+ /* Index quals to be passed to opclass (null-related quals removed) */
+ int numberOfKeys; /* number of index qualifier conditions */
+ ScanKey keyData; /* array of index qualifier descriptors */
+ int numberOfOrderBys; /* number of ordering operators */
+ int numberOfNonNullOrderBys; /* number of ordering operators
+ * with non-NULL arguments */
+ ScanKey orderByData; /* array of ordering op descriptors */
+ Oid *orderByTypes; /* array of ordering op return types */
+ int *nonNullOrderByOffsets; /* array of offset of non-NULL
+ * ordering keys in the original array */
+ Oid indexCollation; /* collation of index column */
+
+ /* Opclass defined functions: */
+ FmgrInfo innerConsistentFn;
+ FmgrInfo leafConsistentFn;
+
+ /* Pre-allocated workspace arrays: */
+ double *zeroDistances;
+ double *infDistances;
+
+ /* These fields are only used in amgetbitmap scans: */
+ TIDBitmap *tbm; /* bitmap being filled */
+ int64 ntids; /* number of TIDs passed to bitmap */
+
+ /* These fields are only used in amgettuple scans: */
+ bool want_itup; /* are we reconstructing tuples? */
+ TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */
+ int nPtrs; /* number of TIDs found on current page */
+ int iPtr; /* index for scanning through same */
+ ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */
+ bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */
+ bool recheckDistances[MaxIndexTuplesPerPage]; /* distance recheck
+ * flags */
+ HeapTuple reconTups[MaxIndexTuplesPerPage]; /* reconstructed tuples */
+
+ /* distances (for recheck) */
+ IndexOrderByDistance *distances[MaxIndexTuplesPerPage];
+
+ /*
+ * Note: using MaxIndexTuplesPerPage above is a bit hokey since
+ * SpGistLeafTuples aren't exactly IndexTuples; however, they are larger,
+ * so this is safe.
+ */
+} SpGistScanOpaqueData;
+
+typedef SpGistScanOpaqueData *SpGistScanOpaque;
+
+/*
+ * This struct is what we actually keep in index->rd_amcache. It includes
+ * static configuration information as well as the lastUsedPages cache.
+ */
+typedef struct SpGistCache
+{
+ spgConfigOut config; /* filled in by opclass config method */
+
+ SpGistTypeDesc attType; /* type of values to be indexed/restored */
+ SpGistTypeDesc attLeafType; /* type of leaf-tuple values */
+ SpGistTypeDesc attPrefixType; /* type of inner-tuple prefix values */
+ SpGistTypeDesc attLabelType; /* type of node label values */
+
+ SpGistLUPCache lastUsedPages; /* local storage of last-used info */
+} SpGistCache;
+
+
+/*
+ * SPGiST tuple types. Note: inner, leaf, and dead tuple structs
+ * must have the same tupstate field in the same position! Real inner and
+ * leaf tuples always have tupstate = LIVE; if the state is something else,
+ * use the SpGistDeadTuple struct to inspect the tuple.
+ */
+
+/* values of tupstate (see README for more info) */
+#define SPGIST_LIVE 0 /* normal live tuple (either inner or leaf) */
+#define SPGIST_REDIRECT 1 /* temporary redirection placeholder */
+#define SPGIST_DEAD 2 /* dead, cannot be removed because of links */
+#define SPGIST_PLACEHOLDER 3 /* placeholder, used to preserve offsets */
+
+/*
+ * SPGiST inner tuple: list of "nodes" that subdivide a set of tuples
+ *
+ * Inner tuple layout:
+ * header/optional prefix/array of nodes, which are SpGistNodeTuples
+ *
+ * size and prefixSize must be multiples of MAXALIGN
+ *
+ * If the prefix datum is of a pass-by-value type, it is stored in its
+ * Datum representation, that is its on-disk representation is of length
+ * sizeof(Datum). This is a fairly unfortunate choice, because in no other
+ * place does Postgres use Datum as an on-disk representation; it creates
+ * an unnecessary incompatibility between 32-bit and 64-bit builds. But the
+ * compatibility loss is mostly theoretical since MAXIMUM_ALIGNOF typically
+ * differs between such builds, too. Anyway we're stuck with it now.
+ */
+typedef struct SpGistInnerTupleData
+{
+ unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */
+ allTheSame:1, /* all nodes in tuple are equivalent */
+ nNodes:13, /* number of nodes within inner tuple */
+ prefixSize:16; /* size of prefix, or 0 if none */
+ uint16 size; /* total size of inner tuple */
+ /* On most machines there will be a couple of wasted bytes here */
+ /* prefix datum follows, then nodes */
+} SpGistInnerTupleData;
+
+typedef SpGistInnerTupleData *SpGistInnerTuple;
+
+/* these must match largest values that fit in bit fields declared above */
+#define SGITMAXNNODES 0x1FFF
+#define SGITMAXPREFIXSIZE 0xFFFF
+#define SGITMAXSIZE 0xFFFF
+
+#define SGITHDRSZ MAXALIGN(sizeof(SpGistInnerTupleData))
+#define _SGITDATA(x) (((char *) (x)) + SGITHDRSZ)
+#define SGITDATAPTR(x) ((x)->prefixSize ? _SGITDATA(x) : NULL)
+#define SGITDATUM(x, s) ((x)->prefixSize ? \
+ ((s)->attPrefixType.attbyval ? \
+ *(Datum *) _SGITDATA(x) : \
+ PointerGetDatum(_SGITDATA(x))) \
+ : (Datum) 0)
+#define SGITNODEPTR(x) ((SpGistNodeTuple) (_SGITDATA(x) + (x)->prefixSize))
+
+/* Macro for iterating through the nodes of an inner tuple */
+#define SGITITERATE(x, i, nt) \
+ for ((i) = 0, (nt) = SGITNODEPTR(x); \
+ (i) < (x)->nNodes; \
+ (i)++, (nt) = (SpGistNodeTuple) (((char *) (nt)) + IndexTupleSize(nt)))
+
+/*
+ * SPGiST node tuple: one node within an inner tuple
+ *
+ * Node tuples use the same header as ordinary Postgres IndexTuples, but
+ * we do not use a null bitmap, because we know there is only one column
+ * so the INDEX_NULL_MASK bit suffices. Also, pass-by-value datums are
+ * stored in Datum form, the same convention as for inner tuple prefixes.
+ */
+
+typedef IndexTupleData SpGistNodeTupleData;
+
+typedef SpGistNodeTupleData *SpGistNodeTuple;
+
+#define SGNTHDRSZ MAXALIGN(sizeof(SpGistNodeTupleData))
+#define SGNTDATAPTR(x) (((char *) (x)) + SGNTHDRSZ)
+#define SGNTDATUM(x, s) ((s)->attLabelType.attbyval ? \
+ *(Datum *) SGNTDATAPTR(x) : \
+ PointerGetDatum(SGNTDATAPTR(x)))
+
+/*
+ * SPGiST leaf tuple: carries a leaf datum and a heap tuple TID,
+ * and optionally some "included" columns.
+ *
+ * In the simplest case, the leaf datum is the same as the indexed value;
+ * but it could also be a suffix or some other sort of delta that permits
+ * reconstruction given knowledge of the prefix path traversed to get here.
+ * Any included columns are stored without modification.
+ *
+ * A nulls bitmap is present if there are included columns AND any of the
+ * datums are NULL. We do not need a nulls bitmap for the case of a null
+ * leaf datum without included columns, as we can infer whether the leaf
+ * datum is null from whether the tuple is stored on a nulls page. (This
+ * provision is mostly for backwards compatibility, but it does save space
+ * on 32-bit machines.) As with other PG index tuple designs, if the nulls
+ * bitmap exists then it's of size INDEX_MAX_KEYS bits regardless of the
+ * actual number of attributes. For the usual choice of INDEX_MAX_KEYS,
+ * this costs nothing because of alignment considerations.
+ *
+ * The size field is wider than could possibly be needed for an on-disk leaf
+ * tuple, but this allows us to form leaf tuples even when the datum is too
+ * wide to be stored immediately, and it costs nothing because of alignment
+ * considerations.
+ *
+ * t_info holds the nextOffset field (14 bits wide, enough for supported
+ * page sizes) plus the has-nulls-bitmap flag bit; another flag bit is free.
+ *
+ * Normally, nextOffset links to the next tuple belonging to the same parent
+ * node (which must be on the same page), or it's 0 if there is no next tuple.
+ * But when the root page is a leaf page, we don't chain its tuples,
+ * so nextOffset is always 0 on the root.
+ *
+ * size must be a multiple of MAXALIGN; also, it must be at least SGDTSIZE
+ * so that the tuple can be converted to REDIRECT status later. (This
+ * restriction only adds bytes for a NULL leaf datum stored on a 32-bit
+ * machine; otherwise alignment restrictions force it anyway.)
+ */
+typedef struct SpGistLeafTupleData
+{
+ unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */
+ size:30; /* large enough for any palloc'able value */
+ uint16 t_info; /* nextOffset, which links to the next tuple
+ * in chain, plus two flag bits */
+ ItemPointerData heapPtr; /* TID of represented heap tuple */
+ /* nulls bitmap follows if the flag bit for it is set */
+ /* leaf datum, then any included datums, follows on a MAXALIGN boundary */
+} SpGistLeafTupleData;
+
+/* Macros to access nextOffset and bit fields inside t_info */
+#define SGLT_GET_NEXTOFFSET(spgLeafTuple) \
+ ((spgLeafTuple)->t_info & 0x3FFF)
+#define SGLT_GET_HASNULLMASK(spgLeafTuple) \
+ (((spgLeafTuple)->t_info & 0x8000) ? true : false)
+#define SGLT_SET_NEXTOFFSET(spgLeafTuple, offsetNumber) \
+ ((spgLeafTuple)->t_info = \
+ ((spgLeafTuple)->t_info & 0xC000) | ((offsetNumber) & 0x3FFF))
+#define SGLT_SET_HASNULLMASK(spgLeafTuple, hasnulls) \
+ ((spgLeafTuple)->t_info = \
+ ((spgLeafTuple)->t_info & 0x7FFF) | ((hasnulls) ? 0x8000 : 0))
+
+#define SGLTHDRSZ(hasnulls) \
+ ((hasnulls) ? MAXALIGN(sizeof(SpGistLeafTupleData) + \
+ sizeof(IndexAttributeBitMapData)) : \
+ MAXALIGN(sizeof(SpGistLeafTupleData)))
+#define SGLTDATAPTR(x) (((char *) (x)) + SGLTHDRSZ(SGLT_GET_HASNULLMASK(x)))
+#define SGLTDATUM(x, s) fetch_att(SGLTDATAPTR(x), \
+ (s)->attLeafType.attbyval, \
+ (s)->attLeafType.attlen)
+
+/*
+ * SPGiST dead tuple: declaration for examining non-live tuples
+ *
+ * The tupstate field of this struct must match those of regular inner and
+ * leaf tuples, and its size field must match a leaf tuple's.
+ * Also, the pointer field must be in the same place as a leaf tuple's heapPtr
+ * field, to satisfy some Asserts that we make when replacing a leaf tuple
+ * with a dead tuple.
+ * We don't use t_info, but it's needed to align the pointer field.
+ * pointer and xid are only valid when tupstate = REDIRECT.
+ */
+typedef struct SpGistDeadTupleData
+{
+ unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */
+ size:30;
+ uint16 t_info; /* not used in dead tuples */
+ ItemPointerData pointer; /* redirection inside index */
+ TransactionId xid; /* ID of xact that inserted this tuple */
+} SpGistDeadTupleData;
+
+typedef SpGistDeadTupleData *SpGistDeadTuple;
+
+#define SGDTSIZE MAXALIGN(sizeof(SpGistDeadTupleData))
+
+/*
+ * Macros for doing free-space calculations. Note that when adding up the
+ * space needed for tuples, we always consider each tuple to need the tuple's
+ * size plus sizeof(ItemIdData) (for the line pointer). This works correctly
+ * so long as tuple sizes are always maxaligned.
+ */
+
+/* Page capacity after allowing for fixed header and special space */
+#define SPGIST_PAGE_CAPACITY \
+ MAXALIGN_DOWN(BLCKSZ - \
+ SizeOfPageHeaderData - \
+ MAXALIGN(sizeof(SpGistPageOpaqueData)))
+
+/*
+ * Compute free space on page, assuming that up to n placeholders can be
+ * recycled if present (n should be the number of tuples to be inserted)
+ */
+#define SpGistPageGetFreeSpace(p, n) \
+ (PageGetExactFreeSpace(p) + \
+ Min(SpGistPageGetOpaque(p)->nPlaceholder, n) * \
+ (SGDTSIZE + sizeof(ItemIdData)))
+
+/*
+ * XLOG stuff
+ */
+
+#define STORE_STATE(s, d) \
+ do { \
+ (d).myXid = (s)->myXid; \
+ (d).isBuild = (s)->isBuild; \
+ } while(0)
+
+/*
+ * The "flags" argument for SpGistGetBuffer should be either GBUF_LEAF to
+ * get a leaf page, or GBUF_INNER_PARITY(blockNumber) to get an inner
+ * page in the same triple-parity group as the specified block number.
+ * (Typically, this should be GBUF_INNER_PARITY(parentBlockNumber + 1)
+ * to follow the rule described in spgist/README.)
+ * In addition, GBUF_NULLS can be OR'd in to get a page for storage of
+ * null-valued tuples.
+ *
+ * Note: these flag values are used as indexes into lastUsedPages.
+ */
+#define GBUF_LEAF 0x03
+#define GBUF_INNER_PARITY(x) ((x) % 3)
+#define GBUF_NULLS 0x04
+
+#define GBUF_PARITY_MASK 0x03
+#define GBUF_REQ_LEAF(flags) (((flags) & GBUF_PARITY_MASK) == GBUF_LEAF)
+#define GBUF_REQ_NULLS(flags) ((flags) & GBUF_NULLS)
+
+/* spgutils.c */
+
+/* reloption parameters */
+#define SPGIST_MIN_FILLFACTOR 10
+#define SPGIST_DEFAULT_FILLFACTOR 80
+
+extern SpGistCache *spgGetCache(Relation index);
+extern TupleDesc getSpGistTupleDesc(Relation index, SpGistTypeDesc *keyType);
+extern void initSpGistState(SpGistState *state, Relation index);
+extern Buffer SpGistNewBuffer(Relation index);
+extern void SpGistUpdateMetaPage(Relation index);
+extern Buffer SpGistGetBuffer(Relation index, int flags,
+ int needSpace, bool *isNew);
+extern void SpGistSetLastUsedPage(Relation index, Buffer buffer);
+extern void SpGistInitPage(Page page, uint16 f);
+extern void SpGistInitBuffer(Buffer b, uint16 f);
+extern void SpGistInitMetapage(Page page);
+extern unsigned int SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum);
+extern Size SpGistGetLeafTupleSize(TupleDesc tupleDescriptor,
+ Datum *datums, bool *isnulls);
+extern SpGistLeafTuple spgFormLeafTuple(SpGistState *state,
+ ItemPointer heapPtr,
+ Datum *datums, bool *isnulls);
+extern SpGistNodeTuple spgFormNodeTuple(SpGistState *state,
+ Datum label, bool isnull);
+extern SpGistInnerTuple spgFormInnerTuple(SpGistState *state,
+ bool hasPrefix, Datum prefix,
+ int nNodes, SpGistNodeTuple *nodes);
+extern SpGistDeadTuple spgFormDeadTuple(SpGistState *state, int tupstate,
+ BlockNumber blkno, OffsetNumber offnum);
+extern void spgDeformLeafTuple(SpGistLeafTuple tup, TupleDesc tupleDescriptor,
+ Datum *datums, bool *isnulls,
+ bool keyColumnIsNull);
+extern Datum *spgExtractNodeLabels(SpGistState *state,
+ SpGistInnerTuple innerTuple);
+extern OffsetNumber SpGistPageAddNewItem(SpGistState *state, Page page,
+ Item item, Size size,
+ OffsetNumber *startOffset,
+ bool errorOK);
+extern bool spgproperty(Oid index_oid, int attno,
+ IndexAMProperty prop, const char *propname,
+ bool *res, bool *isnull);
+
+/* spgdoinsert.c */
+extern void spgUpdateNodeLink(SpGistInnerTuple tup, int nodeN,
+ BlockNumber blkno, OffsetNumber offset);
+extern void spgPageIndexMultiDelete(SpGistState *state, Page page,
+ OffsetNumber *itemnos, int nitems,
+ int firststate, int reststate,
+ BlockNumber blkno, OffsetNumber offnum);
+extern bool spgdoinsert(Relation index, SpGistState *state,
+ ItemPointer heapPtr, Datum *datums, bool *isnulls);
+
+/* spgproc.c */
+extern double *spg_key_orderbys_distances(Datum key, bool isLeaf,
+ ScanKey orderbys, int norderbys);
+extern BOX *box_copy(BOX *orig);
+
+#endif /* SPGIST_PRIVATE_H */
diff --git a/src/include/access/spgxlog.h b/src/include/access/spgxlog.h
new file mode 100644
index 0000000..69405b5
--- /dev/null
+++ b/src/include/access/spgxlog.h
@@ -0,0 +1,257 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgxlog.h
+ * xlog declarations for SP-GiST access method.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/spgxlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPGXLOG_H
+#define SPGXLOG_H
+
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/off.h"
+
+/* XLOG record types for SPGiST */
+ /* #define XLOG_SPGIST_CREATE_INDEX 0x00 */ /* not used anymore */
+#define XLOG_SPGIST_ADD_LEAF 0x10
+#define XLOG_SPGIST_MOVE_LEAFS 0x20
+#define XLOG_SPGIST_ADD_NODE 0x30
+#define XLOG_SPGIST_SPLIT_TUPLE 0x40
+#define XLOG_SPGIST_PICKSPLIT 0x50
+#define XLOG_SPGIST_VACUUM_LEAF 0x60
+#define XLOG_SPGIST_VACUUM_ROOT 0x70
+#define XLOG_SPGIST_VACUUM_REDIRECT 0x80
+
+/*
+ * Some redo functions need an SpGistState, although only a few of its fields
+ * need to be valid. spgxlogState carries the required info in xlog records.
+ * (See fillFakeState in spgxlog.c for more comments.)
+ */
+typedef struct spgxlogState
+{
+ TransactionId myXid;
+ bool isBuild;
+} spgxlogState;
+
+/*
+ * Backup Blk 0: destination page for leaf tuple
+ * Backup Blk 1: parent page (if any)
+ */
+typedef struct spgxlogAddLeaf
+{
+ bool newPage; /* init dest page? */
+ bool storesNulls; /* page is in the nulls tree? */
+ OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */
+ OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */
+
+ OffsetNumber offnumParent; /* where the parent downlink is, if any */
+ uint16 nodeI;
+
+ /* new leaf tuple follows (unaligned!) */
+} spgxlogAddLeaf;
+
+/*
+ * Backup Blk 0: source leaf page
+ * Backup Blk 1: destination leaf page
+ * Backup Blk 2: parent page
+ */
+typedef struct spgxlogMoveLeafs
+{
+ uint16 nMoves; /* number of tuples moved from source page */
+ bool newPage; /* init dest page? */
+ bool replaceDead; /* are we replacing a DEAD source tuple? */
+ bool storesNulls; /* pages are in the nulls tree? */
+
+ /* where the parent downlink is */
+ OffsetNumber offnumParent;
+ uint16 nodeI;
+
+ spgxlogState stateSrc;
+
+ /*----------
+ * data follows:
+ * array of deleted tuple numbers, length nMoves
+ * array of inserted tuple numbers, length nMoves + 1 or 1
+ * list of leaf tuples, length nMoves + 1 or 1 (unaligned!)
+ *
+ * Note: if replaceDead is true then there is only one inserted tuple
+ * number and only one leaf tuple in the data, because we are not copying
+ * the dead tuple from the source
+ *----------
+ */
+ OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+} spgxlogMoveLeafs;
+
+#define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets)
+
+/*
+ * Backup Blk 0: original page
+ * Backup Blk 1: where new tuple goes, if not same place
+ * Backup Blk 2: where parent downlink is, if updated and different from
+ * the old and new
+ */
+typedef struct spgxlogAddNode
+{
+ /*
+ * Offset of the original inner tuple, in the original page (on backup
+ * block 0).
+ */
+ OffsetNumber offnum;
+
+ /*
+ * Offset of the new tuple, on the new page (on backup block 1). Invalid,
+ * if we overwrote the old tuple in the original page).
+ */
+ OffsetNumber offnumNew;
+ bool newPage; /* init new page? */
+
+ /*----
+ * Where is the parent downlink? parentBlk indicates which page it's on,
+ * and offnumParent is the offset within the page. The possible values for
+ * parentBlk are:
+ *
+ * 0: parent == original page
+ * 1: parent == new page
+ * 2: parent == different page (blk ref 2)
+ * -1: parent not updated
+ *----
+ */
+ int8 parentBlk;
+ OffsetNumber offnumParent; /* offset within the parent page */
+
+ uint16 nodeI;
+
+ spgxlogState stateSrc;
+
+ /*
+ * updated inner tuple follows (unaligned!)
+ */
+} spgxlogAddNode;
+
+/*
+ * Backup Blk 0: where the prefix tuple goes
+ * Backup Blk 1: where the postfix tuple goes (if different page)
+ */
+typedef struct spgxlogSplitTuple
+{
+ /* where the prefix tuple goes */
+ OffsetNumber offnumPrefix;
+
+ /* where the postfix tuple goes */
+ OffsetNumber offnumPostfix;
+ bool newPage; /* need to init that page? */
+ bool postfixBlkSame; /* was postfix tuple put on same page as
+ * prefix? */
+
+ /*
+ * new prefix inner tuple follows, then new postfix inner tuple (both are
+ * unaligned!)
+ */
+} spgxlogSplitTuple;
+
+/*
+ * Buffer references in the rdata array are:
+ * Backup Blk 0: Src page (only if not root)
+ * Backup Blk 1: Dest page (if used)
+ * Backup Blk 2: Inner page
+ * Backup Blk 3: Parent page (if any, and different from Inner)
+ */
+typedef struct spgxlogPickSplit
+{
+ bool isRootSplit;
+
+ uint16 nDelete; /* n to delete from Src */
+ uint16 nInsert; /* n to insert on Src and/or Dest */
+ bool initSrc; /* re-init the Src page? */
+ bool initDest; /* re-init the Dest page? */
+
+ /* where to put new inner tuple */
+ OffsetNumber offnumInner;
+ bool initInner; /* re-init the Inner page? */
+
+ bool storesNulls; /* pages are in the nulls tree? */
+
+ /* where the parent downlink is, if any */
+ bool innerIsParent; /* is parent the same as inner page? */
+ OffsetNumber offnumParent;
+ uint16 nodeI;
+
+ spgxlogState stateSrc;
+
+ /*----------
+ * data follows:
+ * array of deleted tuple numbers, length nDelete
+ * array of inserted tuple numbers, length nInsert
+ * array of page selector bytes for inserted tuples, length nInsert
+ * new inner tuple (unaligned!)
+ * list of leaf tuples, length nInsert (unaligned!)
+ *----------
+ */
+ OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+} spgxlogPickSplit;
+
+#define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets)
+
+typedef struct spgxlogVacuumLeaf
+{
+ uint16 nDead; /* number of tuples to become DEAD */
+ uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */
+ uint16 nMove; /* number of tuples to move */
+ uint16 nChain; /* number of tuples to re-chain */
+
+ spgxlogState stateSrc;
+
+ /*----------
+ * data follows:
+ * tuple numbers to become DEAD
+ * tuple numbers to become PLACEHOLDER
+ * tuple numbers to move from (and replace with PLACEHOLDER)
+ * tuple numbers to move to (replacing what is there)
+ * tuple numbers to update nextOffset links of
+ * tuple numbers to insert in nextOffset links
+ *----------
+ */
+ OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+} spgxlogVacuumLeaf;
+
+#define SizeOfSpgxlogVacuumLeaf offsetof(spgxlogVacuumLeaf, offsets)
+
+typedef struct spgxlogVacuumRoot
+{
+ /* vacuum a root page when it is also a leaf */
+ uint16 nDelete; /* number of tuples to delete */
+
+ spgxlogState stateSrc;
+
+ /* offsets of tuples to delete follow */
+ OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+} spgxlogVacuumRoot;
+
+#define SizeOfSpgxlogVacuumRoot offsetof(spgxlogVacuumRoot, offsets)
+
+typedef struct spgxlogVacuumRedirect
+{
+ uint16 nToPlaceholder; /* number of redirects to make placeholders */
+ OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */
+ TransactionId newestRedirectXid; /* newest XID of removed redirects */
+
+ /* offsets of redirect tuples to make placeholders follow */
+ OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+} spgxlogVacuumRedirect;
+
+#define SizeOfSpgxlogVacuumRedirect offsetof(spgxlogVacuumRedirect, offsets)
+
+extern void spg_redo(XLogReaderState *record);
+extern void spg_desc(StringInfo buf, XLogReaderState *record);
+extern const char *spg_identify(uint8 info);
+extern void spg_xlog_startup(void);
+extern void spg_xlog_cleanup(void);
+extern void spg_mask(char *pagedata, BlockNumber blkno);
+
+#endif /* SPGXLOG_H */
diff --git a/src/include/access/stratnum.h b/src/include/access/stratnum.h
new file mode 100644
index 0000000..fad4b69
--- /dev/null
+++ b/src/include/access/stratnum.h
@@ -0,0 +1,85 @@
+/*-------------------------------------------------------------------------
+ *
+ * stratnum.h
+ * POSTGRES strategy number definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/stratnum.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STRATNUM_H
+#define STRATNUM_H
+
+/*
+ * Strategy numbers identify the semantics that particular operators have
+ * with respect to particular operator classes. In some cases a strategy
+ * subtype (an OID) is used as further information.
+ */
+typedef uint16 StrategyNumber;
+
+#define InvalidStrategy ((StrategyNumber) 0)
+
+/*
+ * Strategy numbers for B-tree indexes.
+ */
+#define BTLessStrategyNumber 1
+#define BTLessEqualStrategyNumber 2
+#define BTEqualStrategyNumber 3
+#define BTGreaterEqualStrategyNumber 4
+#define BTGreaterStrategyNumber 5
+
+#define BTMaxStrategyNumber 5
+
+/*
+ * Strategy numbers for hash indexes. There's only one valid strategy for
+ * hashing: equality.
+ */
+#define HTEqualStrategyNumber 1
+
+#define HTMaxStrategyNumber 1
+
+/*
+ * Strategy numbers common to (some) GiST, SP-GiST and BRIN opclasses.
+ *
+ * The first few of these come from the R-Tree indexing method (hence the
+ * names); the others have been added over time as they have been needed.
+ */
+#define RTLeftStrategyNumber 1 /* for << */
+#define RTOverLeftStrategyNumber 2 /* for &< */
+#define RTOverlapStrategyNumber 3 /* for && */
+#define RTOverRightStrategyNumber 4 /* for &> */
+#define RTRightStrategyNumber 5 /* for >> */
+#define RTSameStrategyNumber 6 /* for ~= */
+#define RTContainsStrategyNumber 7 /* for @> */
+#define RTContainedByStrategyNumber 8 /* for <@ */
+#define RTOverBelowStrategyNumber 9 /* for &<| */
+#define RTBelowStrategyNumber 10 /* for <<| */
+#define RTAboveStrategyNumber 11 /* for |>> */
+#define RTOverAboveStrategyNumber 12 /* for |&> */
+#define RTOldContainsStrategyNumber 13 /* for old spelling of @> */
+#define RTOldContainedByStrategyNumber 14 /* for old spelling of <@ */
+#define RTKNNSearchStrategyNumber 15 /* for <-> (distance) */
+#define RTContainsElemStrategyNumber 16 /* for range types @> elem */
+#define RTAdjacentStrategyNumber 17 /* for -|- */
+#define RTEqualStrategyNumber 18 /* for = */
+#define RTNotEqualStrategyNumber 19 /* for != */
+#define RTLessStrategyNumber 20 /* for < */
+#define RTLessEqualStrategyNumber 21 /* for <= */
+#define RTGreaterStrategyNumber 22 /* for > */
+#define RTGreaterEqualStrategyNumber 23 /* for >= */
+#define RTSubStrategyNumber 24 /* for inet >> */
+#define RTSubEqualStrategyNumber 25 /* for inet <<= */
+#define RTSuperStrategyNumber 26 /* for inet << */
+#define RTSuperEqualStrategyNumber 27 /* for inet >>= */
+#define RTPrefixStrategyNumber 28 /* for text ^@ */
+#define RTOldBelowStrategyNumber 29 /* for old spelling of <<| */
+#define RTOldAboveStrategyNumber 30 /* for old spelling of |>> */
+
+#define RTMaxStrategyNumber 30
+
+
+#endif /* STRATNUM_H */
diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h
new file mode 100644
index 0000000..d0ab44a
--- /dev/null
+++ b/src/include/access/subtrans.h
@@ -0,0 +1,29 @@
+/*
+ * subtrans.h
+ *
+ * PostgreSQL subtransaction-log manager
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/subtrans.h
+ */
+#ifndef SUBTRANS_H
+#define SUBTRANS_H
+
+/* Number of SLRU buffers to use for subtrans */
+#define NUM_SUBTRANS_BUFFERS 32
+
+extern void SubTransSetParent(TransactionId xid, TransactionId parent);
+extern TransactionId SubTransGetParent(TransactionId xid);
+extern TransactionId SubTransGetTopmostTransaction(TransactionId xid);
+
+extern Size SUBTRANSShmemSize(void);
+extern void SUBTRANSShmemInit(void);
+extern void BootStrapSUBTRANS(void);
+extern void StartupSUBTRANS(TransactionId oldestActiveXID);
+extern void CheckPointSUBTRANS(void);
+extern void ExtendSUBTRANS(TransactionId newestXact);
+extern void TruncateSUBTRANS(TransactionId oldestXact);
+
+#endif /* SUBTRANS_H */
diff --git a/src/include/access/syncscan.h b/src/include/access/syncscan.h
new file mode 100644
index 0000000..7947f3c
--- /dev/null
+++ b/src/include/access/syncscan.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * syncscan.h
+ * POSTGRES synchronous scan support functions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/syncscan.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SYNCSCAN_H
+#define SYNCSCAN_H
+
+#include "storage/block.h"
+#include "utils/relcache.h"
+
+extern void ss_report_location(Relation rel, BlockNumber location);
+extern BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks);
+extern void SyncScanShmemInit(void);
+extern Size SyncScanShmemSize(void);
+
+#endif
diff --git a/src/include/access/sysattr.h b/src/include/access/sysattr.h
new file mode 100644
index 0000000..968257b
--- /dev/null
+++ b/src/include/access/sysattr.h
@@ -0,0 +1,29 @@
+/*-------------------------------------------------------------------------
+ *
+ * sysattr.h
+ * POSTGRES system attribute definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/sysattr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SYSATTR_H
+#define SYSATTR_H
+
+
+/*
+ * Attribute numbers for the system-defined attributes
+ */
+#define SelfItemPointerAttributeNumber (-1)
+#define MinTransactionIdAttributeNumber (-2)
+#define MinCommandIdAttributeNumber (-3)
+#define MaxTransactionIdAttributeNumber (-4)
+#define MaxCommandIdAttributeNumber (-5)
+#define TableOidAttributeNumber (-6)
+#define FirstLowInvalidHeapAttributeNumber (-7)
+
+#endif /* SYSATTR_H */
diff --git a/src/include/access/table.h b/src/include/access/table.h
new file mode 100644
index 0000000..5e4d9dd
--- /dev/null
+++ b/src/include/access/table.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * table.h
+ * Generic routines for table related code.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/table.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TABLE_H
+#define TABLE_H
+
+#include "nodes/primnodes.h"
+#include "storage/lockdefs.h"
+#include "utils/relcache.h"
+
+extern Relation table_open(Oid relationId, LOCKMODE lockmode);
+extern Relation table_openrv(const RangeVar *relation, LOCKMODE lockmode);
+extern Relation table_openrv_extended(const RangeVar *relation,
+ LOCKMODE lockmode, bool missing_ok);
+extern Relation try_table_open(Oid relationId, LOCKMODE lockmode);
+extern void table_close(Relation relation, LOCKMODE lockmode);
+
+#endif /* TABLE_H */
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
new file mode 100644
index 0000000..9f1e4a1
--- /dev/null
+++ b/src/include/access/tableam.h
@@ -0,0 +1,2075 @@
+/*-------------------------------------------------------------------------
+ *
+ * tableam.h
+ * POSTGRES table access method definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/tableam.h
+ *
+ * NOTES
+ * See tableam.sgml for higher level documentation.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TABLEAM_H
+#define TABLEAM_H
+
+#include "access/relscan.h"
+#include "access/sdir.h"
+#include "access/xact.h"
+#include "utils/guc.h"
+#include "utils/rel.h"
+#include "utils/snapshot.h"
+
+
+#define DEFAULT_TABLE_ACCESS_METHOD "heap"
+
+/* GUCs */
+extern char *default_table_access_method;
+extern bool synchronize_seqscans;
+
+
+struct BulkInsertStateData;
+struct IndexInfo;
+struct SampleScanState;
+struct TBMIterateResult;
+struct VacuumParams;
+struct ValidateIndexState;
+
+/*
+ * Bitmask values for the flags argument to the scan_begin callback.
+ */
+typedef enum ScanOptions
+{
+ /* one of SO_TYPE_* may be specified */
+ SO_TYPE_SEQSCAN = 1 << 0,
+ SO_TYPE_BITMAPSCAN = 1 << 1,
+ SO_TYPE_SAMPLESCAN = 1 << 2,
+ SO_TYPE_TIDSCAN = 1 << 3,
+ SO_TYPE_TIDRANGESCAN = 1 << 4,
+ SO_TYPE_ANALYZE = 1 << 5,
+
+ /* several of SO_ALLOW_* may be specified */
+ /* allow or disallow use of access strategy */
+ SO_ALLOW_STRAT = 1 << 6,
+ /* report location to syncscan logic? */
+ SO_ALLOW_SYNC = 1 << 7,
+ /* verify visibility page-at-a-time? */
+ SO_ALLOW_PAGEMODE = 1 << 8,
+
+ /* unregister snapshot at scan end? */
+ SO_TEMP_SNAPSHOT = 1 << 9
+} ScanOptions;
+
+/*
+ * Result codes for table_{update,delete,lock_tuple}, and for visibility
+ * routines inside table AMs.
+ */
+typedef enum TM_Result
+{
+ /*
+ * Signals that the action succeeded (i.e. update/delete performed, lock
+ * was acquired)
+ */
+ TM_Ok,
+
+ /* The affected tuple wasn't visible to the relevant snapshot */
+ TM_Invisible,
+
+ /* The affected tuple was already modified by the calling backend */
+ TM_SelfModified,
+
+ /*
+ * The affected tuple was updated by another transaction. This includes
+ * the case where tuple was moved to another partition.
+ */
+ TM_Updated,
+
+ /* The affected tuple was deleted by another transaction */
+ TM_Deleted,
+
+ /*
+ * The affected tuple is currently being modified by another session. This
+ * will only be returned if table_(update/delete/lock_tuple) are
+ * instructed not to wait.
+ */
+ TM_BeingModified,
+
+ /* lock couldn't be acquired, action skipped. Only used by lock_tuple */
+ TM_WouldBlock
+} TM_Result;
+
+/*
+ * When table_tuple_update, table_tuple_delete, or table_tuple_lock fail
+ * because the target tuple is already outdated, they fill in this struct to
+ * provide information to the caller about what happened.
+ *
+ * ctid is the target's ctid link: it is the same as the target's TID if the
+ * target was deleted, or the location of the replacement tuple if the target
+ * was updated.
+ *
+ * xmax is the outdating transaction's XID. If the caller wants to visit the
+ * replacement tuple, it must check that this matches before believing the
+ * replacement is really a match.
+ *
+ * cmax is the outdating command's CID, but only when the failure code is
+ * TM_SelfModified (i.e., something in the current transaction outdated the
+ * tuple); otherwise cmax is zero. (We make this restriction because
+ * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other
+ * transactions.)
+ */
+typedef struct TM_FailureData
+{
+ ItemPointerData ctid;
+ TransactionId xmax;
+ CommandId cmax;
+ bool traversed;
+} TM_FailureData;
+
+/*
+ * State used when calling table_index_delete_tuples().
+ *
+ * Represents the status of table tuples, referenced by table TID and taken by
+ * index AM from index tuples. State consists of high level parameters of the
+ * deletion operation, plus two mutable palloc()'d arrays for information
+ * about the status of individual table tuples. These are conceptually one
+ * single array. Using two arrays keeps the TM_IndexDelete struct small,
+ * which makes sorting the first array (the deltids array) fast.
+ *
+ * Some index AM callers perform simple index tuple deletion (by specifying
+ * bottomup = false), and include only known-dead deltids. These known-dead
+ * entries are all marked knowndeletable = true directly (typically these are
+ * TIDs from LP_DEAD-marked index tuples), but that isn't strictly required.
+ *
+ * Callers that specify bottomup = true are "bottom-up index deletion"
+ * callers. The considerations for the tableam are more subtle with these
+ * callers because they ask the tableam to perform highly speculative work,
+ * and might only expect the tableam to check a small fraction of all entries.
+ * Caller is not allowed to specify knowndeletable = true for any entry
+ * because everything is highly speculative. Bottom-up caller provides
+ * context and hints to tableam -- see comments below for details on how index
+ * AMs and tableams should coordinate during bottom-up index deletion.
+ *
+ * Simple index deletion callers may ask the tableam to perform speculative
+ * work, too. This is a little like bottom-up deletion, but not too much.
+ * The tableam will only perform speculative work when it's practically free
+ * to do so in passing for simple deletion caller (while always performing
+ * whatever work is is needed to enable knowndeletable/LP_DEAD index tuples to
+ * be deleted within index AM). This is the real reason why it's possible for
+ * simple index deletion caller to specify knowndeletable = false up front
+ * (this means "check if it's possible for me to delete corresponding index
+ * tuple when it's cheap to do so in passing"). The index AM should only
+ * include "extra" entries for index tuples whose TIDs point to a table block
+ * that tableam is expected to have to visit anyway (in the event of a block
+ * orientated tableam). The tableam isn't strictly obligated to check these
+ * "extra" TIDs, but a block-based AM should always manage to do so in
+ * practice.
+ *
+ * The final contents of the deltids/status arrays are interesting to callers
+ * that ask tableam to perform speculative work (i.e. when _any_ items have
+ * knowndeletable set to false up front). These index AM callers will
+ * naturally need to consult final state to determine which index tuples are
+ * in fact deletable.
+ *
+ * The index AM can keep track of which index tuple relates to which deltid by
+ * setting idxoffnum (and/or relying on each entry being uniquely identifiable
+ * using tid), which is important when the final contents of the array will
+ * need to be interpreted -- the array can shrink from initial size after
+ * tableam processing and/or have entries in a new order (tableam may sort
+ * deltids array for its own reasons). Bottom-up callers may find that final
+ * ndeltids is 0 on return from call to tableam, in which case no index tuple
+ * deletions are possible. Simple deletion callers can rely on any entries
+ * they know to be deletable appearing in the final array as deletable.
+ */
+typedef struct TM_IndexDelete
+{
+ ItemPointerData tid; /* table TID from index tuple */
+ int16 id; /* Offset into TM_IndexStatus array */
+} TM_IndexDelete;
+
+typedef struct TM_IndexStatus
+{
+ OffsetNumber idxoffnum; /* Index am page offset number */
+ bool knowndeletable; /* Currently known to be deletable? */
+
+ /* Bottom-up index deletion specific fields follow */
+ bool promising; /* Promising (duplicate) index tuple? */
+ int16 freespace; /* Space freed in index if deleted */
+} TM_IndexStatus;
+
+/*
+ * Index AM/tableam coordination is central to the design of bottom-up index
+ * deletion. The index AM provides hints about where to look to the tableam
+ * by marking some entries as "promising". Index AM does this with duplicate
+ * index tuples that are strongly suspected to be old versions left behind by
+ * UPDATEs that did not logically modify indexed values. Index AM may find it
+ * helpful to only mark entries as promising when they're thought to have been
+ * affected by such an UPDATE in the recent past.
+ *
+ * Bottom-up index deletion casts a wide net at first, usually by including
+ * all TIDs on a target index page. It is up to the tableam to worry about
+ * the cost of checking transaction status information. The tableam is in
+ * control, but needs careful guidance from the index AM. Index AM requests
+ * that bottomupfreespace target be met, while tableam measures progress
+ * towards that goal by tallying the per-entry freespace value for known
+ * deletable entries. (All !bottomup callers can just set these space related
+ * fields to zero.)
+ */
+typedef struct TM_IndexDeleteOp
+{
+ bool bottomup; /* Bottom-up (not simple) deletion? */
+ int bottomupfreespace; /* Bottom-up space target */
+
+ /* Mutable per-TID information follows (index AM initializes entries) */
+ int ndeltids; /* Current # of deltids/status elements */
+ TM_IndexDelete *deltids;
+ TM_IndexStatus *status;
+} TM_IndexDeleteOp;
+
+/* "options" flag bits for table_tuple_insert */
+/* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
+#define TABLE_INSERT_SKIP_FSM 0x0002
+#define TABLE_INSERT_FROZEN 0x0004
+#define TABLE_INSERT_NO_LOGICAL 0x0008
+
+/* flag bits for table_tuple_lock */
+/* Follow tuples whose update is in progress if lock modes don't conflict */
+#define TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS (1 << 0)
+/* Follow update chain and lock latest version of tuple */
+#define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1)
+
+
+/* Typedef for callback function for table_index_build_scan */
+typedef void (*IndexBuildCallback) (Relation index,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state);
+
+/*
+ * API struct for a table AM. Note this must be allocated in a
+ * server-lifetime manner, typically as a static const struct, which then gets
+ * returned by FormData_pg_am.amhandler.
+ *
+ * In most cases it's not appropriate to call the callbacks directly, use the
+ * table_* wrapper functions instead.
+ *
+ * GetTableAmRoutine() asserts that required callbacks are filled in, remember
+ * to update when adding a callback.
+ */
+typedef struct TableAmRoutine
+{
+ /* this must be set to T_TableAmRoutine */
+ NodeTag type;
+
+
+ /* ------------------------------------------------------------------------
+ * Slot related callbacks.
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * Return slot implementation suitable for storing a tuple of this AM.
+ */
+ const TupleTableSlotOps *(*slot_callbacks) (Relation rel);
+
+
+ /* ------------------------------------------------------------------------
+ * Table scan callbacks.
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * Start a scan of `rel`. The callback has to return a TableScanDesc,
+ * which will typically be embedded in a larger, AM specific, struct.
+ *
+ * If nkeys != 0, the results need to be filtered by those scan keys.
+ *
+ * pscan, if not NULL, will have already been initialized with
+ * parallelscan_initialize(), and has to be for the same relation. Will
+ * only be set coming from table_beginscan_parallel().
+ *
+ * `flags` is a bitmask indicating the type of scan (ScanOptions's
+ * SO_TYPE_*, currently only one may be specified), options controlling
+ * the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be
+ * specified, an AM may ignore unsupported ones) and whether the snapshot
+ * needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT).
+ */
+ TableScanDesc (*scan_begin) (Relation rel,
+ Snapshot snapshot,
+ int nkeys, struct ScanKeyData *key,
+ ParallelTableScanDesc pscan,
+ uint32 flags);
+
+ /*
+ * Release resources and deallocate scan. If TableScanDesc.temp_snap,
+ * TableScanDesc.rs_snapshot needs to be unregistered.
+ */
+ void (*scan_end) (TableScanDesc scan);
+
+ /*
+ * Restart relation scan. If set_params is set to true, allow_{strat,
+ * sync, pagemode} (see scan_begin) changes should be taken into account.
+ */
+ void (*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key,
+ bool set_params, bool allow_strat,
+ bool allow_sync, bool allow_pagemode);
+
+ /*
+ * Return next tuple from `scan`, store in slot.
+ */
+ bool (*scan_getnextslot) (TableScanDesc scan,
+ ScanDirection direction,
+ TupleTableSlot *slot);
+
+ /*-----------
+ * Optional functions to provide scanning for ranges of ItemPointers.
+ * Implementations must either provide both of these functions, or neither
+ * of them.
+ *
+ * Implementations of scan_set_tidrange must themselves handle
+ * ItemPointers of any value. i.e, they must handle each of the following:
+ *
+ * 1) mintid or maxtid is beyond the end of the table; and
+ * 2) mintid is above maxtid; and
+ * 3) item offset for mintid or maxtid is beyond the maximum offset
+ * allowed by the AM.
+ *
+ * Implementations can assume that scan_set_tidrange is always called
+ * before can_getnextslot_tidrange or after scan_rescan and before any
+ * further calls to scan_getnextslot_tidrange.
+ */
+ void (*scan_set_tidrange) (TableScanDesc scan,
+ ItemPointer mintid,
+ ItemPointer maxtid);
+
+ /*
+ * Return next tuple from `scan` that's in the range of TIDs defined by
+ * scan_set_tidrange.
+ */
+ bool (*scan_getnextslot_tidrange) (TableScanDesc scan,
+ ScanDirection direction,
+ TupleTableSlot *slot);
+
+ /* ------------------------------------------------------------------------
+ * Parallel table scan related functions.
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * Estimate the size of shared memory needed for a parallel scan of this
+ * relation. The snapshot does not need to be accounted for.
+ */
+ Size (*parallelscan_estimate) (Relation rel);
+
+ /*
+ * Initialize ParallelTableScanDesc for a parallel scan of this relation.
+ * `pscan` will be sized according to parallelscan_estimate() for the same
+ * relation.
+ */
+ Size (*parallelscan_initialize) (Relation rel,
+ ParallelTableScanDesc pscan);
+
+ /*
+ * Reinitialize `pscan` for a new scan. `rel` will be the same relation as
+ * when `pscan` was initialized by parallelscan_initialize.
+ */
+ void (*parallelscan_reinitialize) (Relation rel,
+ ParallelTableScanDesc pscan);
+
+
+ /* ------------------------------------------------------------------------
+ * Index Scan Callbacks
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * Prepare to fetch tuples from the relation, as needed when fetching
+ * tuples for an index scan. The callback has to return an
+ * IndexFetchTableData, which the AM will typically embed in a larger
+ * structure with additional information.
+ *
+ * Tuples for an index scan can then be fetched via index_fetch_tuple.
+ */
+ struct IndexFetchTableData *(*index_fetch_begin) (Relation rel);
+
+ /*
+ * Reset index fetch. Typically this will release cross index fetch
+ * resources held in IndexFetchTableData.
+ */
+ void (*index_fetch_reset) (struct IndexFetchTableData *data);
+
+ /*
+ * Release resources and deallocate index fetch.
+ */
+ void (*index_fetch_end) (struct IndexFetchTableData *data);
+
+ /*
+ * Fetch tuple at `tid` into `slot`, after doing a visibility test
+ * according to `snapshot`. If a tuple was found and passed the visibility
+ * test, return true, false otherwise.
+ *
+ * Note that AMs that do not necessarily update indexes when indexed
+ * columns do not change, need to return the current/correct version of
+ * the tuple that is visible to the snapshot, even if the tid points to an
+ * older version of the tuple.
+ *
+ * *call_again is false on the first call to index_fetch_tuple for a tid.
+ * If there potentially is another tuple matching the tid, *call_again
+ * needs to be set to true by index_fetch_tuple, signaling to the caller
+ * that index_fetch_tuple should be called again for the same tid.
+ *
+ * *all_dead, if all_dead is not NULL, should be set to true by
+ * index_fetch_tuple iff it is guaranteed that no backend needs to see
+ * that tuple. Index AMs can use that to avoid returning that tid in
+ * future searches.
+ */
+ bool (*index_fetch_tuple) (struct IndexFetchTableData *scan,
+ ItemPointer tid,
+ Snapshot snapshot,
+ TupleTableSlot *slot,
+ bool *call_again, bool *all_dead);
+
+
+ /* ------------------------------------------------------------------------
+ * Callbacks for non-modifying operations on individual tuples
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * Fetch tuple at `tid` into `slot`, after doing a visibility test
+ * according to `snapshot`. If a tuple was found and passed the visibility
+ * test, returns true, false otherwise.
+ */
+ bool (*tuple_fetch_row_version) (Relation rel,
+ ItemPointer tid,
+ Snapshot snapshot,
+ TupleTableSlot *slot);
+
+ /*
+ * Is tid valid for a scan of this relation.
+ */
+ bool (*tuple_tid_valid) (TableScanDesc scan,
+ ItemPointer tid);
+
+ /*
+ * Return the latest version of the tuple at `tid`, by updating `tid` to
+ * point at the newest version.
+ */
+ void (*tuple_get_latest_tid) (TableScanDesc scan,
+ ItemPointer tid);
+
+ /*
+ * Does the tuple in `slot` satisfy `snapshot`? The slot needs to be of
+ * the appropriate type for the AM.
+ */
+ bool (*tuple_satisfies_snapshot) (Relation rel,
+ TupleTableSlot *slot,
+ Snapshot snapshot);
+
+ /* see table_index_delete_tuples() */
+ TransactionId (*index_delete_tuples) (Relation rel,
+ TM_IndexDeleteOp *delstate);
+
+
+ /* ------------------------------------------------------------------------
+ * Manipulations of physical tuples.
+ * ------------------------------------------------------------------------
+ */
+
+ /* see table_tuple_insert() for reference about parameters */
+ void (*tuple_insert) (Relation rel, TupleTableSlot *slot,
+ CommandId cid, int options,
+ struct BulkInsertStateData *bistate);
+
+ /* see table_tuple_insert_speculative() for reference about parameters */
+ void (*tuple_insert_speculative) (Relation rel,
+ TupleTableSlot *slot,
+ CommandId cid,
+ int options,
+ struct BulkInsertStateData *bistate,
+ uint32 specToken);
+
+ /* see table_tuple_complete_speculative() for reference about parameters */
+ void (*tuple_complete_speculative) (Relation rel,
+ TupleTableSlot *slot,
+ uint32 specToken,
+ bool succeeded);
+
+ /* see table_multi_insert() for reference about parameters */
+ void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots,
+ CommandId cid, int options, struct BulkInsertStateData *bistate);
+
+ /* see table_tuple_delete() for reference about parameters */
+ TM_Result (*tuple_delete) (Relation rel,
+ ItemPointer tid,
+ CommandId cid,
+ Snapshot snapshot,
+ Snapshot crosscheck,
+ bool wait,
+ TM_FailureData *tmfd,
+ bool changingPart);
+
+ /* see table_tuple_update() for reference about parameters */
+ TM_Result (*tuple_update) (Relation rel,
+ ItemPointer otid,
+ TupleTableSlot *slot,
+ CommandId cid,
+ Snapshot snapshot,
+ Snapshot crosscheck,
+ bool wait,
+ TM_FailureData *tmfd,
+ LockTupleMode *lockmode,
+ bool *update_indexes);
+
+ /* see table_tuple_lock() for reference about parameters */
+ TM_Result (*tuple_lock) (Relation rel,
+ ItemPointer tid,
+ Snapshot snapshot,
+ TupleTableSlot *slot,
+ CommandId cid,
+ LockTupleMode mode,
+ LockWaitPolicy wait_policy,
+ uint8 flags,
+ TM_FailureData *tmfd);
+
+ /*
+ * Perform operations necessary to complete insertions made via
+ * tuple_insert and multi_insert with a BulkInsertState specified. In-tree
+ * access methods ceased to use this.
+ *
+ * Typically callers of tuple_insert and multi_insert will just pass all
+ * the flags that apply to them, and each AM has to decide which of them
+ * make sense for it, and then only take actions in finish_bulk_insert for
+ * those flags, and ignore others.
+ *
+ * Optional callback.
+ */
+ void (*finish_bulk_insert) (Relation rel, int options);
+
+
+ /* ------------------------------------------------------------------------
+ * DDL related functionality.
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * This callback needs to create a new relation filenode for `rel`, with
+ * appropriate durability behaviour for `persistence`.
+ *
+ * Note that only the subset of the relcache filled by
+ * RelationBuildLocalRelation() can be relied upon and that the relation's
+ * catalog entries will either not yet exist (new relation), or will still
+ * reference the old relfilenode.
+ *
+ * As output *freezeXid, *minmulti must be set to the values appropriate
+ * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
+ * fields to be filled they can be set to InvalidTransactionId and
+ * InvalidMultiXactId, respectively.
+ *
+ * See also table_relation_set_new_filenode().
+ */
+ void (*relation_set_new_filenode) (Relation rel,
+ const RelFileNode *newrnode,
+ char persistence,
+ TransactionId *freezeXid,
+ MultiXactId *minmulti);
+
+ /*
+ * This callback needs to remove all contents from `rel`'s current
+ * relfilenode. No provisions for transactional behaviour need to be made.
+ * Often this can be implemented by truncating the underlying storage to
+ * its minimal size.
+ *
+ * See also table_relation_nontransactional_truncate().
+ */
+ void (*relation_nontransactional_truncate) (Relation rel);
+
+ /*
+ * See table_relation_copy_data().
+ *
+ * This can typically be implemented by directly copying the underlying
+ * storage, unless it contains references to the tablespace internally.
+ */
+ void (*relation_copy_data) (Relation rel,
+ const RelFileNode *newrnode);
+
+ /* See table_relation_copy_for_cluster() */
+ void (*relation_copy_for_cluster) (Relation NewTable,
+ Relation OldTable,
+ Relation OldIndex,
+ bool use_sort,
+ TransactionId OldestXmin,
+ TransactionId *xid_cutoff,
+ MultiXactId *multi_cutoff,
+ double *num_tuples,
+ double *tups_vacuumed,
+ double *tups_recently_dead);
+
+ /*
+ * React to VACUUM command on the relation. The VACUUM can be triggered by
+ * a user or by autovacuum. The specific actions performed by the AM will
+ * depend heavily on the individual AM.
+ *
+ * On entry a transaction is already established, and the relation is
+ * locked with a ShareUpdateExclusive lock.
+ *
+ * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through
+ * this routine, even if (for ANALYZE) it is part of the same VACUUM
+ * command.
+ *
+ * There probably, in the future, needs to be a separate callback to
+ * integrate with autovacuum's scheduling.
+ */
+ void (*relation_vacuum) (Relation rel,
+ struct VacuumParams *params,
+ BufferAccessStrategy bstrategy);
+
+ /*
+ * Prepare to analyze block `blockno` of `scan`. The scan has been started
+ * with table_beginscan_analyze(). See also
+ * table_scan_analyze_next_block().
+ *
+ * The callback may acquire resources like locks that are held until
+ * table_scan_analyze_next_tuple() returns false. It e.g. can make sense
+ * to hold a lock until all tuples on a block have been analyzed by
+ * scan_analyze_next_tuple.
+ *
+ * The callback can return false if the block is not suitable for
+ * sampling, e.g. because it's a metapage that could never contain tuples.
+ *
+ * XXX: This obviously is primarily suited for block-based AMs. It's not
+ * clear what a good interface for non block based AMs would be, so there
+ * isn't one yet.
+ */
+ bool (*scan_analyze_next_block) (TableScanDesc scan,
+ BlockNumber blockno,
+ BufferAccessStrategy bstrategy);
+
+ /*
+ * See table_scan_analyze_next_tuple().
+ *
+ * Not every AM might have a meaningful concept of dead rows, in which
+ * case it's OK to not increment *deadrows - but note that that may
+ * influence autovacuum scheduling (see comment for relation_vacuum
+ * callback).
+ */
+ bool (*scan_analyze_next_tuple) (TableScanDesc scan,
+ TransactionId OldestXmin,
+ double *liverows,
+ double *deadrows,
+ TupleTableSlot *slot);
+
+ /* see table_index_build_range_scan for reference about parameters */
+ double (*index_build_range_scan) (Relation table_rel,
+ Relation index_rel,
+ struct IndexInfo *index_info,
+ bool allow_sync,
+ bool anyvisible,
+ bool progress,
+ BlockNumber start_blockno,
+ BlockNumber numblocks,
+ IndexBuildCallback callback,
+ void *callback_state,
+ TableScanDesc scan);
+
+ /* see table_index_validate_scan for reference about parameters */
+ void (*index_validate_scan) (Relation table_rel,
+ Relation index_rel,
+ struct IndexInfo *index_info,
+ Snapshot snapshot,
+ struct ValidateIndexState *state);
+
+
+ /* ------------------------------------------------------------------------
+ * Miscellaneous functions.
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * See table_relation_size().
+ *
+ * Note that currently a few callers use the MAIN_FORKNUM size to figure
+ * out the range of potentially interesting blocks (brin, analyze). It's
+ * probable that we'll need to revise the interface for those at some
+ * point.
+ */
+ uint64 (*relation_size) (Relation rel, ForkNumber forkNumber);
+
+
+ /*
+ * This callback should return true if the relation requires a TOAST table
+ * and false if it does not. It may wish to examine the relation's tuple
+ * descriptor before making a decision, but if it uses some other method
+ * of storing large values (or if it does not support them) it can simply
+ * return false.
+ */
+ bool (*relation_needs_toast_table) (Relation rel);
+
+ /*
+ * This callback should return the OID of the table AM that implements
+ * TOAST tables for this AM. If the relation_needs_toast_table callback
+ * always returns false, this callback is not required.
+ */
+ Oid (*relation_toast_am) (Relation rel);
+
+ /*
+ * This callback is invoked when detoasting a value stored in a toast
+ * table implemented by this AM. See table_relation_fetch_toast_slice()
+ * for more details.
+ */
+ void (*relation_fetch_toast_slice) (Relation toastrel, Oid valueid,
+ int32 attrsize,
+ int32 sliceoffset,
+ int32 slicelength,
+ struct varlena *result);
+
+
+ /* ------------------------------------------------------------------------
+ * Planner related functions.
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * See table_relation_estimate_size().
+ *
+ * While block oriented, it shouldn't be too hard for an AM that doesn't
+ * internally use blocks to convert into a usable representation.
+ *
+ * This differs from the relation_size callback by returning size
+ * estimates (both relation size and tuple count) for planning purposes,
+ * rather than returning a currently correct estimate.
+ */
+ void (*relation_estimate_size) (Relation rel, int32 *attr_widths,
+ BlockNumber *pages, double *tuples,
+ double *allvisfrac);
+
+
+ /* ------------------------------------------------------------------------
+ * Executor related functions.
+ * ------------------------------------------------------------------------
+ */
+
+ /*
+ * Prepare to fetch / check / return tuples from `tbmres->blockno` as part
+ * of a bitmap table scan. `scan` was started via table_beginscan_bm().
+ * Return false if there are no tuples to be found on the page, true
+ * otherwise.
+ *
+ * This will typically read and pin the target block, and do the necessary
+ * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
+ * make sense to perform tuple visibility checks at this time). For some
+ * AMs it will make more sense to do all the work referencing `tbmres`
+ * contents here, for others it might be better to defer more work to
+ * scan_bitmap_next_tuple.
+ *
+ * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples
+ * on the page have to be returned, otherwise the tuples at offsets in
+ * `tbmres->offsets` need to be returned.
+ *
+ * XXX: Currently this may only be implemented if the AM uses md.c as its
+ * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
+ * blockids directly to the underlying storage. nodeBitmapHeapscan.c
+ * performs prefetching directly using that interface. This probably
+ * needs to be rectified at a later point.
+ *
+ * XXX: Currently this may only be implemented if the AM uses the
+ * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
+ * perform prefetching. This probably needs to be rectified at a later
+ * point.
+ *
+ * Optional callback, but either both scan_bitmap_next_block and
+ * scan_bitmap_next_tuple need to exist, or neither.
+ */
+ bool (*scan_bitmap_next_block) (TableScanDesc scan,
+ struct TBMIterateResult *tbmres);
+
+ /*
+ * Fetch the next tuple of a bitmap table scan into `slot` and return true
+ * if a visible tuple was found, false otherwise.
+ *
+ * For some AMs it will make more sense to do all the work referencing
+ * `tbmres` contents in scan_bitmap_next_block, for others it might be
+ * better to defer more work to this callback.
+ *
+ * Optional callback, but either both scan_bitmap_next_block and
+ * scan_bitmap_next_tuple need to exist, or neither.
+ */
+ bool (*scan_bitmap_next_tuple) (TableScanDesc scan,
+ struct TBMIterateResult *tbmres,
+ TupleTableSlot *slot);
+
+ /*
+ * Prepare to fetch tuples from the next block in a sample scan. Return
+ * false if the sample scan is finished, true otherwise. `scan` was
+ * started via table_beginscan_sampling().
+ *
+ * Typically this will first determine the target block by calling the
+ * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
+ * perform a sequential scan over all blocks. The determined block is
+ * then typically read and pinned.
+ *
+ * As the TsmRoutine interface is block based, a block needs to be passed
+ * to NextSampleBlock(). If that's not appropriate for an AM, it
+ * internally needs to perform mapping between the internal and a block
+ * based representation.
+ *
+ * Note that it's not acceptable to hold deadlock prone resources such as
+ * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
+ * block - the tuple is likely to be returned to an upper query node, and
+ * the next call could be off a long while. Holding buffer pins and such
+ * is obviously OK.
+ *
+ * Currently it is required to implement this interface, as there's no
+ * alternative way (contrary e.g. to bitmap scans) to implement sample
+ * scans. If infeasible to implement, the AM may raise an error.
+ */
+ bool (*scan_sample_next_block) (TableScanDesc scan,
+ struct SampleScanState *scanstate);
+
+ /*
+ * This callback, only called after scan_sample_next_block has returned
+ * true, should determine the next tuple to be returned from the selected
+ * block using the TsmRoutine's NextSampleTuple() callback.
+ *
+ * The callback needs to perform visibility checks, and only return
+ * visible tuples. That obviously can mean calling NextSampleTuple()
+ * multiple times.
+ *
+ * The TsmRoutine interface assumes that there's a maximum offset on a
+ * given page, so if that doesn't apply to an AM, it needs to emulate that
+ * assumption somehow.
+ */
+ bool (*scan_sample_next_tuple) (TableScanDesc scan,
+ struct SampleScanState *scanstate,
+ TupleTableSlot *slot);
+
+} TableAmRoutine;
+
+
+/* ----------------------------------------------------------------------------
+ * Slot functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Returns slot callbacks suitable for holding tuples of the appropriate type
+ * for the relation. Works for tables, views, foreign tables and partitioned
+ * tables.
+ */
+extern const TupleTableSlotOps *table_slot_callbacks(Relation rel);
+
+/*
+ * Returns slot using the callbacks returned by table_slot_callbacks(), and
+ * registers it on *reglist.
+ */
+extern TupleTableSlot *table_slot_create(Relation rel, List **reglist);
+
+
+/* ----------------------------------------------------------------------------
+ * Table scan functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Start a scan of `rel`. Returned tuples pass a visibility test of
+ * `snapshot`, and if nkeys != 0, the results are filtered by those scan keys.
+ */
+static inline TableScanDesc
+table_beginscan(Relation rel, Snapshot snapshot,
+ int nkeys, struct ScanKeyData *key)
+{
+ uint32 flags = SO_TYPE_SEQSCAN |
+ SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
+
+ return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+}
+
+/*
+ * Like table_beginscan(), but for scanning catalog. It'll automatically use a
+ * snapshot appropriate for scanning catalog relations.
+ */
+extern TableScanDesc table_beginscan_catalog(Relation rel, int nkeys,
+ struct ScanKeyData *key);
+
+/*
+ * Like table_beginscan(), but table_beginscan_strat() offers an extended API
+ * that lets the caller control whether a nondefault buffer access strategy
+ * can be used, and whether syncscan can be chosen (possibly resulting in the
+ * scan not starting from block zero). Both of these default to true with
+ * plain table_beginscan.
+ */
+static inline TableScanDesc
+table_beginscan_strat(Relation rel, Snapshot snapshot,
+ int nkeys, struct ScanKeyData *key,
+ bool allow_strat, bool allow_sync)
+{
+ uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE;
+
+ if (allow_strat)
+ flags |= SO_ALLOW_STRAT;
+ if (allow_sync)
+ flags |= SO_ALLOW_SYNC;
+
+ return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+}
+
+/*
+ * table_beginscan_bm is an alternative entry point for setting up a
+ * TableScanDesc for a bitmap heap scan. Although that scan technology is
+ * really quite unlike a standard seqscan, there is just enough commonality to
+ * make it worth using the same data structure.
+ */
+static inline TableScanDesc
+table_beginscan_bm(Relation rel, Snapshot snapshot,
+ int nkeys, struct ScanKeyData *key)
+{
+ uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE;
+
+ return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+}
+
+/*
+ * table_beginscan_sampling is an alternative entry point for setting up a
+ * TableScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
+ * using the same data structure although the behavior is rather different.
+ * In addition to the options offered by table_beginscan_strat, this call
+ * also allows control of whether page-mode visibility checking is used.
+ */
+static inline TableScanDesc
+table_beginscan_sampling(Relation rel, Snapshot snapshot,
+ int nkeys, struct ScanKeyData *key,
+ bool allow_strat, bool allow_sync,
+ bool allow_pagemode)
+{
+ uint32 flags = SO_TYPE_SAMPLESCAN;
+
+ if (allow_strat)
+ flags |= SO_ALLOW_STRAT;
+ if (allow_sync)
+ flags |= SO_ALLOW_SYNC;
+ if (allow_pagemode)
+ flags |= SO_ALLOW_PAGEMODE;
+
+ return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+}
+
+/*
+ * table_beginscan_tid is an alternative entry point for setting up a
+ * TableScanDesc for a Tid scan. As with bitmap scans, it's worth using
+ * the same data structure although the behavior is rather different.
+ */
+static inline TableScanDesc
+table_beginscan_tid(Relation rel, Snapshot snapshot)
+{
+ uint32 flags = SO_TYPE_TIDSCAN;
+
+ return rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags);
+}
+
+/*
+ * table_beginscan_analyze is an alternative entry point for setting up a
+ * TableScanDesc for an ANALYZE scan. As with bitmap scans, it's worth using
+ * the same data structure although the behavior is rather different.
+ */
+static inline TableScanDesc
+table_beginscan_analyze(Relation rel)
+{
+ uint32 flags = SO_TYPE_ANALYZE;
+
+ return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags);
+}
+
+/*
+ * End relation scan.
+ */
+static inline void
+table_endscan(TableScanDesc scan)
+{
+ scan->rs_rd->rd_tableam->scan_end(scan);
+}
+
+/*
+ * Restart a relation scan.
+ */
+static inline void
+table_rescan(TableScanDesc scan,
+ struct ScanKeyData *key)
+{
+ scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false);
+}
+
+/*
+ * Restart a relation scan after changing params.
+ *
+ * This call allows changing the buffer strategy, syncscan, and pagemode
+ * options before starting a fresh scan. Note that although the actual use of
+ * syncscan might change (effectively, enabling or disabling reporting), the
+ * previously selected startblock will be kept.
+ */
+static inline void
+table_rescan_set_params(TableScanDesc scan, struct ScanKeyData *key,
+ bool allow_strat, bool allow_sync, bool allow_pagemode)
+{
+ scan->rs_rd->rd_tableam->scan_rescan(scan, key, true,
+ allow_strat, allow_sync,
+ allow_pagemode);
+}
+
+/*
+ * Update snapshot used by the scan.
+ */
+extern void table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot);
+
+/*
+ * Return next tuple from `scan`, store in slot.
+ */
+static inline bool
+table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
+{
+ slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
+
+ /*
+ * We don't expect direct calls to table_scan_getnextslot with valid
+ * CheckXidAlive for catalog or regular tables. See detailed comments in
+ * xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_scan_getnextslot call during logical decoding");
+
+ return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);
+}
+
+/* ----------------------------------------------------------------------------
+ * TID Range scanning related functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * table_beginscan_tidrange is the entry point for setting up a TableScanDesc
+ * for a TID range scan.
+ */
+static inline TableScanDesc
+table_beginscan_tidrange(Relation rel, Snapshot snapshot,
+ ItemPointer mintid,
+ ItemPointer maxtid)
+{
+ TableScanDesc sscan;
+ uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE;
+
+ sscan = rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags);
+
+ /* Set the range of TIDs to scan */
+ sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid);
+
+ return sscan;
+}
+
+/*
+ * table_rescan_tidrange resets the scan position and sets the minimum and
+ * maximum TID range to scan for a TableScanDesc created by
+ * table_beginscan_tidrange.
+ */
+static inline void
+table_rescan_tidrange(TableScanDesc sscan, ItemPointer mintid,
+ ItemPointer maxtid)
+{
+ /* Ensure table_beginscan_tidrange() was used. */
+ Assert((sscan->rs_flags & SO_TYPE_TIDRANGESCAN) != 0);
+
+ sscan->rs_rd->rd_tableam->scan_rescan(sscan, NULL, false, false, false, false);
+ sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid);
+}
+
+/*
+ * Fetch the next tuple from `sscan` for a TID range scan created by
+ * table_beginscan_tidrange(). Stores the tuple in `slot` and returns true,
+ * or returns false if no more tuples exist in the range.
+ */
+static inline bool
+table_scan_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction,
+ TupleTableSlot *slot)
+{
+ /* Ensure table_beginscan_tidrange() was used. */
+ Assert((sscan->rs_flags & SO_TYPE_TIDRANGESCAN) != 0);
+
+ return sscan->rs_rd->rd_tableam->scan_getnextslot_tidrange(sscan,
+ direction,
+ slot);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Parallel table scan related functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Estimate the size of shared memory needed for a parallel scan of this
+ * relation.
+ */
+extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot);
+
+/*
+ * Initialize ParallelTableScanDesc for a parallel scan of this
+ * relation. `pscan` needs to be sized according to parallelscan_estimate()
+ * for the same relation. Call this just once in the leader process; then,
+ * individual workers attach via table_beginscan_parallel.
+ */
+extern void table_parallelscan_initialize(Relation rel,
+ ParallelTableScanDesc pscan,
+ Snapshot snapshot);
+
+/*
+ * Begin a parallel scan. `pscan` needs to have been initialized with
+ * table_parallelscan_initialize(), for the same relation. The initialization
+ * does not need to have happened in this backend.
+ *
+ * Caller must hold a suitable lock on the relation.
+ */
+extern TableScanDesc table_beginscan_parallel(Relation rel,
+ ParallelTableScanDesc pscan);
+
+/*
+ * Restart a parallel scan. Call this in the leader process. Caller is
+ * responsible for making sure that all workers have finished the scan
+ * beforehand.
+ */
+static inline void
+table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
+{
+ rel->rd_tableam->parallelscan_reinitialize(rel, pscan);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Index scan related functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Prepare to fetch tuples from the relation, as needed when fetching tuples
+ * for an index scan.
+ *
+ * Tuples for an index scan can then be fetched via table_index_fetch_tuple().
+ */
+static inline IndexFetchTableData *
+table_index_fetch_begin(Relation rel)
+{
+ return rel->rd_tableam->index_fetch_begin(rel);
+}
+
+/*
+ * Reset index fetch. Typically this will release cross index fetch resources
+ * held in IndexFetchTableData.
+ */
+static inline void
+table_index_fetch_reset(struct IndexFetchTableData *scan)
+{
+ scan->rel->rd_tableam->index_fetch_reset(scan);
+}
+
+/*
+ * Release resources and deallocate index fetch.
+ */
+static inline void
+table_index_fetch_end(struct IndexFetchTableData *scan)
+{
+ scan->rel->rd_tableam->index_fetch_end(scan);
+}
+
+/*
+ * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing
+ * a visibility test according to `snapshot`. If a tuple was found and passed
+ * the visibility test, returns true, false otherwise. Note that *tid may be
+ * modified when we return true (see later remarks on multiple row versions
+ * reachable via a single index entry).
+ *
+ * *call_again needs to be false on the first call to table_index_fetch_tuple() for
+ * a tid. If there potentially is another tuple matching the tid, *call_again
+ * will be set to true, signaling that table_index_fetch_tuple() should be called
+ * again for the same tid.
+ *
+ * *all_dead, if all_dead is not NULL, will be set to true by
+ * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see
+ * that tuple. Index AMs can use that to avoid returning that tid in future
+ * searches.
+ *
+ * The difference between this function and table_tuple_fetch_row_version()
+ * is that this function returns the currently visible version of a row if
+ * the AM supports storing multiple row versions reachable via a single index
+ * entry (like heap's HOT). Whereas table_tuple_fetch_row_version() only
+ * evaluates the tuple exactly at `tid`. Outside of index entry ->table tuple
+ * lookups, table_tuple_fetch_row_version() is what's usually needed.
+ */
+static inline bool
+table_index_fetch_tuple(struct IndexFetchTableData *scan,
+ ItemPointer tid,
+ Snapshot snapshot,
+ TupleTableSlot *slot,
+ bool *call_again, bool *all_dead)
+{
+ /*
+ * We don't expect direct calls to table_index_fetch_tuple with valid
+ * CheckXidAlive for catalog or regular tables. See detailed comments in
+ * xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding");
+
+ return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
+ slot, call_again,
+ all_dead);
+}
+
+/*
+ * This is a convenience wrapper around table_index_fetch_tuple() which
+ * returns whether there are table tuple items corresponding to an index
+ * entry. This likely is only useful to verify if there's a conflict in a
+ * unique index.
+ */
+extern bool table_index_fetch_tuple_check(Relation rel,
+ ItemPointer tid,
+ Snapshot snapshot,
+ bool *all_dead);
+
+
+/* ------------------------------------------------------------------------
+ * Functions for non-modifying operations on individual tuples
+ * ------------------------------------------------------------------------
+ */
+
+
+/*
+ * Fetch tuple at `tid` into `slot`, after doing a visibility test according to
+ * `snapshot`. If a tuple was found and passed the visibility test, returns
+ * true, false otherwise.
+ *
+ * See table_index_fetch_tuple's comment about what the difference between
+ * these functions is. It is correct to use this function outside of index
+ * entry->table tuple lookups.
+ */
+static inline bool
+table_tuple_fetch_row_version(Relation rel,
+ ItemPointer tid,
+ Snapshot snapshot,
+ TupleTableSlot *slot)
+{
+ /*
+ * We don't expect direct calls to table_tuple_fetch_row_version with
+ * valid CheckXidAlive for catalog or regular tables. See detailed
+ * comments in xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding");
+
+ return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot);
+}
+
+/*
+ * Verify that `tid` is a potentially valid tuple identifier. That doesn't
+ * mean that the pointed to row needs to exist or be visible, but that
+ * attempting to fetch the row (e.g. with table_tuple_get_latest_tid() or
+ * table_tuple_fetch_row_version()) should not error out if called with that
+ * tid.
+ *
+ * `scan` needs to have been started via table_beginscan().
+ */
+static inline bool
+table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
+{
+ return scan->rs_rd->rd_tableam->tuple_tid_valid(scan, tid);
+}
+
+/*
+ * Return the latest version of the tuple at `tid`, by updating `tid` to
+ * point at the newest version.
+ */
+extern void table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid);
+
+/*
+ * Return true iff tuple in slot satisfies the snapshot.
+ *
+ * This assumes the slot's tuple is valid, and of the appropriate type for the
+ * AM.
+ *
+ * Some AMs might modify the data underlying the tuple as a side-effect. If so
+ * they ought to mark the relevant buffer dirty.
+ */
+static inline bool
+table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
+ Snapshot snapshot)
+{
+ return rel->rd_tableam->tuple_satisfies_snapshot(rel, slot, snapshot);
+}
+
+/*
+ * Determine which index tuples are safe to delete based on their table TID.
+ *
+ * Determines which entries from index AM caller's TM_IndexDeleteOp state
+ * point to vacuumable table tuples. Entries that are found by tableam to be
+ * vacuumable are naturally safe for index AM to delete, and so get directly
+ * marked as deletable. See comments above TM_IndexDelete and comments above
+ * TM_IndexDeleteOp for full details.
+ *
+ * Returns a latestRemovedXid transaction ID that caller generally places in
+ * its index deletion WAL record. This might be used during subsequent REDO
+ * of the WAL record when in Hot Standby mode -- a recovery conflict for the
+ * index deletion operation might be required on the standby.
+ */
+static inline TransactionId
+table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
+{
+ return rel->rd_tableam->index_delete_tuples(rel, delstate);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Functions for manipulations of physical tuples.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Insert a tuple from a slot into table AM routine.
+ *
+ * The options bitmask allows the caller to specify options that may change the
+ * behaviour of the AM. The AM will ignore options that it does not support.
+ *
+ * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse
+ * free space in the relation. This can save some cycles when we know the
+ * relation is new and doesn't contain useful amounts of free space.
+ * TABLE_INSERT_SKIP_FSM is commonly passed directly to
+ * RelationGetBufferForTuple. See that method for more information.
+ *
+ * TABLE_INSERT_FROZEN should only be specified for inserts into
+ * relfilenodes created during the current subtransaction and when
+ * there are no prior snapshots or pre-existing portals open.
+ * This causes rows to be frozen, which is an MVCC violation and
+ * requires explicit options chosen by user.
+ *
+ * TABLE_INSERT_NO_LOGICAL force-disables the emitting of logical decoding
+ * information for the tuple. This should solely be used during table rewrites
+ * where RelationIsLogicallyLogged(relation) is not yet accurate for the new
+ * relation.
+ *
+ * Note that most of these options will be applied when inserting into the
+ * heap's TOAST table, too, if the tuple requires any out-of-line data.
+ *
+ * The BulkInsertState object (if any; bistate can be NULL for default
+ * behavior) is also just passed through to RelationGetBufferForTuple. If
+ * `bistate` is provided, table_finish_bulk_insert() needs to be called.
+ *
+ * On return the slot's tts_tid and tts_tableOid are updated to reflect the
+ * insertion. But note that any toasting of fields within the slot is NOT
+ * reflected in the slots contents.
+ */
+static inline void
+table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid,
+ int options, struct BulkInsertStateData *bistate)
+{
+ rel->rd_tableam->tuple_insert(rel, slot, cid, options,
+ bistate);
+}
+
+/*
+ * Perform a "speculative insertion". These can be backed out afterwards
+ * without aborting the whole transaction. Other sessions can wait for the
+ * speculative insertion to be confirmed, turning it into a regular tuple, or
+ * aborted, as if it never existed. Speculatively inserted tuples behave as
+ * "value locks" of short duration, used to implement INSERT .. ON CONFLICT.
+ *
+ * A transaction having performed a speculative insertion has to either abort,
+ * or finish the speculative insertion with
+ * table_tuple_complete_speculative(succeeded = ...).
+ */
+static inline void
+table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot,
+ CommandId cid, int options,
+ struct BulkInsertStateData *bistate,
+ uint32 specToken)
+{
+ rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options,
+ bistate, specToken);
+}
+
+/*
+ * Complete "speculative insertion" started in the same transaction. If
+ * succeeded is true, the tuple is fully inserted, if false, it's removed.
+ */
+static inline void
+table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot,
+ uint32 specToken, bool succeeded)
+{
+ rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken,
+ succeeded);
+}
+
+/*
+ * Insert multiple tuples into a table.
+ *
+ * This is like table_tuple_insert(), but inserts multiple tuples in one
+ * operation. That's often faster than calling table_tuple_insert() in a loop,
+ * because e.g. the AM can reduce WAL logging and page locking overhead.
+ *
+ * Except for taking `nslots` tuples as input, and an array of TupleTableSlots
+ * in `slots`, the parameters for table_multi_insert() are the same as for
+ * table_tuple_insert().
+ *
+ * Note: this leaks memory into the current memory context. You can create a
+ * temporary context before calling this, if that's a problem.
+ */
+static inline void
+table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots,
+ CommandId cid, int options, struct BulkInsertStateData *bistate)
+{
+ rel->rd_tableam->multi_insert(rel, slots, nslots,
+ cid, options, bistate);
+}
+
+/*
+ * Delete a tuple.
+ *
+ * NB: do not call this directly unless prepared to deal with
+ * concurrent-update conditions. Use simple_table_tuple_delete instead.
+ *
+ * Input parameters:
+ * relation - table to be modified (caller must hold suitable lock)
+ * tid - TID of tuple to be deleted
+ * cid - delete command ID (used for visibility test, and stored into
+ * cmax if successful)
+ * crosscheck - if not InvalidSnapshot, also check tuple against this
+ * wait - true if should wait for any conflicting update to commit/abort
+ * Output parameters:
+ * tmfd - filled in failure cases (see below)
+ * changingPart - true iff the tuple is being moved to another partition
+ * table due to an update of the partition key. Otherwise, false.
+ *
+ * Normal, successful return value is TM_Ok, which means we did actually
+ * delete it. Failure return codes are TM_SelfModified, TM_Updated, and
+ * TM_BeingModified (the last only possible if wait == false).
+ *
+ * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
+ * t_xmax, and, if possible, and, if possible, t_cmax. See comments for
+ * struct TM_FailureData for additional info.
+ */
+static inline TM_Result
+table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid,
+ Snapshot snapshot, Snapshot crosscheck, bool wait,
+ TM_FailureData *tmfd, bool changingPart)
+{
+ return rel->rd_tableam->tuple_delete(rel, tid, cid,
+ snapshot, crosscheck,
+ wait, tmfd, changingPart);
+}
+
+/*
+ * Update a tuple.
+ *
+ * NB: do not call this directly unless you are prepared to deal with
+ * concurrent-update conditions. Use simple_table_tuple_update instead.
+ *
+ * Input parameters:
+ * relation - table to be modified (caller must hold suitable lock)
+ * otid - TID of old tuple to be replaced
+ * slot - newly constructed tuple data to store
+ * cid - update command ID (used for visibility test, and stored into
+ * cmax/cmin if successful)
+ * crosscheck - if not InvalidSnapshot, also check old tuple against this
+ * wait - true if should wait for any conflicting update to commit/abort
+ * Output parameters:
+ * tmfd - filled in failure cases (see below)
+ * lockmode - filled with lock mode acquired on tuple
+ * update_indexes - in success cases this is set to true if new index entries
+ * are required for this tuple
+ *
+ * Normal, successful return value is TM_Ok, which means we did actually
+ * update it. Failure return codes are TM_SelfModified, TM_Updated, and
+ * TM_BeingModified (the last only possible if wait == false).
+ *
+ * On success, the slot's tts_tid and tts_tableOid are updated to match the new
+ * stored tuple; in particular, slot->tts_tid is set to the TID where the
+ * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
+ * update was done. However, any TOAST changes in the new tuple's
+ * data are not reflected into *newtup.
+ *
+ * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
+ * t_xmax, and, if possible, t_cmax. See comments for struct TM_FailureData
+ * for additional info.
+ */
+static inline TM_Result
+table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot,
+ CommandId cid, Snapshot snapshot, Snapshot crosscheck,
+ bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode,
+ bool *update_indexes)
+{
+ return rel->rd_tableam->tuple_update(rel, otid, slot,
+ cid, snapshot, crosscheck,
+ wait, tmfd,
+ lockmode, update_indexes);
+}
+
+/*
+ * Lock a tuple in the specified mode.
+ *
+ * Input parameters:
+ * relation: relation containing tuple (caller must hold suitable lock)
+ * tid: TID of tuple to lock
+ * snapshot: snapshot to use for visibility determinations
+ * cid: current command ID (used for visibility test, and stored into
+ * tuple's cmax if lock is successful)
+ * mode: lock mode desired
+ * wait_policy: what to do if tuple lock is not available
+ * flags:
+ * If TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS, follow the update chain to
+ * also lock descendant tuples if lock modes don't conflict.
+ * If TUPLE_LOCK_FLAG_FIND_LAST_VERSION, follow the update chain and lock
+ * latest version.
+ *
+ * Output parameters:
+ * *slot: contains the target tuple
+ * *tmfd: filled in failure cases (see below)
+ *
+ * Function result may be:
+ * TM_Ok: lock was successfully acquired
+ * TM_Invisible: lock failed because tuple was never visible to us
+ * TM_SelfModified: lock failed because tuple updated by self
+ * TM_Updated: lock failed because tuple updated by other xact
+ * TM_Deleted: lock failed because tuple deleted by other xact
+ * TM_WouldBlock: lock couldn't be acquired and wait_policy is skip
+ *
+ * In the failure cases other than TM_Invisible and TM_Deleted, the routine
+ * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See
+ * comments for struct TM_FailureData for additional info.
+ */
+static inline TM_Result
+table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot,
+ TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
+ LockWaitPolicy wait_policy, uint8 flags,
+ TM_FailureData *tmfd)
+{
+ return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot,
+ cid, mode, wait_policy,
+ flags, tmfd);
+}
+
+/*
+ * Perform operations necessary to complete insertions made via
+ * tuple_insert and multi_insert with a BulkInsertState specified.
+ */
+static inline void
+table_finish_bulk_insert(Relation rel, int options)
+{
+ /* optional callback */
+ if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert)
+ rel->rd_tableam->finish_bulk_insert(rel, options);
+}
+
+
+/* ------------------------------------------------------------------------
+ * DDL related functionality.
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * Create storage for `rel` in `newrnode`, with persistence set to
+ * `persistence`.
+ *
+ * This is used both during relation creation and various DDL operations to
+ * create a new relfilenode that can be filled from scratch. When creating
+ * new storage for an existing relfilenode, this should be called before the
+ * relcache entry has been updated.
+ *
+ * *freezeXid, *minmulti are set to the xid / multixact horizon for the table
+ * that pg_class.{relfrozenxid, relminmxid} have to be set to.
+ */
+static inline void
+table_relation_set_new_filenode(Relation rel,
+ const RelFileNode *newrnode,
+ char persistence,
+ TransactionId *freezeXid,
+ MultiXactId *minmulti)
+{
+ rel->rd_tableam->relation_set_new_filenode(rel, newrnode, persistence,
+ freezeXid, minmulti);
+}
+
+/*
+ * Remove all table contents from `rel`, in a non-transactional manner.
+ * Non-transactional meaning that there's no need to support rollbacks. This
+ * commonly only is used to perform truncations for relfilenodes created in the
+ * current transaction.
+ */
+static inline void
+table_relation_nontransactional_truncate(Relation rel)
+{
+ rel->rd_tableam->relation_nontransactional_truncate(rel);
+}
+
+/*
+ * Copy data from `rel` into the new relfilenode `newrnode`. The new
+ * relfilenode may not have storage associated before this function is
+ * called. This is only supposed to be used for low level operations like
+ * changing a relation's tablespace.
+ */
+static inline void
+table_relation_copy_data(Relation rel, const RelFileNode *newrnode)
+{
+ rel->rd_tableam->relation_copy_data(rel, newrnode);
+}
+
+/*
+ * Copy data from `OldTable` into `NewTable`, as part of a CLUSTER or VACUUM
+ * FULL.
+ *
+ * Additional Input parameters:
+ * - use_sort - if true, the table contents are sorted appropriate for
+ * `OldIndex`; if false and OldIndex is not InvalidOid, the data is copied
+ * in that index's order; if false and OldIndex is InvalidOid, no sorting is
+ * performed
+ * - OldIndex - see use_sort
+ * - OldestXmin - computed by vacuum_set_xid_limits(), even when
+ * not needed for the relation's AM
+ * - *xid_cutoff - ditto
+ * - *multi_cutoff - ditto
+ *
+ * Output parameters:
+ * - *xid_cutoff - rel's new relfrozenxid value, may be invalid
+ * - *multi_cutoff - rel's new relminmxid value, may be invalid
+ * - *tups_vacuumed - stats, for logging, if appropriate for AM
+ * - *tups_recently_dead - stats, for logging, if appropriate for AM
+ */
+static inline void
+table_relation_copy_for_cluster(Relation OldTable, Relation NewTable,
+ Relation OldIndex,
+ bool use_sort,
+ TransactionId OldestXmin,
+ TransactionId *xid_cutoff,
+ MultiXactId *multi_cutoff,
+ double *num_tuples,
+ double *tups_vacuumed,
+ double *tups_recently_dead)
+{
+ OldTable->rd_tableam->relation_copy_for_cluster(OldTable, NewTable, OldIndex,
+ use_sort, OldestXmin,
+ xid_cutoff, multi_cutoff,
+ num_tuples, tups_vacuumed,
+ tups_recently_dead);
+}
+
+/*
+ * Perform VACUUM on the relation. The VACUUM can be triggered by a user or by
+ * autovacuum. The specific actions performed by the AM will depend heavily on
+ * the individual AM.
+ *
+ * On entry a transaction needs to already been established, and the
+ * table is locked with a ShareUpdateExclusive lock.
+ *
+ * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this
+ * routine, even if (for ANALYZE) it is part of the same VACUUM command.
+ */
+static inline void
+table_relation_vacuum(Relation rel, struct VacuumParams *params,
+ BufferAccessStrategy bstrategy)
+{
+ rel->rd_tableam->relation_vacuum(rel, params, bstrategy);
+}
+
+/*
+ * Prepare to analyze block `blockno` of `scan`. The scan needs to have been
+ * started with table_beginscan_analyze(). Note that this routine might
+ * acquire resources like locks that are held until
+ * table_scan_analyze_next_tuple() returns false.
+ *
+ * Returns false if block is unsuitable for sampling, true otherwise.
+ */
+static inline bool
+table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
+ BufferAccessStrategy bstrategy)
+{
+ return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno,
+ bstrategy);
+}
+
+/*
+ * Iterate over tuples in the block selected with
+ * table_scan_analyze_next_block() (which needs to have returned true, and
+ * this routine may not have returned false for the same block before). If a
+ * tuple that's suitable for sampling is found, true is returned and a tuple
+ * is stored in `slot`.
+ *
+ * *liverows and *deadrows are incremented according to the encountered
+ * tuples.
+ */
+static inline bool
+table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
+ double *liverows, double *deadrows,
+ TupleTableSlot *slot)
+{
+ return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin,
+ liverows, deadrows,
+ slot);
+}
+
+/*
+ * table_index_build_scan - scan the table to find tuples to be indexed
+ *
+ * This is called back from an access-method-specific index build procedure
+ * after the AM has done whatever setup it needs. The parent table relation
+ * is scanned to find tuples that should be entered into the index. Each
+ * such tuple is passed to the AM's callback routine, which does the right
+ * things to add it to the new index. After we return, the AM's index
+ * build procedure does whatever cleanup it needs.
+ *
+ * The total count of live tuples is returned. This is for updating pg_class
+ * statistics. (It's annoying not to be able to do that here, but we want to
+ * merge that update with others; see index_update_stats.) Note that the
+ * index AM itself must keep track of the number of index tuples; we don't do
+ * so here because the AM might reject some of the tuples for its own reasons,
+ * such as being unable to store NULLs.
+ *
+ * If 'progress', the PROGRESS_SCAN_BLOCKS_TOTAL counter is updated when
+ * starting the scan, and PROGRESS_SCAN_BLOCKS_DONE is updated as we go along.
+ *
+ * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
+ * any potentially broken HOT chains. Currently, we set this if there are any
+ * RECENTLY_DEAD or DELETE_IN_PROGRESS entries in a HOT chain, without trying
+ * very hard to detect whether they're really incompatible with the chain tip.
+ * This only really makes sense for heap AM, it might need to be generalized
+ * for other AMs later.
+ */
+static inline double
+table_index_build_scan(Relation table_rel,
+ Relation index_rel,
+ struct IndexInfo *index_info,
+ bool allow_sync,
+ bool progress,
+ IndexBuildCallback callback,
+ void *callback_state,
+ TableScanDesc scan)
+{
+ return table_rel->rd_tableam->index_build_range_scan(table_rel,
+ index_rel,
+ index_info,
+ allow_sync,
+ false,
+ progress,
+ 0,
+ InvalidBlockNumber,
+ callback,
+ callback_state,
+ scan);
+}
+
+/*
+ * As table_index_build_scan(), except that instead of scanning the complete
+ * table, only the given number of blocks are scanned. Scan to end-of-rel can
+ * be signaled by passing InvalidBlockNumber as numblocks. Note that
+ * restricting the range to scan cannot be done when requesting syncscan.
+ *
+ * When "anyvisible" mode is requested, all tuples visible to any transaction
+ * are indexed and counted as live, including those inserted or deleted by
+ * transactions that are still in progress.
+ */
+static inline double
+table_index_build_range_scan(Relation table_rel,
+ Relation index_rel,
+ struct IndexInfo *index_info,
+ bool allow_sync,
+ bool anyvisible,
+ bool progress,
+ BlockNumber start_blockno,
+ BlockNumber numblocks,
+ IndexBuildCallback callback,
+ void *callback_state,
+ TableScanDesc scan)
+{
+ return table_rel->rd_tableam->index_build_range_scan(table_rel,
+ index_rel,
+ index_info,
+ allow_sync,
+ anyvisible,
+ progress,
+ start_blockno,
+ numblocks,
+ callback,
+ callback_state,
+ scan);
+}
+
+/*
+ * table_index_validate_scan - second table scan for concurrent index build
+ *
+ * See validate_index() for an explanation.
+ */
+static inline void
+table_index_validate_scan(Relation table_rel,
+ Relation index_rel,
+ struct IndexInfo *index_info,
+ Snapshot snapshot,
+ struct ValidateIndexState *state)
+{
+ table_rel->rd_tableam->index_validate_scan(table_rel,
+ index_rel,
+ index_info,
+ snapshot,
+ state);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Miscellaneous functionality
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Return the current size of `rel` in bytes. If `forkNumber` is
+ * InvalidForkNumber, return the relation's overall size, otherwise the size
+ * for the indicated fork.
+ *
+ * Note that the overall size might not be the equivalent of the sum of sizes
+ * for the individual forks for some AMs, e.g. because the AMs storage does
+ * not neatly map onto the builtin types of forks.
+ */
+static inline uint64
+table_relation_size(Relation rel, ForkNumber forkNumber)
+{
+ return rel->rd_tableam->relation_size(rel, forkNumber);
+}
+
+/*
+ * table_relation_needs_toast_table - does this relation need a toast table?
+ */
+static inline bool
+table_relation_needs_toast_table(Relation rel)
+{
+ return rel->rd_tableam->relation_needs_toast_table(rel);
+}
+
+/*
+ * Return the OID of the AM that should be used to implement the TOAST table
+ * for this relation.
+ */
+static inline Oid
+table_relation_toast_am(Relation rel)
+{
+ return rel->rd_tableam->relation_toast_am(rel);
+}
+
+/*
+ * Fetch all or part of a TOAST value from a TOAST table.
+ *
+ * If this AM is never used to implement a TOAST table, then this callback
+ * is not needed. But, if toasted values are ever stored in a table of this
+ * type, then you will need this callback.
+ *
+ * toastrel is the relation in which the toasted value is stored.
+ *
+ * valueid identifes which toast value is to be fetched. For the heap,
+ * this corresponds to the values stored in the chunk_id column.
+ *
+ * attrsize is the total size of the toast value to be fetched.
+ *
+ * sliceoffset is the offset within the toast value of the first byte that
+ * should be fetched.
+ *
+ * slicelength is the number of bytes from the toast value that should be
+ * fetched.
+ *
+ * result is caller-allocated space into which the fetched bytes should be
+ * stored.
+ */
+static inline void
+table_relation_fetch_toast_slice(Relation toastrel, Oid valueid,
+ int32 attrsize, int32 sliceoffset,
+ int32 slicelength, struct varlena *result)
+{
+ toastrel->rd_tableam->relation_fetch_toast_slice(toastrel, valueid,
+ attrsize,
+ sliceoffset, slicelength,
+ result);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Planner related functionality
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Estimate the current size of the relation, as an AM specific workhorse for
+ * estimate_rel_size(). Look there for an explanation of the parameters.
+ */
+static inline void
+table_relation_estimate_size(Relation rel, int32 *attr_widths,
+ BlockNumber *pages, double *tuples,
+ double *allvisfrac)
+{
+ rel->rd_tableam->relation_estimate_size(rel, attr_widths, pages, tuples,
+ allvisfrac);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Executor related functionality
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
+ * a bitmap table scan. `scan` needs to have been started via
+ * table_beginscan_bm(). Returns false if there are no tuples to be found on
+ * the page, true otherwise.
+ *
+ * Note, this is an optionally implemented function, therefore should only be
+ * used after verifying the presence (at plan time or such).
+ */
+static inline bool
+table_scan_bitmap_next_block(TableScanDesc scan,
+ struct TBMIterateResult *tbmres)
+{
+ /*
+ * We don't expect direct calls to table_scan_bitmap_next_block with valid
+ * CheckXidAlive for catalog or regular tables. See detailed comments in
+ * xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding");
+
+ return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
+ tbmres);
+}
+
+/*
+ * Fetch the next tuple of a bitmap table scan into `slot` and return true if
+ * a visible tuple was found, false otherwise.
+ * table_scan_bitmap_next_block() needs to previously have selected a
+ * block (i.e. returned true), and no previous
+ * table_scan_bitmap_next_tuple() for the same block may have
+ * returned false.
+ */
+static inline bool
+table_scan_bitmap_next_tuple(TableScanDesc scan,
+ struct TBMIterateResult *tbmres,
+ TupleTableSlot *slot)
+{
+ /*
+ * We don't expect direct calls to table_scan_bitmap_next_tuple with valid
+ * CheckXidAlive for catalog or regular tables. See detailed comments in
+ * xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding");
+
+ return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
+ tbmres,
+ slot);
+}
+
+/*
+ * Prepare to fetch tuples from the next block in a sample scan. Returns false
+ * if the sample scan is finished, true otherwise. `scan` needs to have been
+ * started via table_beginscan_sampling().
+ *
+ * This will call the TsmRoutine's NextSampleBlock() callback if necessary
+ * (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the
+ * underlying relation.
+ */
+static inline bool
+table_scan_sample_next_block(TableScanDesc scan,
+ struct SampleScanState *scanstate)
+{
+ /*
+ * We don't expect direct calls to table_scan_sample_next_block with valid
+ * CheckXidAlive for catalog or regular tables. See detailed comments in
+ * xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_scan_sample_next_block call during logical decoding");
+ return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate);
+}
+
+/*
+ * Fetch the next sample tuple into `slot` and return true if a visible tuple
+ * was found, false otherwise. table_scan_sample_next_block() needs to
+ * previously have selected a block (i.e. returned true), and no previous
+ * table_scan_sample_next_tuple() for the same block may have returned false.
+ *
+ * This will call the TsmRoutine's NextSampleTuple() callback.
+ */
+static inline bool
+table_scan_sample_next_tuple(TableScanDesc scan,
+ struct SampleScanState *scanstate,
+ TupleTableSlot *slot)
+{
+ /*
+ * We don't expect direct calls to table_scan_sample_next_tuple with valid
+ * CheckXidAlive for catalog or regular tables. See detailed comments in
+ * xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_scan_sample_next_tuple call during logical decoding");
+ return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate,
+ slot);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Functions to make modifications a bit simpler.
+ * ----------------------------------------------------------------------------
+ */
+
+extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot);
+extern void simple_table_tuple_delete(Relation rel, ItemPointer tid,
+ Snapshot snapshot);
+extern void simple_table_tuple_update(Relation rel, ItemPointer otid,
+ TupleTableSlot *slot, Snapshot snapshot,
+ bool *update_indexes);
+
+
+/* ----------------------------------------------------------------------------
+ * Helper functions to implement parallel scans for block oriented AMs.
+ * ----------------------------------------------------------------------------
+ */
+
+extern Size table_block_parallelscan_estimate(Relation rel);
+extern Size table_block_parallelscan_initialize(Relation rel,
+ ParallelTableScanDesc pscan);
+extern void table_block_parallelscan_reinitialize(Relation rel,
+ ParallelTableScanDesc pscan);
+extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWorker pbscanwork,
+ ParallelBlockTableScanDesc pbscan);
+extern void table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWorker pbscanwork,
+ ParallelBlockTableScanDesc pbscan);
+
+
+/* ----------------------------------------------------------------------------
+ * Helper functions to implement relation sizing for block oriented AMs.
+ * ----------------------------------------------------------------------------
+ */
+
+extern uint64 table_block_relation_size(Relation rel, ForkNumber forkNumber);
+extern void table_block_relation_estimate_size(Relation rel,
+ int32 *attr_widths,
+ BlockNumber *pages,
+ double *tuples,
+ double *allvisfrac,
+ Size overhead_bytes_per_tuple,
+ Size usable_bytes_per_page);
+
+/* ----------------------------------------------------------------------------
+ * Functions in tableamapi.c
+ * ----------------------------------------------------------------------------
+ */
+
+extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler);
+extern const TableAmRoutine *GetHeapamTableAmRoutine(void);
+extern bool check_default_table_access_method(char **newval, void **extra,
+ GucSource source);
+
+#endif /* TABLEAM_H */
diff --git a/src/include/access/timeline.h b/src/include/access/timeline.h
new file mode 100644
index 0000000..ce3586c
--- /dev/null
+++ b/src/include/access/timeline.h
@@ -0,0 +1,44 @@
+/*
+ * timeline.h
+ *
+ * Functions for reading and writing timeline history files.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/timeline.h
+ */
+#ifndef TIMELINE_H
+#define TIMELINE_H
+
+#include "access/xlogdefs.h"
+#include "nodes/pg_list.h"
+
+/*
+ * A list of these structs describes the timeline history of the server. Each
+ * TimeLineHistoryEntry represents a piece of WAL belonging to the history,
+ * from newest to oldest. All WAL locations between 'begin' and 'end' belong to
+ * the timeline represented by the entry. Together the 'begin' and 'end'
+ * pointers of all the entries form a contiguous line from beginning of time
+ * to infinity.
+ */
+typedef struct
+{
+ TimeLineID tli;
+ XLogRecPtr begin; /* inclusive */
+ XLogRecPtr end; /* exclusive, InvalidXLogRecPtr means infinity */
+} TimeLineHistoryEntry;
+
+extern List *readTimeLineHistory(TimeLineID targetTLI);
+extern bool existsTimeLineHistory(TimeLineID probeTLI);
+extern TimeLineID findNewestTimeLine(TimeLineID startTLI);
+extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
+ XLogRecPtr switchpoint, char *reason);
+extern void writeTimeLineHistoryFile(TimeLineID tli, char *content, int size);
+extern void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end);
+extern bool tliInHistory(TimeLineID tli, List *expectedTLEs);
+extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history);
+extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history,
+ TimeLineID *nextTLI);
+
+#endif /* TIMELINE_H */
diff --git a/src/include/access/toast_compression.h b/src/include/access/toast_compression.h
new file mode 100644
index 0000000..c992ece
--- /dev/null
+++ b/src/include/access/toast_compression.h
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * toast_compression.h
+ * Functions for toast compression.
+ *
+ * Copyright (c) 2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/toast_compression.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef TOAST_COMPRESSION_H
+#define TOAST_COMPRESSION_H
+
+/*
+ * GUC support.
+ *
+ * default_toast_compression is an integer for purposes of the GUC machinery,
+ * but the value is one of the char values defined below, as they appear in
+ * pg_attribute.attcompression, e.g. TOAST_PGLZ_COMPRESSION.
+ */
+extern int default_toast_compression;
+
+/*
+ * Built-in compression method ID. The toast compression header will store
+ * this in the first 2 bits of the raw length. These built-in compression
+ * method IDs are directly mapped to the built-in compression methods.
+ *
+ * Don't use these values for anything other than understanding the meaning
+ * of the raw bits from a varlena; in particular, if the goal is to identify
+ * a compression method, use the constants TOAST_PGLZ_COMPRESSION, etc.
+ * below. We might someday support more than 4 compression methods, but
+ * we can never have more than 4 values in this enum, because there are
+ * only 2 bits available in the places where this is stored.
+ */
+typedef enum ToastCompressionId
+{
+ TOAST_PGLZ_COMPRESSION_ID = 0,
+ TOAST_LZ4_COMPRESSION_ID = 1,
+ TOAST_INVALID_COMPRESSION_ID = 2
+} ToastCompressionId;
+
+/*
+ * Built-in compression methods. pg_attribute will store these in the
+ * attcompression column. In attcompression, InvalidCompressionMethod
+ * denotes the default behavior.
+ */
+#define TOAST_PGLZ_COMPRESSION 'p'
+#define TOAST_LZ4_COMPRESSION 'l'
+#define InvalidCompressionMethod '\0'
+
+#define CompressionMethodIsValid(cm) ((cm) != InvalidCompressionMethod)
+
+
+/* pglz compression/decompression routines */
+extern struct varlena *pglz_compress_datum(const struct varlena *value);
+extern struct varlena *pglz_decompress_datum(const struct varlena *value);
+extern struct varlena *pglz_decompress_datum_slice(const struct varlena *value,
+ int32 slicelength);
+
+/* lz4 compression/decompression routines */
+extern struct varlena *lz4_compress_datum(const struct varlena *value);
+extern struct varlena *lz4_decompress_datum(const struct varlena *value);
+extern struct varlena *lz4_decompress_datum_slice(const struct varlena *value,
+ int32 slicelength);
+
+/* other stuff */
+extern ToastCompressionId toast_get_compression_id(struct varlena *attr);
+extern char CompressionNameToMethod(const char *compression);
+extern const char *GetCompressionMethodName(char method);
+
+#endif /* TOAST_COMPRESSION_H */
diff --git a/src/include/access/toast_helper.h b/src/include/access/toast_helper.h
new file mode 100644
index 0000000..05104ce
--- /dev/null
+++ b/src/include/access/toast_helper.h
@@ -0,0 +1,116 @@
+/*-------------------------------------------------------------------------
+ *
+ * toast_helper.h
+ * Helper functions for table AMs implementing compressed or
+ * out-of-line storage of varlena attributes.
+ *
+ * Copyright (c) 2000-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/toast_helper.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef TOAST_HELPER_H
+#define TOAST_HELPER_H
+
+#include "utils/rel.h"
+
+/*
+ * Information about one column of a tuple being toasted.
+ *
+ * NOTE: toast_action[i] can have these values:
+ * ' ' default handling
+ * TYPSTORAGE_PLAIN already processed --- don't touch it
+ * TYPSTORAGE_EXTENDED incompressible, but OK to move off
+ *
+ * NOTE: toast_attr[i].tai_size is only made valid for varlena attributes with
+ * toast_action[i] different from TYPSTORAGE_PLAIN.
+ */
+typedef struct
+{
+ struct varlena *tai_oldexternal;
+ int32 tai_size;
+ uint8 tai_colflags;
+ char tai_compression;
+} ToastAttrInfo;
+
+/*
+ * Information about one tuple being toasted.
+ */
+typedef struct
+{
+ /*
+ * Before calling toast_tuple_init, the caller must initialize the
+ * following fields. Each array must have a length equal to
+ * ttc_rel->rd_att->natts. The tts_oldvalues and tts_oldisnull fields
+ * should be NULL in the case of an insert.
+ */
+ Relation ttc_rel; /* the relation that contains the tuple */
+ Datum *ttc_values; /* values from the tuple columns */
+ bool *ttc_isnull; /* null flags for the tuple columns */
+ Datum *ttc_oldvalues; /* values from previous tuple */
+ bool *ttc_oldisnull; /* null flags from previous tuple */
+
+ /*
+ * Before calling toast_tuple_init, the caller should set tts_attr to
+ * point to an array of ToastAttrInfo structures of a length equal to
+ * tts_rel->rd_att->natts. The contents of the array need not be
+ * initialized. ttc_flags also does not need to be initialized.
+ */
+ uint8 ttc_flags;
+ ToastAttrInfo *ttc_attr;
+} ToastTupleContext;
+
+/*
+ * Flags indicating the overall state of a TOAST operation.
+ *
+ * TOAST_NEEDS_DELETE_OLD indicates that one or more old TOAST datums need
+ * to be deleted.
+ *
+ * TOAST_NEEDS_FREE indicates that one or more TOAST values need to be freed.
+ *
+ * TOAST_HAS_NULLS indicates that nulls were found in the tuple being toasted.
+ *
+ * TOAST_NEEDS_CHANGE indicates that a new tuple needs to built; in other
+ * words, the toaster did something.
+ */
+#define TOAST_NEEDS_DELETE_OLD 0x0001
+#define TOAST_NEEDS_FREE 0x0002
+#define TOAST_HAS_NULLS 0x0004
+#define TOAST_NEEDS_CHANGE 0x0008
+
+/*
+ * Flags indicating the status of a TOAST operation with respect to a
+ * particular column.
+ *
+ * TOASTCOL_NEEDS_DELETE_OLD indicates that the old TOAST datums for this
+ * column need to be deleted.
+ *
+ * TOASTCOL_NEEDS_FREE indicates that the value for this column needs to
+ * be freed.
+ *
+ * TOASTCOL_IGNORE indicates that the toaster should not further process
+ * this column.
+ *
+ * TOASTCOL_INCOMPRESSIBLE indicates that this column has been found to
+ * be incompressible, but could be moved out-of-line.
+ */
+#define TOASTCOL_NEEDS_DELETE_OLD TOAST_NEEDS_DELETE_OLD
+#define TOASTCOL_NEEDS_FREE TOAST_NEEDS_FREE
+#define TOASTCOL_IGNORE 0x0010
+#define TOASTCOL_INCOMPRESSIBLE 0x0020
+
+extern void toast_tuple_init(ToastTupleContext *ttc);
+extern int toast_tuple_find_biggest_attribute(ToastTupleContext *ttc,
+ bool for_compression,
+ bool check_main);
+extern void toast_tuple_try_compression(ToastTupleContext *ttc, int attribute);
+extern void toast_tuple_externalize(ToastTupleContext *ttc, int attribute,
+ int options);
+extern void toast_tuple_cleanup(ToastTupleContext *ttc);
+
+extern void toast_delete_external(Relation rel, Datum *values, bool *isnull,
+ bool is_speculative);
+
+#endif
diff --git a/src/include/access/toast_internals.h b/src/include/access/toast_internals.h
new file mode 100644
index 0000000..1c28b07
--- /dev/null
+++ b/src/include/access/toast_internals.h
@@ -0,0 +1,63 @@
+/*-------------------------------------------------------------------------
+ *
+ * toast_internals.h
+ * Internal definitions for the TOAST system.
+ *
+ * Copyright (c) 2000-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/toast_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TOAST_INTERNALS_H
+#define TOAST_INTERNALS_H
+
+#include "access/toast_compression.h"
+#include "storage/lockdefs.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+/*
+ * The information at the start of the compressed toast data.
+ */
+typedef struct toast_compress_header
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ uint32 tcinfo; /* 2 bits for compression method and 30 bits
+ * external size; see va_extinfo */
+} toast_compress_header;
+
+/*
+ * Utilities for manipulation of header information for compressed
+ * toast entries.
+ */
+#define TOAST_COMPRESS_EXTSIZE(ptr) \
+ (((toast_compress_header *) (ptr))->tcinfo & VARLENA_EXTSIZE_MASK)
+#define TOAST_COMPRESS_METHOD(ptr) \
+ (((toast_compress_header *) (ptr))->tcinfo >> VARLENA_EXTSIZE_BITS)
+
+#define TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(ptr, len, cm_method) \
+ do { \
+ Assert((len) > 0 && (len) <= VARLENA_EXTSIZE_MASK); \
+ Assert((cm_method) == TOAST_PGLZ_COMPRESSION_ID || \
+ (cm_method) == TOAST_LZ4_COMPRESSION_ID); \
+ ((toast_compress_header *) (ptr))->tcinfo = \
+ (len) | ((uint32) (cm_method) << VARLENA_EXTSIZE_BITS); \
+ } while (0)
+
+extern Datum toast_compress_datum(Datum value, char cmethod);
+extern Oid toast_get_valid_index(Oid toastoid, LOCKMODE lock);
+
+extern void toast_delete_datum(Relation rel, Datum value, bool is_speculative);
+extern Datum toast_save_datum(Relation rel, Datum value,
+ struct varlena *oldexternal, int options);
+
+extern int toast_open_indexes(Relation toastrel,
+ LOCKMODE lock,
+ Relation **toastidxs,
+ int *num_indexes);
+extern void toast_close_indexes(Relation *toastidxs, int num_indexes,
+ LOCKMODE lock);
+extern void init_toast_snapshot(Snapshot toast_snapshot);
+
+#endif /* TOAST_INTERNALS_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
new file mode 100644
index 0000000..2fe8a59
--- /dev/null
+++ b/src/include/access/transam.h
@@ -0,0 +1,370 @@
+/*-------------------------------------------------------------------------
+ *
+ * transam.h
+ * postgres transaction access method support code
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/transam.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TRANSAM_H
+#define TRANSAM_H
+
+#include "access/xlogdefs.h"
+
+
+/* ----------------
+ * Special transaction ID values
+ *
+ * BootstrapTransactionId is the XID for "bootstrap" operations, and
+ * FrozenTransactionId is used for very old tuples. Both should
+ * always be considered valid.
+ *
+ * FirstNormalTransactionId is the first "normal" transaction id.
+ * Note: if you need to change it, you must change pg_class.h as well.
+ * ----------------
+ */
+#define InvalidTransactionId ((TransactionId) 0)
+#define BootstrapTransactionId ((TransactionId) 1)
+#define FrozenTransactionId ((TransactionId) 2)
+#define FirstNormalTransactionId ((TransactionId) 3)
+#define MaxTransactionId ((TransactionId) 0xFFFFFFFF)
+
+/* ----------------
+ * transaction ID manipulation macros
+ * ----------------
+ */
+#define TransactionIdIsValid(xid) ((xid) != InvalidTransactionId)
+#define TransactionIdIsNormal(xid) ((xid) >= FirstNormalTransactionId)
+#define TransactionIdEquals(id1, id2) ((id1) == (id2))
+#define TransactionIdStore(xid, dest) (*(dest) = (xid))
+#define StoreInvalidTransactionId(dest) (*(dest) = InvalidTransactionId)
+
+#define EpochFromFullTransactionId(x) ((uint32) ((x).value >> 32))
+#define XidFromFullTransactionId(x) ((uint32) (x).value)
+#define U64FromFullTransactionId(x) ((x).value)
+#define FullTransactionIdEquals(a, b) ((a).value == (b).value)
+#define FullTransactionIdPrecedes(a, b) ((a).value < (b).value)
+#define FullTransactionIdPrecedesOrEquals(a, b) ((a).value <= (b).value)
+#define FullTransactionIdFollows(a, b) ((a).value > (b).value)
+#define FullTransactionIdFollowsOrEquals(a, b) ((a).value >= (b).value)
+#define FullTransactionIdIsValid(x) TransactionIdIsValid(XidFromFullTransactionId(x))
+#define InvalidFullTransactionId FullTransactionIdFromEpochAndXid(0, InvalidTransactionId)
+#define FirstNormalFullTransactionId FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId)
+#define FullTransactionIdIsNormal(x) FullTransactionIdFollowsOrEquals(x, FirstNormalFullTransactionId)
+
+/*
+ * A 64 bit value that contains an epoch and a TransactionId. This is
+ * wrapped in a struct to prevent implicit conversion to/from TransactionId.
+ * Not all values represent valid normal XIDs.
+ */
+typedef struct FullTransactionId
+{
+ uint64 value;
+} FullTransactionId;
+
+static inline FullTransactionId
+FullTransactionIdFromEpochAndXid(uint32 epoch, TransactionId xid)
+{
+ FullTransactionId result;
+
+ result.value = ((uint64) epoch) << 32 | xid;
+
+ return result;
+}
+
+static inline FullTransactionId
+FullTransactionIdFromU64(uint64 value)
+{
+ FullTransactionId result;
+
+ result.value = value;
+
+ return result;
+}
+
+/* advance a transaction ID variable, handling wraparound correctly */
+#define TransactionIdAdvance(dest) \
+ do { \
+ (dest)++; \
+ if ((dest) < FirstNormalTransactionId) \
+ (dest) = FirstNormalTransactionId; \
+ } while(0)
+
+/*
+ * Retreat a FullTransactionId variable, stepping over xids that would appear
+ * to be special only when viewed as 32bit XIDs.
+ */
+static inline void
+FullTransactionIdRetreat(FullTransactionId *dest)
+{
+ dest->value--;
+
+ /*
+ * In contrast to 32bit XIDs don't step over the "actual" special xids.
+ * For 64bit xids these can't be reached as part of a wraparound as they
+ * can in the 32bit case.
+ */
+ if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId))
+ return;
+
+ /*
+ * But we do need to step over XIDs that'd appear special only for 32bit
+ * XIDs.
+ */
+ while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId)
+ dest->value--;
+}
+
+/*
+ * Advance a FullTransactionId variable, stepping over xids that would appear
+ * to be special only when viewed as 32bit XIDs.
+ */
+static inline void
+FullTransactionIdAdvance(FullTransactionId *dest)
+{
+ dest->value++;
+
+ /* see FullTransactionIdAdvance() */
+ if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId))
+ return;
+
+ while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId)
+ dest->value++;
+}
+
+/* back up a transaction ID variable, handling wraparound correctly */
+#define TransactionIdRetreat(dest) \
+ do { \
+ (dest)--; \
+ } while ((dest) < FirstNormalTransactionId)
+
+/* compare two XIDs already known to be normal; this is a macro for speed */
+#define NormalTransactionIdPrecedes(id1, id2) \
+ (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
+ (int32) ((id1) - (id2)) < 0)
+
+/* compare two XIDs already known to be normal; this is a macro for speed */
+#define NormalTransactionIdFollows(id1, id2) \
+ (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
+ (int32) ((id1) - (id2)) > 0)
+
+/* ----------
+ * Object ID (OID) zero is InvalidOid.
+ *
+ * OIDs 1-9999 are reserved for manual assignment (see .dat files in
+ * src/include/catalog/). Of these, 8000-9999 are reserved for
+ * development purposes (such as in-progress patches and forks);
+ * they should not appear in released versions.
+ *
+ * OIDs 10000-11999 are reserved for assignment by genbki.pl, for use
+ * when the .dat files in src/include/catalog/ do not specify an OID
+ * for a catalog entry that requires one. Note that genbki.pl assigns
+ * these OIDs independently in each catalog, so they're not guaranteed
+ * to be globally unique.
+ *
+ * OIDS 12000-16383 are reserved for assignment during initdb
+ * using the OID generator. (We start the generator at 12000.)
+ *
+ * OIDs beginning at 16384 are assigned from the OID generator
+ * during normal multiuser operation. (We force the generator up to
+ * 16384 as soon as we are in normal operation.)
+ *
+ * The choices of 8000, 10000 and 12000 are completely arbitrary, and can be
+ * moved if we run low on OIDs in any category. Changing the macros below,
+ * and updating relevant documentation (see bki.sgml and RELEASE_CHANGES),
+ * should be sufficient to do this. Moving the 16384 boundary between
+ * initdb-assigned OIDs and user-defined objects would be substantially
+ * more painful, however, since some user-defined OIDs will appear in
+ * on-disk data; such a change would probably break pg_upgrade.
+ *
+ * NOTE: if the OID generator wraps around, we skip over OIDs 0-16383
+ * and resume with 16384. This minimizes the odds of OID conflict, by not
+ * reassigning OIDs that might have been assigned during initdb.
+ * ----------
+ */
+#define FirstGenbkiObjectId 10000
+#define FirstBootstrapObjectId 12000
+#define FirstNormalObjectId 16384
+
+/*
+ * VariableCache is a data structure in shared memory that is used to track
+ * OID and XID assignment state. For largely historical reasons, there is
+ * just one struct with different fields that are protected by different
+ * LWLocks.
+ *
+ * Note: xidWrapLimit and oldestXidDB are not "active" values, but are
+ * used just to generate useful messages when xidWarnLimit or xidStopLimit
+ * are exceeded.
+ */
+typedef struct VariableCacheData
+{
+ /*
+ * These fields are protected by OidGenLock.
+ */
+ Oid nextOid; /* next OID to assign */
+ uint32 oidCount; /* OIDs available before must do XLOG work */
+
+ /*
+ * These fields are protected by XidGenLock.
+ */
+ FullTransactionId nextXid; /* next XID to assign */
+
+ TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
+ TransactionId xidVacLimit; /* start forcing autovacuums here */
+ TransactionId xidWarnLimit; /* start complaining here */
+ TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */
+ TransactionId xidWrapLimit; /* where the world ends */
+ Oid oldestXidDB; /* database with minimum datfrozenxid */
+
+ /*
+ * These fields are protected by CommitTsLock
+ */
+ TransactionId oldestCommitTsXid;
+ TransactionId newestCommitTsXid;
+
+ /*
+ * These fields are protected by ProcArrayLock.
+ */
+ FullTransactionId latestCompletedXid; /* newest full XID that has
+ * committed or aborted */
+
+ /*
+ * Number of top-level transactions with xids (i.e. which may have
+ * modified the database) that completed in some form since the start of
+ * the server. This currently is solely used to check whether
+ * GetSnapshotData() needs to recompute the contents of the snapshot, or
+ * not. There are likely other users of this. Always above 1.
+ */
+ uint64 xactCompletionCount;
+
+ /*
+ * These fields are protected by XactTruncationLock
+ */
+ TransactionId oldestClogXid; /* oldest it's safe to look up in clog */
+
+} VariableCacheData;
+
+typedef VariableCacheData *VariableCache;
+
+
+/* ----------------
+ * extern declarations
+ * ----------------
+ */
+
+/* in transam/xact.c */
+extern bool TransactionStartedDuringRecovery(void);
+
+/* in transam/varsup.c */
+extern PGDLLIMPORT VariableCache ShmemVariableCache;
+
+/*
+ * prototypes for functions in transam/transam.c
+ */
+extern bool TransactionIdDidCommit(TransactionId transactionId);
+extern bool TransactionIdDidAbort(TransactionId transactionId);
+extern bool TransactionIdIsKnownCompleted(TransactionId transactionId);
+extern void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids);
+extern void TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, XLogRecPtr lsn);
+extern void TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids);
+extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2);
+extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2);
+extern bool TransactionIdFollows(TransactionId id1, TransactionId id2);
+extern bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2);
+extern TransactionId TransactionIdLatest(TransactionId mainxid,
+ int nxids, const TransactionId *xids);
+extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid);
+
+/* in transam/varsup.c */
+extern FullTransactionId GetNewTransactionId(bool isSubXact);
+extern void AdvanceNextFullTransactionIdPastXid(TransactionId xid);
+extern FullTransactionId ReadNextFullTransactionId(void);
+extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
+ Oid oldest_datoid);
+extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid);
+extern bool ForceTransactionIdLimitUpdate(void);
+extern Oid GetNewObjectId(void);
+
+#ifdef USE_ASSERT_CHECKING
+extern void AssertTransactionIdInAllowableRange(TransactionId xid);
+#else
+#define AssertTransactionIdInAllowableRange(xid) ((void)true)
+#endif
+
+/*
+ * Some frontend programs include this header. For compilers that emit static
+ * inline functions even when they're unused, that leads to unsatisfied
+ * external references; hence hide them with #ifndef FRONTEND.
+ */
+#ifndef FRONTEND
+
+/*
+ * For callers that just need the XID part of the next transaction ID.
+ */
+static inline TransactionId
+ReadNextTransactionId(void)
+{
+ return XidFromFullTransactionId(ReadNextFullTransactionId());
+}
+
+/* return transaction ID backed up by amount, handling wraparound correctly */
+static inline TransactionId
+TransactionIdRetreatedBy(TransactionId xid, uint32 amount)
+{
+ xid -= amount;
+
+ while (xid < FirstNormalTransactionId)
+ xid--;
+
+ return xid;
+}
+
+/* return the older of the two IDs */
+static inline TransactionId
+TransactionIdOlder(TransactionId a, TransactionId b)
+{
+ if (!TransactionIdIsValid(a))
+ return b;
+
+ if (!TransactionIdIsValid(b))
+ return a;
+
+ if (TransactionIdPrecedes(a, b))
+ return a;
+ return b;
+}
+
+/* return the older of the two IDs, assuming they're both normal */
+static inline TransactionId
+NormalTransactionIdOlder(TransactionId a, TransactionId b)
+{
+ Assert(TransactionIdIsNormal(a));
+ Assert(TransactionIdIsNormal(b));
+ if (NormalTransactionIdPrecedes(a, b))
+ return a;
+ return b;
+}
+
+/* return the newer of the two IDs */
+static inline FullTransactionId
+FullTransactionIdNewer(FullTransactionId a, FullTransactionId b)
+{
+ if (!FullTransactionIdIsValid(a))
+ return b;
+
+ if (!FullTransactionIdIsValid(b))
+ return a;
+
+ if (FullTransactionIdFollows(a, b))
+ return a;
+ return b;
+}
+
+#endif /* FRONTEND */
+
+#endif /* TRANSAM_H */
diff --git a/src/include/access/tsmapi.h b/src/include/access/tsmapi.h
new file mode 100644
index 0000000..2dc848c
--- /dev/null
+++ b/src/include/access/tsmapi.h
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsmapi.h
+ * API for tablesample methods
+ *
+ * Copyright (c) 2015-2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/tsmapi.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TSMAPI_H
+#define TSMAPI_H
+
+#include "nodes/execnodes.h"
+#include "nodes/pathnodes.h"
+
+
+/*
+ * Callback function signatures --- see tablesample-method.sgml for more info.
+ */
+
+typedef void (*SampleScanGetSampleSize_function) (PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples);
+
+typedef void (*InitSampleScan_function) (SampleScanState *node,
+ int eflags);
+
+typedef void (*BeginSampleScan_function) (SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed);
+
+typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node,
+ BlockNumber nblocks);
+
+typedef OffsetNumber (*NextSampleTuple_function) (SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset);
+
+typedef void (*EndSampleScan_function) (SampleScanState *node);
+
+/*
+ * TsmRoutine is the struct returned by a tablesample method's handler
+ * function. It provides pointers to the callback functions needed by the
+ * planner and executor, as well as additional information about the method.
+ *
+ * More function pointers are likely to be added in the future.
+ * Therefore it's recommended that the handler initialize the struct with
+ * makeNode(TsmRoutine) so that all fields are set to NULL. This will
+ * ensure that no fields are accidentally left undefined.
+ */
+typedef struct TsmRoutine
+{
+ NodeTag type;
+
+ /* List of datatype OIDs for the arguments of the TABLESAMPLE clause */
+ List *parameterTypes;
+
+ /* Can method produce repeatable samples across, or even within, queries? */
+ bool repeatable_across_queries;
+ bool repeatable_across_scans;
+
+ /* Functions for planning a SampleScan on a physical table */
+ SampleScanGetSampleSize_function SampleScanGetSampleSize;
+
+ /* Functions for executing a SampleScan on a physical table */
+ InitSampleScan_function InitSampleScan; /* can be NULL */
+ BeginSampleScan_function BeginSampleScan;
+ NextSampleBlock_function NextSampleBlock; /* can be NULL */
+ NextSampleTuple_function NextSampleTuple;
+ EndSampleScan_function EndSampleScan; /* can be NULL */
+} TsmRoutine;
+
+
+/* Functions in access/tablesample/tablesample.c */
+extern TsmRoutine *GetTsmRoutine(Oid tsmhandler);
+
+#endif /* TSMAPI_H */
diff --git a/src/include/access/tupconvert.h b/src/include/access/tupconvert.h
new file mode 100644
index 0000000..a2cc4b3
--- /dev/null
+++ b/src/include/access/tupconvert.h
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------
+ *
+ * tupconvert.h
+ * Tuple conversion support.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/tupconvert.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPCONVERT_H
+#define TUPCONVERT_H
+
+#include "access/attmap.h"
+#include "access/htup.h"
+#include "access/tupdesc.h"
+#include "executor/tuptable.h"
+#include "nodes/bitmapset.h"
+
+
+typedef struct TupleConversionMap
+{
+ TupleDesc indesc; /* tupdesc for source rowtype */
+ TupleDesc outdesc; /* tupdesc for result rowtype */
+ AttrMap *attrMap; /* indexes of input fields, or 0 for null */
+ Datum *invalues; /* workspace for deconstructing source */
+ bool *inisnull;
+ Datum *outvalues; /* workspace for constructing result */
+ bool *outisnull;
+} TupleConversionMap;
+
+
+extern TupleConversionMap *convert_tuples_by_position(TupleDesc indesc,
+ TupleDesc outdesc,
+ const char *msg);
+
+extern TupleConversionMap *convert_tuples_by_name(TupleDesc indesc,
+ TupleDesc outdesc);
+
+extern HeapTuple execute_attr_map_tuple(HeapTuple tuple, TupleConversionMap *map);
+extern TupleTableSlot *execute_attr_map_slot(AttrMap *attrMap,
+ TupleTableSlot *in_slot,
+ TupleTableSlot *out_slot);
+extern Bitmapset *execute_attr_map_cols(AttrMap *attrMap, Bitmapset *inbitmap);
+
+extern void free_conversion_map(TupleConversionMap *map);
+
+#endif /* TUPCONVERT_H */
diff --git a/src/include/access/tupdesc.h b/src/include/access/tupdesc.h
new file mode 100644
index 0000000..f45d47a
--- /dev/null
+++ b/src/include/access/tupdesc.h
@@ -0,0 +1,154 @@
+/*-------------------------------------------------------------------------
+ *
+ * tupdesc.h
+ * POSTGRES tuple descriptor definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/tupdesc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPDESC_H
+#define TUPDESC_H
+
+#include "access/attnum.h"
+#include "catalog/pg_attribute.h"
+#include "nodes/pg_list.h"
+
+
+typedef struct AttrDefault
+{
+ AttrNumber adnum;
+ char *adbin; /* nodeToString representation of expr */
+} AttrDefault;
+
+typedef struct ConstrCheck
+{
+ char *ccname;
+ char *ccbin; /* nodeToString representation of expr */
+ bool ccvalid;
+ bool ccnoinherit; /* this is a non-inheritable constraint */
+} ConstrCheck;
+
+/* This structure contains constraints of a tuple */
+typedef struct TupleConstr
+{
+ AttrDefault *defval; /* array */
+ ConstrCheck *check; /* array */
+ struct AttrMissing *missing; /* missing attributes values, NULL if none */
+ uint16 num_defval;
+ uint16 num_check;
+ bool has_not_null;
+ bool has_generated_stored;
+} TupleConstr;
+
+/*
+ * This struct is passed around within the backend to describe the structure
+ * of tuples. For tuples coming from on-disk relations, the information is
+ * collected from the pg_attribute, pg_attrdef, and pg_constraint catalogs.
+ * Transient row types (such as the result of a join query) have anonymous
+ * TupleDesc structs that generally omit any constraint info; therefore the
+ * structure is designed to let the constraints be omitted efficiently.
+ *
+ * Note that only user attributes, not system attributes, are mentioned in
+ * TupleDesc.
+ *
+ * If the tupdesc is known to correspond to a named rowtype (such as a table's
+ * rowtype) then tdtypeid identifies that type and tdtypmod is -1. Otherwise
+ * tdtypeid is RECORDOID, and tdtypmod can be either -1 for a fully anonymous
+ * row type, or a value >= 0 to allow the rowtype to be looked up in the
+ * typcache.c type cache.
+ *
+ * Note that tdtypeid is never the OID of a domain over composite, even if
+ * we are dealing with values that are known (at some higher level) to be of
+ * a domain-over-composite type. This is because tdtypeid/tdtypmod need to
+ * match up with the type labeling of composite Datums, and those are never
+ * explicitly marked as being of a domain type, either.
+ *
+ * Tuple descriptors that live in caches (relcache or typcache, at present)
+ * are reference-counted: they can be deleted when their reference count goes
+ * to zero. Tuple descriptors created by the executor need no reference
+ * counting, however: they are simply created in the appropriate memory
+ * context and go away when the context is freed. We set the tdrefcount
+ * field of such a descriptor to -1, while reference-counted descriptors
+ * always have tdrefcount >= 0.
+ */
+typedef struct TupleDescData
+{
+ int natts; /* number of attributes in the tuple */
+ Oid tdtypeid; /* composite type ID for tuple type */
+ int32 tdtypmod; /* typmod for tuple type */
+ int tdrefcount; /* reference count, or -1 if not counting */
+ TupleConstr *constr; /* constraints, or NULL if none */
+ /* attrs[N] is the description of Attribute Number N+1 */
+ FormData_pg_attribute attrs[FLEXIBLE_ARRAY_MEMBER];
+} TupleDescData;
+typedef struct TupleDescData *TupleDesc;
+
+/* Accessor for the i'th attribute of tupdesc. */
+#define TupleDescAttr(tupdesc, i) (&(tupdesc)->attrs[(i)])
+
+extern TupleDesc CreateTemplateTupleDesc(int natts);
+
+extern TupleDesc CreateTupleDesc(int natts, Form_pg_attribute *attrs);
+
+extern TupleDesc CreateTupleDescCopy(TupleDesc tupdesc);
+
+extern TupleDesc CreateTupleDescCopyConstr(TupleDesc tupdesc);
+
+#define TupleDescSize(src) \
+ (offsetof(struct TupleDescData, attrs) + \
+ (src)->natts * sizeof(FormData_pg_attribute))
+
+extern void TupleDescCopy(TupleDesc dst, TupleDesc src);
+
+extern void TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno,
+ TupleDesc src, AttrNumber srcAttno);
+
+extern void FreeTupleDesc(TupleDesc tupdesc);
+
+extern void IncrTupleDescRefCount(TupleDesc tupdesc);
+extern void DecrTupleDescRefCount(TupleDesc tupdesc);
+
+#define PinTupleDesc(tupdesc) \
+ do { \
+ if ((tupdesc)->tdrefcount >= 0) \
+ IncrTupleDescRefCount(tupdesc); \
+ } while (0)
+
+#define ReleaseTupleDesc(tupdesc) \
+ do { \
+ if ((tupdesc)->tdrefcount >= 0) \
+ DecrTupleDescRefCount(tupdesc); \
+ } while (0)
+
+extern bool equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2);
+
+extern uint32 hashTupleDesc(TupleDesc tupdesc);
+
+extern void TupleDescInitEntry(TupleDesc desc,
+ AttrNumber attributeNumber,
+ const char *attributeName,
+ Oid oidtypeid,
+ int32 typmod,
+ int attdim);
+
+extern void TupleDescInitBuiltinEntry(TupleDesc desc,
+ AttrNumber attributeNumber,
+ const char *attributeName,
+ Oid oidtypeid,
+ int32 typmod,
+ int attdim);
+
+extern void TupleDescInitEntryCollation(TupleDesc desc,
+ AttrNumber attributeNumber,
+ Oid collationid);
+
+extern TupleDesc BuildDescForRelation(List *schema);
+
+extern TupleDesc BuildDescFromLists(List *names, List *types, List *typmods, List *collations);
+
+#endif /* TUPDESC_H */
diff --git a/src/include/access/tupdesc_details.h b/src/include/access/tupdesc_details.h
new file mode 100644
index 0000000..d0d2c99
--- /dev/null
+++ b/src/include/access/tupdesc_details.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * tupdesc_details.h
+ * POSTGRES tuple descriptor definitions we can't include everywhere
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/tupdesc_details.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef TUPDESC_DETAILS_H
+#define TUPDESC_DETAILS_H
+
+/*
+ * Structure used to represent value to be used when the attribute is not
+ * present at all in a tuple, i.e. when the column was created after the tuple
+ */
+typedef struct AttrMissing
+{
+ bool am_present; /* true if non-NULL missing value exists */
+ Datum am_value; /* value when attribute is missing */
+} AttrMissing;
+
+#endif /* TUPDESC_DETAILS_H */
diff --git a/src/include/access/tupmacs.h b/src/include/access/tupmacs.h
new file mode 100644
index 0000000..65ac1ef
--- /dev/null
+++ b/src/include/access/tupmacs.h
@@ -0,0 +1,247 @@
+/*-------------------------------------------------------------------------
+ *
+ * tupmacs.h
+ * Tuple macros used by both index tuples and heap tuples.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/tupmacs.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPMACS_H
+#define TUPMACS_H
+
+#include "catalog/pg_type_d.h" /* for TYPALIGN macros */
+
+
+/*
+ * Check a tuple's null bitmap to determine whether the attribute is null.
+ * Note that a 0 in the null bitmap indicates a null, while 1 indicates
+ * non-null.
+ */
+#define att_isnull(ATT, BITS) (!((BITS)[(ATT) >> 3] & (1 << ((ATT) & 0x07))))
+
+/*
+ * Given a Form_pg_attribute and a pointer into a tuple's data area,
+ * return the correct value or pointer.
+ *
+ * We return a Datum value in all cases. If the attribute has "byval" false,
+ * we return the same pointer into the tuple data area that we're passed.
+ * Otherwise, we return the correct number of bytes fetched from the data
+ * area and extended to Datum form.
+ *
+ * On machines where Datum is 8 bytes, we support fetching 8-byte byval
+ * attributes; otherwise, only 1, 2, and 4-byte values are supported.
+ *
+ * Note that T must already be properly aligned for this to work correctly.
+ */
+#define fetchatt(A,T) fetch_att(T, (A)->attbyval, (A)->attlen)
+
+/*
+ * Same, but work from byval/len parameters rather than Form_pg_attribute.
+ */
+#if SIZEOF_DATUM == 8
+
+#define fetch_att(T,attbyval,attlen) \
+( \
+ (attbyval) ? \
+ ( \
+ (attlen) == (int) sizeof(Datum) ? \
+ *((Datum *)(T)) \
+ : \
+ ( \
+ (attlen) == (int) sizeof(int32) ? \
+ Int32GetDatum(*((int32 *)(T))) \
+ : \
+ ( \
+ (attlen) == (int) sizeof(int16) ? \
+ Int16GetDatum(*((int16 *)(T))) \
+ : \
+ ( \
+ AssertMacro((attlen) == 1), \
+ CharGetDatum(*((char *)(T))) \
+ ) \
+ ) \
+ ) \
+ ) \
+ : \
+ PointerGetDatum((char *) (T)) \
+)
+#else /* SIZEOF_DATUM != 8 */
+
+#define fetch_att(T,attbyval,attlen) \
+( \
+ (attbyval) ? \
+ ( \
+ (attlen) == (int) sizeof(int32) ? \
+ Int32GetDatum(*((int32 *)(T))) \
+ : \
+ ( \
+ (attlen) == (int) sizeof(int16) ? \
+ Int16GetDatum(*((int16 *)(T))) \
+ : \
+ ( \
+ AssertMacro((attlen) == 1), \
+ CharGetDatum(*((char *)(T))) \
+ ) \
+ ) \
+ ) \
+ : \
+ PointerGetDatum((char *) (T)) \
+)
+#endif /* SIZEOF_DATUM == 8 */
+
+/*
+ * att_align_datum aligns the given offset as needed for a datum of alignment
+ * requirement attalign and typlen attlen. attdatum is the Datum variable
+ * we intend to pack into a tuple (it's only accessed if we are dealing with
+ * a varlena type). Note that this assumes the Datum will be stored as-is;
+ * callers that are intending to convert non-short varlena datums to short
+ * format have to account for that themselves.
+ */
+#define att_align_datum(cur_offset, attalign, attlen, attdatum) \
+( \
+ ((attlen) == -1 && VARATT_IS_SHORT(DatumGetPointer(attdatum))) ? \
+ (uintptr_t) (cur_offset) : \
+ att_align_nominal(cur_offset, attalign) \
+)
+
+/*
+ * att_align_pointer performs the same calculation as att_align_datum,
+ * but is used when walking a tuple. attptr is the current actual data
+ * pointer; when accessing a varlena field we have to "peek" to see if we
+ * are looking at a pad byte or the first byte of a 1-byte-header datum.
+ * (A zero byte must be either a pad byte, or the first byte of a correctly
+ * aligned 4-byte length word; in either case we can align safely. A non-zero
+ * byte must be either a 1-byte length word, or the first byte of a correctly
+ * aligned 4-byte length word; in either case we need not align.)
+ *
+ * Note: some callers pass a "char *" pointer for cur_offset. This is
+ * a bit of a hack but should work all right as long as uintptr_t is the
+ * correct width.
+ */
+#define att_align_pointer(cur_offset, attalign, attlen, attptr) \
+( \
+ ((attlen) == -1 && VARATT_NOT_PAD_BYTE(attptr)) ? \
+ (uintptr_t) (cur_offset) : \
+ att_align_nominal(cur_offset, attalign) \
+)
+
+/*
+ * att_align_nominal aligns the given offset as needed for a datum of alignment
+ * requirement attalign, ignoring any consideration of packed varlena datums.
+ * There are three main use cases for using this macro directly:
+ * * we know that the att in question is not varlena (attlen != -1);
+ * in this case it is cheaper than the above macros and just as good.
+ * * we need to estimate alignment padding cost abstractly, ie without
+ * reference to a real tuple. We must assume the worst case that
+ * all varlenas are aligned.
+ * * within arrays and multiranges, we unconditionally align varlenas (XXX this
+ * should be revisited, probably).
+ *
+ * The attalign cases are tested in what is hopefully something like their
+ * frequency of occurrence.
+ */
+#define att_align_nominal(cur_offset, attalign) \
+( \
+ ((attalign) == TYPALIGN_INT) ? INTALIGN(cur_offset) : \
+ (((attalign) == TYPALIGN_CHAR) ? (uintptr_t) (cur_offset) : \
+ (((attalign) == TYPALIGN_DOUBLE) ? DOUBLEALIGN(cur_offset) : \
+ ( \
+ AssertMacro((attalign) == TYPALIGN_SHORT), \
+ SHORTALIGN(cur_offset) \
+ ))) \
+)
+
+/*
+ * att_addlength_datum increments the given offset by the space needed for
+ * the given Datum variable. attdatum is only accessed if we are dealing
+ * with a variable-length attribute.
+ */
+#define att_addlength_datum(cur_offset, attlen, attdatum) \
+ att_addlength_pointer(cur_offset, attlen, DatumGetPointer(attdatum))
+
+/*
+ * att_addlength_pointer performs the same calculation as att_addlength_datum,
+ * but is used when walking a tuple --- attptr is the pointer to the field
+ * within the tuple.
+ *
+ * Note: some callers pass a "char *" pointer for cur_offset. This is
+ * actually perfectly OK, but probably should be cleaned up along with
+ * the same practice for att_align_pointer.
+ */
+#define att_addlength_pointer(cur_offset, attlen, attptr) \
+( \
+ ((attlen) > 0) ? \
+ ( \
+ (cur_offset) + (attlen) \
+ ) \
+ : (((attlen) == -1) ? \
+ ( \
+ (cur_offset) + VARSIZE_ANY(attptr) \
+ ) \
+ : \
+ ( \
+ AssertMacro((attlen) == -2), \
+ (cur_offset) + (strlen((char *) (attptr)) + 1) \
+ )) \
+)
+
+/*
+ * store_att_byval is a partial inverse of fetch_att: store a given Datum
+ * value into a tuple data area at the specified address. However, it only
+ * handles the byval case, because in typical usage the caller needs to
+ * distinguish by-val and by-ref cases anyway, and so a do-it-all macro
+ * wouldn't be convenient.
+ */
+#if SIZEOF_DATUM == 8
+
+#define store_att_byval(T,newdatum,attlen) \
+ do { \
+ switch (attlen) \
+ { \
+ case sizeof(char): \
+ *(char *) (T) = DatumGetChar(newdatum); \
+ break; \
+ case sizeof(int16): \
+ *(int16 *) (T) = DatumGetInt16(newdatum); \
+ break; \
+ case sizeof(int32): \
+ *(int32 *) (T) = DatumGetInt32(newdatum); \
+ break; \
+ case sizeof(Datum): \
+ *(Datum *) (T) = (newdatum); \
+ break; \
+ default: \
+ elog(ERROR, "unsupported byval length: %d", \
+ (int) (attlen)); \
+ break; \
+ } \
+ } while (0)
+#else /* SIZEOF_DATUM != 8 */
+
+#define store_att_byval(T,newdatum,attlen) \
+ do { \
+ switch (attlen) \
+ { \
+ case sizeof(char): \
+ *(char *) (T) = DatumGetChar(newdatum); \
+ break; \
+ case sizeof(int16): \
+ *(int16 *) (T) = DatumGetInt16(newdatum); \
+ break; \
+ case sizeof(int32): \
+ *(int32 *) (T) = DatumGetInt32(newdatum); \
+ break; \
+ default: \
+ elog(ERROR, "unsupported byval length: %d", \
+ (int) (attlen)); \
+ break; \
+ } \
+ } while (0)
+#endif /* SIZEOF_DATUM == 8 */
+
+#endif
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
new file mode 100644
index 0000000..edb797b
--- /dev/null
+++ b/src/include/access/twophase.h
@@ -0,0 +1,63 @@
+/*-------------------------------------------------------------------------
+ *
+ * twophase.h
+ * Two-phase-commit related declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/twophase.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TWOPHASE_H
+#define TWOPHASE_H
+
+#include "access/xact.h"
+#include "access/xlogdefs.h"
+#include "datatype/timestamp.h"
+#include "storage/lock.h"
+
+/*
+ * GlobalTransactionData is defined in twophase.c; other places have no
+ * business knowing the internal definition.
+ */
+typedef struct GlobalTransactionData *GlobalTransaction;
+
+/* GUC variable */
+extern PGDLLIMPORT int max_prepared_xacts;
+
+extern Size TwoPhaseShmemSize(void);
+extern void TwoPhaseShmemInit(void);
+
+extern void AtAbort_Twophase(void);
+extern void PostPrepare_Twophase(void);
+
+extern TransactionId TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
+ bool *have_more);
+extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid, bool lock_held);
+extern BackendId TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held);
+
+extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid,
+ TimestampTz prepared_at,
+ Oid owner, Oid databaseid);
+
+extern void StartPrepare(GlobalTransaction gxact);
+extern void EndPrepare(GlobalTransaction gxact);
+extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
+
+extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
+ int *nxids_p);
+extern void StandbyRecoverPreparedTransactions(void);
+extern void RecoverPreparedTransactions(void);
+
+extern void CheckPointTwoPhase(XLogRecPtr redo_horizon);
+
+extern void FinishPreparedTransaction(const char *gid, bool isCommit);
+
+extern void PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
+ XLogRecPtr end_lsn, RepOriginId origin_id);
+extern void PrepareRedoRemove(TransactionId xid, bool giveWarning);
+extern void restoreTwoPhaseData(void);
+#endif /* TWOPHASE_H */
diff --git a/src/include/access/twophase_rmgr.h b/src/include/access/twophase_rmgr.h
new file mode 100644
index 0000000..2709d72
--- /dev/null
+++ b/src/include/access/twophase_rmgr.h
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * twophase_rmgr.h
+ * Two-phase-commit resource managers definition
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/twophase_rmgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TWOPHASE_RMGR_H
+#define TWOPHASE_RMGR_H
+
+typedef void (*TwoPhaseCallback) (TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+typedef uint8 TwoPhaseRmgrId;
+
+/*
+ * Built-in resource managers
+ */
+#define TWOPHASE_RM_END_ID 0
+#define TWOPHASE_RM_LOCK_ID 1
+#define TWOPHASE_RM_PGSTAT_ID 2
+#define TWOPHASE_RM_MULTIXACT_ID 3
+#define TWOPHASE_RM_PREDICATELOCK_ID 4
+#define TWOPHASE_RM_MAX_ID TWOPHASE_RM_PREDICATELOCK_ID
+
+extern const TwoPhaseCallback twophase_recover_callbacks[];
+extern const TwoPhaseCallback twophase_postcommit_callbacks[];
+extern const TwoPhaseCallback twophase_postabort_callbacks[];
+extern const TwoPhaseCallback twophase_standby_recover_callbacks[];
+
+
+extern void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
+ const void *data, uint32 len);
+
+#endif /* TWOPHASE_RMGR_H */
diff --git a/src/include/access/valid.h b/src/include/access/valid.h
new file mode 100644
index 0000000..a462113
--- /dev/null
+++ b/src/include/access/valid.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * valid.h
+ * POSTGRES tuple qualification validity definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/valid.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VALID_H
+#define VALID_H
+
+/*
+ * HeapKeyTest
+ *
+ * Test a heap tuple to see if it satisfies a scan key.
+ */
+#define HeapKeyTest(tuple, \
+ tupdesc, \
+ nkeys, \
+ keys, \
+ result) \
+do \
+{ \
+ /* Use underscores to protect the variables passed in as parameters */ \
+ int __cur_nkeys = (nkeys); \
+ ScanKey __cur_keys = (keys); \
+ \
+ (result) = true; /* may change */ \
+ for (; __cur_nkeys--; __cur_keys++) \
+ { \
+ Datum __atp; \
+ bool __isnull; \
+ Datum __test; \
+ \
+ if (__cur_keys->sk_flags & SK_ISNULL) \
+ { \
+ (result) = false; \
+ break; \
+ } \
+ \
+ __atp = heap_getattr((tuple), \
+ __cur_keys->sk_attno, \
+ (tupdesc), \
+ &__isnull); \
+ \
+ if (__isnull) \
+ { \
+ (result) = false; \
+ break; \
+ } \
+ \
+ __test = FunctionCall2Coll(&__cur_keys->sk_func, \
+ __cur_keys->sk_collation, \
+ __atp, __cur_keys->sk_argument); \
+ \
+ if (!DatumGetBool(__test)) \
+ { \
+ (result) = false; \
+ break; \
+ } \
+ } \
+} while (0)
+
+#endif /* VALID_H */
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
new file mode 100644
index 0000000..0981b21
--- /dev/null
+++ b/src/include/access/visibilitymap.h
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * visibilitymap.h
+ * visibility map interface
+ *
+ *
+ * Portions Copyright (c) 2007-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/visibilitymap.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VISIBILITYMAP_H
+#define VISIBILITYMAP_H
+
+#include "access/visibilitymapdefs.h"
+#include "access/xlogdefs.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "utils/relcache.h"
+
+/* Macros for visibilitymap test */
+#define VM_ALL_VISIBLE(r, b, v) \
+ ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_VISIBLE) != 0)
+#define VM_ALL_FROZEN(r, b, v) \
+ ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
+
+extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
+ Buffer vmbuf, uint8 flags);
+extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
+ Buffer *vmbuf);
+extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf);
+extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
+ XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
+ uint8 flags);
+extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf);
+extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen);
+extern BlockNumber visibilitymap_prepare_truncate(Relation rel,
+ BlockNumber nheapblocks);
+
+#endif /* VISIBILITYMAP_H */
diff --git a/src/include/access/visibilitymapdefs.h b/src/include/access/visibilitymapdefs.h
new file mode 100644
index 0000000..58be5a4
--- /dev/null
+++ b/src/include/access/visibilitymapdefs.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * visibilitymapdefs.h
+ * macros for accessing contents of visibility map pages
+ *
+ *
+ * Copyright (c) 2021, PostgreSQL Global Development Group
+ *
+ * src/include/access/visibilitymapdefs.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VISIBILITYMAPDEFS_H
+#define VISIBILITYMAPDEFS_H
+
+/* Number of bits for one heap page */
+#define BITS_PER_HEAPBLOCK 2
+
+/* Flags for bit map */
+#define VISIBILITYMAP_ALL_VISIBLE 0x01
+#define VISIBILITYMAP_ALL_FROZEN 0x02
+#define VISIBILITYMAP_VALID_BITS 0x03 /* OR of all valid visibilitymap
+ * flags bits */
+
+#endif /* VISIBILITYMAPDEFS_H */
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
new file mode 100644
index 0000000..c538758
--- /dev/null
+++ b/src/include/access/xact.h
@@ -0,0 +1,476 @@
+/*-------------------------------------------------------------------------
+ *
+ * xact.h
+ * postgres transaction system definitions
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xact.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef XACT_H
+#define XACT_H
+
+#include "access/transam.h"
+#include "access/xlogreader.h"
+#include "datatype/timestamp.h"
+#include "lib/stringinfo.h"
+#include "nodes/pg_list.h"
+#include "storage/relfilenode.h"
+#include "storage/sinval.h"
+
+/*
+ * Maximum size of Global Transaction ID (including '\0').
+ *
+ * Note that the max value of GIDSIZE must fit in the uint16 gidlen,
+ * specified in TwoPhaseFileHeader.
+ */
+#define GIDSIZE 200
+
+/*
+ * Xact isolation levels
+ */
+#define XACT_READ_UNCOMMITTED 0
+#define XACT_READ_COMMITTED 1
+#define XACT_REPEATABLE_READ 2
+#define XACT_SERIALIZABLE 3
+
+extern int DefaultXactIsoLevel;
+extern PGDLLIMPORT int XactIsoLevel;
+
+/*
+ * We implement three isolation levels internally.
+ * The two stronger ones use one snapshot per database transaction;
+ * the others use one snapshot per statement.
+ * Serializable uses predicate locks in addition to snapshots.
+ * These macros should be used to check which isolation level is selected.
+ */
+#define IsolationUsesXactSnapshot() (XactIsoLevel >= XACT_REPEATABLE_READ)
+#define IsolationIsSerializable() (XactIsoLevel == XACT_SERIALIZABLE)
+
+/* Xact read-only state */
+extern bool DefaultXactReadOnly;
+extern bool XactReadOnly;
+
+/* flag for logging statements in this transaction */
+extern bool xact_is_sampled;
+
+/*
+ * Xact is deferrable -- only meaningful (currently) for read only
+ * SERIALIZABLE transactions
+ */
+extern bool DefaultXactDeferrable;
+extern bool XactDeferrable;
+
+typedef enum
+{
+ SYNCHRONOUS_COMMIT_OFF, /* asynchronous commit */
+ SYNCHRONOUS_COMMIT_LOCAL_FLUSH, /* wait for local flush only */
+ SYNCHRONOUS_COMMIT_REMOTE_WRITE, /* wait for local flush and remote
+ * write */
+ SYNCHRONOUS_COMMIT_REMOTE_FLUSH, /* wait for local and remote flush */
+ SYNCHRONOUS_COMMIT_REMOTE_APPLY /* wait for local and remote flush and
+ * remote apply */
+} SyncCommitLevel;
+
+/* Define the default setting for synchronous_commit */
+#define SYNCHRONOUS_COMMIT_ON SYNCHRONOUS_COMMIT_REMOTE_FLUSH
+
+/* Synchronous commit level */
+extern int synchronous_commit;
+
+/* used during logical streaming of a transaction */
+extern PGDLLIMPORT TransactionId CheckXidAlive;
+extern PGDLLIMPORT bool bsysscan;
+
+/*
+ * Miscellaneous flag bits to record events which occur on the top level
+ * transaction. These flags are only persisted in MyXactFlags and are intended
+ * so we remember to do certain things later in the transaction. This is
+ * globally accessible, so can be set from anywhere in the code which requires
+ * recording flags.
+ */
+extern int MyXactFlags;
+
+/*
+ * XACT_FLAGS_ACCESSEDTEMPNAMESPACE - set when a temporary object is accessed.
+ * We don't allow PREPARE TRANSACTION in that case.
+ */
+#define XACT_FLAGS_ACCESSEDTEMPNAMESPACE (1U << 0)
+
+/*
+ * XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK - records whether the top level xact
+ * logged any Access Exclusive Locks.
+ */
+#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK (1U << 1)
+
+/*
+ * XACT_FLAGS_NEEDIMMEDIATECOMMIT - records whether the top level statement
+ * is one that requires immediate commit, such as CREATE DATABASE.
+ */
+#define XACT_FLAGS_NEEDIMMEDIATECOMMIT (1U << 2)
+
+/*
+ * start- and end-of-transaction callbacks for dynamically loaded modules
+ */
+typedef enum
+{
+ XACT_EVENT_COMMIT,
+ XACT_EVENT_PARALLEL_COMMIT,
+ XACT_EVENT_ABORT,
+ XACT_EVENT_PARALLEL_ABORT,
+ XACT_EVENT_PREPARE,
+ XACT_EVENT_PRE_COMMIT,
+ XACT_EVENT_PARALLEL_PRE_COMMIT,
+ XACT_EVENT_PRE_PREPARE
+} XactEvent;
+
+typedef void (*XactCallback) (XactEvent event, void *arg);
+
+typedef enum
+{
+ SUBXACT_EVENT_START_SUB,
+ SUBXACT_EVENT_COMMIT_SUB,
+ SUBXACT_EVENT_ABORT_SUB,
+ SUBXACT_EVENT_PRE_COMMIT_SUB
+} SubXactEvent;
+
+typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
+ SubTransactionId parentSubid, void *arg);
+
+
+/* ----------------
+ * transaction-related XLOG entries
+ * ----------------
+ */
+
+/*
+ * XLOG allows to store some information in high 4 bits of log record xl_info
+ * field. We use 3 for the opcode, and one about an optional flag variable.
+ */
+#define XLOG_XACT_COMMIT 0x00
+#define XLOG_XACT_PREPARE 0x10
+#define XLOG_XACT_ABORT 0x20
+#define XLOG_XACT_COMMIT_PREPARED 0x30
+#define XLOG_XACT_ABORT_PREPARED 0x40
+#define XLOG_XACT_ASSIGNMENT 0x50
+#define XLOG_XACT_INVALIDATIONS 0x60
+/* free opcode 0x70 */
+
+/* mask for filtering opcodes out of xl_info */
+#define XLOG_XACT_OPMASK 0x70
+
+/* does this record have a 'xinfo' field or not */
+#define XLOG_XACT_HAS_INFO 0x80
+
+/*
+ * The following flags, stored in xinfo, determine which information is
+ * contained in commit/abort records.
+ */
+#define XACT_XINFO_HAS_DBINFO (1U << 0)
+#define XACT_XINFO_HAS_SUBXACTS (1U << 1)
+#define XACT_XINFO_HAS_RELFILENODES (1U << 2)
+#define XACT_XINFO_HAS_INVALS (1U << 3)
+#define XACT_XINFO_HAS_TWOPHASE (1U << 4)
+#define XACT_XINFO_HAS_ORIGIN (1U << 5)
+#define XACT_XINFO_HAS_AE_LOCKS (1U << 6)
+#define XACT_XINFO_HAS_GID (1U << 7)
+
+/*
+ * Also stored in xinfo, these indicating a variety of additional actions that
+ * need to occur when emulating transaction effects during recovery.
+ *
+ * They are named XactCompletion... to differentiate them from
+ * EOXact... routines which run at the end of the original transaction
+ * completion.
+ */
+#define XACT_COMPLETION_APPLY_FEEDBACK (1U << 29)
+#define XACT_COMPLETION_UPDATE_RELCACHE_FILE (1U << 30)
+#define XACT_COMPLETION_FORCE_SYNC_COMMIT (1U << 31)
+
+/* Access macros for above flags */
+#define XactCompletionApplyFeedback(xinfo) \
+ ((xinfo & XACT_COMPLETION_APPLY_FEEDBACK) != 0)
+#define XactCompletionRelcacheInitFileInval(xinfo) \
+ ((xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE) != 0)
+#define XactCompletionForceSyncCommit(xinfo) \
+ ((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0)
+
+typedef struct xl_xact_assignment
+{
+ TransactionId xtop; /* assigned XID's top-level XID */
+ int nsubxacts; /* number of subtransaction XIDs */
+ TransactionId xsub[FLEXIBLE_ARRAY_MEMBER]; /* assigned subxids */
+} xl_xact_assignment;
+
+#define MinSizeOfXactAssignment offsetof(xl_xact_assignment, xsub)
+
+/*
+ * Commit and abort records can contain a lot of information. But a large
+ * portion of the records won't need all possible pieces of information. So we
+ * only include what's needed.
+ *
+ * A minimal commit/abort record only consists of a xl_xact_commit/abort
+ * struct. The presence of additional information is indicated by bits set in
+ * 'xl_xact_xinfo->xinfo'. The presence of the xinfo field itself is signaled
+ * by a set XLOG_XACT_HAS_INFO bit in the xl_info field.
+ *
+ * NB: All the individual data chunks should be sized to multiples of
+ * sizeof(int) and only require int32 alignment. If they require bigger
+ * alignment, they need to be copied upon reading.
+ */
+
+/* sub-records for commit/abort */
+
+typedef struct xl_xact_xinfo
+{
+ /*
+ * Even though we right now only require 1 byte of space in xinfo we use
+ * four so following records don't have to care about alignment. Commit
+ * records can be large, so copying large portions isn't attractive.
+ */
+ uint32 xinfo;
+} xl_xact_xinfo;
+
+typedef struct xl_xact_dbinfo
+{
+ Oid dbId; /* MyDatabaseId */
+ Oid tsId; /* MyDatabaseTableSpace */
+} xl_xact_dbinfo;
+
+typedef struct xl_xact_subxacts
+{
+ int nsubxacts; /* number of subtransaction XIDs */
+ TransactionId subxacts[FLEXIBLE_ARRAY_MEMBER];
+} xl_xact_subxacts;
+#define MinSizeOfXactSubxacts offsetof(xl_xact_subxacts, subxacts)
+
+typedef struct xl_xact_relfilenodes
+{
+ int nrels; /* number of relations */
+ RelFileNode xnodes[FLEXIBLE_ARRAY_MEMBER];
+} xl_xact_relfilenodes;
+#define MinSizeOfXactRelfilenodes offsetof(xl_xact_relfilenodes, xnodes)
+
+typedef struct xl_xact_invals
+{
+ int nmsgs; /* number of shared inval msgs */
+ SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
+} xl_xact_invals;
+#define MinSizeOfXactInvals offsetof(xl_xact_invals, msgs)
+
+typedef struct xl_xact_twophase
+{
+ TransactionId xid;
+} xl_xact_twophase;
+
+typedef struct xl_xact_origin
+{
+ XLogRecPtr origin_lsn;
+ TimestampTz origin_timestamp;
+} xl_xact_origin;
+
+typedef struct xl_xact_commit
+{
+ TimestampTz xact_time; /* time of commit */
+
+ /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */
+ /* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */
+ /* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */
+ /* xl_xact_relfilenodes follows if XINFO_HAS_RELFILENODES */
+ /* xl_xact_invals follows if XINFO_HAS_INVALS */
+ /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */
+ /* twophase_gid follows if XINFO_HAS_GID. As a null-terminated string. */
+ /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */
+} xl_xact_commit;
+#define MinSizeOfXactCommit (offsetof(xl_xact_commit, xact_time) + sizeof(TimestampTz))
+
+typedef struct xl_xact_abort
+{
+ TimestampTz xact_time; /* time of abort */
+
+ /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */
+ /* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */
+ /* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */
+ /* xl_xact_relfilenodes follows if XINFO_HAS_RELFILENODES */
+ /* No invalidation messages needed. */
+ /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */
+ /* twophase_gid follows if XINFO_HAS_GID. As a null-terminated string. */
+ /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */
+} xl_xact_abort;
+#define MinSizeOfXactAbort sizeof(xl_xact_abort)
+
+typedef struct xl_xact_prepare
+{
+ uint32 magic; /* format identifier */
+ uint32 total_len; /* actual file length */
+ TransactionId xid; /* original transaction XID */
+ Oid database; /* OID of database it was in */
+ TimestampTz prepared_at; /* time of preparation */
+ Oid owner; /* user running the transaction */
+ int32 nsubxacts; /* number of following subxact XIDs */
+ int32 ncommitrels; /* number of delete-on-commit rels */
+ int32 nabortrels; /* number of delete-on-abort rels */
+ int32 ninvalmsgs; /* number of cache invalidation messages */
+ bool initfileinval; /* does relcache init file need invalidation? */
+ uint16 gidlen; /* length of the GID - GID follows the header */
+ XLogRecPtr origin_lsn; /* lsn of this record at origin node */
+ TimestampTz origin_timestamp; /* time of prepare at origin node */
+} xl_xact_prepare;
+
+/*
+ * Commit/Abort records in the above form are a bit verbose to parse, so
+ * there's a deconstructed versions generated by ParseCommit/AbortRecord() for
+ * easier consumption.
+ */
+typedef struct xl_xact_parsed_commit
+{
+ TimestampTz xact_time;
+ uint32 xinfo;
+
+ Oid dbId; /* MyDatabaseId */
+ Oid tsId; /* MyDatabaseTableSpace */
+
+ int nsubxacts;
+ TransactionId *subxacts;
+
+ int nrels;
+ RelFileNode *xnodes;
+
+ int nmsgs;
+ SharedInvalidationMessage *msgs;
+
+ TransactionId twophase_xid; /* only for 2PC */
+ char twophase_gid[GIDSIZE]; /* only for 2PC */
+ int nabortrels; /* only for 2PC */
+ RelFileNode *abortnodes; /* only for 2PC */
+
+ XLogRecPtr origin_lsn;
+ TimestampTz origin_timestamp;
+} xl_xact_parsed_commit;
+
+typedef xl_xact_parsed_commit xl_xact_parsed_prepare;
+
+typedef struct xl_xact_parsed_abort
+{
+ TimestampTz xact_time;
+ uint32 xinfo;
+
+ Oid dbId; /* MyDatabaseId */
+ Oid tsId; /* MyDatabaseTableSpace */
+
+ int nsubxacts;
+ TransactionId *subxacts;
+
+ int nrels;
+ RelFileNode *xnodes;
+
+ TransactionId twophase_xid; /* only for 2PC */
+ char twophase_gid[GIDSIZE]; /* only for 2PC */
+
+ XLogRecPtr origin_lsn;
+ TimestampTz origin_timestamp;
+} xl_xact_parsed_abort;
+
+
+/* ----------------
+ * extern definitions
+ * ----------------
+ */
+extern bool IsTransactionState(void);
+extern bool IsAbortedTransactionBlockState(void);
+extern TransactionId GetTopTransactionId(void);
+extern TransactionId GetTopTransactionIdIfAny(void);
+extern TransactionId GetCurrentTransactionId(void);
+extern TransactionId GetCurrentTransactionIdIfAny(void);
+extern TransactionId GetStableLatestTransactionId(void);
+extern SubTransactionId GetCurrentSubTransactionId(void);
+extern FullTransactionId GetTopFullTransactionId(void);
+extern FullTransactionId GetTopFullTransactionIdIfAny(void);
+extern FullTransactionId GetCurrentFullTransactionId(void);
+extern FullTransactionId GetCurrentFullTransactionIdIfAny(void);
+extern void MarkCurrentTransactionIdLoggedIfAny(void);
+extern bool SubTransactionIsActive(SubTransactionId subxid);
+extern CommandId GetCurrentCommandId(bool used);
+extern void SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts);
+extern TimestampTz GetCurrentTransactionStartTimestamp(void);
+extern TimestampTz GetCurrentStatementStartTimestamp(void);
+extern TimestampTz GetCurrentTransactionStopTimestamp(void);
+extern void SetCurrentStatementStartTimestamp(void);
+extern int GetCurrentTransactionNestLevel(void);
+extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
+extern void CommandCounterIncrement(void);
+extern void ForceSyncCommit(void);
+extern void StartTransactionCommand(void);
+extern void SaveTransactionCharacteristics(void);
+extern void RestoreTransactionCharacteristics(void);
+extern void CommitTransactionCommand(void);
+extern void AbortCurrentTransaction(void);
+extern void BeginTransactionBlock(void);
+extern bool EndTransactionBlock(bool chain);
+extern bool PrepareTransactionBlock(const char *gid);
+extern void UserAbortTransactionBlock(bool chain);
+extern void BeginImplicitTransactionBlock(void);
+extern void EndImplicitTransactionBlock(void);
+extern void ReleaseSavepoint(const char *name);
+extern void DefineSavepoint(const char *name);
+extern void RollbackToSavepoint(const char *name);
+extern void BeginInternalSubTransaction(const char *name);
+extern void ReleaseCurrentSubTransaction(void);
+extern void RollbackAndReleaseCurrentSubTransaction(void);
+extern bool IsSubTransaction(void);
+extern Size EstimateTransactionStateSpace(void);
+extern void SerializeTransactionState(Size maxsize, char *start_address);
+extern void StartParallelWorkerTransaction(char *tstatespace);
+extern void EndParallelWorkerTransaction(void);
+extern bool IsTransactionBlock(void);
+extern bool IsTransactionOrTransactionBlock(void);
+extern char TransactionBlockStatusCode(void);
+extern void AbortOutOfAnyTransaction(void);
+extern void PreventInTransactionBlock(bool isTopLevel, const char *stmtType);
+extern void RequireTransactionBlock(bool isTopLevel, const char *stmtType);
+extern void WarnNoTransactionBlock(bool isTopLevel, const char *stmtType);
+extern bool IsInTransactionBlock(bool isTopLevel);
+extern void RegisterXactCallback(XactCallback callback, void *arg);
+extern void UnregisterXactCallback(XactCallback callback, void *arg);
+extern void RegisterSubXactCallback(SubXactCallback callback, void *arg);
+extern void UnregisterSubXactCallback(SubXactCallback callback, void *arg);
+
+extern bool IsSubTransactionAssignmentPending(void);
+extern void MarkSubTransactionAssigned(void);
+
+extern int xactGetCommittedChildren(TransactionId **ptr);
+
+extern XLogRecPtr XactLogCommitRecord(TimestampTz commit_time,
+ int nsubxacts, TransactionId *subxacts,
+ int nrels, RelFileNode *rels,
+ int nmsgs, SharedInvalidationMessage *msgs,
+ bool relcacheInval,
+ int xactflags,
+ TransactionId twophase_xid,
+ const char *twophase_gid);
+
+extern XLogRecPtr XactLogAbortRecord(TimestampTz abort_time,
+ int nsubxacts, TransactionId *subxacts,
+ int nrels, RelFileNode *rels,
+ int xactflags, TransactionId twophase_xid,
+ const char *twophase_gid);
+extern void xact_redo(XLogReaderState *record);
+
+/* xactdesc.c */
+extern void xact_desc(StringInfo buf, XLogReaderState *record);
+extern const char *xact_identify(uint8 info);
+
+/* also in xactdesc.c, so they can be shared between front/backend code */
+extern void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed);
+extern void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed);
+extern void ParsePrepareRecord(uint8 info, xl_xact_prepare *xlrec, xl_xact_parsed_prepare *parsed);
+
+extern void EnterParallelMode(void);
+extern void ExitParallelMode(void);
+extern bool IsInParallelMode(void);
+
+#endif /* XACT_H */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
new file mode 100644
index 0000000..ee3e369
--- /dev/null
+++ b/src/include/access/xlog.h
@@ -0,0 +1,406 @@
+/*
+ * xlog.h
+ *
+ * PostgreSQL write-ahead log manager
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xlog.h
+ */
+#ifndef XLOG_H
+#define XLOG_H
+
+#include "access/rmgr.h"
+#include "access/xlogdefs.h"
+#include "access/xloginsert.h"
+#include "access/xlogreader.h"
+#include "datatype/timestamp.h"
+#include "lib/stringinfo.h"
+#include "nodes/pg_list.h"
+#include "storage/fd.h"
+
+
+/* Sync methods */
+#define SYNC_METHOD_FSYNC 0
+#define SYNC_METHOD_FDATASYNC 1
+#define SYNC_METHOD_OPEN 2 /* for O_SYNC */
+#define SYNC_METHOD_FSYNC_WRITETHROUGH 3
+#define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */
+extern int sync_method;
+
+extern PGDLLIMPORT TimeLineID ThisTimeLineID; /* current TLI */
+
+/*
+ * Prior to 8.4, all activity during recovery was carried out by the startup
+ * process. This local variable continues to be used in many parts of the
+ * code to indicate actions taken by RecoveryManagers. Other processes that
+ * potentially perform work during recovery should check RecoveryInProgress().
+ * See XLogCtl notes in xlog.c.
+ */
+extern bool InRecovery;
+
+/*
+ * Like InRecovery, standbyState is only valid in the startup process.
+ * In all other processes it will have the value STANDBY_DISABLED (so
+ * InHotStandby will read as false).
+ *
+ * In DISABLED state, we're performing crash recovery or hot standby was
+ * disabled in postgresql.conf.
+ *
+ * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but
+ * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
+ * to initialize our primary-transaction tracking system.
+ *
+ * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
+ * state. The tracked information might still be incomplete, so we can't allow
+ * connections yet, but redo functions must update the in-memory state when
+ * appropriate.
+ *
+ * In SNAPSHOT_READY mode, we have full knowledge of transactions that are
+ * (or were) running on the primary at the current WAL location. Snapshots
+ * can be taken, and read-only queries can be run.
+ */
+typedef enum
+{
+ STANDBY_DISABLED,
+ STANDBY_INITIALIZED,
+ STANDBY_SNAPSHOT_PENDING,
+ STANDBY_SNAPSHOT_READY
+} HotStandbyState;
+
+extern HotStandbyState standbyState;
+
+#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
+
+/*
+ * Recovery target type.
+ * Only set during a Point in Time recovery, not when in standby mode.
+ */
+typedef enum
+{
+ RECOVERY_TARGET_UNSET,
+ RECOVERY_TARGET_XID,
+ RECOVERY_TARGET_TIME,
+ RECOVERY_TARGET_NAME,
+ RECOVERY_TARGET_LSN,
+ RECOVERY_TARGET_IMMEDIATE
+} RecoveryTargetType;
+
+/*
+ * Recovery target TimeLine goal
+ */
+typedef enum
+{
+ RECOVERY_TARGET_TIMELINE_CONTROLFILE,
+ RECOVERY_TARGET_TIMELINE_LATEST,
+ RECOVERY_TARGET_TIMELINE_NUMERIC
+} RecoveryTargetTimeLineGoal;
+
+extern XLogRecPtr ProcLastRecPtr;
+extern XLogRecPtr XactLastRecEnd;
+extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd;
+
+extern bool reachedConsistency;
+
+/* these variables are GUC parameters related to XLOG */
+extern int wal_segment_size;
+extern int min_wal_size_mb;
+extern int max_wal_size_mb;
+extern int wal_keep_size_mb;
+extern int max_slot_wal_keep_size_mb;
+extern int XLOGbuffers;
+extern int XLogArchiveTimeout;
+extern int wal_retrieve_retry_interval;
+extern char *XLogArchiveCommand;
+extern bool EnableHotStandby;
+extern bool fullPageWrites;
+extern bool wal_log_hints;
+extern bool wal_compression;
+extern bool wal_init_zero;
+extern bool wal_recycle;
+extern bool *wal_consistency_checking;
+extern char *wal_consistency_checking_string;
+extern bool log_checkpoints;
+extern char *recoveryRestoreCommand;
+extern char *recoveryEndCommand;
+extern char *archiveCleanupCommand;
+extern bool recoveryTargetInclusive;
+extern int recoveryTargetAction;
+extern int recovery_min_apply_delay;
+extern char *PrimaryConnInfo;
+extern char *PrimarySlotName;
+extern bool wal_receiver_create_temp_slot;
+extern bool track_wal_io_timing;
+
+/* indirectly set via GUC system */
+extern TransactionId recoveryTargetXid;
+extern char *recovery_target_time_string;
+extern const char *recoveryTargetName;
+extern XLogRecPtr recoveryTargetLSN;
+extern RecoveryTargetType recoveryTarget;
+extern char *PromoteTriggerFile;
+extern RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal;
+extern TimeLineID recoveryTargetTLIRequested;
+extern TimeLineID recoveryTargetTLI;
+
+extern int CheckPointSegments;
+
+/* option set locally in startup process only when signal files exist */
+extern bool StandbyModeRequested;
+extern bool StandbyMode;
+
+/* Archive modes */
+typedef enum ArchiveMode
+{
+ ARCHIVE_MODE_OFF = 0, /* disabled */
+ ARCHIVE_MODE_ON, /* enabled while server is running normally */
+ ARCHIVE_MODE_ALWAYS /* enabled always (even during recovery) */
+} ArchiveMode;
+extern int XLogArchiveMode;
+
+/* WAL levels */
+typedef enum WalLevel
+{
+ WAL_LEVEL_MINIMAL = 0,
+ WAL_LEVEL_REPLICA,
+ WAL_LEVEL_LOGICAL
+} WalLevel;
+
+/* Recovery states */
+typedef enum RecoveryState
+{
+ RECOVERY_STATE_CRASH = 0, /* crash recovery */
+ RECOVERY_STATE_ARCHIVE, /* archive recovery */
+ RECOVERY_STATE_DONE /* currently in production */
+} RecoveryState;
+
+/* Recovery pause states */
+typedef enum RecoveryPauseState
+{
+ RECOVERY_NOT_PAUSED, /* pause not requested */
+ RECOVERY_PAUSE_REQUESTED, /* pause requested, but not yet paused */
+ RECOVERY_PAUSED /* recovery is paused */
+} RecoveryPauseState;
+
+extern PGDLLIMPORT int wal_level;
+
+/* Is WAL archiving enabled (always or only while server is running normally)? */
+#define XLogArchivingActive() \
+ (AssertMacro(XLogArchiveMode == ARCHIVE_MODE_OFF || wal_level >= WAL_LEVEL_REPLICA), XLogArchiveMode > ARCHIVE_MODE_OFF)
+/* Is WAL archiving enabled always (even during recovery)? */
+#define XLogArchivingAlways() \
+ (AssertMacro(XLogArchiveMode == ARCHIVE_MODE_OFF || wal_level >= WAL_LEVEL_REPLICA), XLogArchiveMode == ARCHIVE_MODE_ALWAYS)
+#define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0')
+
+/*
+ * Is WAL-logging necessary for archival or log-shipping, or can we skip
+ * WAL-logging if we fsync() the data before committing instead?
+ */
+#define XLogIsNeeded() (wal_level >= WAL_LEVEL_REPLICA)
+
+/*
+ * Is a full-page image needed for hint bit updates?
+ *
+ * Normally, we don't WAL-log hint bit updates, but if checksums are enabled,
+ * we have to protect them against torn page writes. When you only set
+ * individual bits on a page, it's still consistent no matter what combination
+ * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log
+ * them if forced by wal_log_hints=on.
+ */
+#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints)
+
+/* Do we need to WAL-log information required only for Hot Standby and logical replication? */
+#define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA)
+
+/* Do we need to WAL-log information required only for logical replication? */
+#define XLogLogicalInfoActive() (wal_level >= WAL_LEVEL_LOGICAL)
+
+#ifdef WAL_DEBUG
+extern bool XLOG_DEBUG;
+#endif
+
+/*
+ * OR-able request flag bits for checkpoints. The "cause" bits are used only
+ * for logging purposes. Note: the flags must be defined so that it's
+ * sensible to OR together request flags arising from different requestors.
+ */
+
+/* These directly affect the behavior of CreateCheckPoint and subsidiaries */
+#define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */
+#define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but
+ * issued at end of WAL recovery */
+#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */
+#define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */
+#define CHECKPOINT_FLUSH_ALL 0x0010 /* Flush all pages, including those
+ * belonging to unlogged tables */
+/* These are important to RequestCheckpoint */
+#define CHECKPOINT_WAIT 0x0020 /* Wait for completion */
+#define CHECKPOINT_REQUESTED 0x0040 /* Checkpoint request has been made */
+/* These indicate the cause of a checkpoint request */
+#define CHECKPOINT_CAUSE_XLOG 0x0080 /* XLOG consumption */
+#define CHECKPOINT_CAUSE_TIME 0x0100 /* Elapsed time */
+
+/*
+ * Flag bits for the record being inserted, set using XLogSetRecordFlags().
+ */
+#define XLOG_INCLUDE_ORIGIN 0x01 /* include the replication origin */
+#define XLOG_MARK_UNIMPORTANT 0x02 /* record not important for durability */
+#define XLOG_INCLUDE_XID 0x04 /* WAL-internal message-passing hack */
+
+
+/* Checkpoint statistics */
+typedef struct CheckpointStatsData
+{
+ TimestampTz ckpt_start_t; /* start of checkpoint */
+ TimestampTz ckpt_write_t; /* start of flushing buffers */
+ TimestampTz ckpt_sync_t; /* start of fsyncs */
+ TimestampTz ckpt_sync_end_t; /* end of fsyncs */
+ TimestampTz ckpt_end_t; /* end of checkpoint */
+
+ int ckpt_bufs_written; /* # of buffers written */
+
+ int ckpt_segs_added; /* # of new xlog segments created */
+ int ckpt_segs_removed; /* # of xlog segments deleted */
+ int ckpt_segs_recycled; /* # of xlog segments recycled */
+
+ int ckpt_sync_rels; /* # of relations synced */
+ uint64 ckpt_longest_sync; /* Longest sync for one relation */
+ uint64 ckpt_agg_sync_time; /* The sum of all the individual sync
+ * times, which is not necessarily the
+ * same as the total elapsed time for the
+ * entire sync phase. */
+} CheckpointStatsData;
+
+extern CheckpointStatsData CheckpointStats;
+
+/*
+ * GetWALAvailability return codes
+ */
+typedef enum WALAvailability
+{
+ WALAVAIL_INVALID_LSN, /* parameter error */
+ WALAVAIL_RESERVED, /* WAL segment is within max_wal_size */
+ WALAVAIL_EXTENDED, /* WAL segment is reserved by a slot or
+ * wal_keep_size */
+ WALAVAIL_UNRESERVED, /* no longer reserved, but not removed yet */
+ WALAVAIL_REMOVED /* WAL segment has been removed */
+} WALAvailability;
+
+struct XLogRecData;
+
+extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata,
+ XLogRecPtr fpw_lsn,
+ uint8 flags,
+ int num_fpi);
+extern void XLogFlush(XLogRecPtr RecPtr);
+extern bool XLogBackgroundFlush(void);
+extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
+extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock);
+extern int XLogFileOpen(XLogSegNo segno);
+
+extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
+extern XLogSegNo XLogGetLastRemovedSegno(void);
+extern void XLogSetAsyncXactLSN(XLogRecPtr record);
+extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
+
+extern void xlog_redo(XLogReaderState *record);
+extern void xlog_desc(StringInfo buf, XLogReaderState *record);
+extern const char *xlog_identify(uint8 info);
+
+extern void issue_xlog_fsync(int fd, XLogSegNo segno);
+
+extern bool RecoveryInProgress(void);
+extern RecoveryState GetRecoveryState(void);
+extern bool HotStandbyActive(void);
+extern bool HotStandbyActiveInReplay(void);
+extern bool XLogInsertAllowed(void);
+extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
+extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
+extern XLogRecPtr GetXLogInsertRecPtr(void);
+extern XLogRecPtr GetXLogWriteRecPtr(void);
+extern RecoveryPauseState GetRecoveryPauseState(void);
+extern void SetRecoveryPause(bool recoveryPause);
+extern TimestampTz GetLatestXTime(void);
+extern TimestampTz GetCurrentChunkReplayStartTime(void);
+
+extern void UpdateControlFile(void);
+extern uint64 GetSystemIdentifier(void);
+extern char *GetMockAuthenticationNonce(void);
+extern bool DataChecksumsEnabled(void);
+extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
+extern Size XLOGShmemSize(void);
+extern void XLOGShmemInit(void);
+extern void BootStrapXLOG(void);
+extern void LocalProcessControlFile(bool reset);
+extern void StartupXLOG(void);
+extern void ShutdownXLOG(int code, Datum arg);
+extern void InitXLOGAccess(void);
+extern void CreateCheckPoint(int flags);
+extern bool CreateRestartPoint(int flags);
+extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN);
+extern XLogRecPtr CalculateMaxmumSafeLSN(void);
+extern void XLogPutNextOid(Oid nextOid);
+extern XLogRecPtr XLogRestorePoint(const char *rpName);
+extern void UpdateFullPageWrites(void);
+extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p);
+extern XLogRecPtr GetRedoRecPtr(void);
+extern XLogRecPtr GetInsertRecPtr(void);
+extern XLogRecPtr GetFlushRecPtr(void);
+extern XLogRecPtr GetLastImportantRecPtr(void);
+extern void RemovePromoteSignalFiles(void);
+
+extern bool PromoteIsTriggered(void);
+extern bool CheckPromoteSignal(void);
+extern void WakeupRecovery(void);
+extern void SetWalWriterSleeping(bool sleeping);
+
+extern void StartupRequestWalReceiverRestart(void);
+extern void XLogRequestWalReceiverReply(void);
+
+extern void assign_max_wal_size(int newval, void *extra);
+extern void assign_checkpoint_completion_target(double newval, void *extra);
+
+/*
+ * Routines to start, stop, and get status of a base backup.
+ */
+
+/*
+ * Session-level status of base backups
+ *
+ * This is used in parallel with the shared memory status to control parallel
+ * execution of base backup functions for a given session, be it a backend
+ * dedicated to replication or a normal backend connected to a database. The
+ * update of the session-level status happens at the same time as the shared
+ * memory counters to keep a consistent global and local state of the backups
+ * running.
+ */
+typedef enum SessionBackupState
+{
+ SESSION_BACKUP_NONE,
+ SESSION_BACKUP_EXCLUSIVE,
+ SESSION_BACKUP_NON_EXCLUSIVE
+} SessionBackupState;
+
+extern XLogRecPtr do_pg_start_backup(const char *backupidstr, bool fast,
+ TimeLineID *starttli_p, StringInfo labelfile,
+ List **tablespaces, StringInfo tblspcmapfile);
+extern XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive,
+ TimeLineID *stoptli_p);
+extern void do_pg_abort_backup(int code, Datum arg);
+extern void register_persistent_abort_backup_handler(void);
+extern SessionBackupState get_backup_status(void);
+
+/* File path names (all relative to $PGDATA) */
+#define RECOVERY_SIGNAL_FILE "recovery.signal"
+#define STANDBY_SIGNAL_FILE "standby.signal"
+#define BACKUP_LABEL_FILE "backup_label"
+#define BACKUP_LABEL_OLD "backup_label.old"
+
+#define TABLESPACE_MAP "tablespace_map"
+#define TABLESPACE_MAP_OLD "tablespace_map.old"
+
+/* files to signal promotion to primary */
+#define PROMOTE_SIGNAL_FILE "promote"
+
+#endif /* XLOG_H */
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
new file mode 100644
index 0000000..dcf41e9
--- /dev/null
+++ b/src/include/access/xlog_internal.h
@@ -0,0 +1,336 @@
+/*
+ * xlog_internal.h
+ *
+ * PostgreSQL write-ahead log internal declarations
+ *
+ * NOTE: this file is intended to contain declarations useful for
+ * manipulating the XLOG files directly, but it is not supposed to be
+ * needed by rmgr routines (redo support for individual record types).
+ * So the XLogRecord typedef and associated stuff appear in xlogrecord.h.
+ *
+ * Note: This file must be includable in both frontend and backend contexts,
+ * to allow stand-alone tools like pg_receivewal to deal with WAL files.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xlog_internal.h
+ */
+#ifndef XLOG_INTERNAL_H
+#define XLOG_INTERNAL_H
+
+#include "access/xlogdefs.h"
+#include "access/xlogreader.h"
+#include "datatype/timestamp.h"
+#include "lib/stringinfo.h"
+#include "pgtime.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+
+/*
+ * Each page of XLOG file has a header like this:
+ */
+#define XLOG_PAGE_MAGIC 0xD10D /* can be used as WAL version indicator */
+
+typedef struct XLogPageHeaderData
+{
+ uint16 xlp_magic; /* magic value for correctness checks */
+ uint16 xlp_info; /* flag bits, see below */
+ TimeLineID xlp_tli; /* TimeLineID of first record on page */
+ XLogRecPtr xlp_pageaddr; /* XLOG address of this page */
+
+ /*
+ * When there is not enough space on current page for whole record, we
+ * continue on the next page. xlp_rem_len is the number of bytes
+ * remaining from a previous page; it tracks xl_tot_len in the initial
+ * header. Note that the continuation data isn't necessarily aligned.
+ */
+ uint32 xlp_rem_len; /* total len of remaining data for record */
+} XLogPageHeaderData;
+
+#define SizeOfXLogShortPHD MAXALIGN(sizeof(XLogPageHeaderData))
+
+typedef XLogPageHeaderData *XLogPageHeader;
+
+/*
+ * When the XLP_LONG_HEADER flag is set, we store additional fields in the
+ * page header. (This is ordinarily done just in the first page of an
+ * XLOG file.) The additional fields serve to identify the file accurately.
+ */
+typedef struct XLogLongPageHeaderData
+{
+ XLogPageHeaderData std; /* standard header fields */
+ uint64 xlp_sysid; /* system identifier from pg_control */
+ uint32 xlp_seg_size; /* just as a cross-check */
+ uint32 xlp_xlog_blcksz; /* just as a cross-check */
+} XLogLongPageHeaderData;
+
+#define SizeOfXLogLongPHD MAXALIGN(sizeof(XLogLongPageHeaderData))
+
+typedef XLogLongPageHeaderData *XLogLongPageHeader;
+
+/* When record crosses page boundary, set this flag in new page's header */
+#define XLP_FIRST_IS_CONTRECORD 0x0001
+/* This flag indicates a "long" page header */
+#define XLP_LONG_HEADER 0x0002
+/* This flag indicates backup blocks starting in this page are optional */
+#define XLP_BKP_REMOVABLE 0x0004
+/* Replaces a missing contrecord; see CreateOverwriteContrecordRecord */
+#define XLP_FIRST_IS_OVERWRITE_CONTRECORD 0x0008
+/* All defined flag bits in xlp_info (used for validity checking of header) */
+#define XLP_ALL_FLAGS 0x000F
+
+#define XLogPageHeaderSize(hdr) \
+ (((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD)
+
+/* wal_segment_size can range from 1MB to 1GB */
+#define WalSegMinSize 1024 * 1024
+#define WalSegMaxSize 1024 * 1024 * 1024
+/* default number of min and max wal segments */
+#define DEFAULT_MIN_WAL_SEGS 5
+#define DEFAULT_MAX_WAL_SEGS 64
+
+/* check that the given size is a valid wal_segment_size */
+#define IsPowerOf2(x) (x > 0 && ((x) & ((x)-1)) == 0)
+#define IsValidWalSegSize(size) \
+ (IsPowerOf2(size) && \
+ ((size) >= WalSegMinSize && (size) <= WalSegMaxSize))
+
+#define XLogSegmentsPerXLogId(wal_segsz_bytes) \
+ (UINT64CONST(0x100000000) / (wal_segsz_bytes))
+
+#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest) \
+ (dest) = (segno) * (wal_segsz_bytes) + (offset)
+
+#define XLogSegmentOffset(xlogptr, wal_segsz_bytes) \
+ ((xlogptr) & ((wal_segsz_bytes) - 1))
+
+/*
+ * Compute a segment number from an XLogRecPtr.
+ *
+ * For XLByteToSeg, do the computation at face value. For XLByteToPrevSeg,
+ * a boundary byte is taken to be in the previous segment. This is suitable
+ * for deciding which segment to write given a pointer to a record end,
+ * for example.
+ */
+#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes) \
+ logSegNo = (xlrp) / (wal_segsz_bytes)
+
+#define XLByteToPrevSeg(xlrp, logSegNo, wal_segsz_bytes) \
+ logSegNo = ((xlrp) - 1) / (wal_segsz_bytes)
+
+/*
+ * Convert values of GUCs measured in megabytes to equiv. segment count.
+ * Rounds down.
+ */
+#define XLogMBVarToSegs(mbvar, wal_segsz_bytes) \
+ ((mbvar) / ((wal_segsz_bytes) / (1024 * 1024)))
+
+/*
+ * Is an XLogRecPtr within a particular XLOG segment?
+ *
+ * For XLByteInSeg, do the computation at face value. For XLByteInPrevSeg,
+ * a boundary byte is taken to be in the previous segment.
+ */
+#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes) \
+ (((xlrp) / (wal_segsz_bytes)) == (logSegNo))
+
+#define XLByteInPrevSeg(xlrp, logSegNo, wal_segsz_bytes) \
+ ((((xlrp) - 1) / (wal_segsz_bytes)) == (logSegNo))
+
+/* Check if an XLogRecPtr value is in a plausible range */
+#define XRecOffIsValid(xlrp) \
+ ((xlrp) % XLOG_BLCKSZ >= SizeOfXLogShortPHD)
+
+/*
+ * The XLog directory and control file (relative to $PGDATA)
+ */
+#define XLOGDIR "pg_wal"
+#define XLOG_CONTROL_FILE "global/pg_control"
+
+/*
+ * These macros encapsulate knowledge about the exact layout of XLog file
+ * names, timeline history file names, and archive-status file names.
+ */
+#define MAXFNAMELEN 64
+
+/* Length of XLog file name */
+#define XLOG_FNAME_LEN 24
+
+/*
+ * Generate a WAL segment file name. Do not use this macro in a helper
+ * function allocating the result generated.
+ */
+#define XLogFileName(fname, tli, logSegNo, wal_segsz_bytes) \
+ snprintf(fname, MAXFNAMELEN, "%08X%08X%08X", tli, \
+ (uint32) ((logSegNo) / XLogSegmentsPerXLogId(wal_segsz_bytes)), \
+ (uint32) ((logSegNo) % XLogSegmentsPerXLogId(wal_segsz_bytes)))
+
+#define XLogFileNameById(fname, tli, log, seg) \
+ snprintf(fname, MAXFNAMELEN, "%08X%08X%08X", tli, log, seg)
+
+#define IsXLogFileName(fname) \
+ (strlen(fname) == XLOG_FNAME_LEN && \
+ strspn(fname, "0123456789ABCDEF") == XLOG_FNAME_LEN)
+
+/*
+ * XLOG segment with .partial suffix. Used by pg_receivewal and at end of
+ * archive recovery, when we want to archive a WAL segment but it might not
+ * be complete yet.
+ */
+#define IsPartialXLogFileName(fname) \
+ (strlen(fname) == XLOG_FNAME_LEN + strlen(".partial") && \
+ strspn(fname, "0123456789ABCDEF") == XLOG_FNAME_LEN && \
+ strcmp((fname) + XLOG_FNAME_LEN, ".partial") == 0)
+
+#define XLogFromFileName(fname, tli, logSegNo, wal_segsz_bytes) \
+ do { \
+ uint32 log; \
+ uint32 seg; \
+ sscanf(fname, "%08X%08X%08X", tli, &log, &seg); \
+ *logSegNo = (uint64) log * XLogSegmentsPerXLogId(wal_segsz_bytes) + seg; \
+ } while (0)
+
+#define XLogFilePath(path, tli, logSegNo, wal_segsz_bytes) \
+ snprintf(path, MAXPGPATH, XLOGDIR "/%08X%08X%08X", tli, \
+ (uint32) ((logSegNo) / XLogSegmentsPerXLogId(wal_segsz_bytes)), \
+ (uint32) ((logSegNo) % XLogSegmentsPerXLogId(wal_segsz_bytes)))
+
+#define TLHistoryFileName(fname, tli) \
+ snprintf(fname, MAXFNAMELEN, "%08X.history", tli)
+
+#define IsTLHistoryFileName(fname) \
+ (strlen(fname) == 8 + strlen(".history") && \
+ strspn(fname, "0123456789ABCDEF") == 8 && \
+ strcmp((fname) + 8, ".history") == 0)
+
+#define TLHistoryFilePath(path, tli) \
+ snprintf(path, MAXPGPATH, XLOGDIR "/%08X.history", tli)
+
+#define StatusFilePath(path, xlog, suffix) \
+ snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s%s", xlog, suffix)
+
+#define BackupHistoryFileName(fname, tli, logSegNo, startpoint, wal_segsz_bytes) \
+ snprintf(fname, MAXFNAMELEN, "%08X%08X%08X.%08X.backup", tli, \
+ (uint32) ((logSegNo) / XLogSegmentsPerXLogId(wal_segsz_bytes)), \
+ (uint32) ((logSegNo) % XLogSegmentsPerXLogId(wal_segsz_bytes)), \
+ (uint32) (XLogSegmentOffset(startpoint, wal_segsz_bytes)))
+
+#define IsBackupHistoryFileName(fname) \
+ (strlen(fname) > XLOG_FNAME_LEN && \
+ strspn(fname, "0123456789ABCDEF") == XLOG_FNAME_LEN && \
+ strcmp((fname) + strlen(fname) - strlen(".backup"), ".backup") == 0)
+
+#define BackupHistoryFilePath(path, tli, logSegNo, startpoint, wal_segsz_bytes) \
+ snprintf(path, MAXPGPATH, XLOGDIR "/%08X%08X%08X.%08X.backup", tli, \
+ (uint32) ((logSegNo) / XLogSegmentsPerXLogId(wal_segsz_bytes)), \
+ (uint32) ((logSegNo) % XLogSegmentsPerXLogId(wal_segsz_bytes)), \
+ (uint32) (XLogSegmentOffset((startpoint), wal_segsz_bytes)))
+
+/*
+ * Information logged when we detect a change in one of the parameters
+ * important for Hot Standby.
+ */
+typedef struct xl_parameter_change
+{
+ int MaxConnections;
+ int max_worker_processes;
+ int max_wal_senders;
+ int max_prepared_xacts;
+ int max_locks_per_xact;
+ int wal_level;
+ bool wal_log_hints;
+ bool track_commit_timestamp;
+} xl_parameter_change;
+
+/* logs restore point */
+typedef struct xl_restore_point
+{
+ TimestampTz rp_time;
+ char rp_name[MAXFNAMELEN];
+} xl_restore_point;
+
+/* Overwrite of prior contrecord */
+typedef struct xl_overwrite_contrecord
+{
+ XLogRecPtr overwritten_lsn;
+ TimestampTz overwrite_time;
+} xl_overwrite_contrecord;
+
+/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
+typedef struct xl_end_of_recovery
+{
+ TimestampTz end_time;
+ TimeLineID ThisTimeLineID; /* new TLI */
+ TimeLineID PrevTimeLineID; /* previous TLI we forked off from */
+} xl_end_of_recovery;
+
+/*
+ * The functions in xloginsert.c construct a chain of XLogRecData structs
+ * to represent the final WAL record.
+ */
+typedef struct XLogRecData
+{
+ struct XLogRecData *next; /* next struct in chain, or NULL */
+ char *data; /* start of rmgr data to include */
+ uint32 len; /* length of rmgr data to include */
+} XLogRecData;
+
+/*
+ * Recovery target action.
+ */
+typedef enum
+{
+ RECOVERY_TARGET_ACTION_PAUSE,
+ RECOVERY_TARGET_ACTION_PROMOTE,
+ RECOVERY_TARGET_ACTION_SHUTDOWN
+} RecoveryTargetAction;
+
+/*
+ * Method table for resource managers.
+ *
+ * This struct must be kept in sync with the PG_RMGR definition in
+ * rmgr.c.
+ *
+ * rm_identify must return a name for the record based on xl_info (without
+ * reference to the rmid). For example, XLOG_BTREE_VACUUM would be named
+ * "VACUUM". rm_desc can then be called to obtain additional detail for the
+ * record, if available (e.g. the last block).
+ *
+ * rm_mask takes as input a page modified by the resource manager and masks
+ * out bits that shouldn't be flagged by wal_consistency_checking.
+ *
+ * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h).
+ */
+typedef struct RmgrData
+{
+ const char *rm_name;
+ void (*rm_redo) (XLogReaderState *record);
+ void (*rm_desc) (StringInfo buf, XLogReaderState *record);
+ const char *(*rm_identify) (uint8 info);
+ void (*rm_startup) (void);
+ void (*rm_cleanup) (void);
+ void (*rm_mask) (char *pagedata, BlockNumber blkno);
+} RmgrData;
+
+extern const RmgrData RmgrTable[];
+
+/*
+ * Exported to support xlog switching from checkpointer
+ */
+extern pg_time_t GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN);
+extern XLogRecPtr RequestXLogSwitch(bool mark_unimportant);
+
+extern void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli);
+
+/*
+ * Exported for the functions in timeline.c and xlogarchive.c. Only valid
+ * in the startup process.
+ */
+extern bool ArchiveRecoveryRequested;
+extern bool InArchiveRecovery;
+extern bool StandbyMode;
+extern char *recoveryRestoreCommand;
+
+#endif /* XLOG_INTERNAL_H */
diff --git a/src/include/access/xlogarchive.h b/src/include/access/xlogarchive.h
new file mode 100644
index 0000000..3edd1a9
--- /dev/null
+++ b/src/include/access/xlogarchive.h
@@ -0,0 +1,35 @@
+/*------------------------------------------------------------------------
+ *
+ * xlogarchive.h
+ * Prototypes for WAL archives in the backend
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/include/access/xlogarchive.h
+ *
+ *------------------------------------------------------------------------
+ */
+
+#ifndef XLOG_ARCHIVE_H
+#define XLOG_ARCHIVE_H
+
+#include "access/xlogdefs.h"
+
+extern bool RestoreArchivedFile(char *path, const char *xlogfname,
+ const char *recovername, off_t expectedSize,
+ bool cleanupEnabled);
+extern void ExecuteRecoveryCommand(const char *command, const char *commandName,
+ bool failOnSignal);
+extern void KeepFileRestoredFromArchive(const char *path, const char *xlogfname);
+extern void XLogArchiveNotify(const char *xlog);
+extern void XLogArchiveNotifySeg(XLogSegNo segno);
+extern void XLogArchiveForceDone(const char *xlog);
+extern bool XLogArchiveCheckDone(const char *xlog);
+extern bool XLogArchiveIsBusy(const char *xlog);
+extern bool XLogArchiveIsReady(const char *xlog);
+extern bool XLogArchiveIsReadyOrDone(const char *xlog);
+extern void XLogArchiveCleanup(const char *xlog);
+
+#endif /* XLOG_ARCHIVE_H */
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
new file mode 100644
index 0000000..0940b64
--- /dev/null
+++ b/src/include/access/xlogdefs.h
@@ -0,0 +1,116 @@
+/*
+ * xlogdefs.h
+ *
+ * Postgres write-ahead log manager record pointer and
+ * timeline number definitions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xlogdefs.h
+ */
+#ifndef XLOG_DEFS_H
+#define XLOG_DEFS_H
+
+#include <fcntl.h> /* need open() flags */
+
+/*
+ * Pointer to a location in the XLOG. These pointers are 64 bits wide,
+ * because we don't want them ever to overflow.
+ */
+typedef uint64 XLogRecPtr;
+
+/*
+ * Zero is used indicate an invalid pointer. Bootstrap skips the first possible
+ * WAL segment, initializing the first WAL page at WAL segment size, so no XLOG
+ * record can begin at zero.
+ */
+#define InvalidXLogRecPtr 0
+#define XLogRecPtrIsInvalid(r) ((r) == InvalidXLogRecPtr)
+
+/*
+ * First LSN to use for "fake" LSNs.
+ *
+ * Values smaller than this can be used for special per-AM purposes.
+ */
+#define FirstNormalUnloggedLSN ((XLogRecPtr) 1000)
+
+/*
+ * Handy macro for printing XLogRecPtr in conventional format, e.g.,
+ *
+ * printf("%X/%X", LSN_FORMAT_ARGS(lsn));
+ */
+#define LSN_FORMAT_ARGS(lsn) (AssertVariableIsOfTypeMacro((lsn), XLogRecPtr), (uint32) ((lsn) >> 32)), ((uint32) (lsn))
+
+/*
+ * XLogSegNo - physical log file sequence number.
+ */
+typedef uint64 XLogSegNo;
+
+/*
+ * TimeLineID (TLI) - identifies different database histories to prevent
+ * confusion after restoring a prior state of a database installation.
+ * TLI does not change in a normal stop/restart of the database (including
+ * crash-and-recover cases); but we must assign a new TLI after doing
+ * a recovery to a prior state, a/k/a point-in-time recovery. This makes
+ * the new WAL logfile sequence we generate distinguishable from the
+ * sequence that was generated in the previous incarnation.
+ */
+typedef uint32 TimeLineID;
+
+/*
+ * Replication origin id - this is located in this file to avoid having to
+ * include origin.h in a bunch of xlog related places.
+ */
+typedef uint16 RepOriginId;
+
+/*
+ * Because O_DIRECT bypasses the kernel buffers, and because we never
+ * read those buffers except during crash recovery or if wal_level != minimal,
+ * it is a win to use it in all cases where we sync on each write(). We could
+ * allow O_DIRECT with fsync(), but it is unclear if fsync() could process
+ * writes not buffered in the kernel. Also, O_DIRECT is never enough to force
+ * data to the drives, it merely tries to bypass the kernel cache, so we still
+ * need O_SYNC/O_DSYNC.
+ */
+#ifdef O_DIRECT
+#define PG_O_DIRECT O_DIRECT
+#else
+#define PG_O_DIRECT 0
+#endif
+
+/*
+ * This chunk of hackery attempts to determine which file sync methods
+ * are available on the current platform, and to choose an appropriate
+ * default method. We assume that fsync() is always available, and that
+ * configure determined whether fdatasync() is.
+ */
+#if defined(O_SYNC)
+#define OPEN_SYNC_FLAG O_SYNC
+#elif defined(O_FSYNC)
+#define OPEN_SYNC_FLAG O_FSYNC
+#endif
+
+#if defined(O_DSYNC)
+#if defined(OPEN_SYNC_FLAG)
+/* O_DSYNC is distinct? */
+#if O_DSYNC != OPEN_SYNC_FLAG
+#define OPEN_DATASYNC_FLAG O_DSYNC
+#endif
+#else /* !defined(OPEN_SYNC_FLAG) */
+/* Win32 only has O_DSYNC */
+#define OPEN_DATASYNC_FLAG O_DSYNC
+#endif
+#endif
+
+#if defined(PLATFORM_DEFAULT_SYNC_METHOD)
+#define DEFAULT_SYNC_METHOD PLATFORM_DEFAULT_SYNC_METHOD
+#elif defined(OPEN_DATASYNC_FLAG)
+#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN_DSYNC
+#elif defined(HAVE_FDATASYNC)
+#define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC
+#else
+#define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC
+#endif
+
+#endif /* XLOG_DEFS_H */
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
new file mode 100644
index 0000000..f1d8c39
--- /dev/null
+++ b/src/include/access/xloginsert.h
@@ -0,0 +1,66 @@
+/*
+ * xloginsert.h
+ *
+ * Functions for generating WAL records
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xloginsert.h
+ */
+#ifndef XLOGINSERT_H
+#define XLOGINSERT_H
+
+#include "access/rmgr.h"
+#include "access/xlogdefs.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+/*
+ * The minimum size of the WAL construction working area. If you need to
+ * register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more
+ * than XLR_NORMAL_RDATAS data chunks in a single WAL record, you must call
+ * XLogEnsureRecordSpace() first to allocate more working memory.
+ */
+#define XLR_NORMAL_MAX_BLOCK_ID 4
+#define XLR_NORMAL_RDATAS 20
+
+/* flags for XLogRegisterBuffer */
+#define REGBUF_FORCE_IMAGE 0x01 /* force a full-page image */
+#define REGBUF_NO_IMAGE 0x02 /* don't take a full-page image */
+#define REGBUF_WILL_INIT (0x04 | 0x02) /* page will be re-initialized at
+ * replay (implies NO_IMAGE) */
+#define REGBUF_STANDARD 0x08 /* page follows "standard" page layout,
+ * (data between pd_lower and pd_upper
+ * will be skipped) */
+#define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image
+ * is taken */
+
+/* prototypes for public functions in xloginsert.c: */
+extern void XLogBeginInsert(void);
+extern void XLogSetRecordFlags(uint8 flags);
+extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info);
+extern void XLogEnsureRecordSpace(int max_block_id, int ndatas);
+extern void XLogRegisterData(char *data, int len);
+extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags);
+extern void XLogRegisterBlock(uint8 block_id, RelFileNode *rnode,
+ ForkNumber forknum, BlockNumber blknum, char *page,
+ uint8 flags);
+extern void XLogRegisterBufData(uint8 block_id, char *data, int len);
+extern void XLogResetInsertion(void);
+extern bool XLogCheckBufferNeedsBackup(Buffer buffer);
+
+extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
+ BlockNumber blk, char *page, bool page_std);
+extern void log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages,
+ BlockNumber *blknos, char **pages, bool page_std);
+extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std);
+extern void log_newpage_range(Relation rel, ForkNumber forkNum,
+ BlockNumber startblk, BlockNumber endblk, bool page_std);
+extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
+
+extern void InitXLogInsert(void);
+
+#endif /* XLOGINSERT_H */
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
new file mode 100644
index 0000000..10458c2
--- /dev/null
+++ b/src/include/access/xlogreader.h
@@ -0,0 +1,340 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogreader.h
+ * Definitions for the generic XLog reading facility
+ *
+ * Portions Copyright (c) 2013-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/include/access/xlogreader.h
+ *
+ * NOTES
+ * See the definition of the XLogReaderState struct for instructions on
+ * how to use the XLogReader infrastructure.
+ *
+ * The basic idea is to allocate an XLogReaderState via
+ * XLogReaderAllocate(), position the reader to the first record with
+ * XLogBeginRead() or XLogFindNextRecord(), and call XLogReadRecord()
+ * until it returns NULL.
+ *
+ * Callers supply a page_read callback if they want to call
+ * XLogReadRecord or XLogFindNextRecord; it can be passed in as NULL
+ * otherwise. The WALRead function can be used as a helper to write
+ * page_read callbacks, but it is not mandatory; callers that use it,
+ * must supply segment_open callbacks. The segment_close callback
+ * must always be supplied.
+ *
+ * After reading a record with XLogReadRecord(), it's decomposed into
+ * the per-block and main data parts, and the parts can be accessed
+ * with the XLogRec* macros and functions. You can also decode a
+ * record that's already constructed in memory, without reading from
+ * disk, by calling the DecodeXLogRecord() function.
+ *-------------------------------------------------------------------------
+ */
+#ifndef XLOGREADER_H
+#define XLOGREADER_H
+
+#ifndef FRONTEND
+#include "access/transam.h"
+#endif
+
+#include "access/xlogrecord.h"
+
+/* WALOpenSegment represents a WAL segment being read. */
+typedef struct WALOpenSegment
+{
+ int ws_file; /* segment file descriptor */
+ XLogSegNo ws_segno; /* segment number */
+ TimeLineID ws_tli; /* timeline ID of the currently open file */
+} WALOpenSegment;
+
+/* WALSegmentContext carries context information about WAL segments to read */
+typedef struct WALSegmentContext
+{
+ char ws_dir[MAXPGPATH];
+ int ws_segsize;
+} WALSegmentContext;
+
+typedef struct XLogReaderState XLogReaderState;
+
+/* Function type definitions for various xlogreader interactions */
+typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader,
+ XLogRecPtr targetPagePtr,
+ int reqLen,
+ XLogRecPtr targetRecPtr,
+ char *readBuf);
+typedef void (*WALSegmentOpenCB) (XLogReaderState *xlogreader,
+ XLogSegNo nextSegNo,
+ TimeLineID *tli_p);
+typedef void (*WALSegmentCloseCB) (XLogReaderState *xlogreader);
+
+typedef struct XLogReaderRoutine
+{
+ /*
+ * Data input callback
+ *
+ * This callback shall read at least reqLen valid bytes of the xlog page
+ * starting at targetPagePtr, and store them in readBuf. The callback
+ * shall return the number of bytes read (never more than XLOG_BLCKSZ), or
+ * -1 on failure. The callback shall sleep, if necessary, to wait for the
+ * requested bytes to become available. The callback will not be invoked
+ * again for the same page unless more than the returned number of bytes
+ * are needed.
+ *
+ * targetRecPtr is the position of the WAL record we're reading. Usually
+ * it is equal to targetPagePtr + reqLen, but sometimes xlogreader needs
+ * to read and verify the page or segment header, before it reads the
+ * actual WAL record it's interested in. In that case, targetRecPtr can
+ * be used to determine which timeline to read the page from.
+ *
+ * The callback shall set ->seg.ws_tli to the TLI of the file the page was
+ * read from.
+ */
+ XLogPageReadCB page_read;
+
+ /*
+ * Callback to open the specified WAL segment for reading. ->seg.ws_file
+ * shall be set to the file descriptor of the opened segment. In case of
+ * failure, an error shall be raised by the callback and it shall not
+ * return.
+ *
+ * "nextSegNo" is the number of the segment to be opened.
+ *
+ * "tli_p" is an input/output argument. WALRead() uses it to pass the
+ * timeline in which the new segment should be found, but the callback can
+ * use it to return the TLI that it actually opened.
+ */
+ WALSegmentOpenCB segment_open;
+
+ /*
+ * WAL segment close callback. ->seg.ws_file shall be set to a negative
+ * number.
+ */
+ WALSegmentCloseCB segment_close;
+} XLogReaderRoutine;
+
+#define XL_ROUTINE(...) &(XLogReaderRoutine){__VA_ARGS__}
+
+typedef struct
+{
+ /* Is this block ref in use? */
+ bool in_use;
+
+ /* Identify the block this refers to */
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blkno;
+
+ /* copy of the fork_flags field from the XLogRecordBlockHeader */
+ uint8 flags;
+
+ /* Information on full-page image, if any */
+ bool has_image; /* has image, even for consistency checking */
+ bool apply_image; /* has image that should be restored */
+ char *bkp_image;
+ uint16 hole_offset;
+ uint16 hole_length;
+ uint16 bimg_len;
+ uint8 bimg_info;
+
+ /* Buffer holding the rmgr-specific data associated with this block */
+ bool has_data;
+ char *data;
+ uint16 data_len;
+ uint16 data_bufsz;
+} DecodedBkpBlock;
+
+struct XLogReaderState
+{
+ /*
+ * Operational callbacks
+ */
+ XLogReaderRoutine routine;
+
+ /* ----------------------------------------
+ * Public parameters
+ * ----------------------------------------
+ */
+
+ /*
+ * System identifier of the xlog files we're about to read. Set to zero
+ * (the default value) if unknown or unimportant.
+ */
+ uint64 system_identifier;
+
+ /*
+ * Opaque data for callbacks to use. Not used by XLogReader.
+ */
+ void *private_data;
+
+ /*
+ * Start and end point of last record read. EndRecPtr is also used as the
+ * position to read next. Calling XLogBeginRead() sets EndRecPtr to the
+ * starting position and ReadRecPtr to invalid.
+ */
+ XLogRecPtr ReadRecPtr; /* start of last record read */
+ XLogRecPtr EndRecPtr; /* end+1 of last record read */
+
+
+ /* ----------------------------------------
+ * Decoded representation of current record
+ *
+ * Use XLogRecGet* functions to investigate the record; these fields
+ * should not be accessed directly.
+ * ----------------------------------------
+ */
+ XLogRecord *decoded_record; /* currently decoded record */
+
+ char *main_data; /* record's main data portion */
+ uint32 main_data_len; /* main data portion's length */
+ uint32 main_data_bufsz; /* allocated size of the buffer */
+
+ RepOriginId record_origin;
+
+ TransactionId toplevel_xid; /* XID of top-level transaction */
+
+ /* information about blocks referenced by the record. */
+ DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1];
+
+ int max_block_id; /* highest block_id in use (-1 if none) */
+
+ /* ----------------------------------------
+ * private/internal state
+ * ----------------------------------------
+ */
+
+ /*
+ * Buffer for currently read page (XLOG_BLCKSZ bytes, valid up to at least
+ * readLen bytes)
+ */
+ char *readBuf;
+ uint32 readLen;
+
+ /* last read XLOG position for data currently in readBuf */
+ WALSegmentContext segcxt;
+ WALOpenSegment seg;
+ uint32 segoff;
+
+ /*
+ * beginning of prior page read, and its TLI. Doesn't necessarily
+ * correspond to what's in readBuf; used for timeline sanity checks.
+ */
+ XLogRecPtr latestPagePtr;
+ TimeLineID latestPageTLI;
+
+ /* beginning of the WAL record being read. */
+ XLogRecPtr currRecPtr;
+ /* timeline to read it from, 0 if a lookup is required */
+ TimeLineID currTLI;
+
+ /*
+ * Safe point to read to in currTLI if current TLI is historical
+ * (tliSwitchPoint) or InvalidXLogRecPtr if on current timeline.
+ *
+ * Actually set to the start of the segment containing the timeline switch
+ * that ends currTLI's validity, not the LSN of the switch its self, since
+ * we can't assume the old segment will be present.
+ */
+ XLogRecPtr currTLIValidUntil;
+
+ /*
+ * If currTLI is not the most recent known timeline, the next timeline to
+ * read from when currTLIValidUntil is reached.
+ */
+ TimeLineID nextTLI;
+
+ /*
+ * Buffer for current ReadRecord result (expandable), used when a record
+ * crosses a page boundary.
+ */
+ char *readRecordBuf;
+ uint32 readRecordBufSize;
+
+ /* Buffer to hold error message */
+ char *errormsg_buf;
+
+ /*
+ * Set at the end of recovery: the start point of a partial record at the
+ * end of WAL (InvalidXLogRecPtr if there wasn't one), and the start
+ * location of its first contrecord that went missing.
+ */
+ XLogRecPtr abortedRecPtr;
+ XLogRecPtr missingContrecPtr;
+ /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */
+ XLogRecPtr overwrittenRecPtr;
+};
+
+/* Get a new XLogReader */
+extern XLogReaderState *XLogReaderAllocate(int wal_segment_size,
+ const char *waldir,
+ XLogReaderRoutine *routine,
+ void *private_data);
+extern XLogReaderRoutine *LocalXLogReaderRoutine(void);
+
+/* Free an XLogReader */
+extern void XLogReaderFree(XLogReaderState *state);
+
+/* Position the XLogReader to given record */
+extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr);
+#ifdef FRONTEND
+extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr);
+#endif /* FRONTEND */
+
+/* Read the next XLog record. Returns NULL on end-of-WAL or failure */
+extern struct XLogRecord *XLogReadRecord(XLogReaderState *state,
+ char **errormsg);
+
+/* Validate a page */
+extern bool XLogReaderValidatePageHeader(XLogReaderState *state,
+ XLogRecPtr recptr, char *phdr);
+
+/*
+ * Error information from WALRead that both backend and frontend caller can
+ * process. Currently only errors from pg_pread can be reported.
+ */
+typedef struct WALReadError
+{
+ int wre_errno; /* errno set by the last pg_pread() */
+ int wre_off; /* Offset we tried to read from. */
+ int wre_req; /* Bytes requested to be read. */
+ int wre_read; /* Bytes read by the last read(). */
+ WALOpenSegment wre_seg; /* Segment we tried to read from. */
+} WALReadError;
+
+extern bool WALRead(XLogReaderState *state,
+ char *buf, XLogRecPtr startptr, Size count,
+ TimeLineID tli, WALReadError *errinfo);
+
+/* Functions for decoding an XLogRecord */
+
+extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record,
+ char **errmsg);
+
+#define XLogRecGetTotalLen(decoder) ((decoder)->decoded_record->xl_tot_len)
+#define XLogRecGetPrev(decoder) ((decoder)->decoded_record->xl_prev)
+#define XLogRecGetInfo(decoder) ((decoder)->decoded_record->xl_info)
+#define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid)
+#define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid)
+#define XLogRecGetOrigin(decoder) ((decoder)->record_origin)
+#define XLogRecGetTopXid(decoder) ((decoder)->toplevel_xid)
+#define XLogRecGetData(decoder) ((decoder)->main_data)
+#define XLogRecGetDataLen(decoder) ((decoder)->main_data_len)
+#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0)
+#define XLogRecHasBlockRef(decoder, block_id) \
+ ((decoder)->blocks[block_id].in_use)
+#define XLogRecHasBlockImage(decoder, block_id) \
+ ((decoder)->blocks[block_id].has_image)
+#define XLogRecBlockImageApply(decoder, block_id) \
+ ((decoder)->blocks[block_id].apply_image)
+
+#ifndef FRONTEND
+extern FullTransactionId XLogRecGetFullXid(XLogReaderState *record);
+#endif
+
+extern bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page);
+extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len);
+extern bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id,
+ RelFileNode *rnode, ForkNumber *forknum,
+ BlockNumber *blknum);
+
+#endif /* XLOGREADER_H */
diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h
new file mode 100644
index 0000000..f68cb18
--- /dev/null
+++ b/src/include/access/xlogrecord.h
@@ -0,0 +1,229 @@
+/*
+ * xlogrecord.h
+ *
+ * Definitions for the WAL record format.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xlogrecord.h
+ */
+#ifndef XLOGRECORD_H
+#define XLOGRECORD_H
+
+#include "access/rmgr.h"
+#include "access/xlogdefs.h"
+#include "port/pg_crc32c.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+/*
+ * The overall layout of an XLOG record is:
+ * Fixed-size header (XLogRecord struct)
+ * XLogRecordBlockHeader struct
+ * XLogRecordBlockHeader struct
+ * ...
+ * XLogRecordDataHeader[Short|Long] struct
+ * block data
+ * block data
+ * ...
+ * main data
+ *
+ * There can be zero or more XLogRecordBlockHeaders, and 0 or more bytes of
+ * rmgr-specific data not associated with a block. XLogRecord structs
+ * always start on MAXALIGN boundaries in the WAL files, but the rest of
+ * the fields are not aligned.
+ *
+ * The XLogRecordBlockHeader, XLogRecordDataHeaderShort and
+ * XLogRecordDataHeaderLong structs all begin with a single 'id' byte. It's
+ * used to distinguish between block references, and the main data structs.
+ */
+typedef struct XLogRecord
+{
+ uint32 xl_tot_len; /* total len of entire record */
+ TransactionId xl_xid; /* xact id */
+ XLogRecPtr xl_prev; /* ptr to previous record in log */
+ uint8 xl_info; /* flag bits, see below */
+ RmgrId xl_rmid; /* resource manager for this record */
+ /* 2 bytes of padding here, initialize to zero */
+ pg_crc32c xl_crc; /* CRC for this record */
+
+ /* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */
+
+} XLogRecord;
+
+#define SizeOfXLogRecord (offsetof(XLogRecord, xl_crc) + sizeof(pg_crc32c))
+
+/*
+ * The high 4 bits in xl_info may be used freely by rmgr. The
+ * XLR_SPECIAL_REL_UPDATE and XLR_CHECK_CONSISTENCY bits can be passed by
+ * XLogInsert caller. The rest are set internally by XLogInsert.
+ */
+#define XLR_INFO_MASK 0x0F
+#define XLR_RMGR_INFO_MASK 0xF0
+
+/*
+ * If a WAL record modifies any relation files, in ways not covered by the
+ * usual block references, this flag is set. This is not used for anything
+ * by PostgreSQL itself, but it allows external tools that read WAL and keep
+ * track of modified blocks to recognize such special record types.
+ */
+#define XLR_SPECIAL_REL_UPDATE 0x01
+
+/*
+ * Enforces consistency checks of replayed WAL at recovery. If enabled,
+ * each record will log a full-page write for each block modified by the
+ * record and will reuse it afterwards for consistency checks. The caller
+ * of XLogInsert can use this value if necessary, but if
+ * wal_consistency_checking is enabled for a rmgr this is set unconditionally.
+ */
+#define XLR_CHECK_CONSISTENCY 0x02
+
+/*
+ * Header info for block data appended to an XLOG record.
+ *
+ * 'data_length' is the length of the rmgr-specific payload data associated
+ * with this block. It does not include the possible full page image, nor
+ * XLogRecordBlockHeader struct itself.
+ *
+ * Note that we don't attempt to align the XLogRecordBlockHeader struct!
+ * So, the struct must be copied to aligned local storage before use.
+ */
+typedef struct XLogRecordBlockHeader
+{
+ uint8 id; /* block reference ID */
+ uint8 fork_flags; /* fork within the relation, and flags */
+ uint16 data_length; /* number of payload bytes (not including page
+ * image) */
+
+ /* If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows */
+ /* If BKPBLOCK_SAME_REL is not set, a RelFileNode follows */
+ /* BlockNumber follows */
+} XLogRecordBlockHeader;
+
+#define SizeOfXLogRecordBlockHeader (offsetof(XLogRecordBlockHeader, data_length) + sizeof(uint16))
+
+/*
+ * Additional header information when a full-page image is included
+ * (i.e. when BKPBLOCK_HAS_IMAGE is set).
+ *
+ * The XLOG code is aware that PG data pages usually contain an unused "hole"
+ * in the middle, which contains only zero bytes. Since we know that the
+ * "hole" is all zeros, we remove it from the stored data (and it's not counted
+ * in the XLOG record's CRC, either). Hence, the amount of block data actually
+ * present is (BLCKSZ - <length of "hole" bytes>).
+ *
+ * Additionally, when wal_compression is enabled, we will try to compress full
+ * page images using the PGLZ compression algorithm, after removing the "hole".
+ * This can reduce the WAL volume, but at some extra cost of CPU spent
+ * on the compression during WAL logging. In this case, since the "hole"
+ * length cannot be calculated by subtracting the number of page image bytes
+ * from BLCKSZ, basically it needs to be stored as an extra information.
+ * But when no "hole" exists, we can assume that the "hole" length is zero
+ * and no such an extra information needs to be stored. Note that
+ * the original version of page image is stored in WAL instead of the
+ * compressed one if the number of bytes saved by compression is less than
+ * the length of extra information. Hence, when a page image is successfully
+ * compressed, the amount of block data actually present is less than
+ * BLCKSZ - the length of "hole" bytes - the length of extra information.
+ */
+typedef struct XLogRecordBlockImageHeader
+{
+ uint16 length; /* number of page image bytes */
+ uint16 hole_offset; /* number of bytes before "hole" */
+ uint8 bimg_info; /* flag bits, see below */
+
+ /*
+ * If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, an
+ * XLogRecordBlockCompressHeader struct follows.
+ */
+} XLogRecordBlockImageHeader;
+
+#define SizeOfXLogRecordBlockImageHeader \
+ (offsetof(XLogRecordBlockImageHeader, bimg_info) + sizeof(uint8))
+
+/* Information stored in bimg_info */
+#define BKPIMAGE_HAS_HOLE 0x01 /* page image has "hole" */
+#define BKPIMAGE_IS_COMPRESSED 0x02 /* page image is compressed */
+#define BKPIMAGE_APPLY 0x04 /* page image should be restored during
+ * replay */
+
+/*
+ * Extra header information used when page image has "hole" and
+ * is compressed.
+ */
+typedef struct XLogRecordBlockCompressHeader
+{
+ uint16 hole_length; /* number of bytes in "hole" */
+} XLogRecordBlockCompressHeader;
+
+#define SizeOfXLogRecordBlockCompressHeader \
+ sizeof(XLogRecordBlockCompressHeader)
+
+/*
+ * Maximum size of the header for a block reference. This is used to size a
+ * temporary buffer for constructing the header.
+ */
+#define MaxSizeOfXLogRecordBlockHeader \
+ (SizeOfXLogRecordBlockHeader + \
+ SizeOfXLogRecordBlockImageHeader + \
+ SizeOfXLogRecordBlockCompressHeader + \
+ sizeof(RelFileNode) + \
+ sizeof(BlockNumber))
+
+/*
+ * The fork number fits in the lower 4 bits in the fork_flags field. The upper
+ * bits are used for flags.
+ */
+#define BKPBLOCK_FORK_MASK 0x0F
+#define BKPBLOCK_FLAG_MASK 0xF0
+#define BKPBLOCK_HAS_IMAGE 0x10 /* block data is an XLogRecordBlockImage */
+#define BKPBLOCK_HAS_DATA 0x20
+#define BKPBLOCK_WILL_INIT 0x40 /* redo will re-init the page */
+#define BKPBLOCK_SAME_REL 0x80 /* RelFileNode omitted, same as previous */
+
+/*
+ * XLogRecordDataHeaderShort/Long are used for the "main data" portion of
+ * the record. If the length of the data is less than 256 bytes, the short
+ * form is used, with a single byte to hold the length. Otherwise the long
+ * form is used.
+ *
+ * (These structs are currently not used in the code, they are here just for
+ * documentation purposes).
+ */
+typedef struct XLogRecordDataHeaderShort
+{
+ uint8 id; /* XLR_BLOCK_ID_DATA_SHORT */
+ uint8 data_length; /* number of payload bytes */
+} XLogRecordDataHeaderShort;
+
+#define SizeOfXLogRecordDataHeaderShort (sizeof(uint8) * 2)
+
+typedef struct XLogRecordDataHeaderLong
+{
+ uint8 id; /* XLR_BLOCK_ID_DATA_LONG */
+ /* followed by uint32 data_length, unaligned */
+} XLogRecordDataHeaderLong;
+
+#define SizeOfXLogRecordDataHeaderLong (sizeof(uint8) + sizeof(uint32))
+
+/*
+ * Block IDs used to distinguish different kinds of record fragments. Block
+ * references are numbered from 0 to XLR_MAX_BLOCK_ID. A rmgr is free to use
+ * any ID number in that range (although you should stick to small numbers,
+ * because the WAL machinery is optimized for that case). A few ID
+ * numbers are reserved to denote the "main" data portion of the record,
+ * as well as replication-supporting transaction metadata.
+ *
+ * The maximum is currently set at 32, quite arbitrarily. Most records only
+ * need a handful of block references, but there are a few exceptions that
+ * need more.
+ */
+#define XLR_MAX_BLOCK_ID 32
+
+#define XLR_BLOCK_ID_DATA_SHORT 255
+#define XLR_BLOCK_ID_DATA_LONG 254
+#define XLR_BLOCK_ID_ORIGIN 253
+#define XLR_BLOCK_ID_TOPLEVEL_XID 252
+
+#endif /* XLOGRECORD_H */
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
new file mode 100644
index 0000000..9ac602b
--- /dev/null
+++ b/src/include/access/xlogutils.h
@@ -0,0 +1,63 @@
+/*
+ * xlogutils.h
+ *
+ * Utilities for replaying WAL records.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xlogutils.h
+ */
+#ifndef XLOG_UTILS_H
+#define XLOG_UTILS_H
+
+#include "access/xlogreader.h"
+#include "storage/bufmgr.h"
+
+
+extern bool XLogHaveInvalidPages(void);
+extern void XLogCheckInvalidPages(void);
+
+extern void XLogDropRelation(RelFileNode rnode, ForkNumber forknum);
+extern void XLogDropDatabase(Oid dbid);
+extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber nblocks);
+
+/* Result codes for XLogReadBufferForRedo[Extended] */
+typedef enum
+{
+ BLK_NEEDS_REDO, /* changes from WAL record need to be applied */
+ BLK_DONE, /* block is already up-to-date */
+ BLK_RESTORED, /* block was restored from a full-page image */
+ BLK_NOTFOUND /* block was not found (and hence does not
+ * need to be replayed) */
+} XLogRedoAction;
+
+extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record,
+ uint8 buffer_id, Buffer *buf);
+extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id);
+extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record,
+ uint8 buffer_id,
+ ReadBufferMode mode, bool get_cleanup_lock,
+ Buffer *buf);
+
+extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
+ BlockNumber blkno, ReadBufferMode mode);
+
+extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
+extern void FreeFakeRelcacheEntry(Relation fakerel);
+
+extern int read_local_xlog_page(XLogReaderState *state,
+ XLogRecPtr targetPagePtr, int reqLen,
+ XLogRecPtr targetRecPtr, char *cur_page);
+extern void wal_segment_open(XLogReaderState *state,
+ XLogSegNo nextSegNo,
+ TimeLineID *tli_p);
+extern void wal_segment_close(XLogReaderState *state);
+
+extern void XLogReadDetermineTimeline(XLogReaderState *state,
+ XLogRecPtr wantPage, uint32 wantLength);
+
+extern void WALReadRaiseError(WALReadError *errinfo);
+
+#endif