diff options
Diffstat (limited to 'src/include/access')
88 files changed, 16892 insertions, 0 deletions
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h new file mode 100644 index 0000000..4476ff7 --- /dev/null +++ b/src/include/access/amapi.h @@ -0,0 +1,292 @@ +/*------------------------------------------------------------------------- + * + * amapi.h + * API for Postgres index access methods. + * + * Copyright (c) 2015-2023, PostgreSQL Global Development Group + * + * src/include/access/amapi.h + * + *------------------------------------------------------------------------- + */ +#ifndef AMAPI_H +#define AMAPI_H + +#include "access/genam.h" + +/* + * We don't wish to include planner header files here, since most of an index + * AM's implementation isn't concerned with those data structures. To allow + * declaring amcostestimate_function here, use forward struct references. + */ +struct PlannerInfo; +struct IndexPath; + +/* Likewise, this file shouldn't depend on execnodes.h. */ +struct IndexInfo; + + +/* + * Properties for amproperty API. This list covers properties known to the + * core code, but an index AM can define its own properties, by matching the + * string property name. + */ +typedef enum IndexAMProperty +{ + AMPROP_UNKNOWN = 0, /* anything not known to core code */ + AMPROP_ASC, /* column properties */ + AMPROP_DESC, + AMPROP_NULLS_FIRST, + AMPROP_NULLS_LAST, + AMPROP_ORDERABLE, + AMPROP_DISTANCE_ORDERABLE, + AMPROP_RETURNABLE, + AMPROP_SEARCH_ARRAY, + AMPROP_SEARCH_NULLS, + AMPROP_CLUSTERABLE, /* index properties */ + AMPROP_INDEX_SCAN, + AMPROP_BITMAP_SCAN, + AMPROP_BACKWARD_SCAN, + AMPROP_CAN_ORDER, /* AM properties */ + AMPROP_CAN_UNIQUE, + AMPROP_CAN_MULTI_COL, + AMPROP_CAN_EXCLUDE, + AMPROP_CAN_INCLUDE +} IndexAMProperty; + +/* + * We use lists of this struct type to keep track of both operators and + * support functions while building or adding to an opclass or opfamily. + * amadjustmembers functions receive lists of these structs, and are allowed + * to alter their "ref" fields. + * + * The "ref" fields define how the pg_amop or pg_amproc entry should depend + * on the associated objects (that is, which dependency type to use, and + * which opclass or opfamily it should depend on). + * + * If ref_is_hard is true, the entry will have a NORMAL dependency on the + * operator or support func, and an INTERNAL dependency on the opclass or + * opfamily. This forces the opclass or opfamily to be dropped if the + * operator or support func is dropped, and requires the CASCADE option + * to do so. Nor will ALTER OPERATOR FAMILY DROP be allowed. This is + * the right behavior for objects that are essential to an opclass. + * + * If ref_is_hard is false, the entry will have an AUTO dependency on the + * operator or support func, and also an AUTO dependency on the opclass or + * opfamily. This allows ALTER OPERATOR FAMILY DROP, and causes that to + * happen automatically if the operator or support func is dropped. This + * is the right behavior for inessential ("loose") objects. + */ +typedef struct OpFamilyMember +{ + bool is_func; /* is this an operator, or support func? */ + Oid object; /* operator or support func's OID */ + int number; /* strategy or support func number */ + Oid lefttype; /* lefttype */ + Oid righttype; /* righttype */ + Oid sortfamily; /* ordering operator's sort opfamily, or 0 */ + bool ref_is_hard; /* hard or soft dependency? */ + bool ref_is_family; /* is dependency on opclass or opfamily? */ + Oid refobjid; /* OID of opclass or opfamily */ +} OpFamilyMember; + + +/* + * Callback function signatures --- see indexam.sgml for more info. + */ + +/* build new index */ +typedef IndexBuildResult *(*ambuild_function) (Relation heapRelation, + Relation indexRelation, + struct IndexInfo *indexInfo); + +/* build empty index */ +typedef void (*ambuildempty_function) (Relation indexRelation); + +/* insert this tuple */ +typedef bool (*aminsert_function) (Relation indexRelation, + Datum *values, + bool *isnull, + ItemPointer heap_tid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); + +/* bulk delete */ +typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); + +/* post-VACUUM cleanup */ +typedef IndexBulkDeleteResult *(*amvacuumcleanup_function) (IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); + +/* can indexscan return IndexTuples? */ +typedef bool (*amcanreturn_function) (Relation indexRelation, int attno); + +/* estimate cost of an indexscan */ +typedef void (*amcostestimate_function) (struct PlannerInfo *root, + struct IndexPath *path, + double loop_count, + Cost *indexStartupCost, + Cost *indexTotalCost, + Selectivity *indexSelectivity, + double *indexCorrelation, + double *indexPages); + +/* parse index reloptions */ +typedef bytea *(*amoptions_function) (Datum reloptions, + bool validate); + +/* report AM, index, or index column property */ +typedef bool (*amproperty_function) (Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull); + +/* name of phase as used in progress reporting */ +typedef char *(*ambuildphasename_function) (int64 phasenum); + +/* validate definition of an opclass for this AM */ +typedef bool (*amvalidate_function) (Oid opclassoid); + +/* validate operators and support functions to be added to an opclass/family */ +typedef void (*amadjustmembers_function) (Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions); + +/* prepare for index scan */ +typedef IndexScanDesc (*ambeginscan_function) (Relation indexRelation, + int nkeys, + int norderbys); + +/* (re)start index scan */ +typedef void (*amrescan_function) (IndexScanDesc scan, + ScanKey keys, + int nkeys, + ScanKey orderbys, + int norderbys); + +/* next valid tuple */ +typedef bool (*amgettuple_function) (IndexScanDesc scan, + ScanDirection direction); + +/* fetch all valid tuples */ +typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, + TIDBitmap *tbm); + +/* end index scan */ +typedef void (*amendscan_function) (IndexScanDesc scan); + +/* mark current scan position */ +typedef void (*ammarkpos_function) (IndexScanDesc scan); + +/* restore marked scan position */ +typedef void (*amrestrpos_function) (IndexScanDesc scan); + +/* + * Callback function signatures - for parallel index scans. + */ + +/* estimate size of parallel scan descriptor */ +typedef Size (*amestimateparallelscan_function) (void); + +/* prepare for parallel index scan */ +typedef void (*aminitparallelscan_function) (void *target); + +/* (re)start parallel index scan */ +typedef void (*amparallelrescan_function) (IndexScanDesc scan); + +/* + * API struct for an index AM. Note this must be stored in a single palloc'd + * chunk of memory. + */ +typedef struct IndexAmRoutine +{ + NodeTag type; + + /* + * Total number of strategies (operators) by which we can traverse/search + * this AM. Zero if AM does not have a fixed set of strategy assignments. + */ + uint16 amstrategies; + /* total number of support functions that this AM uses */ + uint16 amsupport; + /* opclass options support function number or 0 */ + uint16 amoptsprocnum; + /* does AM support ORDER BY indexed column's value? */ + bool amcanorder; + /* does AM support ORDER BY result of an operator on indexed column? */ + bool amcanorderbyop; + /* does AM support backward scanning? */ + bool amcanbackward; + /* does AM support UNIQUE indexes? */ + bool amcanunique; + /* does AM support multi-column indexes? */ + bool amcanmulticol; + /* does AM require scans to have a constraint on the first index column? */ + bool amoptionalkey; + /* does AM handle ScalarArrayOpExpr quals? */ + bool amsearcharray; + /* does AM handle IS NULL/IS NOT NULL quals? */ + bool amsearchnulls; + /* can index storage data type differ from column data type? */ + bool amstorage; + /* can an index of this type be clustered on? */ + bool amclusterable; + /* does AM handle predicate locks? */ + bool ampredlocks; + /* does AM support parallel scan? */ + bool amcanparallel; + /* does AM support columns included with clause INCLUDE? */ + bool amcaninclude; + /* does AM use maintenance_work_mem? */ + bool amusemaintenanceworkmem; + /* does AM store tuple information only at block granularity? */ + bool amsummarizing; + /* OR of parallel vacuum flags. See vacuum.h for flags. */ + uint8 amparallelvacuumoptions; + /* type of data stored in index, or InvalidOid if variable */ + Oid amkeytype; + + /* + * If you add new properties to either the above or the below lists, then + * they should also (usually) be exposed via the property API (see + * IndexAMProperty at the top of the file, and utils/adt/amutils.c). + */ + + /* interface functions */ + ambuild_function ambuild; + ambuildempty_function ambuildempty; + aminsert_function aminsert; + ambulkdelete_function ambulkdelete; + amvacuumcleanup_function amvacuumcleanup; + amcanreturn_function amcanreturn; /* can be NULL */ + amcostestimate_function amcostestimate; + amoptions_function amoptions; + amproperty_function amproperty; /* can be NULL */ + ambuildphasename_function ambuildphasename; /* can be NULL */ + amvalidate_function amvalidate; + amadjustmembers_function amadjustmembers; /* can be NULL */ + ambeginscan_function ambeginscan; + amrescan_function amrescan; + amgettuple_function amgettuple; /* can be NULL */ + amgetbitmap_function amgetbitmap; /* can be NULL */ + amendscan_function amendscan; + ammarkpos_function ammarkpos; /* can be NULL */ + amrestrpos_function amrestrpos; /* can be NULL */ + + /* interface functions to support parallel index scans */ + amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ + aminitparallelscan_function aminitparallelscan; /* can be NULL */ + amparallelrescan_function amparallelrescan; /* can be NULL */ +} IndexAmRoutine; + + +/* Functions in access/index/amapi.c */ +extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid amoid, bool noerror); + +#endif /* AMAPI_H */ diff --git a/src/include/access/amvalidate.h b/src/include/access/amvalidate.h new file mode 100644 index 0000000..63d0e49 --- /dev/null +++ b/src/include/access/amvalidate.h @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * amvalidate.h + * Support routines for index access methods' amvalidate and + * amadjustmembers functions. + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * src/include/access/amvalidate.h + * + *------------------------------------------------------------------------- + */ +#ifndef AMVALIDATE_H +#define AMVALIDATE_H + +#include "utils/catcache.h" + + +/* Struct returned (in a list) by identify_opfamily_groups() */ +typedef struct OpFamilyOpFuncGroup +{ + Oid lefttype; /* amoplefttype/amproclefttype */ + Oid righttype; /* amoprighttype/amprocrighttype */ + uint64 operatorset; /* bitmask of operators with these types */ + uint64 functionset; /* bitmask of support funcs with these types */ +} OpFamilyOpFuncGroup; + + +/* Functions in access/index/amvalidate.c */ +extern List *identify_opfamily_groups(CatCList *oprlist, CatCList *proclist); +extern bool check_amproc_signature(Oid funcid, Oid restype, bool exact, + int minargs, int maxargs,...); +extern bool check_amoptsproc_signature(Oid funcid); +extern bool check_amop_signature(Oid opno, Oid restype, + Oid lefttype, Oid righttype); +extern Oid opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, + Oid datatypeoid); +extern bool opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid); + +#endif /* AMVALIDATE_H */ diff --git a/src/include/access/attmap.h b/src/include/access/attmap.h new file mode 100644 index 0000000..9156a12 --- /dev/null +++ b/src/include/access/attmap.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * attmap.h + * Definitions for PostgreSQL attribute mappings + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/attmap.h + * + *------------------------------------------------------------------------- + */ +#ifndef ATTMAP_H +#define ATTMAP_H + +#include "access/attnum.h" +#include "access/tupdesc.h" + +/* + * Attribute mapping structure + * + * This maps attribute numbers between a pair of relations, designated + * 'input' and 'output' (most typically inheritance parent and child + * relations), whose common columns may have different attribute numbers. + * Such difference may arise due to the columns being ordered differently + * in the two relations or the two relations having dropped columns at + * different positions. + * + * 'maplen' is set to the number of attributes of the 'output' relation, + * taking into account any of its dropped attributes, with the corresponding + * elements of the 'attnums' array set to 0. + */ +typedef struct AttrMap +{ + AttrNumber *attnums; + int maplen; +} AttrMap; + +extern AttrMap *make_attrmap(int maplen); +extern void free_attrmap(AttrMap *map); + +/* Conversion routines to build mappings */ +extern AttrMap *build_attrmap_by_name(TupleDesc indesc, + TupleDesc outdesc, + bool missing_ok); +extern AttrMap *build_attrmap_by_name_if_req(TupleDesc indesc, + TupleDesc outdesc, + bool missing_ok); +extern AttrMap *build_attrmap_by_position(TupleDesc indesc, + TupleDesc outdesc, + const char *msg); + +#endif /* ATTMAP_H */ diff --git a/src/include/access/attnum.h b/src/include/access/attnum.h new file mode 100644 index 0000000..4c43295 --- /dev/null +++ b/src/include/access/attnum.h @@ -0,0 +1,64 @@ +/*------------------------------------------------------------------------- + * + * attnum.h + * POSTGRES attribute number definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/attnum.h + * + *------------------------------------------------------------------------- + */ +#ifndef ATTNUM_H +#define ATTNUM_H + + +/* + * user defined attribute numbers start at 1. -ay 2/95 + */ +typedef int16 AttrNumber; + +#define InvalidAttrNumber 0 +#define MaxAttrNumber 32767 + +/* ---------------- + * support macros + * ---------------- + */ +/* + * AttributeNumberIsValid + * True iff the attribute number is valid. + */ +#define AttributeNumberIsValid(attributeNumber) \ + ((bool) ((attributeNumber) != InvalidAttrNumber)) + +/* + * AttrNumberIsForUserDefinedAttr + * True iff the attribute number corresponds to a user defined attribute. + */ +#define AttrNumberIsForUserDefinedAttr(attributeNumber) \ + ((bool) ((attributeNumber) > 0)) + +/* + * AttrNumberGetAttrOffset + * Returns the attribute offset for an attribute number. + * + * Note: + * Assumes the attribute number is for a user defined attribute. + */ +#define AttrNumberGetAttrOffset(attNum) \ +( \ + AssertMacro(AttrNumberIsForUserDefinedAttr(attNum)), \ + ((attNum) - 1) \ +) + +/* + * AttrOffsetGetAttrNumber + * Returns the attribute number for an attribute offset. + */ +#define AttrOffsetGetAttrNumber(attributeOffset) \ + ((AttrNumber) (1 + (attributeOffset))) + +#endif /* ATTNUM_H */ diff --git a/src/include/access/brin.h b/src/include/access/brin.h new file mode 100644 index 0000000..ed66f1b --- /dev/null +++ b/src/include/access/brin.h @@ -0,0 +1,55 @@ +/* + * AM-callable functions for BRIN indexes + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin.h + */ +#ifndef BRIN_H +#define BRIN_H + +#include "nodes/execnodes.h" +#include "utils/relcache.h" + + +/* + * Storage type for BRIN's reloptions + */ +typedef struct BrinOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + BlockNumber pagesPerRange; + bool autosummarize; +} BrinOptions; + + +/* + * BrinStatsData represents stats data for planner use + */ +typedef struct BrinStatsData +{ + BlockNumber pagesPerRange; + BlockNumber revmapNumPages; +} BrinStatsData; + + +#define BRIN_DEFAULT_PAGES_PER_RANGE 128 +#define BrinGetPagesPerRange(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BRIN_AM_OID), \ + (relation)->rd_options ? \ + ((BrinOptions *) (relation)->rd_options)->pagesPerRange : \ + BRIN_DEFAULT_PAGES_PER_RANGE) +#define BrinGetAutoSummarize(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BRIN_AM_OID), \ + (relation)->rd_options ? \ + ((BrinOptions *) (relation)->rd_options)->autosummarize : \ + false) + + +extern void brinGetStats(Relation index, BrinStatsData *stats); + +#endif /* BRIN_H */ diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h new file mode 100644 index 0000000..97ddc92 --- /dev/null +++ b/src/include/access/brin_internal.h @@ -0,0 +1,115 @@ +/* + * brin_internal.h + * internal declarations for BRIN indexes + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_internal.h + */ +#ifndef BRIN_INTERNAL_H +#define BRIN_INTERNAL_H + +#include "access/amapi.h" +#include "storage/bufpage.h" +#include "utils/typcache.h" + + +/* + * A BrinDesc is a struct designed to enable decoding a BRIN tuple from the + * on-disk format to an in-memory tuple and vice-versa. + */ + +/* struct returned by "OpcInfo" amproc */ +typedef struct BrinOpcInfo +{ + /* Number of columns stored in an index column of this opclass */ + uint16 oi_nstored; + + /* Regular processing of NULLs in BrinValues? */ + bool oi_regular_nulls; + + /* Opaque pointer for the opclass' private use */ + void *oi_opaque; + + /* Type cache entries of the stored columns */ + TypeCacheEntry *oi_typcache[FLEXIBLE_ARRAY_MEMBER]; +} BrinOpcInfo; + +/* the size of a BrinOpcInfo for the given number of columns */ +#define SizeofBrinOpcInfo(ncols) \ + (offsetof(BrinOpcInfo, oi_typcache) + sizeof(TypeCacheEntry *) * ncols) + +typedef struct BrinDesc +{ + /* Containing memory context */ + MemoryContext bd_context; + + /* the index relation itself */ + Relation bd_index; + + /* tuple descriptor of the index relation */ + TupleDesc bd_tupdesc; + + /* cached copy for on-disk tuples; generated at first use */ + TupleDesc bd_disktdesc; + + /* total number of Datum entries that are stored on-disk for all columns */ + int bd_totalstored; + + /* per-column info; bd_tupdesc->natts entries long */ + BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER]; +} BrinDesc; + +/* + * Globally-known function support numbers for BRIN indexes. Individual + * opclasses can define more function support numbers, which must fall into + * BRIN_FIRST_OPTIONAL_PROCNUM .. BRIN_LAST_OPTIONAL_PROCNUM. + */ +#define BRIN_PROCNUM_OPCINFO 1 +#define BRIN_PROCNUM_ADDVALUE 2 +#define BRIN_PROCNUM_CONSISTENT 3 +#define BRIN_PROCNUM_UNION 4 +#define BRIN_MANDATORY_NPROCS 4 +#define BRIN_PROCNUM_OPTIONS 5 /* optional */ +/* procedure numbers up to 10 are reserved for BRIN future expansion */ +#define BRIN_FIRST_OPTIONAL_PROCNUM 11 +#define BRIN_LAST_OPTIONAL_PROCNUM 15 + +#undef BRIN_DEBUG + +#ifdef BRIN_DEBUG +#define BRIN_elog(args) elog args +#else +#define BRIN_elog(args) ((void) 0) +#endif + +/* brin.c */ +extern BrinDesc *brin_build_desc(Relation rel); +extern void brin_free_desc(BrinDesc *bdesc); +extern IndexBuildResult *brinbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void brinbuildempty(Relation index); +extern bool brininsert(Relation idxRel, Datum *values, bool *nulls, + ItemPointer heaptid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); +extern IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys); +extern int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm); +extern void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern void brinendscan(IndexScanDesc scan); +extern IndexBulkDeleteResult *brinbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *brinvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); +extern bytea *brinoptions(Datum reloptions, bool validate); + +/* brin_validate.c */ +extern bool brinvalidate(Oid opclassoid); + +#endif /* BRIN_INTERNAL_H */ diff --git a/src/include/access/brin_page.h b/src/include/access/brin_page.h new file mode 100644 index 0000000..3670ca6 --- /dev/null +++ b/src/include/access/brin_page.h @@ -0,0 +1,96 @@ +/* + * brin_page.h + * Prototypes and definitions for BRIN page layouts + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_page.h + * + * NOTES + * + * These structs should really be private to specific BRIN files, but it's + * useful to have them here so that they can be used by pageinspect and similar + * tools. + */ +#ifndef BRIN_PAGE_H +#define BRIN_PAGE_H + +#include "storage/block.h" +#include "storage/itemptr.h" + +/* + * Special area of BRIN pages. + * + * We define it in this odd way so that it always occupies the last + * MAXALIGN-sized element of each page. + */ +typedef struct BrinSpecialSpace +{ + uint16 vector[MAXALIGN(1) / sizeof(uint16)]; +} BrinSpecialSpace; + +/* + * Make the page type be the last half-word in the page, for consumption by + * pg_filedump and similar utilities. We don't really care much about the + * position of the "flags" half-word, but it's simpler to apply a consistent + * rule to both. + * + * See comments above GinPageOpaqueData. + */ +#define BrinPageType(page) \ + (((BrinSpecialSpace *) \ + PageGetSpecialPointer(page))->vector[MAXALIGN(1) / sizeof(uint16) - 1]) + +#define BrinPageFlags(page) \ + (((BrinSpecialSpace *) \ + PageGetSpecialPointer(page))->vector[MAXALIGN(1) / sizeof(uint16) - 2]) + +/* special space on all BRIN pages stores a "type" identifier */ +#define BRIN_PAGETYPE_META 0xF091 +#define BRIN_PAGETYPE_REVMAP 0xF092 +#define BRIN_PAGETYPE_REGULAR 0xF093 + +#define BRIN_IS_META_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_META) +#define BRIN_IS_REVMAP_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REVMAP) +#define BRIN_IS_REGULAR_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REGULAR) + +/* flags for BrinSpecialSpace */ +#define BRIN_EVACUATE_PAGE (1 << 0) + + +/* Metapage definitions */ +typedef struct BrinMetaPageData +{ + uint32 brinMagic; + uint32 brinVersion; + BlockNumber pagesPerRange; + BlockNumber lastRevmapPage; +} BrinMetaPageData; + +#define BRIN_CURRENT_VERSION 1 +#define BRIN_META_MAGIC 0xA8109CFA + +#define BRIN_METAPAGE_BLKNO 0 + +/* Definitions for revmap pages */ +typedef struct RevmapContents +{ + /* + * This array will fill all available space on the page. It should be + * declared [FLEXIBLE_ARRAY_MEMBER], but for some reason you can't do that + * in an otherwise-empty struct. + */ + ItemPointerData rm_tids[1]; +} RevmapContents; + +#define REVMAP_CONTENT_SIZE \ + (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ + offsetof(RevmapContents, rm_tids) - \ + MAXALIGN(sizeof(BrinSpecialSpace))) +/* max num of items in the array */ +#define REVMAP_PAGE_MAXITEMS \ + (REVMAP_CONTENT_SIZE / sizeof(ItemPointerData)) + +#endif /* BRIN_PAGE_H */ diff --git a/src/include/access/brin_pageops.h b/src/include/access/brin_pageops.h new file mode 100644 index 0000000..041cc32 --- /dev/null +++ b/src/include/access/brin_pageops.h @@ -0,0 +1,38 @@ +/* + * brin_pageops.h + * Prototypes for operating on BRIN pages. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_pageops.h + */ +#ifndef BRIN_PAGEOPS_H +#define BRIN_PAGEOPS_H + +#include "access/brin_revmap.h" + +extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, BlockNumber heapBlk, + Buffer oldbuf, OffsetNumber oldoff, + const BrinTuple *origtup, Size origsz, + const BrinTuple *newtup, Size newsz, + bool samepage); +extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz, + Size newsz); +extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, + BrinTuple *tup, Size itemsz); + +extern void brin_page_init(Page page, uint16 type); +extern void brin_metapage_init(Page page, BlockNumber pagesPerRange, + uint16 version); + +extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf); +extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer buf); + +extern void brin_page_cleanup(Relation idxrel, Buffer buf); + +#endif /* BRIN_PAGEOPS_H */ diff --git a/src/include/access/brin_revmap.h b/src/include/access/brin_revmap.h new file mode 100644 index 0000000..7532363 --- /dev/null +++ b/src/include/access/brin_revmap.h @@ -0,0 +1,41 @@ +/* + * brin_revmap.h + * Prototypes for BRIN reverse range maps + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_revmap.h + */ + +#ifndef BRIN_REVMAP_H +#define BRIN_REVMAP_H + +#include "access/brin_tuple.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/itemptr.h" +#include "storage/off.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + +/* struct definition lives in brin_revmap.c */ +typedef struct BrinRevmap BrinRevmap; + +extern BrinRevmap *brinRevmapInitialize(Relation idxrel, + BlockNumber *pagesPerRange, Snapshot snapshot); +extern void brinRevmapTerminate(BrinRevmap *revmap); + +extern void brinRevmapExtend(BrinRevmap *revmap, + BlockNumber heapBlk); +extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap, + BlockNumber heapBlk); +extern void brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, + BlockNumber heapBlk, ItemPointerData tid); +extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap, + BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, + Size *size, int mode, Snapshot snapshot); +extern bool brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk); + +#endif /* BRIN_REVMAP_H */ diff --git a/src/include/access/brin_tuple.h b/src/include/access/brin_tuple.h new file mode 100644 index 0000000..6f33ba6 --- /dev/null +++ b/src/include/access/brin_tuple.h @@ -0,0 +1,112 @@ +/* + * brin_tuple.h + * Declarations for dealing with BRIN-specific tuples. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/brin_tuple.h + */ +#ifndef BRIN_TUPLE_H +#define BRIN_TUPLE_H + +#include "access/brin_internal.h" +#include "access/tupdesc.h" + +/* + * The BRIN opclasses may register serialization callback, in case the on-disk + * and in-memory representations differ (e.g. for performance reasons). + */ +typedef void (*brin_serialize_callback_type) (BrinDesc *bdesc, Datum src, Datum *dst); + +/* + * A BRIN index stores one index tuple per page range. Each index tuple + * has one BrinValues struct for each indexed column; in turn, each BrinValues + * has (besides the null flags) an array of Datum whose size is determined by + * the opclass. + */ +typedef struct BrinValues +{ + AttrNumber bv_attno; /* index attribute number */ + bool bv_hasnulls; /* are there any nulls in the page range? */ + bool bv_allnulls; /* are all values nulls in the page range? */ + Datum *bv_values; /* current accumulated values */ + Datum bv_mem_value; /* expanded accumulated values */ + MemoryContext bv_context; + brin_serialize_callback_type bv_serialize; +} BrinValues; + +/* + * This struct is used to represent an in-memory index tuple. The values can + * only be meaningfully decoded with an appropriate BrinDesc. + */ +typedef struct BrinMemTuple +{ + bool bt_placeholder; /* this is a placeholder tuple */ + bool bt_empty_range; /* range represents no tuples */ + BlockNumber bt_blkno; /* heap blkno that the tuple is for */ + MemoryContext bt_context; /* memcxt holding the bt_columns values */ + /* output arrays for brin_deform_tuple: */ + Datum *bt_values; /* values array */ + bool *bt_allnulls; /* allnulls array */ + bool *bt_hasnulls; /* hasnulls array */ + /* not an output array, but must be last */ + BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]; +} BrinMemTuple; + +/* + * An on-disk BRIN tuple. This is possibly followed by a nulls bitmask, with + * room for 2 null bits (two bits for each indexed column); an opclass-defined + * number of Datum values for each column follow. + */ +typedef struct BrinTuple +{ + /* heap block number that the tuple is for */ + BlockNumber bt_blkno; + + /* --------------- + * bt_info is laid out in the following fashion: + * + * 7th (high) bit: has nulls + * 6th bit: is placeholder tuple + * 5th bit: range is empty + * 4-0 bit: offset of data + * --------------- + */ + uint8 bt_info; +} BrinTuple; + +#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8)) + +/* + * bt_info manipulation macros + */ +#define BRIN_OFFSET_MASK 0x1F +#define BRIN_EMPTY_RANGE_MASK 0x20 +#define BRIN_PLACEHOLDER_MASK 0x40 +#define BRIN_NULLS_MASK 0x80 + +#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK)) +#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0) +#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0) +#define BrinTupleIsEmptyRange(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_EMPTY_RANGE_MASK)) != 0) + + +extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, + BrinMemTuple *tuple, Size *size); +extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc, + BlockNumber blkno, Size *size); +extern void brin_free_tuple(BrinTuple *tuple); +extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len, + BrinTuple *dest, Size *destsz); +extern bool brin_tuples_equal(const BrinTuple *a, Size alen, + const BrinTuple *b, Size blen); + +extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc); +extern BrinMemTuple *brin_memtuple_initialize(BrinMemTuple *dtuple, + BrinDesc *brdesc); +extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc, + BrinTuple *tuple, BrinMemTuple *dMemtuple); + +#endif /* BRIN_TUPLE_H */ diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h new file mode 100644 index 0000000..ae263a3 --- /dev/null +++ b/src/include/access/brin_xlog.h @@ -0,0 +1,151 @@ +/*------------------------------------------------------------------------- + * + * brin_xlog.h + * POSTGRES BRIN access XLOG definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/brin_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef BRIN_XLOG_H +#define BRIN_XLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/bufpage.h" +#include "storage/itemptr.h" +#include "storage/relfilelocator.h" +#include "utils/relcache.h" + + +/* + * WAL record definitions for BRIN's WAL operations + * + * XLOG allows to store some information in high 4 bits of log + * record xl_info field. + */ +#define XLOG_BRIN_CREATE_INDEX 0x00 +#define XLOG_BRIN_INSERT 0x10 +#define XLOG_BRIN_UPDATE 0x20 +#define XLOG_BRIN_SAMEPAGE_UPDATE 0x30 +#define XLOG_BRIN_REVMAP_EXTEND 0x40 +#define XLOG_BRIN_DESUMMARIZE 0x50 + +#define XLOG_BRIN_OPMASK 0x70 +/* + * When we insert the first item on a new page, we restore the entire page in + * redo. + */ +#define XLOG_BRIN_INIT_PAGE 0x80 + +/* + * This is what we need to know about a BRIN index create. + * + * Backup block 0: metapage + */ +typedef struct xl_brin_createidx +{ + BlockNumber pagesPerRange; + uint16 version; +} xl_brin_createidx; +#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16)) + +/* + * This is what we need to know about a BRIN tuple insert + * + * Backup block 0: main page, block data is the new BrinTuple. + * Backup block 1: revmap page + */ +typedef struct xl_brin_insert +{ + BlockNumber heapBlk; + + /* extra information needed to update the revmap */ + BlockNumber pagesPerRange; + + /* offset number in the main page to insert the tuple to. */ + OffsetNumber offnum; +} xl_brin_insert; + +#define SizeOfBrinInsert (offsetof(xl_brin_insert, offnum) + sizeof(OffsetNumber)) + +/* + * A cross-page update is the same as an insert, but also stores information + * about the old tuple. + * + * Like in xl_brin_insert: + * Backup block 0: new page, block data includes the new BrinTuple. + * Backup block 1: revmap page + * + * And in addition: + * Backup block 2: old page + */ +typedef struct xl_brin_update +{ + /* offset number of old tuple on old page */ + OffsetNumber oldOffnum; + + xl_brin_insert insert; +} xl_brin_update; + +#define SizeOfBrinUpdate (offsetof(xl_brin_update, insert) + SizeOfBrinInsert) + +/* + * This is what we need to know about a BRIN tuple samepage update + * + * Backup block 0: updated page, with new BrinTuple as block data + */ +typedef struct xl_brin_samepage_update +{ + OffsetNumber offnum; +} xl_brin_samepage_update; + +#define SizeOfBrinSamepageUpdate (sizeof(OffsetNumber)) + +/* + * This is what we need to know about a revmap extension + * + * Backup block 0: metapage + * Backup block 1: new revmap page + */ +typedef struct xl_brin_revmap_extend +{ + /* + * XXX: This is actually redundant - the block number is stored as part of + * backup block 1. + */ + BlockNumber targetBlk; +} xl_brin_revmap_extend; + +#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \ + sizeof(BlockNumber)) + +/* + * This is what we need to know about a range de-summarization + * + * Backup block 0: revmap page + * Backup block 1: regular page + */ +typedef struct xl_brin_desummarize +{ + BlockNumber pagesPerRange; + /* page number location to set to invalid */ + BlockNumber heapBlk; + /* offset of item to delete in regular index page */ + OffsetNumber regOffset; +} xl_brin_desummarize; + +#define SizeOfBrinDesummarize (offsetof(xl_brin_desummarize, regOffset) + \ + sizeof(OffsetNumber)) + + +extern void brin_redo(XLogReaderState *record); +extern void brin_desc(StringInfo buf, XLogReaderState *record); +extern const char *brin_identify(uint8 info); +extern void brin_mask(char *pagedata, BlockNumber blkno); + +#endif /* BRIN_XLOG_H */ diff --git a/src/include/access/bufmask.h b/src/include/access/bufmask.h new file mode 100644 index 0000000..cf29ba4 --- /dev/null +++ b/src/include/access/bufmask.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * bufmask.h + * Definitions for buffer masking routines, used to mask certain bits + * in a page which can be different when the WAL is generated + * and when the WAL is applied. This is really the job of each + * individual rmgr, but we make things easier by providing some + * common routines to handle cases which occur in multiple rmgrs. + * + * Portions Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * src/include/access/bufmask.h + * + *------------------------------------------------------------------------- + */ + +#ifndef BUFMASK_H +#define BUFMASK_H + +#include "storage/block.h" +#include "storage/bufmgr.h" + +/* Marker used to mask pages consistently */ +#define MASK_MARKER 0 + +extern void mask_page_lsn_and_checksum(Page page); +extern void mask_page_hint_bits(Page page); +extern void mask_unused_space(Page page); +extern void mask_lp_flags(Page page); +extern void mask_page_content(Page page); + +#endif diff --git a/src/include/access/clog.h b/src/include/access/clog.h new file mode 100644 index 0000000..d99444f --- /dev/null +++ b/src/include/access/clog.h @@ -0,0 +1,63 @@ +/* + * clog.h + * + * PostgreSQL transaction-commit-log manager + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/clog.h + */ +#ifndef CLOG_H +#define CLOG_H + +#include "access/xlogreader.h" +#include "storage/sync.h" +#include "lib/stringinfo.h" + +/* + * Possible transaction statuses --- note that all-zeroes is the initial + * state. + * + * A "subcommitted" transaction is a committed subtransaction whose parent + * hasn't committed or aborted yet. + */ +typedef int XidStatus; + +#define TRANSACTION_STATUS_IN_PROGRESS 0x00 +#define TRANSACTION_STATUS_COMMITTED 0x01 +#define TRANSACTION_STATUS_ABORTED 0x02 +#define TRANSACTION_STATUS_SUB_COMMITTED 0x03 + +typedef struct xl_clog_truncate +{ + int pageno; + TransactionId oldestXact; + Oid oldestXactDb; +} xl_clog_truncate; + +extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, XLogRecPtr lsn); +extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn); + +extern Size CLOGShmemBuffers(void); +extern Size CLOGShmemSize(void); +extern void CLOGShmemInit(void); +extern void BootStrapCLOG(void); +extern void StartupCLOG(void); +extern void TrimCLOG(void); +extern void CheckPointCLOG(void); +extern void ExtendCLOG(TransactionId newestXact); +extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid); + +extern int clogsyncfiletag(const FileTag *ftag, char *path); + +/* XLOG stuff */ +#define CLOG_ZEROPAGE 0x00 +#define CLOG_TRUNCATE 0x10 + +extern void clog_redo(XLogReaderState *record); +extern void clog_desc(StringInfo buf, XLogReaderState *record); +extern const char *clog_identify(uint8 info); + +#endif /* CLOG_H */ diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h new file mode 100644 index 0000000..5087cdc --- /dev/null +++ b/src/include/access/commit_ts.h @@ -0,0 +1,74 @@ +/* + * commit_ts.h + * + * PostgreSQL commit timestamp manager + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/commit_ts.h + */ +#ifndef COMMIT_TS_H +#define COMMIT_TS_H + +#include "access/xlog.h" +#include "datatype/timestamp.h" +#include "replication/origin.h" +#include "storage/sync.h" + + +extern PGDLLIMPORT bool track_commit_timestamp; + +extern void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz timestamp, + RepOriginId nodeid); +extern bool TransactionIdGetCommitTsData(TransactionId xid, + TimestampTz *ts, RepOriginId *nodeid); +extern TransactionId GetLatestCommitTsData(TimestampTz *ts, + RepOriginId *nodeid); + +extern Size CommitTsShmemBuffers(void); +extern Size CommitTsShmemSize(void); +extern void CommitTsShmemInit(void); +extern void BootStrapCommitTs(void); +extern void StartupCommitTs(void); +extern void CommitTsParameterChange(bool newvalue, bool oldvalue); +extern void CompleteCommitTsInitialization(void); +extern void CheckPointCommitTs(void); +extern void ExtendCommitTs(TransactionId newestXact); +extern void TruncateCommitTs(TransactionId oldestXact); +extern void SetCommitTsLimit(TransactionId oldestXact, + TransactionId newestXact); +extern void AdvanceOldestCommitTsXid(TransactionId oldestXact); + +extern int committssyncfiletag(const FileTag *ftag, char *path); + +/* XLOG stuff */ +#define COMMIT_TS_ZEROPAGE 0x00 +#define COMMIT_TS_TRUNCATE 0x10 + +typedef struct xl_commit_ts_set +{ + TimestampTz timestamp; + RepOriginId nodeid; + TransactionId mainxid; + /* subxact Xids follow */ +} xl_commit_ts_set; + +#define SizeOfCommitTsSet (offsetof(xl_commit_ts_set, mainxid) + \ + sizeof(TransactionId)) + +typedef struct xl_commit_ts_truncate +{ + int pageno; + TransactionId oldestXid; +} xl_commit_ts_truncate; + +#define SizeOfCommitTsTruncate (offsetof(xl_commit_ts_truncate, oldestXid) + \ + sizeof(TransactionId)) + +extern void commit_ts_redo(XLogReaderState *record); +extern void commit_ts_desc(StringInfo buf, XLogReaderState *record); +extern const char *commit_ts_identify(uint8 info); + +#endif /* COMMIT_TS_H */ diff --git a/src/include/access/detoast.h b/src/include/access/detoast.h new file mode 100644 index 0000000..908e1fc --- /dev/null +++ b/src/include/access/detoast.h @@ -0,0 +1,82 @@ +/*------------------------------------------------------------------------- + * + * detoast.h + * Access to compressed and external varlena values. + * + * Copyright (c) 2000-2023, PostgreSQL Global Development Group + * + * src/include/access/detoast.h + * + *------------------------------------------------------------------------- + */ +#ifndef DETOAST_H +#define DETOAST_H + +/* + * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum + * into a local "struct varatt_external" toast pointer. This should be + * just a memcpy, but some versions of gcc seem to produce broken code + * that assumes the datum contents are aligned. Introducing an explicit + * intermediate "varattrib_1b_e *" variable seems to fix it. + */ +#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \ +do { \ + varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \ + Assert(VARATT_IS_EXTERNAL(attre)); \ + Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \ + memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \ +} while (0) + +/* Size of an EXTERNAL datum that contains a standard TOAST pointer */ +#define TOAST_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(varatt_external)) + +/* Size of an EXTERNAL datum that contains an indirection pointer */ +#define INDIRECT_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(varatt_indirect)) + +/* ---------- + * detoast_external_attr() - + * + * Fetches an external stored attribute from the toast + * relation. Does NOT decompress it, if stored external + * in compressed format. + * ---------- + */ +extern struct varlena *detoast_external_attr(struct varlena *attr); + +/* ---------- + * detoast_attr() - + * + * Fully detoasts one attribute, fetching and/or decompressing + * it as needed. + * ---------- + */ +extern struct varlena *detoast_attr(struct varlena *attr); + +/* ---------- + * detoast_attr_slice() - + * + * Fetches only the specified portion of an attribute. + * (Handles all cases for attribute storage) + * ---------- + */ +extern struct varlena *detoast_attr_slice(struct varlena *attr, + int32 sliceoffset, + int32 slicelength); + +/* ---------- + * toast_raw_datum_size - + * + * Return the raw (detoasted) size of a varlena datum + * ---------- + */ +extern Size toast_raw_datum_size(Datum value); + +/* ---------- + * toast_datum_size - + * + * Return the storage size of a varlena datum + * ---------- + */ +extern Size toast_datum_size(Datum value); + +#endif /* DETOAST_H */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h new file mode 100644 index 0000000..b071ced --- /dev/null +++ b/src/include/access/genam.h @@ -0,0 +1,234 @@ +/*------------------------------------------------------------------------- + * + * genam.h + * POSTGRES generalized index access method definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/genam.h + * + *------------------------------------------------------------------------- + */ +#ifndef GENAM_H +#define GENAM_H + +#include "access/sdir.h" +#include "access/skey.h" +#include "nodes/tidbitmap.h" +#include "storage/lockdefs.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + +/* We don't want this file to depend on execnodes.h. */ +struct IndexInfo; + +/* + * Struct for statistics returned by ambuild + */ +typedef struct IndexBuildResult +{ + double heap_tuples; /* # of tuples seen in parent table */ + double index_tuples; /* # of tuples inserted into index */ +} IndexBuildResult; + +/* + * Struct for input arguments passed to ambulkdelete and amvacuumcleanup + * + * num_heap_tuples is accurate only when estimated_count is false; + * otherwise it's just an estimate (currently, the estimate is the + * prior value of the relation's pg_class.reltuples field, so it could + * even be -1). It will always just be an estimate during ambulkdelete. + */ +typedef struct IndexVacuumInfo +{ + Relation index; /* the index being vacuumed */ + Relation heaprel; /* the heap relation the index belongs to */ + bool analyze_only; /* ANALYZE (without any actual vacuum) */ + bool report_progress; /* emit progress.h status reports */ + bool estimated_count; /* num_heap_tuples is an estimate */ + int message_level; /* ereport level for progress messages */ + double num_heap_tuples; /* tuples remaining in heap */ + BufferAccessStrategy strategy; /* access strategy for reads */ +} IndexVacuumInfo; + +/* + * Struct for statistics returned by ambulkdelete and amvacuumcleanup + * + * This struct is normally allocated by the first ambulkdelete call and then + * passed along through subsequent ones until amvacuumcleanup; however, + * amvacuumcleanup must be prepared to allocate it in the case where no + * ambulkdelete calls were made (because no tuples needed deletion). + * Note that an index AM could choose to return a larger struct + * of which this is just the first field; this provides a way for ambulkdelete + * to communicate additional private data to amvacuumcleanup. + * + * Note: pages_newly_deleted is the number of pages in the index that were + * deleted by the current vacuum operation. pages_deleted and pages_free + * refer to free space within the index file. + * + * Note: Some index AMs may compute num_index_tuples by reference to + * num_heap_tuples, in which case they should copy the estimated_count field + * from IndexVacuumInfo. + */ +typedef struct IndexBulkDeleteResult +{ + BlockNumber num_pages; /* pages remaining in index */ + bool estimated_count; /* num_index_tuples is an estimate */ + double num_index_tuples; /* tuples remaining */ + double tuples_removed; /* # removed during vacuum operation */ + BlockNumber pages_newly_deleted; /* # pages marked deleted by us */ + BlockNumber pages_deleted; /* # pages marked deleted (could be by us) */ + BlockNumber pages_free; /* # pages available for reuse */ +} IndexBulkDeleteResult; + +/* Typedef for callback function to determine if a tuple is bulk-deletable */ +typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state); + +/* struct definitions appear in relscan.h */ +typedef struct IndexScanDescData *IndexScanDesc; +typedef struct SysScanDescData *SysScanDesc; + +typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc; + +/* + * Enumeration specifying the type of uniqueness check to perform in + * index_insert(). + * + * UNIQUE_CHECK_YES is the traditional Postgres immediate check, possibly + * blocking to see if a conflicting transaction commits. + * + * For deferrable unique constraints, UNIQUE_CHECK_PARTIAL is specified at + * insertion time. The index AM should test if the tuple is unique, but + * should not throw error, block, or prevent the insertion if the tuple + * appears not to be unique. We'll recheck later when it is time for the + * constraint to be enforced. The AM must return true if the tuple is + * known unique, false if it is possibly non-unique. In the "true" case + * it is safe to omit the later recheck. + * + * When it is time to recheck the deferred constraint, a pseudo-insertion + * call is made with UNIQUE_CHECK_EXISTING. The tuple is already in the + * index in this case, so it should not be inserted again. Rather, just + * check for conflicting live tuples (possibly blocking). + */ +typedef enum IndexUniqueCheck +{ + UNIQUE_CHECK_NO, /* Don't do any uniqueness checking */ + UNIQUE_CHECK_YES, /* Enforce uniqueness at insertion time */ + UNIQUE_CHECK_PARTIAL, /* Test uniqueness, but no error */ + UNIQUE_CHECK_EXISTING /* Check if existing tuple is unique */ +} IndexUniqueCheck; + + +/* Nullable "ORDER BY col op const" distance */ +typedef struct IndexOrderByDistance +{ + double value; + bool isnull; +} IndexOrderByDistance; + +/* + * generalized index_ interface routines (in indexam.c) + */ + +/* + * IndexScanIsValid + * True iff the index scan is valid. + */ +#define IndexScanIsValid(scan) PointerIsValid(scan) + +extern Relation index_open(Oid relationId, LOCKMODE lockmode); +extern Relation try_index_open(Oid relationId, LOCKMODE lockmode); +extern void index_close(Relation relation, LOCKMODE lockmode); + +extern bool index_insert(Relation indexRelation, + Datum *values, bool *isnull, + ItemPointer heap_t_ctid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); + +extern IndexScanDesc index_beginscan(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + int nkeys, int norderbys); +extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, + Snapshot snapshot, + int nkeys); +extern void index_rescan(IndexScanDesc scan, + ScanKey keys, int nkeys, + ScanKey orderbys, int norderbys); +extern void index_endscan(IndexScanDesc scan); +extern void index_markpos(IndexScanDesc scan); +extern void index_restrpos(IndexScanDesc scan); +extern Size index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot); +extern void index_parallelscan_initialize(Relation heapRelation, + Relation indexRelation, Snapshot snapshot, + ParallelIndexScanDesc target); +extern void index_parallelrescan(IndexScanDesc scan); +extern IndexScanDesc index_beginscan_parallel(Relation heaprel, + Relation indexrel, int nkeys, int norderbys, + ParallelIndexScanDesc pscan); +extern ItemPointer index_getnext_tid(IndexScanDesc scan, + ScanDirection direction); +struct TupleTableSlot; +extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); +extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, + struct TupleTableSlot *slot); +extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); + +extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, + IndexBulkDeleteResult *istat, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *istat); +extern bool index_can_return(Relation indexRelation, int attno); +extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, + uint16 procnum); +extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, + uint16 procnum); +extern void index_store_float8_orderby_distances(IndexScanDesc scan, + Oid *orderByTypes, + IndexOrderByDistance *distances, + bool recheckOrderBy); +extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum, + Datum attoptions, bool validate); + + +/* + * index access method support routines (in genam.c) + */ +extern IndexScanDesc RelationGetIndexScan(Relation indexRelation, + int nkeys, int norderbys); +extern void IndexScanEnd(IndexScanDesc scan); +extern char *BuildIndexValueDescription(Relation indexRelation, + Datum *values, bool *isnull); +extern TransactionId index_compute_xid_horizon_for_tuples(Relation irel, + Relation hrel, + Buffer ibuf, + OffsetNumber *itemnos, + int nitems); + +/* + * heap-or-index access to system catalogs (in genam.c) + */ +extern SysScanDesc systable_beginscan(Relation heapRelation, + Oid indexId, + bool indexOK, + Snapshot snapshot, + int nkeys, ScanKey key); +extern HeapTuple systable_getnext(SysScanDesc sysscan); +extern bool systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup); +extern void systable_endscan(SysScanDesc sysscan); +extern SysScanDesc systable_beginscan_ordered(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + int nkeys, ScanKey key); +extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan, + ScanDirection direction); +extern void systable_endscan_ordered(SysScanDesc sysscan); + +#endif /* GENAM_H */ diff --git a/src/include/access/generic_xlog.h b/src/include/access/generic_xlog.h new file mode 100644 index 0000000..66941f9 --- /dev/null +++ b/src/include/access/generic_xlog.h @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * generic_xlog.h + * Generic xlog API definition. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/generic_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef GENERIC_XLOG_H +#define GENERIC_XLOG_H + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xloginsert.h" +#include "storage/bufpage.h" +#include "utils/rel.h" + +#define MAX_GENERIC_XLOG_PAGES XLR_NORMAL_MAX_BLOCK_ID + +/* Flag bits for GenericXLogRegisterBuffer */ +#define GENERIC_XLOG_FULL_IMAGE 0x0001 /* write full-page image */ + +/* state of generic xlog record construction */ +struct GenericXLogState; +typedef struct GenericXLogState GenericXLogState; + +/* API for construction of generic xlog records */ +extern GenericXLogState *GenericXLogStart(Relation relation); +extern Page GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, + int flags); +extern XLogRecPtr GenericXLogFinish(GenericXLogState *state); +extern void GenericXLogAbort(GenericXLogState *state); + +/* functions defined for rmgr */ +extern void generic_redo(XLogReaderState *record); +extern const char *generic_identify(uint8 info); +extern void generic_desc(StringInfo buf, XLogReaderState *record); +extern void generic_mask(char *page, BlockNumber blkno); + +#endif /* GENERIC_XLOG_H */ diff --git a/src/include/access/gin.h b/src/include/access/gin.h new file mode 100644 index 0000000..f165506 --- /dev/null +++ b/src/include/access/gin.h @@ -0,0 +1,91 @@ +/*-------------------------------------------------------------------------- + * gin.h + * Public header file for Generalized Inverted Index access method. + * + * Copyright (c) 2006-2023, PostgreSQL Global Development Group + * + * src/include/access/gin.h + *-------------------------------------------------------------------------- + */ +#ifndef GIN_H +#define GIN_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/block.h" +#include "utils/relcache.h" + + +/* + * amproc indexes for inverted indexes. + */ +#define GIN_COMPARE_PROC 1 +#define GIN_EXTRACTVALUE_PROC 2 +#define GIN_EXTRACTQUERY_PROC 3 +#define GIN_CONSISTENT_PROC 4 +#define GIN_COMPARE_PARTIAL_PROC 5 +#define GIN_TRICONSISTENT_PROC 6 +#define GIN_OPTIONS_PROC 7 +#define GINNProcs 7 + +/* + * searchMode settings for extractQueryFn. + */ +#define GIN_SEARCH_MODE_DEFAULT 0 +#define GIN_SEARCH_MODE_INCLUDE_EMPTY 1 +#define GIN_SEARCH_MODE_ALL 2 +#define GIN_SEARCH_MODE_EVERYTHING 3 /* for internal use only */ + +/* + * GinStatsData represents stats data for planner use + */ +typedef struct GinStatsData +{ + BlockNumber nPendingPages; + BlockNumber nTotalPages; + BlockNumber nEntryPages; + BlockNumber nDataPages; + int64 nEntries; + int32 ginVersion; +} GinStatsData; + +/* + * A ternary value used by tri-consistent functions. + * + * This must be of the same size as a bool because some code will cast a + * pointer to a bool to a pointer to a GinTernaryValue. + */ +typedef char GinTernaryValue; + +StaticAssertDecl(sizeof(GinTernaryValue) == sizeof(bool), + "sizes of GinTernaryValue and bool are not equal"); + +#define GIN_FALSE 0 /* item is not present / does not match */ +#define GIN_TRUE 1 /* item is present / matches */ +#define GIN_MAYBE 2 /* don't know if item is present / don't know + * if matches */ + +static inline GinTernaryValue +DatumGetGinTernaryValue(Datum X) +{ + return (GinTernaryValue) X; +} + +static inline Datum +GinTernaryValueGetDatum(GinTernaryValue X) +{ + return (Datum) X; +} + +#define PG_RETURN_GIN_TERNARY_VALUE(x) return GinTernaryValueGetDatum(x) + +/* GUC parameters */ +extern PGDLLIMPORT int GinFuzzySearchLimit; +extern PGDLLIMPORT int gin_pending_list_limit; + +/* ginutil.c */ +extern void ginGetStats(Relation index, GinStatsData *stats); +extern void ginUpdateStats(Relation index, const GinStatsData *stats, + bool is_build); + +#endif /* GIN_H */ diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h new file mode 100644 index 0000000..6da6492 --- /dev/null +++ b/src/include/access/gin_private.h @@ -0,0 +1,502 @@ +/*-------------------------------------------------------------------------- + * gin_private.h + * header file for postgres inverted index access method implementation. + * + * Copyright (c) 2006-2023, PostgreSQL Global Development Group + * + * src/include/access/gin_private.h + *-------------------------------------------------------------------------- + */ +#ifndef GIN_PRIVATE_H +#define GIN_PRIVATE_H + +#include "access/amapi.h" +#include "access/gin.h" +#include "access/ginblock.h" +#include "access/itup.h" +#include "catalog/pg_am_d.h" +#include "fmgr.h" +#include "lib/rbtree.h" +#include "storage/bufmgr.h" + +/* + * Storage type for GIN's reloptions + */ +typedef struct GinOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + bool useFastUpdate; /* use fast updates? */ + int pendingListCleanupSize; /* maximum size of pending list */ +} GinOptions; + +#define GIN_DEFAULT_USE_FASTUPDATE true +#define GinGetUseFastUpdate(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == GIN_AM_OID), \ + (relation)->rd_options ? \ + ((GinOptions *) (relation)->rd_options)->useFastUpdate : GIN_DEFAULT_USE_FASTUPDATE) +#define GinGetPendingListCleanupSize(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == GIN_AM_OID), \ + (relation)->rd_options && \ + ((GinOptions *) (relation)->rd_options)->pendingListCleanupSize != -1 ? \ + ((GinOptions *) (relation)->rd_options)->pendingListCleanupSize : \ + gin_pending_list_limit) + + +/* Macros for buffer lock/unlock operations */ +#define GIN_UNLOCK BUFFER_LOCK_UNLOCK +#define GIN_SHARE BUFFER_LOCK_SHARE +#define GIN_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE + + +/* + * GinState: working data structure describing the index being worked on + */ +typedef struct GinState +{ + Relation index; + bool oneCol; /* true if single-column index */ + + /* + * origTupdesc is the nominal tuple descriptor of the index, ie, the i'th + * attribute shows the key type (not the input data type!) of the i'th + * index column. In a single-column index this describes the actual leaf + * index tuples. In a multi-column index, the actual leaf tuples contain + * a smallint column number followed by a key datum of the appropriate + * type for that column. We set up tupdesc[i] to describe the actual + * rowtype of the index tuples for the i'th column, ie, (int2, keytype). + * Note that in any case, leaf tuples contain more data than is known to + * the TupleDesc; see access/gin/README for details. + */ + TupleDesc origTupdesc; + TupleDesc tupdesc[INDEX_MAX_KEYS]; + + /* + * Per-index-column opclass support functions + */ + FmgrInfo compareFn[INDEX_MAX_KEYS]; + FmgrInfo extractValueFn[INDEX_MAX_KEYS]; + FmgrInfo extractQueryFn[INDEX_MAX_KEYS]; + FmgrInfo consistentFn[INDEX_MAX_KEYS]; + FmgrInfo triConsistentFn[INDEX_MAX_KEYS]; + FmgrInfo comparePartialFn[INDEX_MAX_KEYS]; /* optional method */ + /* canPartialMatch[i] is true if comparePartialFn[i] is valid */ + bool canPartialMatch[INDEX_MAX_KEYS]; + /* Collations to pass to the support functions */ + Oid supportCollation[INDEX_MAX_KEYS]; +} GinState; + + +/* ginutil.c */ +extern bytea *ginoptions(Datum reloptions, bool validate); +extern void initGinState(GinState *state, Relation index); +extern Buffer GinNewBuffer(Relation index); +extern void GinInitBuffer(Buffer b, uint32 f); +extern void GinInitPage(Page page, uint32 f, Size pageSize); +extern void GinInitMetabuffer(Buffer b); +extern int ginCompareEntries(GinState *ginstate, OffsetNumber attnum, + Datum a, GinNullCategory categorya, + Datum b, GinNullCategory categoryb); +extern int ginCompareAttEntries(GinState *ginstate, + OffsetNumber attnuma, Datum a, GinNullCategory categorya, + OffsetNumber attnumb, Datum b, GinNullCategory categoryb); +extern Datum *ginExtractEntries(GinState *ginstate, OffsetNumber attnum, + Datum value, bool isNull, + int32 *nentries, GinNullCategory **categories); + +extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple); +extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple, + GinNullCategory *category); + +/* gininsert.c */ +extern IndexBuildResult *ginbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void ginbuildempty(Relation index); +extern bool gininsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); +extern void ginEntryInsert(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats); + +/* ginbtree.c */ + +typedef struct GinBtreeStack +{ + BlockNumber blkno; + Buffer buffer; + OffsetNumber off; + ItemPointerData iptr; + /* predictNumber contains predicted number of pages on current level */ + uint32 predictNumber; + struct GinBtreeStack *parent; +} GinBtreeStack; + +typedef struct GinBtreeData *GinBtree; + +/* Return codes for GinBtreeData.beginPlaceToPage method */ +typedef enum +{ + GPTP_NO_WORK, + GPTP_INSERT, + GPTP_SPLIT +} GinPlaceToPageRC; + +typedef struct GinBtreeData +{ + /* search methods */ + BlockNumber (*findChildPage) (GinBtree, GinBtreeStack *); + BlockNumber (*getLeftMostChild) (GinBtree, Page); + bool (*isMoveRight) (GinBtree, Page); + bool (*findItem) (GinBtree, GinBtreeStack *); + + /* insert methods */ + OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber); + GinPlaceToPageRC (*beginPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void **, Page *, Page *); + void (*execPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void *); + void *(*prepareDownlink) (GinBtree, Buffer); + void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page); + + bool isData; + + Relation index; + BlockNumber rootBlkno; + GinState *ginstate; /* not valid in a data scan */ + bool fullScan; + bool isBuild; + + /* Search key for Entry tree */ + OffsetNumber entryAttnum; + Datum entryKey; + GinNullCategory entryCategory; + + /* Search key for data tree (posting tree) */ + ItemPointerData itemptr; +} GinBtreeData; + +/* This represents a tuple to be inserted to entry tree. */ +typedef struct +{ + IndexTuple entry; /* tuple to insert */ + bool isDelete; /* delete old tuple at same offset? */ +} GinBtreeEntryInsertData; + +/* + * This represents an itempointer, or many itempointers, to be inserted to + * a data (posting tree) leaf page + */ +typedef struct +{ + ItemPointerData *items; + uint32 nitem; + uint32 curitem; +} GinBtreeDataLeafInsertData; + +/* + * For internal data (posting tree) pages, the insertion payload is a + * PostingItem + */ + +extern GinBtreeStack *ginFindLeafPage(GinBtree btree, bool searchMode, + bool rootConflictCheck, Snapshot snapshot); +extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode); +extern void freeGinBtreeStack(GinBtreeStack *stack); +extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack, + void *insertdata, GinStatsData *buildStats); + +/* ginentrypage.c */ +extern IndexTuple GinFormTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + Pointer data, Size dataSize, int nipd, bool errorTooBig); +extern void ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, + Datum key, GinNullCategory category, + GinState *ginstate); +extern void ginEntryFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage); +extern ItemPointer ginReadTuple(GinState *ginstate, OffsetNumber attnum, + IndexTuple itup, int *nitems); + +/* gindatapage.c */ +extern ItemPointer GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast); +extern int GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm); +extern BlockNumber createPostingTree(Relation index, + ItemPointerData *items, uint32 nitems, + GinStatsData *buildStats, Buffer entrybuffer); +extern void GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset); +extern void GinPageDeletePostingItem(Page page, OffsetNumber offset); +extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats); +extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot); +extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage); + +/* + * This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers + * and ginVacuumPostingTreeLeaf and as an opaque struct, so we need a forward + * declaration for it. + */ +typedef struct GinVacuumState GinVacuumState; + +extern void ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, + GinVacuumState *gvs); + +/* ginscan.c */ + +/* + * GinScanKeyData describes a single GIN index qualifier expression. + * + * From each qual expression, we extract one or more specific index search + * conditions, which are represented by GinScanEntryData. It's quite + * possible for identical search conditions to be requested by more than + * one qual expression, in which case we merge such conditions to have just + * one unique GinScanEntry --- this is particularly important for efficiency + * when dealing with full-index-scan entries. So there can be multiple + * GinScanKeyData.scanEntry pointers to the same GinScanEntryData. + * + * In each GinScanKeyData, nentries is the true number of entries, while + * nuserentries is the number that extractQueryFn returned (which is what + * we report to consistentFn). The "user" entries must come first. + */ +typedef struct GinScanKeyData *GinScanKey; + +typedef struct GinScanEntryData *GinScanEntry; + +typedef struct GinScanKeyData +{ + /* Real number of entries in scanEntry[] (always > 0) */ + uint32 nentries; + /* Number of entries that extractQueryFn and consistentFn know about */ + uint32 nuserentries; + + /* array of GinScanEntry pointers, one per extracted search condition */ + GinScanEntry *scanEntry; + + /* + * At least one of the entries in requiredEntries must be present for a + * tuple to match the overall qual. + * + * additionalEntries contains entries that are needed by the consistent + * function to decide if an item matches, but are not sufficient to + * satisfy the qual without entries from requiredEntries. + */ + GinScanEntry *requiredEntries; + int nrequired; + GinScanEntry *additionalEntries; + int nadditional; + + /* array of check flags, reported to consistentFn */ + GinTernaryValue *entryRes; + bool (*boolConsistentFn) (GinScanKey key); + GinTernaryValue (*triConsistentFn) (GinScanKey key); + FmgrInfo *consistentFmgrInfo; + FmgrInfo *triConsistentFmgrInfo; + Oid collation; + + /* other data needed for calling consistentFn */ + Datum query; + /* NB: these three arrays have only nuserentries elements! */ + Datum *queryValues; + GinNullCategory *queryCategories; + Pointer *extra_data; + StrategyNumber strategy; + int32 searchMode; + OffsetNumber attnum; + + /* + * An excludeOnly scan key is not able to enumerate all matching tuples. + * That is, to be semantically correct on its own, it would need to have a + * GIN_CAT_EMPTY_QUERY scanEntry, but it doesn't. Such a key can still be + * used to filter tuples returned by other scan keys, so we will get the + * right answers as long as there's at least one non-excludeOnly scan key + * for each index attribute considered by the search. For efficiency + * reasons we don't want to have unnecessary GIN_CAT_EMPTY_QUERY entries, + * so we will convert an excludeOnly scan key to non-excludeOnly (by + * adding a GIN_CAT_EMPTY_QUERY scanEntry) only if there are no other + * non-excludeOnly scan keys. + */ + bool excludeOnly; + + /* + * Match status data. curItem is the TID most recently tested (could be a + * lossy-page pointer). curItemMatches is true if it passes the + * consistentFn test; if so, recheckCurItem is the recheck flag. + * isFinished means that all the input entry streams are finished, so this + * key cannot succeed for any later TIDs. + */ + ItemPointerData curItem; + bool curItemMatches; + bool recheckCurItem; + bool isFinished; +} GinScanKeyData; + +typedef struct GinScanEntryData +{ + /* query key and other information from extractQueryFn */ + Datum queryKey; + GinNullCategory queryCategory; + bool isPartialMatch; + Pointer extra_data; + StrategyNumber strategy; + int32 searchMode; + OffsetNumber attnum; + + /* Current page in posting tree */ + Buffer buffer; + + /* current ItemPointer to heap */ + ItemPointerData curItem; + + /* for a partial-match or full-scan query, we accumulate all TIDs here */ + TIDBitmap *matchBitmap; + TBMIterator *matchIterator; + TBMIterateResult *matchResult; + + /* used for Posting list and one page in Posting tree */ + ItemPointerData *list; + int nlist; + OffsetNumber offset; + + bool isFinished; + bool reduceResult; + uint32 predictNumberResult; + GinBtreeData btree; +} GinScanEntryData; + +typedef struct GinScanOpaqueData +{ + MemoryContext tempCtx; + GinState ginstate; + + GinScanKey keys; /* one per scan qualifier expr */ + uint32 nkeys; + + GinScanEntry *entries; /* one per index search condition */ + uint32 totalentries; + uint32 allocentries; /* allocated length of entries[] */ + + MemoryContext keyCtx; /* used to hold key and entry data */ + + bool isVoidRes; /* true if query is unsatisfiable */ +} GinScanOpaqueData; + +typedef GinScanOpaqueData *GinScanOpaque; + +extern IndexScanDesc ginbeginscan(Relation rel, int nkeys, int norderbys); +extern void ginendscan(IndexScanDesc scan); +extern void ginrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern void ginNewScanKey(IndexScanDesc scan); +extern void ginFreeScanKeys(GinScanOpaque so); + +/* ginget.c */ +extern int64 gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm); + +/* ginlogic.c */ +extern void ginInitConsistentFunction(GinState *ginstate, GinScanKey key); + +/* ginvacuum.c */ +extern IndexBulkDeleteResult *ginbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *ginvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); +extern ItemPointer ginVacuumItemPointers(GinVacuumState *gvs, + ItemPointerData *items, int nitem, int *nremaining); + +/* ginvalidate.c */ +extern bool ginvalidate(Oid opclassoid); +extern void ginadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions); + +/* ginbulk.c */ +typedef struct GinEntryAccumulator +{ + RBTNode rbtnode; + Datum key; + GinNullCategory category; + OffsetNumber attnum; + bool shouldSort; + ItemPointerData *list; + uint32 maxcount; /* allocated size of list[] */ + uint32 count; /* current number of list[] entries */ +} GinEntryAccumulator; + +typedef struct +{ + GinState *ginstate; + Size allocatedMemory; + GinEntryAccumulator *entryallocator; + uint32 eas_used; + RBTree *tree; + RBTreeIterator tree_walk; +} BuildAccumulator; + +extern void ginInitBA(BuildAccumulator *accum); +extern void ginInsertBAEntries(BuildAccumulator *accum, + ItemPointer heapptr, OffsetNumber attnum, + Datum *entries, GinNullCategory *categories, + int32 nentries); +extern void ginBeginBAScan(BuildAccumulator *accum); +extern ItemPointerData *ginGetBAEntry(BuildAccumulator *accum, + OffsetNumber *attnum, Datum *key, GinNullCategory *category, + uint32 *n); + +/* ginfast.c */ + +typedef struct GinTupleCollector +{ + IndexTuple *tuples; + uint32 ntuples; + uint32 lentuples; + uint32 sumsize; +} GinTupleCollector; + +extern void ginHeapTupleFastInsert(GinState *ginstate, + GinTupleCollector *collector); +extern void ginHeapTupleFastCollect(GinState *ginstate, + GinTupleCollector *collector, + OffsetNumber attnum, Datum value, bool isNull, + ItemPointer ht_ctid); +extern void ginInsertCleanup(GinState *ginstate, bool full_clean, + bool fill_fsm, bool forceCleanup, IndexBulkDeleteResult *stats); + +/* ginpostinglist.c */ + +extern GinPostingList *ginCompressPostingList(const ItemPointer ipd, int nipd, + int maxsize, int *nwritten); +extern int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap *tbm); + +extern ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, + int *ndecoded_out); +extern ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded_out); +extern ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, + ItemPointerData *b, uint32 nb, + int *nmerged); + +/* + * Merging the results of several gin scans compares item pointers a lot, + * so we want this to be inlined. + */ +static inline int +ginCompareItemPointers(ItemPointer a, ItemPointer b) +{ + uint64 ia = (uint64) GinItemPointerGetBlockNumber(a) << 32 | GinItemPointerGetOffsetNumber(a); + uint64 ib = (uint64) GinItemPointerGetBlockNumber(b) << 32 | GinItemPointerGetOffsetNumber(b); + + if (ia == ib) + return 0; + else if (ia > ib) + return 1; + else + return -1; +} + +extern int ginTraverseLock(Buffer buffer, bool searchMode); + +#endif /* GIN_PRIVATE_H */ diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h new file mode 100644 index 0000000..c59790e --- /dev/null +++ b/src/include/access/ginblock.h @@ -0,0 +1,346 @@ +/*-------------------------------------------------------------------------- + * ginblock.h + * details of structures stored in GIN index blocks + * + * Copyright (c) 2006-2023, PostgreSQL Global Development Group + * + * src/include/access/ginblock.h + *-------------------------------------------------------------------------- + */ +#ifndef GINBLOCK_H +#define GINBLOCK_H + +#include "access/transam.h" +#include "storage/block.h" +#include "storage/bufpage.h" +#include "storage/itemptr.h" +#include "storage/off.h" + +/* + * Page opaque data in an inverted index page. + * + * Note: GIN does not include a page ID word as do the other index types. + * This is OK because the opaque data is only 8 bytes and so can be reliably + * distinguished by size. Revisit this if the size ever increases. + * Further note: as of 9.2, SP-GiST also uses 8-byte special space, as does + * BRIN as of 9.5. This is still OK, as long as GIN isn't using all of the + * high-order bits in its flags word, because that way the flags word cannot + * match the page IDs used by SP-GiST and BRIN. + */ +typedef struct GinPageOpaqueData +{ + BlockNumber rightlink; /* next page if any */ + OffsetNumber maxoff; /* number of PostingItems on GIN_DATA & + * ~GIN_LEAF page. On GIN_LIST page, number of + * heap tuples. */ + uint16 flags; /* see bit definitions below */ +} GinPageOpaqueData; + +typedef GinPageOpaqueData *GinPageOpaque; + +#define GIN_DATA (1 << 0) +#define GIN_LEAF (1 << 1) +#define GIN_DELETED (1 << 2) +#define GIN_META (1 << 3) +#define GIN_LIST (1 << 4) +#define GIN_LIST_FULLROW (1 << 5) /* makes sense only on GIN_LIST page */ +#define GIN_INCOMPLETE_SPLIT (1 << 6) /* page was split, but parent not + * updated */ +#define GIN_COMPRESSED (1 << 7) + +/* Page numbers of fixed-location pages */ +#define GIN_METAPAGE_BLKNO (0) +#define GIN_ROOT_BLKNO (1) + +typedef struct GinMetaPageData +{ + /* + * Pointers to head and tail of pending list, which consists of GIN_LIST + * pages. These store fast-inserted entries that haven't yet been moved + * into the regular GIN structure. + */ + BlockNumber head; + BlockNumber tail; + + /* + * Free space in bytes in the pending list's tail page. + */ + uint32 tailFreeSize; + + /* + * We store both number of pages and number of heap tuples that are in the + * pending list. + */ + BlockNumber nPendingPages; + int64 nPendingHeapTuples; + + /* + * Statistics for planner use (accurate as of last VACUUM) + */ + BlockNumber nTotalPages; + BlockNumber nEntryPages; + BlockNumber nDataPages; + int64 nEntries; + + /* + * GIN version number (ideally this should have been at the front, but too + * late now. Don't move it!) + * + * Currently 2 (for indexes initialized in 9.4 or later) + * + * Version 1 (indexes initialized in version 9.1, 9.2 or 9.3), is + * compatible, but may contain uncompressed posting tree (leaf) pages and + * posting lists. They will be converted to compressed format when + * modified. + * + * Version 0 (indexes initialized in 9.0 or before) is compatible but may + * be missing null entries, including both null keys and placeholders. + * Reject full-index-scan attempts on such indexes. + */ + int32 ginVersion; +} GinMetaPageData; + +#define GIN_CURRENT_VERSION 2 + +#define GinPageGetMeta(p) \ + ((GinMetaPageData *) PageGetContents(p)) + +/* + * Macros for accessing a GIN index page's opaque data + */ +#define GinPageGetOpaque(page) ( (GinPageOpaque) PageGetSpecialPointer(page) ) + +#define GinPageIsLeaf(page) ( (GinPageGetOpaque(page)->flags & GIN_LEAF) != 0 ) +#define GinPageSetLeaf(page) ( GinPageGetOpaque(page)->flags |= GIN_LEAF ) +#define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF ) +#define GinPageIsData(page) ( (GinPageGetOpaque(page)->flags & GIN_DATA) != 0 ) +#define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA ) +#define GinPageIsList(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST) != 0 ) +#define GinPageSetList(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST ) +#define GinPageHasFullRow(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW) != 0 ) +#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW ) +#define GinPageIsCompressed(page) ( (GinPageGetOpaque(page)->flags & GIN_COMPRESSED) != 0 ) +#define GinPageSetCompressed(page) ( GinPageGetOpaque(page)->flags |= GIN_COMPRESSED ) + +#define GinPageIsDeleted(page) ( (GinPageGetOpaque(page)->flags & GIN_DELETED) != 0 ) +#define GinPageSetDeleted(page) ( GinPageGetOpaque(page)->flags |= GIN_DELETED) +#define GinPageSetNonDeleted(page) ( GinPageGetOpaque(page)->flags &= ~GIN_DELETED) +#define GinPageIsIncompleteSplit(page) ( (GinPageGetOpaque(page)->flags & GIN_INCOMPLETE_SPLIT) != 0 ) + +#define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) + +/* + * We should reclaim deleted page only once every transaction started before + * its deletion is over. + */ +#define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) +#define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid) +extern bool GinPageIsRecyclable(Page page); + +/* + * We use our own ItemPointerGet(BlockNumber|OffsetNumber) + * to avoid Asserts, since sometimes the ip_posid isn't "valid" + */ +#define GinItemPointerGetBlockNumber(pointer) \ + (ItemPointerGetBlockNumberNoCheck(pointer)) + +#define GinItemPointerGetOffsetNumber(pointer) \ + (ItemPointerGetOffsetNumberNoCheck(pointer)) + +#define GinItemPointerSetBlockNumber(pointer, blkno) \ + (ItemPointerSetBlockNumber((pointer), (blkno))) + +#define GinItemPointerSetOffsetNumber(pointer, offnum) \ + (ItemPointerSetOffsetNumber((pointer), (offnum))) + + +/* + * Special-case item pointer values needed by the GIN search logic. + * MIN: sorts less than any valid item pointer + * MAX: sorts greater than any valid item pointer + * LOSSY PAGE: indicates a whole heap page, sorts after normal item + * pointers for that page + * Note that these are all distinguishable from an "invalid" item pointer + * (which is InvalidBlockNumber/0) as well as from all normal item + * pointers (which have item numbers in the range 1..MaxHeapTuplesPerPage). + */ +#define ItemPointerSetMin(p) \ + ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0) +#define ItemPointerIsMin(p) \ + (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0 && \ + GinItemPointerGetBlockNumber(p) == (BlockNumber)0) +#define ItemPointerSetMax(p) \ + ItemPointerSet((p), InvalidBlockNumber, (OffsetNumber)0xffff) +#define ItemPointerSetLossyPage(p, b) \ + ItemPointerSet((p), (b), (OffsetNumber)0xffff) +#define ItemPointerIsLossyPage(p) \ + (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ + GinItemPointerGetBlockNumber(p) != InvalidBlockNumber) + +/* + * Posting item in a non-leaf posting-tree page + */ +typedef struct +{ + /* We use BlockIdData not BlockNumber to avoid padding space wastage */ + BlockIdData child_blkno; + ItemPointerData key; +} PostingItem; + +#define PostingItemGetBlockNumber(pointer) \ + BlockIdGetBlockNumber(&(pointer)->child_blkno) + +#define PostingItemSetBlockNumber(pointer, blockNumber) \ + BlockIdSet(&((pointer)->child_blkno), (blockNumber)) + +/* + * Category codes to distinguish placeholder nulls from ordinary NULL keys. + * + * The first two code values were chosen to be compatible with the usual usage + * of bool isNull flags. However, casting between bool and GinNullCategory is + * risky because of the possibility of different bit patterns and type sizes, + * so it is no longer done. + * + * GIN_CAT_EMPTY_QUERY is never stored in the index; and notice that it is + * chosen to sort before not after regular key values. + */ +typedef signed char GinNullCategory; + +#define GIN_CAT_NORM_KEY 0 /* normal, non-null key value */ +#define GIN_CAT_NULL_KEY 1 /* null key value */ +#define GIN_CAT_EMPTY_ITEM 2 /* placeholder for zero-key item */ +#define GIN_CAT_NULL_ITEM 3 /* placeholder for null item */ +#define GIN_CAT_EMPTY_QUERY (-1) /* placeholder for full-scan query */ + +/* + * Access macros for null category byte in entry tuples + */ +#define GinCategoryOffset(itup,ginstate) \ + (IndexInfoFindDataOffset((itup)->t_info) + \ + ((ginstate)->oneCol ? 0 : sizeof(int16))) +#define GinGetNullCategory(itup,ginstate) \ + (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate)))) +#define GinSetNullCategory(itup,ginstate,c) \ + (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate))) = (c)) + +/* + * Access macros for leaf-page entry tuples (see discussion in README) + */ +#define GinGetNPosting(itup) GinItemPointerGetOffsetNumber(&(itup)->t_tid) +#define GinSetNPosting(itup,n) ItemPointerSetOffsetNumber(&(itup)->t_tid,n) +#define GIN_TREE_POSTING ((OffsetNumber)0xffff) +#define GinIsPostingTree(itup) (GinGetNPosting(itup) == GIN_TREE_POSTING) +#define GinSetPostingTree(itup, blkno) ( GinSetNPosting((itup),GIN_TREE_POSTING), ItemPointerSetBlockNumber(&(itup)->t_tid, blkno) ) +#define GinGetPostingTree(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid) + +#define GIN_ITUP_COMPRESSED (1U << 31) +#define GinGetPostingOffset(itup) (GinItemPointerGetBlockNumber(&(itup)->t_tid) & (~GIN_ITUP_COMPRESSED)) +#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,(n)|GIN_ITUP_COMPRESSED) +#define GinGetPosting(itup) ((Pointer) ((char*)(itup) + GinGetPostingOffset(itup))) +#define GinItupIsCompressed(itup) ((GinItemPointerGetBlockNumber(&(itup)->t_tid) & GIN_ITUP_COMPRESSED) != 0) + +/* + * Maximum size of an item on entry tree page. Make sure that we fit at least + * three items on each page. (On regular B-tree indexes, we must fit at least + * three items: two data items and the "high key". In GIN entry tree, we don't + * currently store the high key explicitly, we just use the rightmost item on + * the page, so it would actually be enough to fit two items.) + */ +#define GinMaxItemSize \ + Min(INDEX_SIZE_MASK, \ + MAXALIGN_DOWN(((BLCKSZ - \ + MAXALIGN(SizeOfPageHeaderData + 3 * sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(GinPageOpaqueData))) / 3))) + +/* + * Access macros for non-leaf entry tuples + */ +#define GinGetDownlink(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid) +#define GinSetDownlink(itup,blkno) ItemPointerSet(&(itup)->t_tid, blkno, InvalidOffsetNumber) + + +/* + * Data (posting tree) pages + * + * Posting tree pages don't store regular tuples. Non-leaf pages contain + * PostingItems, which are pairs of ItemPointers and child block numbers. + * Leaf pages contain GinPostingLists and an uncompressed array of item + * pointers. + * + * In a leaf page, the compressed posting lists are stored after the regular + * page header, one after each other. Although we don't store regular tuples, + * pd_lower is used to indicate the end of the posting lists. After that, free + * space follows. This layout is compatible with the "standard" heap and + * index page layout described in bufpage.h, so that we can e.g set buffer_std + * when writing WAL records. + * + * In the special space is the GinPageOpaque struct. + */ +#define GinDataLeafPageGetPostingList(page) \ + (GinPostingList *) ((PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData)))) +#define GinDataLeafPageGetPostingListSize(page) \ + (((PageHeader) page)->pd_lower - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(ItemPointerData))) + +#define GinDataLeafPageIsEmpty(page) \ + (GinPageIsCompressed(page) ? (GinDataLeafPageGetPostingListSize(page) == 0) : (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)) + +#define GinDataLeafPageGetFreeSpace(page) PageGetExactFreeSpace(page) + +#define GinDataPageGetRightBound(page) ((ItemPointer) PageGetContents(page)) +/* + * Pointer to the data portion of a posting tree page. For internal pages, + * that's the beginning of the array of PostingItems. For compressed leaf + * pages, the first compressed posting list. For uncompressed (pre-9.4) leaf + * pages, it's the beginning of the ItemPointer array. + */ +#define GinDataPageGetData(page) \ + (PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData))) +/* non-leaf pages contain PostingItems */ +#define GinDataPageGetPostingItem(page, i) \ + ((PostingItem *) (GinDataPageGetData(page) + ((i)-1) * sizeof(PostingItem))) + +/* + * Note: there is no GinDataPageGetDataSize macro, because before version + * 9.4, we didn't set pd_lower on data pages. There can be pages in the index + * that were binary-upgraded from earlier versions and still have an invalid + * pd_lower, so we cannot trust it in general. Compressed posting tree leaf + * pages are new in 9.4, however, so we can trust them; see + * GinDataLeafPageGetPostingListSize. + */ +#define GinDataPageSetDataSize(page, size) \ + { \ + Assert(size <= GinDataPageMaxDataSize); \ + ((PageHeader) page)->pd_lower = (size) + MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(ItemPointerData)); \ + } + +#define GinNonLeafDataPageGetFreeSpace(page) \ + (GinDataPageMaxDataSize - \ + GinPageGetOpaque(page)->maxoff * sizeof(PostingItem)) + +#define GinDataPageMaxDataSize \ + (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + - MAXALIGN(sizeof(ItemPointerData)) \ + - MAXALIGN(sizeof(GinPageOpaqueData))) + +/* + * List pages + */ +#define GinListPageSize \ + ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) + +/* + * A compressed posting list. + * + * Note: This requires 2-byte alignment. + */ +typedef struct +{ + ItemPointerData first; /* first item in this posting list (unpacked) */ + uint16 nbytes; /* number of bytes that follow */ + unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]; /* varbyte encoded items */ +} GinPostingList; + +#define SizeOfGinPostingList(plist) (offsetof(GinPostingList, bytes) + SHORTALIGN((plist)->nbytes) ) +#define GinNextPostingListSegment(cur) ((GinPostingList *) (((char *) (cur)) + SizeOfGinPostingList((cur)))) + +#endif /* GINBLOCK_H */ diff --git a/src/include/access/ginxlog.h b/src/include/access/ginxlog.h new file mode 100644 index 0000000..37095e5 --- /dev/null +++ b/src/include/access/ginxlog.h @@ -0,0 +1,216 @@ +/*-------------------------------------------------------------------------- + * ginxlog.h + * header file for postgres inverted index xlog implementation. + * + * Copyright (c) 2006-2023, PostgreSQL Global Development Group + * + * src/include/access/ginxlog.h + *-------------------------------------------------------------------------- + */ +#ifndef GINXLOG_H +#define GINXLOG_H + +#include "access/ginblock.h" +#include "access/itup.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +#define XLOG_GIN_CREATE_PTREE 0x10 + +typedef struct ginxlogCreatePostingTree +{ + uint32 size; + /* A compressed posting list follows */ +} ginxlogCreatePostingTree; + +/* + * The format of the insertion record varies depending on the page type. + * ginxlogInsert is the common part between all variants. + * + * Backup Blk 0: target page + * Backup Blk 1: left child, if this insertion finishes an incomplete split + */ + +#define XLOG_GIN_INSERT 0x20 + +typedef struct +{ + uint16 flags; /* GIN_INSERT_ISLEAF and/or GIN_INSERT_ISDATA */ + + /* + * FOLLOWS: + * + * 1. if not leaf page, block numbers of the left and right child pages + * whose split this insertion finishes, as BlockIdData[2] (beware of + * adding fields in this struct that would make them not 16-bit aligned) + * + * 2. a ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending + * on tree type. + * + * NB: the below structs are only 16-bit aligned when appended to a + * ginxlogInsert struct! Beware of adding fields to them that require + * stricter alignment. + */ +} ginxlogInsert; + +typedef struct +{ + OffsetNumber offset; + bool isDelete; + IndexTupleData tuple; /* variable length */ +} ginxlogInsertEntry; + + +typedef struct +{ + uint16 nactions; + + /* Variable number of 'actions' follow */ +} ginxlogRecompressDataLeaf; + +/* + * Note: this struct is currently not used in code, and only acts as + * documentation. The WAL record format is as specified here, but the code + * uses straight access through a Pointer and memcpy to read/write these. + */ +typedef struct +{ + uint8 segno; /* segment this action applies to */ + char type; /* action type (see below) */ + + /* + * Action-specific data follows. For INSERT and REPLACE actions that is a + * GinPostingList struct. For ADDITEMS, a uint16 for the number of items + * added, followed by the items themselves as ItemPointers. DELETE actions + * have no further data. + */ +} ginxlogSegmentAction; + +/* Action types */ +#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */ +#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */ +#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */ +#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */ +#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */ + +typedef struct +{ + OffsetNumber offset; + PostingItem newitem; +} ginxlogInsertDataInternal; + +/* + * Backup Blk 0: new left page (= original page, if not root split) + * Backup Blk 1: new right page + * Backup Blk 2: original page / new root page, if root split + * Backup Blk 3: left child, if this insertion completes an earlier split + */ +#define XLOG_GIN_SPLIT 0x30 + +typedef struct ginxlogSplit +{ + RelFileLocator locator; + BlockNumber rrlink; /* right link, or root's blocknumber if root + * split */ + BlockNumber leftChildBlkno; /* valid on a non-leaf split */ + BlockNumber rightChildBlkno; + uint16 flags; /* see below */ +} ginxlogSplit; + +/* + * Flags used in ginxlogInsert and ginxlogSplit records + */ +#define GIN_INSERT_ISDATA 0x01 /* for both insert and split records */ +#define GIN_INSERT_ISLEAF 0x02 /* ditto */ +#define GIN_SPLIT_ROOT 0x04 /* only for split records */ + +/* + * Vacuum simply WAL-logs the whole page, when anything is modified. This + * is functionally identical to XLOG_FPI records, but is kept separate for + * debugging purposes. (When inspecting the WAL stream, it's easier to see + * what's going on when GIN vacuum records are marked as such, not as heap + * records.) This is currently only used for entry tree leaf pages. + */ +#define XLOG_GIN_VACUUM_PAGE 0x40 + +/* + * Vacuuming posting tree leaf page is WAL-logged like recompression caused + * by insertion. + */ +#define XLOG_GIN_VACUUM_DATA_LEAF_PAGE 0x90 + +typedef struct ginxlogVacuumDataLeafPage +{ + ginxlogRecompressDataLeaf data; +} ginxlogVacuumDataLeafPage; + +/* + * Backup Blk 0: deleted page + * Backup Blk 1: parent + * Backup Blk 2: left sibling + */ +#define XLOG_GIN_DELETE_PAGE 0x50 + +typedef struct ginxlogDeletePage +{ + OffsetNumber parentOffset; + BlockNumber rightLink; + TransactionId deleteXid; /* last Xid which could see this page in scan */ +} ginxlogDeletePage; + +#define XLOG_GIN_UPDATE_META_PAGE 0x60 + +/* + * Backup Blk 0: metapage + * Backup Blk 1: tail page + */ +typedef struct ginxlogUpdateMeta +{ + RelFileLocator locator; + GinMetaPageData metadata; + BlockNumber prevTail; + BlockNumber newRightlink; + int32 ntuples; /* if ntuples > 0 then metadata.tail was + * updated with that many tuples; else new sub + * list was inserted */ + /* array of inserted tuples follows */ +} ginxlogUpdateMeta; + +#define XLOG_GIN_INSERT_LISTPAGE 0x70 + +typedef struct ginxlogInsertListPage +{ + BlockNumber rightlink; + int32 ntuples; + /* array of inserted tuples follows */ +} ginxlogInsertListPage; + +/* + * Backup Blk 0: metapage + * Backup Blk 1 to (ndeleted + 1): deleted pages + */ + +#define XLOG_GIN_DELETE_LISTPAGE 0x80 + +/* + * The WAL record for deleting list pages must contain a block reference to + * all the deleted pages, so the number of pages that can be deleted in one + * record is limited by XLR_MAX_BLOCK_ID. (block_id 0 is used for the + * metapage.) + */ +#define GIN_NDELETE_AT_ONCE Min(16, XLR_MAX_BLOCK_ID - 1) +typedef struct ginxlogDeleteListPages +{ + GinMetaPageData metadata; + int32 ndeleted; +} ginxlogDeleteListPages; + +extern void gin_redo(XLogReaderState *record); +extern void gin_desc(StringInfo buf, XLogReaderState *record); +extern const char *gin_identify(uint8 info); +extern void gin_xlog_startup(void); +extern void gin_xlog_cleanup(void); +extern void gin_mask(char *pagedata, BlockNumber blkno); + +#endif /* GINXLOG_H */ diff --git a/src/include/access/gist.h b/src/include/access/gist.h new file mode 100644 index 0000000..0235716 --- /dev/null +++ b/src/include/access/gist.h @@ -0,0 +1,248 @@ +/*------------------------------------------------------------------------- + * + * gist.h + * The public API for GiST indexes. This API is exposed to + * individuals implementing GiST indexes, so backward-incompatible + * changes should be made with care. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/gist.h + * + *------------------------------------------------------------------------- + */ +#ifndef GIST_H +#define GIST_H + +#include "access/itup.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogdefs.h" +#include "storage/block.h" +#include "storage/bufpage.h" +#include "utils/relcache.h" + +/* + * amproc indexes for GiST indexes. + */ +#define GIST_CONSISTENT_PROC 1 +#define GIST_UNION_PROC 2 +#define GIST_COMPRESS_PROC 3 +#define GIST_DECOMPRESS_PROC 4 +#define GIST_PENALTY_PROC 5 +#define GIST_PICKSPLIT_PROC 6 +#define GIST_EQUAL_PROC 7 +#define GIST_DISTANCE_PROC 8 +#define GIST_FETCH_PROC 9 +#define GIST_OPTIONS_PROC 10 +#define GIST_SORTSUPPORT_PROC 11 +#define GISTNProcs 11 + +/* + * Page opaque data in a GiST index page. + */ +#define F_LEAF (1 << 0) /* leaf page */ +#define F_DELETED (1 << 1) /* the page has been deleted */ +#define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were + * deleted */ +#define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */ +#define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead, + * but not deleted yet */ + +/* + * NSN (node sequence number) is a special-purpose LSN which is stored on each + * index page in GISTPageOpaqueData and updated only during page splits. By + * recording the parent's LSN in GISTSearchItem.parentlsn, it is possible to + * detect concurrent child page splits by checking if parentlsn < child's NSN, + * and handle them properly. The child page's LSN is insufficient for this + * purpose since it is updated for every page change. + */ +typedef XLogRecPtr GistNSN; + +/* + * A fake LSN / NSN value used during index builds. Must be smaller than any + * real or fake (unlogged) LSN generated after the index build completes so + * that all splits are considered complete. + */ +#define GistBuildLSN ((XLogRecPtr) 1) + +/* + * For on-disk compatibility with pre-9.3 servers, NSN is stored as two + * 32-bit fields on disk, same as LSNs. + */ +typedef PageXLogRecPtr PageGistNSN; + +typedef struct GISTPageOpaqueData +{ + PageGistNSN nsn; /* this value must change on page split */ + BlockNumber rightlink; /* next page if any */ + uint16 flags; /* see bit definitions above */ + uint16 gist_page_id; /* for identification of GiST indexes */ +} GISTPageOpaqueData; + +typedef GISTPageOpaqueData *GISTPageOpaque; + +/* + * Maximum possible sizes for GiST index tuple and index key. Calculation is + * based on assumption that GiST page should fit at least 4 tuples. In theory, + * GiST index can be functional when page can fit 3 tuples. But that seems + * rather inefficient, so we use a bit conservative estimate. + * + * The maximum size of index key is true for unicolumn index. Therefore, this + * estimation should be used to figure out which maximum size of GiST index key + * makes sense at all. For multicolumn indexes, user might be able to tune + * key size using opclass parameters. + */ +#define GISTMaxIndexTupleSize \ + MAXALIGN_DOWN((BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)) / \ + 4 - sizeof(ItemIdData)) + +#define GISTMaxIndexKeySize \ + (GISTMaxIndexTupleSize - MAXALIGN(sizeof(IndexTupleData))) + +/* + * The page ID is for the convenience of pg_filedump and similar utilities, + * which otherwise would have a hard time telling pages of different index + * types apart. It should be the last 2 bytes on the page. This is more or + * less "free" due to alignment considerations. + */ +#define GIST_PAGE_ID 0xFF81 + +/* + * This is the Split Vector to be returned by the PickSplit method. + * PickSplit should fill the indexes of tuples to go to the left side into + * spl_left[], and those to go to the right into spl_right[] (note the method + * is responsible for palloc'ing both of these arrays!). The tuple counts + * go into spl_nleft/spl_nright, and spl_ldatum/spl_rdatum must be set to + * the union keys for each side. + * + * If spl_ldatum_exists and spl_rdatum_exists are true, then we are performing + * a "secondary split" using a non-first index column. In this case some + * decisions have already been made about a page split, and the set of tuples + * being passed to PickSplit is just the tuples about which we are undecided. + * spl_ldatum/spl_rdatum then contain the union keys for the tuples already + * chosen to go left or right. Ideally the PickSplit method should take those + * keys into account while deciding what to do with the remaining tuples, ie + * it should try to "build out" from those unions so as to minimally expand + * them. If it does so, it should union the given tuples' keys into the + * existing spl_ldatum/spl_rdatum values rather than just setting those values + * from scratch, and then set spl_ldatum_exists/spl_rdatum_exists to false to + * show it has done this. + * + * If the PickSplit method fails to clear spl_ldatum_exists/spl_rdatum_exists, + * the core GiST code will make its own decision about how to merge the + * secondary-split results with the previously-chosen tuples, and will then + * recompute the union keys from scratch. This is a workable though often not + * optimal approach. + */ +typedef struct GIST_SPLITVEC +{ + OffsetNumber *spl_left; /* array of entries that go left */ + int spl_nleft; /* size of this array */ + Datum spl_ldatum; /* Union of keys in spl_left */ + bool spl_ldatum_exists; /* true, if spl_ldatum already exists. */ + + OffsetNumber *spl_right; /* array of entries that go right */ + int spl_nright; /* size of the array */ + Datum spl_rdatum; /* Union of keys in spl_right */ + bool spl_rdatum_exists; /* true, if spl_rdatum already exists. */ +} GIST_SPLITVEC; + +/* + * An entry on a GiST node. Contains the key, as well as its own + * location (rel,page,offset) which can supply the matching pointer. + * leafkey is a flag to tell us if the entry is in a leaf node. + */ +typedef struct GISTENTRY +{ + Datum key; + Relation rel; + Page page; + OffsetNumber offset; + bool leafkey; +} GISTENTRY; + +#define GistPageGetOpaque(page) ( (GISTPageOpaque) PageGetSpecialPointer(page) ) + +#define GistPageIsLeaf(page) ( GistPageGetOpaque(page)->flags & F_LEAF) +#define GIST_LEAF(entry) (GistPageIsLeaf((entry)->page)) + +#define GistPageIsDeleted(page) ( GistPageGetOpaque(page)->flags & F_DELETED) + +#define GistTuplesDeleted(page) ( GistPageGetOpaque(page)->flags & F_TUPLES_DELETED) +#define GistMarkTuplesDeleted(page) ( GistPageGetOpaque(page)->flags |= F_TUPLES_DELETED) +#define GistClearTuplesDeleted(page) ( GistPageGetOpaque(page)->flags &= ~F_TUPLES_DELETED) + +#define GistPageHasGarbage(page) ( GistPageGetOpaque(page)->flags & F_HAS_GARBAGE) +#define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE) +#define GistClearPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE) + +#define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT) +#define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT) +#define GistClearFollowRight(page) ( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT) + +#define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn)) +#define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val)) + + +/* + * On a deleted page, we store this struct. A deleted page doesn't contain any + * tuples, so we don't use the normal page layout with line pointers. Instead, + * this struct is stored right after the standard page header. pd_lower points + * to the end of this struct. If we add fields to this struct in the future, we + * can distinguish the old and new formats by pd_lower. + */ +typedef struct GISTDeletedPageContents +{ + /* last xid which could see the page in a scan */ + FullTransactionId deleteXid; +} GISTDeletedPageContents; + +static inline void +GistPageSetDeleted(Page page, FullTransactionId deletexid) +{ + Assert(PageIsEmpty(page)); + + GistPageGetOpaque(page)->flags |= F_DELETED; + ((PageHeader) page)->pd_lower = MAXALIGN(SizeOfPageHeaderData) + sizeof(GISTDeletedPageContents); + + ((GISTDeletedPageContents *) PageGetContents(page))->deleteXid = deletexid; +} + +static inline FullTransactionId +GistPageGetDeleteXid(Page page) +{ + Assert(GistPageIsDeleted(page)); + + /* Is the deleteXid field present? */ + if (((PageHeader) page)->pd_lower >= MAXALIGN(SizeOfPageHeaderData) + + offsetof(GISTDeletedPageContents, deleteXid) + sizeof(FullTransactionId)) + { + return ((GISTDeletedPageContents *) PageGetContents(page))->deleteXid; + } + else + return FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); +} + +/* + * Vector of GISTENTRY structs; user-defined methods union and picksplit + * take it as one of their arguments + */ +typedef struct +{ + int32 n; /* number of elements */ + GISTENTRY vector[FLEXIBLE_ARRAY_MEMBER]; +} GistEntryVector; + +#define GEVHDRSZ (offsetof(GistEntryVector, vector)) + +/* + * macro to initialize a GISTENTRY + */ +#define gistentryinit(e, k, r, pg, o, l) \ + do { (e).key = (k); (e).rel = (r); (e).page = (pg); \ + (e).offset = (o); (e).leafkey = (l); } while (0) + +#endif /* GIST_H */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h new file mode 100644 index 0000000..3edc740 --- /dev/null +++ b/src/include/access/gist_private.h @@ -0,0 +1,571 @@ +/*------------------------------------------------------------------------- + * + * gist_private.h + * private declarations for GiST -- declarations related to the + * internal implementation of GiST, not the public API + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/gist_private.h + * + *------------------------------------------------------------------------- + */ +#ifndef GIST_PRIVATE_H +#define GIST_PRIVATE_H + +#include "access/amapi.h" +#include "access/gist.h" +#include "access/itup.h" +#include "lib/pairingheap.h" +#include "storage/bufmgr.h" +#include "storage/buffile.h" +#include "utils/hsearch.h" +#include "access/genam.h" + +/* + * Maximum number of "halves" a page can be split into in one operation. + * Typically a split produces 2 halves, but can be more if keys have very + * different lengths, or when inserting multiple keys in one operation (as + * when inserting downlinks to an internal node). There is no theoretical + * limit on this, but in practice if you get more than a handful page halves + * in one split, there's something wrong with the opclass implementation. + * GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some + * local arrays used during split. Note that there is also a limit on the + * number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS, + * so if you raise this higher than that limit, you'll just get a different + * error. + */ +#define GIST_MAX_SPLIT_PAGES 75 + +/* Buffer lock modes */ +#define GIST_SHARE BUFFER_LOCK_SHARE +#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE +#define GIST_UNLOCK BUFFER_LOCK_UNLOCK + +typedef struct +{ + BlockNumber prev; + uint32 freespace; + char tupledata[FLEXIBLE_ARRAY_MEMBER]; +} GISTNodeBufferPage; + +#define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)) +/* Returns free space in node buffer page */ +#define PAGE_FREE_SPACE(nbp) (nbp->freespace) +/* Checks if node buffer page is empty */ +#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET) +/* Checks if node buffers page don't contain sufficient space for index tuple */ +#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \ + MAXALIGN(IndexTupleSize(itup))) + +/* + * GISTSTATE: information needed for any GiST index operation + * + * This struct retains call info for the index's opclass-specific support + * functions (per index column), plus the index's tuple descriptor. + * + * scanCxt holds the GISTSTATE itself as well as any data that lives for the + * lifetime of the index operation. We pass this to the support functions + * via fn_mcxt, so that they can store scan-lifespan data in it. The + * functions are invoked in tempCxt, which is typically short-lifespan + * (that is, it's reset after each tuple). However, tempCxt can be the same + * as scanCxt if we're not bothering with per-tuple context resets. + */ +typedef struct GISTSTATE +{ + MemoryContext scanCxt; /* context for scan-lifespan data */ + MemoryContext tempCxt; /* short-term context for calling functions */ + + TupleDesc leafTupdesc; /* index's tuple descriptor */ + TupleDesc nonLeafTupdesc; /* truncated tuple descriptor for non-leaf + * pages */ + TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an + * index-only scan */ + + FmgrInfo consistentFn[INDEX_MAX_KEYS]; + FmgrInfo unionFn[INDEX_MAX_KEYS]; + FmgrInfo compressFn[INDEX_MAX_KEYS]; + FmgrInfo decompressFn[INDEX_MAX_KEYS]; + FmgrInfo penaltyFn[INDEX_MAX_KEYS]; + FmgrInfo picksplitFn[INDEX_MAX_KEYS]; + FmgrInfo equalFn[INDEX_MAX_KEYS]; + FmgrInfo distanceFn[INDEX_MAX_KEYS]; + FmgrInfo fetchFn[INDEX_MAX_KEYS]; + + /* Collations to pass to the support functions */ + Oid supportCollation[INDEX_MAX_KEYS]; +} GISTSTATE; + + +/* + * During a GiST index search, we must maintain a queue of unvisited items, + * which can be either individual heap tuples or whole index pages. If it + * is an ordered search, the unvisited items should be visited in distance + * order. Unvisited items at the same distance should be visited in + * depth-first order, that is heap items first, then lower index pages, then + * upper index pages; this rule avoids doing extra work during a search that + * ends early due to LIMIT. + * + * To perform an ordered search, we use a pairing heap to manage the + * distance-order queue. In a non-ordered search (no order-by operators), + * we use it to return heap tuples before unvisited index pages, to + * ensure depth-first order, but all entries are otherwise considered + * equal. + */ + +/* Individual heap tuple to be visited */ +typedef struct GISTSearchHeapItem +{ + ItemPointerData heapPtr; + bool recheck; /* T if quals must be rechecked */ + bool recheckDistances; /* T if distances must be rechecked */ + HeapTuple recontup; /* data reconstructed from the index, used in + * index-only scans */ + OffsetNumber offnum; /* track offset in page to mark tuple as + * LP_DEAD */ +} GISTSearchHeapItem; + +/* Unvisited item, either index page or heap tuple */ +typedef struct GISTSearchItem +{ + pairingheap_node phNode; + BlockNumber blkno; /* index page number, or InvalidBlockNumber */ + union + { + GistNSN parentlsn; /* parent page's LSN, if index page */ + /* we must store parentlsn to detect whether a split occurred */ + GISTSearchHeapItem heap; /* heap info, if heap tuple */ + } data; + + /* numberOfOrderBys entries */ + IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER]; +} GISTSearchItem; + +#define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber) + +#define SizeOfGISTSearchItem(n_distances) \ + (offsetof(GISTSearchItem, distances) + \ + sizeof(IndexOrderByDistance) * (n_distances)) + +/* + * GISTScanOpaqueData: private state for a scan of a GiST index + */ +typedef struct GISTScanOpaqueData +{ + GISTSTATE *giststate; /* index information, see above */ + Oid *orderByTypes; /* datatypes of ORDER BY expressions */ + + pairingheap *queue; /* queue of unvisited items */ + MemoryContext queueCxt; /* context holding the queue */ + bool qual_ok; /* false if qual can never be satisfied */ + bool firstCall; /* true until first gistgettuple call */ + + /* pre-allocated workspace arrays */ + IndexOrderByDistance *distances; /* output area for gistindex_keytest */ + + /* info about killed items if any (killedItems is NULL if never used) */ + OffsetNumber *killedItems; /* offset numbers of killed items */ + int numKilled; /* number of currently stored items */ + BlockNumber curBlkno; /* current number of block */ + GistNSN curPageLSN; /* pos in the WAL stream when page was read */ + + /* In a non-ordered search, returnable heap items are stored here: */ + GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; + OffsetNumber nPageData; /* number of valid items in array */ + OffsetNumber curPageData; /* next item to return */ + MemoryContext pageDataCxt; /* context holding the fetched tuples, for + * index-only scans */ +} GISTScanOpaqueData; + +typedef GISTScanOpaqueData *GISTScanOpaque; + +/* despite the name, gistxlogPage is not part of any xlog record */ +typedef struct gistxlogPage +{ + BlockNumber blkno; + int num; /* number of index tuples following */ +} gistxlogPage; + +/* SplitedPageLayout - gistSplit function result */ +typedef struct SplitedPageLayout +{ + gistxlogPage block; + IndexTupleData *list; + int lenlist; + IndexTuple itup; /* union key for page */ + Page page; /* to operate */ + Buffer buffer; /* to write after all proceed */ + + struct SplitedPageLayout *next; +} SplitedPageLayout; + +/* + * GISTInsertStack used for locking buffers and transfer arguments during + * insertion + */ +typedef struct GISTInsertStack +{ + /* current page */ + BlockNumber blkno; + Buffer buffer; + Page page; + + /* + * log sequence number from page->lsn to recognize page update and compare + * it with page's nsn to recognize page split + */ + GistNSN lsn; + + /* + * If set, we split the page while descending the tree to find an + * insertion target. It means that we need to retry from the parent, + * because the downlink of this page might no longer cover the new key. + */ + bool retry_from_parent; + + /* offset of the downlink in the parent page, that points to this page */ + OffsetNumber downlinkoffnum; + + /* pointer to parent */ + struct GISTInsertStack *parent; +} GISTInsertStack; + +/* Working state and results for multi-column split logic in gistsplit.c */ +typedef struct GistSplitVector +{ + GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */ + + Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in + * splitVector.spl_left */ + bool spl_lisnull[INDEX_MAX_KEYS]; + + Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in + * splitVector.spl_right */ + bool spl_risnull[INDEX_MAX_KEYS]; + + bool *spl_dontcare; /* flags tuples which could go to either side + * of the split for zero penalty */ +} GistSplitVector; + +typedef struct +{ + Relation r; + Relation heapRel; + Size freespace; /* free space to be left */ + bool is_build; + + GISTInsertStack *stack; +} GISTInsertState; + +/* root page of a gist index */ +#define GIST_ROOT_BLKNO 0 + +/* + * Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on + * inner pages to finish crash recovery of incomplete page splits. If a crash + * happened in the middle of a page split, so that the downlink pointers were + * not yet inserted, crash recovery inserted a special downlink pointer. The + * semantics of an invalid tuple was that it if you encounter one in a scan, + * it must always be followed, because we don't know if the tuples on the + * child page match or not. + * + * We no longer create such invalid tuples, we now mark the left-half of such + * an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the + * split properly the next time we need to insert on that page. To retain + * on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as + * the offset number of all inner tuples. If we encounter any invalid tuples + * with 0xfffe during insertion, we throw an error, though scans still handle + * them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1 + * gist index which already has invalid tuples in it because of a crash. That + * should be rare, and you are recommended to REINDEX anyway if you have any + * invalid tuples in an index, so throwing an error is as far as we go with + * supporting that. + */ +#define TUPLE_IS_VALID 0xffff +#define TUPLE_IS_INVALID 0xfffe + +#define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID ) +#define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID ) + + + + +/* + * A buffer attached to an internal node, used when building an index in + * buffering mode. + */ +typedef struct +{ + BlockNumber nodeBlocknum; /* index block # this buffer is for */ + int32 blocksCount; /* current # of blocks occupied by buffer */ + + BlockNumber pageBlocknum; /* temporary file block # */ + GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */ + + /* is this buffer queued for emptying? */ + bool queuedForEmptying; + + /* is this a temporary copy, not in the hash table? */ + bool isTemp; + + int level; /* 0 == leaf */ +} GISTNodeBuffer; + +/* + * Does specified level have buffers? (Beware of multiple evaluation of + * arguments.) + */ +#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \ + ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \ + (nlevel) != (gfbb)->rootlevel) + +/* Is specified buffer at least half-filled (should be queued for emptying)? */ +#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \ + ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2) + +/* + * Is specified buffer full? Our buffers can actually grow indefinitely, + * beyond the "maximum" size, so this just means whether the buffer has grown + * beyond the nominal maximum size. + */ +#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \ + ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer) + +/* + * Data structure with general information about build buffers. + */ +typedef struct GISTBuildBuffers +{ + /* Persistent memory context for the buffers and metadata. */ + MemoryContext context; + + BufFile *pfile; /* Temporary file to store buffers in */ + long nFileBlocks; /* Current size of the temporary file */ + + /* + * resizable array of free blocks. + */ + long *freeBlocks; + int nFreeBlocks; /* # of currently free blocks in the array */ + int freeBlocksLen; /* current allocated length of the array */ + + /* Hash for buffers by block number */ + HTAB *nodeBuffersTab; + + /* List of buffers scheduled for emptying */ + List *bufferEmptyingQueue; + + /* + * Parameters to the buffering build algorithm. levelStep determines which + * levels in the tree have buffers, and pagesPerBuffer determines how + * large each buffer is. + */ + int levelStep; + int pagesPerBuffer; + + /* Array of lists of buffers on each level, for final emptying */ + List **buffersOnLevels; + int buffersOnLevelsLen; + + /* + * Dynamically-sized array of buffers that currently have their last page + * loaded in main memory. + */ + GISTNodeBuffer **loadedBuffers; + int loadedBuffersCount; /* # of entries in loadedBuffers */ + int loadedBuffersLen; /* allocated size of loadedBuffers */ + + /* Level of the current root node (= height of the index tree - 1) */ + int rootlevel; +} GISTBuildBuffers; + +/* GiSTOptions->buffering_mode values */ +typedef enum GistOptBufferingMode +{ + GIST_OPTION_BUFFERING_AUTO, + GIST_OPTION_BUFFERING_ON, + GIST_OPTION_BUFFERING_OFF +} GistOptBufferingMode; + +/* + * Storage type for GiST's reloptions + */ +typedef struct GiSTOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int fillfactor; /* page fill factor in percent (0..100) */ + GistOptBufferingMode buffering_mode; /* buffering build mode */ +} GiSTOptions; + +/* gist.c */ +extern void gistbuildempty(Relation index); +extern bool gistinsert(Relation r, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); +extern MemoryContext createTempGistContext(void); +extern GISTSTATE *initGISTstate(Relation index); +extern void freeGISTstate(GISTSTATE *giststate); +extern void gistdoinsert(Relation r, + IndexTuple itup, + Size freespace, + GISTSTATE *giststate, + Relation heapRel, + bool is_build); + +/* A List of these is returned from gistplacetopage() in *splitinfo */ +typedef struct +{ + Buffer buf; /* the split page "half" */ + IndexTuple downlink; /* downlink for this half. */ +} GISTPageSplitInfo; + +extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, + Buffer buffer, + IndexTuple *itup, int ntup, + OffsetNumber oldoffnum, BlockNumber *newblkno, + Buffer leftchildbuf, + List **splitinfo, + bool markfollowright, + Relation heapRel, + bool is_build); + +extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, + int len, GISTSTATE *giststate); + +/* gistxlog.c */ +extern XLogRecPtr gistXLogPageDelete(Buffer buffer, + FullTransactionId xid, Buffer parentBuffer, + OffsetNumber downlinkOffset); + +extern void gistXLogPageReuse(Relation rel, Relation heaprel, BlockNumber blkno, + FullTransactionId deleteXid); + +extern XLogRecPtr gistXLogUpdate(Buffer buffer, + OffsetNumber *todelete, int ntodelete, + IndexTuple *itup, int ituplen, + Buffer leftchildbuf); + +extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, + int ntodelete, TransactionId snapshotConflictHorizon, + Relation heaprel); + +extern XLogRecPtr gistXLogSplit(bool page_is_leaf, + SplitedPageLayout *dist, + BlockNumber origrlink, GistNSN orignsn, + Buffer leftchildbuf, bool markfollowright); + +extern XLogRecPtr gistXLogAssignLSN(void); + +/* gistget.c */ +extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); +extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); +extern bool gistcanreturn(Relation index, int attno); + +/* gistvalidate.c */ +extern bool gistvalidate(Oid opclassoid); +extern void gistadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions); + +/* gistutil.c */ + +#define GiSTPageSize \ + ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) ) + +#define GIST_MIN_FILLFACTOR 10 +#define GIST_DEFAULT_FILLFACTOR 90 + +extern bytea *gistoptions(Datum reloptions, bool validate); +extern bool gistproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull); +extern bool gistfitpage(IndexTuple *itvec, int len); +extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace); +extern void gistcheckpage(Relation rel, Buffer buf); +extern Buffer gistNewBuffer(Relation r, Relation heaprel); +extern bool gistPageRecyclable(Page page); +extern void gistfillbuffer(Page page, IndexTuple *itup, int len, + OffsetNumber off); +extern IndexTuple *gistextractpage(Page page, int *len /* out */ ); +extern IndexTuple *gistjoinvector(IndexTuple *itvec, int *len, + IndexTuple *additvec, int addlen); +extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen); + +extern IndexTuple gistunion(Relation r, IndexTuple *itvec, + int len, GISTSTATE *giststate); +extern IndexTuple gistgetadjusted(Relation r, + IndexTuple oldtup, + IndexTuple addtup, + GISTSTATE *giststate); +extern IndexTuple gistFormTuple(GISTSTATE *giststate, + Relation r, Datum *attdata, bool *isnull, bool isleaf); +extern void gistCompressValues(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf, Datum *compatt); + +extern OffsetNumber gistchoose(Relation r, Page p, + IndexTuple it, + GISTSTATE *giststate); + +extern void GISTInitBuffer(Buffer b, uint32 f); +extern void gistinitpage(Page page, uint32 f); +extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, + Datum k, Relation r, Page pg, OffsetNumber o, + bool l, bool isNull); + +extern float gistpenalty(GISTSTATE *giststate, int attno, + GISTENTRY *orig, bool isNullOrig, + GISTENTRY *add, bool isNullAdd); +extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, + Datum *attr, bool *isnull); +extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b); +extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, + OffsetNumber o, GISTENTRY *attdata, bool *isnull); +extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r, + IndexTuple tuple); +extern void gistMakeUnionKey(GISTSTATE *giststate, int attno, + GISTENTRY *entry1, bool isnull1, + GISTENTRY *entry2, bool isnull2, + Datum *dst, bool *dstisnull); + +extern XLogRecPtr gistGetFakeLSN(Relation rel); + +/* gistvacuum.c */ +extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); + +/* gistsplit.c */ +extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup, + int len, GISTSTATE *giststate, + GistSplitVector *v, + int attno); + +/* gistbuild.c */ +extern IndexBuildResult *gistbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); + +/* gistbuildbuffers.c */ +extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep, + int maxLevel); +extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb, + GISTSTATE *giststate, + BlockNumber nodeBlocknum, int level); +extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer, IndexTuple itup); +extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer, IndexTuple *itup); +extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb); +extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, + GISTSTATE *giststate, Relation r, + int level, Buffer buffer, + List *splitinfo); +extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb); + +#endif /* GIST_PRIVATE_H */ diff --git a/src/include/access/gistscan.h b/src/include/access/gistscan.h new file mode 100644 index 0000000..6591124 --- /dev/null +++ b/src/include/access/gistscan.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * + * gistscan.h + * routines defined in access/gist/gistscan.c + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/gistscan.h + * + *------------------------------------------------------------------------- + */ +#ifndef GISTSCAN_H +#define GISTSCAN_H + +#include "access/amapi.h" + +extern IndexScanDesc gistbeginscan(Relation r, int nkeys, int norderbys); +extern void gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, + ScanKey orderbys, int norderbys); +extern void gistendscan(IndexScanDesc scan); + +#endif /* GISTSCAN_H */ diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h new file mode 100644 index 0000000..aff2ffb --- /dev/null +++ b/src/include/access/gistxlog.h @@ -0,0 +1,117 @@ +/*------------------------------------------------------------------------- + * + * gistxlog.h + * gist xlog routines + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/gistxlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef GIST_XLOG_H +#define GIST_XLOG_H + +#include "access/gist.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + +#define XLOG_GIST_PAGE_UPDATE 0x00 +#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a + * page */ +#define XLOG_GIST_PAGE_REUSE 0x20 /* old page is about to be reused + * from FSM */ +#define XLOG_GIST_PAGE_SPLIT 0x30 + /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ + /* #define XLOG_GIST_CREATE_INDEX 0x50 */ /* not used anymore */ +#define XLOG_GIST_PAGE_DELETE 0x60 +#define XLOG_GIST_ASSIGN_LSN 0x70 /* nop, assign new LSN */ + +/* + * Backup Blk 0: updated page. + * Backup Blk 1: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + */ +typedef struct gistxlogPageUpdate +{ + /* number of deleted offsets */ + uint16 ntodelete; + uint16 ntoinsert; + + /* + * In payload of blk 0 : 1. todelete OffsetNumbers 2. tuples to insert + */ +} gistxlogPageUpdate; + +/* + * Backup Blk 0: Leaf page, whose index tuples are deleted. + */ +typedef struct gistxlogDelete +{ + TransactionId snapshotConflictHorizon; + uint16 ntodelete; /* number of deleted offsets */ + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + + /* TODELETE OFFSET NUMBERS */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} gistxlogDelete; + +#define SizeOfGistxlogDelete offsetof(gistxlogDelete, offsets) + +/* + * Backup Blk 0: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + * Backup Blk 1 - npage: split pages (1 is the original page) + */ +typedef struct gistxlogPageSplit +{ + BlockNumber origrlink; /* rightlink of the page before split */ + GistNSN orignsn; /* NSN of the page before split */ + bool origleaf; /* was splitted page a leaf page? */ + + uint16 npage; /* # of pages in the split */ + bool markfollowright; /* set F_FOLLOW_RIGHT flags */ + + /* + * follow: 1. gistxlogPage and array of IndexTupleData per page + */ +} gistxlogPageSplit; + +/* + * Backup Blk 0: page that was deleted. + * Backup Blk 1: parent page, containing the downlink to the deleted page. + */ +typedef struct gistxlogPageDelete +{ + FullTransactionId deleteXid; /* last Xid which could see page in scan */ + OffsetNumber downlinkOffset; /* Offset of downlink referencing this + * page */ +} gistxlogPageDelete; + +#define SizeOfGistxlogPageDelete (offsetof(gistxlogPageDelete, downlinkOffset) + sizeof(OffsetNumber)) + + +/* + * This is what we need to know about page reuse, for hot standby. + */ +typedef struct gistxlogPageReuse +{ + RelFileLocator locator; + BlockNumber block; + FullTransactionId snapshotConflictHorizon; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ +} gistxlogPageReuse; + +#define SizeOfGistxlogPageReuse (offsetof(gistxlogPageReuse, isCatalogRel) + sizeof(bool)) + +extern void gist_redo(XLogReaderState *record); +extern void gist_desc(StringInfo buf, XLogReaderState *record); +extern const char *gist_identify(uint8 info); +extern void gist_xlog_startup(void); +extern void gist_xlog_cleanup(void); +extern void gist_mask(char *pagedata, BlockNumber blkno); + +#endif diff --git a/src/include/access/hash.h b/src/include/access/hash.h new file mode 100644 index 0000000..9e03527 --- /dev/null +++ b/src/include/access/hash.h @@ -0,0 +1,487 @@ +/*------------------------------------------------------------------------- + * + * hash.h + * header file for postgres hash access method implementation + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/hash.h + * + * NOTES + * modeled after Margo Seltzer's hash implementation for unix. + * + *------------------------------------------------------------------------- + */ +#ifndef HASH_H +#define HASH_H + +#include "access/amapi.h" +#include "access/itup.h" +#include "access/sdir.h" +#include "catalog/pg_am_d.h" +#include "common/hashfn.h" +#include "lib/stringinfo.h" +#include "storage/bufmgr.h" +#include "storage/lockdefs.h" +#include "utils/hsearch.h" +#include "utils/relcache.h" + +/* + * Mapping from hash bucket number to physical block number of bucket's + * starting page. Beware of multiple evaluations of argument! + */ +typedef uint32 Bucket; + +#define InvalidBucket ((Bucket) 0xFFFFFFFF) + +#define BUCKET_TO_BLKNO(metap,B) \ + ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1) + +/* + * Special space for hash index pages. + * + * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at. + * Additional bits in the flag word are used for more transient purposes. + * + * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE. + * However, we ensure that each used page type has a distinct bit so that + * we can OR together page types for uses such as the allowable-page-types + * argument of _hash_checkpage(). + */ +#define LH_UNUSED_PAGE (0) +#define LH_OVERFLOW_PAGE (1 << 0) +#define LH_BUCKET_PAGE (1 << 1) +#define LH_BITMAP_PAGE (1 << 2) +#define LH_META_PAGE (1 << 3) +#define LH_BUCKET_BEING_POPULATED (1 << 4) +#define LH_BUCKET_BEING_SPLIT (1 << 5) +#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6) +#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7) + +#define LH_PAGE_TYPE \ + (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE) + +/* + * In an overflow page, hasho_prevblkno stores the block number of the previous + * page in the bucket chain; in a bucket page, hasho_prevblkno stores the + * hashm_maxbucket value as of the last time the bucket was last split, or + * else as of the time the bucket was created. The latter convention is used + * to determine whether a cached copy of the metapage is too stale to be used + * without needing to lock or pin the metapage. + * + * hasho_nextblkno is always the block number of the next page in the + * bucket chain, or InvalidBlockNumber if there are no more such pages. + */ +typedef struct HashPageOpaqueData +{ + BlockNumber hasho_prevblkno; /* see above */ + BlockNumber hasho_nextblkno; /* see above */ + Bucket hasho_bucket; /* bucket number this pg belongs to */ + uint16 hasho_flag; /* page type code + flag bits, see above */ + uint16 hasho_page_id; /* for identification of hash indexes */ +} HashPageOpaqueData; + +typedef HashPageOpaqueData *HashPageOpaque; + +#define HashPageGetOpaque(page) ((HashPageOpaque) PageGetSpecialPointer(page)) + +#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0) +#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0) +#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0) +#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0) + +/* + * The page ID is for the convenience of pg_filedump and similar utilities, + * which otherwise would have a hard time telling pages of different index + * types apart. It should be the last 2 bytes on the page. This is more or + * less "free" due to alignment considerations. + */ +#define HASHO_PAGE_ID 0xFF80 + +typedef struct HashScanPosItem /* what we remember about each match */ +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ +} HashScanPosItem; + +typedef struct HashScanPosData +{ + Buffer buf; /* if valid, the buffer is pinned */ + BlockNumber currPage; /* current hash index page */ + BlockNumber nextPage; /* next overflow page */ + BlockNumber prevPage; /* prev overflow or bucket page */ + + /* + * The items array is always ordered in index order (ie, increasing + * indexoffset). When scanning backwards it is convenient to fill the + * array back-to-front, so we start at the last slot and fill downwards. + * Hence we need both a first-valid-entry and a last-valid-entry counter. + * itemIndex is a cursor showing which entry was last returned to caller. + */ + int firstItem; /* first valid index in items[] */ + int lastItem; /* last valid index in items[] */ + int itemIndex; /* current index in items[] */ + + HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ +} HashScanPosData; + +#define HashScanPosIsPinned(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BufferIsValid((scanpos).buf) \ +) + +#define HashScanPosIsValid(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BlockNumberIsValid((scanpos).currPage) \ +) + +#define HashScanPosInvalidate(scanpos) \ + do { \ + (scanpos).buf = InvalidBuffer; \ + (scanpos).currPage = InvalidBlockNumber; \ + (scanpos).nextPage = InvalidBlockNumber; \ + (scanpos).prevPage = InvalidBlockNumber; \ + (scanpos).firstItem = 0; \ + (scanpos).lastItem = 0; \ + (scanpos).itemIndex = 0; \ + } while (0) + +/* + * HashScanOpaqueData is private state for a hash index scan. + */ +typedef struct HashScanOpaqueData +{ + /* Hash value of the scan key, ie, the hash key we seek */ + uint32 hashso_sk_hash; + + /* remember the buffer associated with primary bucket */ + Buffer hashso_bucket_buf; + + /* + * remember the buffer associated with primary bucket page of bucket being + * split. it is required during the scan of the bucket which is being + * populated during split operation. + */ + Buffer hashso_split_bucket_buf; + + /* Whether scan starts on bucket being populated due to split */ + bool hashso_buc_populated; + + /* + * Whether scanning bucket being split? The value of this parameter is + * referred only when hashso_buc_populated is true. + */ + bool hashso_buc_split; + /* info about killed items if any (killedItems is NULL if never used) */ + int *killedItems; /* currPos.items indexes of killed items */ + int numKilled; /* number of currently stored items */ + + /* + * Identify all the matching items on a page and save them in + * HashScanPosData + */ + HashScanPosData currPos; /* current position data */ +} HashScanOpaqueData; + +typedef HashScanOpaqueData *HashScanOpaque; + +/* + * Definitions for metapage. + */ + +#define HASH_METAPAGE 0 /* metapage is always block 0 */ + +#define HASH_MAGIC 0x6440640 +#define HASH_VERSION 4 + +/* + * spares[] holds the number of overflow pages currently allocated at or + * before a certain splitpoint. For example, if spares[3] = 7 then there are + * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The + * value in spares[ovflpoint] increases as overflow pages are added at the + * end of the index. Once ovflpoint increases (ie, we have actually allocated + * the bucket pages belonging to that splitpoint) the number of spares at the + * prior splitpoint cannot change anymore. + * + * ovflpages that have been recycled for reuse can be found by looking at + * bitmaps that are stored within ovflpages dedicated for the purpose. + * The blknos of these bitmap pages are kept in mapp[]; nmaps is the + * number of currently existing bitmaps. + * + * The limitation on the size of spares[] comes from the fact that there's + * no point in having more than 2^32 buckets with only uint32 hashcodes. + * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is + * adjusted in such a way to accommodate multi phased allocation of buckets + * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE). + * + * There is no particular upper limit on the size of mapp[], other than + * needing to fit into the metapage. (With 8K block size, 1024 bitmaps + * limit us to 256 GB of overflow space...). For smaller block size we + * can not use 1024 bitmaps as it will lead to the meta page data crossing + * the block size boundary. So we use BLCKSZ to determine the maximum number + * of bitmaps. + */ +#define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024) + +#define HASH_SPLITPOINT_PHASE_BITS 2 +#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS) +#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1) +#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10 + +/* defines max number of splitpoint phases a hash index can have */ +#define HASH_MAX_SPLITPOINT_GROUP 32 +#define HASH_MAX_SPLITPOINTS \ + (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \ + HASH_SPLITPOINT_PHASES_PER_GRP) + \ + HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) + +typedef struct HashMetaPageData +{ + uint32 hashm_magic; /* magic no. for hash tables */ + uint32 hashm_version; /* version ID */ + double hashm_ntuples; /* number of tuples stored in the table */ + uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */ + uint16 hashm_bsize; /* index page size (bytes) */ + uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power + * of 2 */ + uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */ + uint32 hashm_maxbucket; /* ID of maximum bucket in use */ + uint32 hashm_highmask; /* mask to modulo into entire table */ + uint32 hashm_lowmask; /* mask to modulo into lower half of table */ + uint32 hashm_ovflpoint; /* splitpoint from which ovflpage being + * allocated */ + uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */ + uint32 hashm_nmaps; /* number of bitmap pages */ + RegProcedure hashm_procid; /* hash function id from pg_proc */ + uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each + * splitpoint */ + BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ +} HashMetaPageData; + +typedef HashMetaPageData *HashMetaPage; + +typedef struct HashOptions +{ + int32 varlena_header_; /* varlena header (do not touch directly!) */ + int fillfactor; /* page fill factor in percent (0..100) */ +} HashOptions; + +#define HashGetFillFactor(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == HASH_AM_OID), \ + (relation)->rd_options ? \ + ((HashOptions *) (relation)->rd_options)->fillfactor : \ + HASH_DEFAULT_FILLFACTOR) +#define HashGetTargetPageUsage(relation) \ + (BLCKSZ * HashGetFillFactor(relation) / 100) + +/* + * Maximum size of a hash index item (it's okay to have only one per page) + */ +#define HashMaxItemSize(page) \ + MAXALIGN_DOWN(PageGetPageSize(page) - \ + SizeOfPageHeaderData - \ + sizeof(ItemIdData) - \ + MAXALIGN(sizeof(HashPageOpaqueData))) + +#define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT + +#define HASH_MIN_FILLFACTOR 10 +#define HASH_DEFAULT_FILLFACTOR 75 + +/* + * Constants + */ +#define BYTE_TO_BIT 3 /* 2^3 bits/byte */ +#define ALL_SET ((uint32) ~0) + +/* + * Bitmap pages do not contain tuples. They do contain the standard + * page headers and trailers; however, everything in between is a + * giant bit array. The number of bits that fit on a page obviously + * depends on the page size and the header/trailer overhead. We require + * the number of bits per page to be a power of 2. + */ +#define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize) +#define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) +#define BMPG_SHIFT(metap) ((metap)->hashm_bmshift) +#define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1) + +#define HashPageGetBitmap(page) \ + ((uint32 *) PageGetContents(page)) + +#define HashGetMaxBitmapSize(page) \ + (PageGetPageSize((Page) page) - \ + (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData)))) + +#define HashPageGetMeta(page) \ + ((HashMetaPage) PageGetContents(page)) + +/* + * The number of bits in an ovflpage bitmap word. + */ +#define BITS_PER_MAP 32 /* Number of bits in uint32 */ + +/* Given the address of the beginning of a bit map, clear/set the nth bit */ +#define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) +#define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) +#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) + +/* + * page-level and high-level locking modes (see README) + */ +#define HASH_READ BUFFER_LOCK_SHARE +#define HASH_WRITE BUFFER_LOCK_EXCLUSIVE +#define HASH_NOLOCK (-1) + +/* + * When a new operator class is declared, we require that the user supply + * us with an amproc function for hashing a key of the new type, returning + * a 32-bit hash value. We call this the "standard" hash function. We + * also allow an optional "extended" hash function which accepts a salt and + * returns a 64-bit hash value. This is highly recommended but, for reasons + * of backward compatibility, optional. + * + * When the salt is 0, the low 32 bits of the value returned by the extended + * hash function should match the value that would have been returned by the + * standard hash function. + */ +#define HASHSTANDARD_PROC 1 +#define HASHEXTENDED_PROC 2 +#define HASHOPTIONS_PROC 3 +#define HASHNProcs 3 + + +/* public routines */ + +extern IndexBuildResult *hashbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void hashbuildempty(Relation index); +extern bool hashinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); +extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); +extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); +extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); +extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern void hashendscan(IndexScanDesc scan); +extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); +extern bytea *hashoptions(Datum reloptions, bool validate); +extern bool hashvalidate(Oid opclassoid); +extern void hashadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions); + +/* private routines */ + +/* hashinsert.c */ +extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, + bool sorted); +extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, + Size itemsize, IndexTuple itup, + bool appendtup); +extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, + OffsetNumber *itup_offsets, uint16 nitups); + +/* hashovfl.c */ +extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); +extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy); +extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage); +extern void _hash_squeezebucket(Relation rel, + Bucket bucket, BlockNumber bucket_blkno, + Buffer bucket_buf, + BufferAccessStrategy bstrategy); +extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno); + +/* hashpage.c */ +extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, + int access, int flags); +extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, + BlockNumber blkno, int flags); +extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, + bool force_refresh); +extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, + int access, + HashMetaPage *cachedmetap); +extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); +extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, + uint32 flag, bool initpage); +extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, + ForkNumber forkNum); +extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, + int access, int flags, + BufferAccessStrategy bstrategy); +extern void _hash_relbuf(Relation rel, Buffer buf); +extern void _hash_dropbuf(Relation rel, Buffer buf); +extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so); +extern uint32 _hash_init(Relation rel, double num_tuples, + ForkNumber forkNum); +extern void _hash_init_metabuffer(Buffer buf, double num_tuples, + RegProcedure procid, uint16 ffactor, bool initpage); +extern void _hash_pageinit(Page page, Size size); +extern void _hash_expandtable(Relation rel, Buffer metabuf); +extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, + Bucket obucket, uint32 maxbucket, uint32 highmask, + uint32 lowmask); + +/* hashsearch.c */ +extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); +extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); + +/* hashsort.c */ +typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ + +extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets); +extern void _h_spooldestroy(HSpool *hspool); +extern void _h_spool(HSpool *hspool, ItemPointer self, + Datum *values, bool *isnull); +extern void _h_indexbuild(HSpool *hspool, Relation heapRel); + +/* hashutil.c */ +extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup); +extern uint32 _hash_datum2hashkey(Relation rel, Datum key); +extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype); +extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, + uint32 highmask, uint32 lowmask); +extern uint32 _hash_spareindex(uint32 num_bucket); +extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase); +extern void _hash_checkpage(Relation rel, Buffer buf, int flags); +extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); +extern bool _hash_convert_tuple(Relation index, + Datum *user_values, bool *user_isnull, + Datum *index_values, bool *index_isnull); +extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); +extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); +extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket); +extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); +extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, + uint32 lowmask, uint32 maxbucket); +extern void _hash_kill_items(IndexScanDesc scan); + +/* hash.c */ +extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, + Buffer bucket_buf, BlockNumber bucket_blkno, + BufferAccessStrategy bstrategy, + uint32 maxbucket, uint32 highmask, uint32 lowmask, + double *tuples_removed, double *num_index_tuples, + bool split_cleanup, + IndexBulkDeleteCallback callback, void *callback_state); + +#endif /* HASH_H */ diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h new file mode 100644 index 0000000..b93619d --- /dev/null +++ b/src/include/access/hash_xlog.h @@ -0,0 +1,269 @@ +/*------------------------------------------------------------------------- + * + * hash_xlog.h + * header file for Postgres hash AM implementation + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/hash_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef HASH_XLOG_H +#define HASH_XLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */ +#define HASH_XLOG_FREE_OVFL_BUFS 6 + +/* + * XLOG records for hash operations + */ +#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */ +#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */ +#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */ +#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */ +#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */ +#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */ +#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split operation */ +#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page + * and add to another page */ +#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous + * pages in chain and free the ovfl + * page */ +#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */ +#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary + * bucket page after deleting tuples + * that are moved due to split */ +#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after vacuum */ + +#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index + * page */ + +/* + * xl_hash_split_allocate_page flag values, 8 bits are available. + */ +#define XLH_SPLIT_META_UPDATE_MASKS (1<<0) +#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1) + +/* + * This is what we need to know about simple (without split) insert. + * + * This data record is used for XLOG_HASH_INSERT + * + * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 1: metapage (HashMetaPageData) + */ +typedef struct xl_hash_insert +{ + OffsetNumber offnum; +} xl_hash_insert; + +#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber)) + +/* + * This is what we need to know about addition of overflow page. + * + * This data record is used for XLOG_HASH_ADD_OVFL_PAGE + * + * Backup Blk 0: newly allocated overflow page + * Backup Blk 1: page before new overflow page in the bucket chain + * Backup Blk 2: bitmap page + * Backup Blk 3: new bitmap page + * Backup Blk 4: metapage + */ +typedef struct xl_hash_add_ovfl_page +{ + uint16 bmsize; + bool bmpage_found; +} xl_hash_add_ovfl_page; + +#define SizeOfHashAddOvflPage \ + (offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool)) + +/* + * This is what we need to know about allocating a page for split. + * + * This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + * Backup Blk 2: metapage + */ +typedef struct xl_hash_split_allocate_page +{ + uint32 new_bucket; + uint16 old_bucket_flag; + uint16 new_bucket_flag; + uint8 flags; +} xl_hash_split_allocate_page; + +#define SizeOfHashSplitAllocPage \ + (offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8)) + +/* + * This is what we need to know about completing the split operation. + * + * This data record is used for XLOG_HASH_SPLIT_COMPLETE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + */ +typedef struct xl_hash_split_complete +{ + uint16 old_bucket_flag; + uint16 new_bucket_flag; +} xl_hash_split_complete; + +#define SizeOfHashSplitComplete \ + (offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16)) + +/* + * This is what we need to know about move page contents required during + * squeeze operation. + * + * This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS + * + * Backup Blk 0: bucket page + * Backup Blk 1: page containing moved tuples + * Backup Blk 2: page from which tuples will be removed + */ +typedef struct xl_hash_move_page_contents +{ + uint16 ntups; + bool is_prim_bucket_same_wrt; /* true if the page to which + * tuples are moved is same as + * primary bucket page */ +} xl_hash_move_page_contents; + +#define SizeOfHashMovePageContents \ + (offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the squeeze page operation. + * + * This data record is used for XLOG_HASH_SQUEEZE_PAGE + * + * Backup Blk 0: page containing tuples moved from freed overflow page + * Backup Blk 1: freed overflow page + * Backup Blk 2: page previous to the freed overflow page + * Backup Blk 3: page next to the freed overflow page + * Backup Blk 4: bitmap page containing info of freed overflow page + * Backup Blk 5: meta page + */ +typedef struct xl_hash_squeeze_page +{ + BlockNumber prevblkno; + BlockNumber nextblkno; + uint16 ntups; + bool is_prim_bucket_same_wrt; /* true if the page to which + * tuples are moved is same as + * primary bucket page */ + bool is_prev_bucket_same_wrt; /* true if the page to which + * tuples are moved is the page + * previous to the freed overflow + * page */ +} xl_hash_squeeze_page; + +#define SizeOfHashSqueezePage \ + (offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the deletion of index tuples from a page. + * + * This data record is used for XLOG_HASH_DELETE + * + * Backup Blk 0: primary bucket page + * Backup Blk 1: page from which tuples are deleted + */ +typedef struct xl_hash_delete +{ + bool clear_dead_marking; /* true if this operation clears + * LH_PAGE_HAS_DEAD_TUPLES flag */ + bool is_primary_bucket_page; /* true if the operation is for + * primary bucket page */ +} xl_hash_delete; + +#define SizeOfHashDelete (offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool)) + +/* + * This is what we need for metapage update operation. + * + * This data record is used for XLOG_HASH_UPDATE_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_update_meta_page +{ + double ntuples; +} xl_hash_update_meta_page; + +#define SizeOfHashUpdateMetaPage \ + (offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double)) + +/* + * This is what we need to initialize metapage. + * + * This data record is used for XLOG_HASH_INIT_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_init_meta_page +{ + double num_tuples; + RegProcedure procid; + uint16 ffactor; +} xl_hash_init_meta_page; + +#define SizeOfHashInitMetaPage \ + (offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16)) + +/* + * This is what we need to initialize bitmap page. + * + * This data record is used for XLOG_HASH_INIT_BITMAP_PAGE + * + * Backup Blk 0: bitmap page + * Backup Blk 1: meta page + */ +typedef struct xl_hash_init_bitmap_page +{ + uint16 bmsize; +} xl_hash_init_bitmap_page; + +#define SizeOfHashInitBitmapPage \ + (offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16)) + +/* + * This is what we need for index tuple deletion and to + * update the meta page. + * + * This data record is used for XLOG_HASH_VACUUM_ONE_PAGE + * + * Backup Blk 0: bucket page + * Backup Blk 1: meta page + */ +typedef struct xl_hash_vacuum_one_page +{ + TransactionId snapshotConflictHorizon; + uint16 ntuples; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + + /* TARGET OFFSET NUMBERS */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} xl_hash_vacuum_one_page; + +#define SizeOfHashVacuumOnePage offsetof(xl_hash_vacuum_one_page, offsets) + +extern void hash_redo(XLogReaderState *record); +extern void hash_desc(StringInfo buf, XLogReaderState *record); +extern const char *hash_identify(uint8 info); +extern void hash_mask(char *pagedata, BlockNumber blkno); + +#endif /* HASH_XLOG_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h new file mode 100644 index 0000000..faf5026 --- /dev/null +++ b/src/include/access/heapam.h @@ -0,0 +1,332 @@ +/*------------------------------------------------------------------------- + * + * heapam.h + * POSTGRES heap access method definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/heapam.h + * + *------------------------------------------------------------------------- + */ +#ifndef HEAPAM_H +#define HEAPAM_H + +#include "access/relation.h" /* for backward compatibility */ +#include "access/relscan.h" +#include "access/sdir.h" +#include "access/skey.h" +#include "access/table.h" /* for backward compatibility */ +#include "access/tableam.h" +#include "nodes/lockoptions.h" +#include "nodes/primnodes.h" +#include "storage/bufpage.h" +#include "storage/dsm.h" +#include "storage/lockdefs.h" +#include "storage/shm_toc.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + + +/* "options" flag bits for heap_insert */ +#define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM +#define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN +#define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL +#define HEAP_INSERT_SPECULATIVE 0x0010 + +typedef struct BulkInsertStateData *BulkInsertState; +struct TupleTableSlot; +struct VacuumCutoffs; + +#define MaxLockTupleMode LockTupleExclusive + +/* + * Descriptor for heap table scans. + */ +typedef struct HeapScanDescData +{ + TableScanDescData rs_base; /* AM independent part of the descriptor */ + + /* state set up at initscan time */ + BlockNumber rs_nblocks; /* total number of blocks in rel */ + BlockNumber rs_startblock; /* block # to start at */ + BlockNumber rs_numblocks; /* max number of blocks to scan */ + /* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */ + + /* scan current state */ + bool rs_inited; /* false = scan not init'd yet */ + OffsetNumber rs_coffset; /* current offset # in non-page-at-a-time mode */ + BlockNumber rs_cblock; /* current block # in scan, if any */ + Buffer rs_cbuf; /* current buffer in scan, if any */ + /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + + BufferAccessStrategy rs_strategy; /* access strategy for reads */ + + HeapTupleData rs_ctup; /* current tuple in scan, if any */ + + /* + * For parallel scans to store page allocation data. NULL when not + * performing a parallel scan. + */ + ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + + /* these fields only used in page-at-a-time mode and for bitmap scans */ + int rs_cindex; /* current tuple's index in vistuples */ + int rs_ntuples; /* number of visible tuples on page */ + OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ +} HeapScanDescData; +typedef struct HeapScanDescData *HeapScanDesc; + +/* + * Descriptor for fetches from heap via an index. + */ +typedef struct IndexFetchHeapData +{ + IndexFetchTableData xs_base; /* AM independent part of the descriptor */ + + Buffer xs_cbuf; /* current heap buffer in scan, if any */ + /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ +} IndexFetchHeapData; + +/* Result codes for HeapTupleSatisfiesVacuum */ +typedef enum +{ + HEAPTUPLE_DEAD, /* tuple is dead and deletable */ + HEAPTUPLE_LIVE, /* tuple is live (committed, no deleter) */ + HEAPTUPLE_RECENTLY_DEAD, /* tuple is dead, but not deletable yet */ + HEAPTUPLE_INSERT_IN_PROGRESS, /* inserting xact is still in progress */ + HEAPTUPLE_DELETE_IN_PROGRESS /* deleting xact is still in progress */ +} HTSV_Result; + +/* + * heap_prepare_freeze_tuple may request that heap_freeze_execute_prepared + * check any tuple's to-be-frozen xmin and/or xmax status using pg_xact + */ +#define HEAP_FREEZE_CHECK_XMIN_COMMITTED 0x01 +#define HEAP_FREEZE_CHECK_XMAX_ABORTED 0x02 + +/* heap_prepare_freeze_tuple state describing how to freeze a tuple */ +typedef struct HeapTupleFreeze +{ + /* Fields describing how to process tuple */ + TransactionId xmax; + uint16 t_infomask2; + uint16 t_infomask; + uint8 frzflags; + + /* xmin/xmax check flags */ + uint8 checkflags; + /* Page offset number for tuple */ + OffsetNumber offset; +} HeapTupleFreeze; + +/* + * State used by VACUUM to track the details of freezing all eligible tuples + * on a given heap page. + * + * VACUUM prepares freeze plans for each page via heap_prepare_freeze_tuple + * calls (every tuple with storage gets its own call). This page-level freeze + * state is updated across each call, which ultimately determines whether or + * not freezing the page is required. + * + * Aside from the basic question of whether or not freezing will go ahead, the + * state also tracks the oldest extant XID/MXID in the table as a whole, for + * the purposes of advancing relfrozenxid/relminmxid values in pg_class later + * on. Each heap_prepare_freeze_tuple call pushes NewRelfrozenXid and/or + * NewRelminMxid back as required to avoid unsafe final pg_class values. Any + * and all unfrozen XIDs or MXIDs that remain after VACUUM finishes _must_ + * have values >= the final relfrozenxid/relminmxid values in pg_class. This + * includes XIDs that remain as MultiXact members from any tuple's xmax. + * + * When 'freeze_required' flag isn't set after all tuples are examined, the + * final choice on freezing is made by vacuumlazy.c. It can decide to trigger + * freezing based on whatever criteria it deems appropriate. However, it is + * recommended that vacuumlazy.c avoid early freezing when freezing does not + * enable setting the target page all-frozen in the visibility map afterwards. + */ +typedef struct HeapPageFreeze +{ + /* Is heap_prepare_freeze_tuple caller required to freeze page? */ + bool freeze_required; + + /* + * "Freeze" NewRelfrozenXid/NewRelminMxid trackers. + * + * Trackers used when heap_freeze_execute_prepared freezes, or when there + * are zero freeze plans for a page. It is always valid for vacuumlazy.c + * to freeze any page, by definition. This even includes pages that have + * no tuples with storage to consider in the first place. That way the + * 'totally_frozen' results from heap_prepare_freeze_tuple can always be + * used in the same way, even when no freeze plans need to be executed to + * "freeze the page". Only the "freeze" path needs to consider the need + * to set pages all-frozen in the visibility map under this scheme. + * + * When we freeze a page, we generally freeze all XIDs < OldestXmin, only + * leaving behind XIDs that are ineligible for freezing, if any. And so + * you might wonder why these trackers are necessary at all; why should + * _any_ page that VACUUM freezes _ever_ be left with XIDs/MXIDs that + * ratchet back the top-level NewRelfrozenXid/NewRelminMxid trackers? + * + * It is useful to use a definition of "freeze the page" that does not + * overspecify how MultiXacts are affected. heap_prepare_freeze_tuple + * generally prefers to remove Multis eagerly, but lazy processing is used + * in cases where laziness allows VACUUM to avoid allocating a new Multi. + * The "freeze the page" trackers enable this flexibility. + */ + TransactionId FreezePageRelfrozenXid; + MultiXactId FreezePageRelminMxid; + + /* + * "No freeze" NewRelfrozenXid/NewRelminMxid trackers. + * + * These trackers are maintained in the same way as the trackers used when + * VACUUM scans a page that isn't cleanup locked. Both code paths are + * based on the same general idea (do less work for this page during the + * ongoing VACUUM, at the cost of having to accept older final values). + */ + TransactionId NoFreezePageRelfrozenXid; + MultiXactId NoFreezePageRelminMxid; + +} HeapPageFreeze; + +/* ---------------- + * function prototypes for heap access method + * + * heap_create, heap_create_with_catalog, and heap_drop_with_catalog + * are declared in catalog/heap.h + * ---------------- + */ + + +/* + * HeapScanIsValid + * True iff the heap scan is valid. + */ +#define HeapScanIsValid(scan) PointerIsValid(scan) + +extern TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + ParallelTableScanDesc parallel_scan, + uint32 flags); +extern void heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, + BlockNumber numBlks); +extern void heapgetpage(TableScanDesc sscan, BlockNumber block); +extern void heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, + bool allow_strat, bool allow_sync, bool allow_pagemode); +extern void heap_endscan(TableScanDesc sscan); +extern HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction); +extern bool heap_getnextslot(TableScanDesc sscan, + ScanDirection direction, struct TupleTableSlot *slot); +extern void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, + ItemPointer maxtid); +extern bool heap_getnextslot_tidrange(TableScanDesc sscan, + ScanDirection direction, + TupleTableSlot *slot); +extern bool heap_fetch(Relation relation, Snapshot snapshot, + HeapTuple tuple, Buffer *userbuf, bool keep_buf); +extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, + Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, + bool *all_dead, bool first_call); + +extern void heap_get_latest_tid(TableScanDesc sscan, ItemPointer tid); + +extern BulkInsertState GetBulkInsertState(void); +extern void FreeBulkInsertState(BulkInsertState); +extern void ReleaseBulkInsertStatePin(BulkInsertState bistate); + +extern void heap_insert(Relation relation, HeapTuple tup, CommandId cid, + int options, BulkInsertState bistate); +extern void heap_multi_insert(Relation relation, struct TupleTableSlot **slots, + int ntuples, CommandId cid, int options, + BulkInsertState bistate); +extern TM_Result heap_delete(Relation relation, ItemPointer tid, + CommandId cid, Snapshot crosscheck, bool wait, + struct TM_FailureData *tmfd, bool changingPart); +extern void heap_finish_speculative(Relation relation, ItemPointer tid); +extern void heap_abort_speculative(Relation relation, ItemPointer tid); +extern TM_Result heap_update(Relation relation, ItemPointer otid, + HeapTuple newtup, + CommandId cid, Snapshot crosscheck, bool wait, + struct TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes); +extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_updates, + Buffer *buffer, struct TM_FailureData *tmfd); + +extern void heap_inplace_update(Relation relation, HeapTuple tuple); +extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, + const struct VacuumCutoffs *cutoffs, + HeapPageFreeze *pagefrz, + HeapTupleFreeze *frz, bool *totally_frozen); +extern void heap_freeze_execute_prepared(Relation rel, Buffer buffer, + TransactionId snapshotConflictHorizon, + HeapTupleFreeze *tuples, int ntuples); +extern bool heap_freeze_tuple(HeapTupleHeader tuple, + TransactionId relfrozenxid, TransactionId relminmxid, + TransactionId FreezeLimit, TransactionId MultiXactCutoff); +extern bool heap_tuple_should_freeze(HeapTupleHeader tuple, + const struct VacuumCutoffs *cutoffs, + TransactionId *NoFreezePageRelfrozenXid, + MultiXactId *NoFreezePageRelminMxid); +extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); + +extern void simple_heap_insert(Relation relation, HeapTuple tup); +extern void simple_heap_delete(Relation relation, ItemPointer tid); +extern void simple_heap_update(Relation relation, ItemPointer otid, + HeapTuple tup, TU_UpdateIndexes *update_indexes); + +extern TransactionId heap_index_delete_tuples(Relation rel, + TM_IndexDeleteOp *delstate); + +/* in heap/pruneheap.c */ +struct GlobalVisState; +extern void heap_page_prune_opt(Relation relation, Buffer buffer); +extern int heap_page_prune(Relation relation, Buffer buffer, + struct GlobalVisState *vistest, + TransactionId old_snap_xmin, + TimestampTz old_snap_ts, + int *nnewlpdead, + OffsetNumber *off_loc); +extern void heap_page_prune_execute(Buffer buffer, + OffsetNumber *redirected, int nredirected, + OffsetNumber *nowdead, int ndead, + OffsetNumber *nowunused, int nunused); +extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); + +/* in heap/vacuumlazy.c */ +struct VacuumParams; +extern void heap_vacuum_rel(Relation rel, + struct VacuumParams *params, BufferAccessStrategy bstrategy); + +/* in heap/heapam_visibility.c */ +extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, + Buffer buffer); +extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, + Buffer buffer); +extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, + Buffer buffer); +extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, + TransactionId *dead_after); +extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid); +extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); +extern bool HeapTupleIsSurelyDead(HeapTuple htup, + struct GlobalVisState *vistest); + +/* + * To avoid leaking too much knowledge about reorderbuffer implementation + * details this is implemented in reorderbuffer.c not heapam_visibility.c + */ +struct HTAB; +extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, + Snapshot snapshot, + HeapTuple htup, + Buffer buffer, + CommandId *cmin, CommandId *cmax); +extern void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, + Buffer buffer, Snapshot snapshot); + +#endif /* HEAPAM_H */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h new file mode 100644 index 0000000..a038450 --- /dev/null +++ b/src/include/access/heapam_xlog.h @@ -0,0 +1,421 @@ +/*------------------------------------------------------------------------- + * + * heapam_xlog.h + * POSTGRES heap access XLOG definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/heapam_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef HEAPAM_XLOG_H +#define HEAPAM_XLOG_H + +#include "access/htup.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/buf.h" +#include "storage/bufpage.h" +#include "storage/relfilelocator.h" +#include "utils/relcache.h" + + +/* + * WAL record definitions for heapam.c's WAL operations + * + * XLOG allows to store some information in high 4 bits of log + * record xl_info field. We use 3 for opcode and one for init bit. + */ +#define XLOG_HEAP_INSERT 0x00 +#define XLOG_HEAP_DELETE 0x10 +#define XLOG_HEAP_UPDATE 0x20 +#define XLOG_HEAP_TRUNCATE 0x30 +#define XLOG_HEAP_HOT_UPDATE 0x40 +#define XLOG_HEAP_CONFIRM 0x50 +#define XLOG_HEAP_LOCK 0x60 +#define XLOG_HEAP_INPLACE 0x70 + +#define XLOG_HEAP_OPMASK 0x70 +/* + * When we insert 1st item on new page in INSERT, UPDATE, HOT_UPDATE, + * or MULTI_INSERT, we can (and we do) restore entire page in redo + */ +#define XLOG_HEAP_INIT_PAGE 0x80 +/* + * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes + * are associated with RM_HEAP2_ID, but are not logically different from + * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to + * these, too. + */ +#define XLOG_HEAP2_REWRITE 0x00 +#define XLOG_HEAP2_PRUNE 0x10 +#define XLOG_HEAP2_VACUUM 0x20 +#define XLOG_HEAP2_FREEZE_PAGE 0x30 +#define XLOG_HEAP2_VISIBLE 0x40 +#define XLOG_HEAP2_MULTI_INSERT 0x50 +#define XLOG_HEAP2_LOCK_UPDATED 0x60 +#define XLOG_HEAP2_NEW_CID 0x70 + +/* + * xl_heap_insert/xl_heap_multi_insert flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_INSERT_ALL_VISIBLE_CLEARED (1<<0) +#define XLH_INSERT_LAST_IN_MULTI (1<<1) +#define XLH_INSERT_IS_SPECULATIVE (1<<2) +#define XLH_INSERT_CONTAINS_NEW_TUPLE (1<<3) +#define XLH_INSERT_ON_TOAST_RELATION (1<<4) + +/* all_frozen_set always implies all_visible_set */ +#define XLH_INSERT_ALL_FROZEN_SET (1<<5) + +/* + * xl_heap_update flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED (1<<0) +/* PD_ALL_VISIBLE was cleared in the 2nd page */ +#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED (1<<1) +#define XLH_UPDATE_CONTAINS_OLD_TUPLE (1<<2) +#define XLH_UPDATE_CONTAINS_OLD_KEY (1<<3) +#define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) +#define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) +#define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) + +/* convenience macro for checking whether any form of old tuple was logged */ +#define XLH_UPDATE_CONTAINS_OLD \ + (XLH_UPDATE_CONTAINS_OLD_TUPLE | XLH_UPDATE_CONTAINS_OLD_KEY) + +/* + * xl_heap_delete flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_DELETE_ALL_VISIBLE_CLEARED (1<<0) +#define XLH_DELETE_CONTAINS_OLD_TUPLE (1<<1) +#define XLH_DELETE_CONTAINS_OLD_KEY (1<<2) +#define XLH_DELETE_IS_SUPER (1<<3) +#define XLH_DELETE_IS_PARTITION_MOVE (1<<4) + +/* convenience macro for checking whether any form of old tuple was logged */ +#define XLH_DELETE_CONTAINS_OLD \ + (XLH_DELETE_CONTAINS_OLD_TUPLE | XLH_DELETE_CONTAINS_OLD_KEY) + +/* This is what we need to know about delete */ +typedef struct xl_heap_delete +{ + TransactionId xmax; /* xmax of the deleted tuple */ + OffsetNumber offnum; /* deleted tuple's offset */ + uint8 infobits_set; /* infomask bits */ + uint8 flags; +} xl_heap_delete; + +#define SizeOfHeapDelete (offsetof(xl_heap_delete, flags) + sizeof(uint8)) + +/* + * xl_heap_truncate flag values, 8 bits are available. + */ +#define XLH_TRUNCATE_CASCADE (1<<0) +#define XLH_TRUNCATE_RESTART_SEQS (1<<1) + +/* + * For truncate we list all truncated relids in an array, followed by all + * sequence relids that need to be restarted, if any. + * All rels are always within the same database, so we just list dbid once. + */ +typedef struct xl_heap_truncate +{ + Oid dbId; + uint32 nrelids; + uint8 flags; + Oid relids[FLEXIBLE_ARRAY_MEMBER]; +} xl_heap_truncate; + +#define SizeOfHeapTruncate (offsetof(xl_heap_truncate, relids)) + +/* + * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted + * or updated tuple in WAL; we can save a few bytes by reconstructing the + * fields that are available elsewhere in the WAL record, or perhaps just + * plain needn't be reconstructed. These are the fields we must store. + */ +typedef struct xl_heap_header +{ + uint16 t_infomask2; + uint16 t_infomask; + uint8 t_hoff; +} xl_heap_header; + +#define SizeOfHeapHeader (offsetof(xl_heap_header, t_hoff) + sizeof(uint8)) + +/* This is what we need to know about insert */ +typedef struct xl_heap_insert +{ + OffsetNumber offnum; /* inserted tuple's offset */ + uint8 flags; + + /* xl_heap_header & TUPLE DATA in backup block 0 */ +} xl_heap_insert; + +#define SizeOfHeapInsert (offsetof(xl_heap_insert, flags) + sizeof(uint8)) + +/* + * This is what we need to know about a multi-insert. + * + * The main data of the record consists of this xl_heap_multi_insert header. + * 'offsets' array is omitted if the whole page is reinitialized + * (XLOG_HEAP_INIT_PAGE). + * + * In block 0's data portion, there is an xl_multi_insert_tuple struct, + * followed by the tuple data for each tuple. There is padding to align + * each xl_multi_insert_tuple struct. + */ +typedef struct xl_heap_multi_insert +{ + uint8 flags; + uint16 ntuples; + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} xl_heap_multi_insert; + +#define SizeOfHeapMultiInsert offsetof(xl_heap_multi_insert, offsets) + +typedef struct xl_multi_insert_tuple +{ + uint16 datalen; /* size of tuple data that follows */ + uint16 t_infomask2; + uint16 t_infomask; + uint8 t_hoff; + /* TUPLE DATA FOLLOWS AT END OF STRUCT */ +} xl_multi_insert_tuple; + +#define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8)) + +/* + * This is what we need to know about update|hot_update + * + * Backup blk 0: new page + * + * If XLH_UPDATE_PREFIX_FROM_OLD or XLH_UPDATE_SUFFIX_FROM_OLD flags are set, + * the prefix and/or suffix come first, as one or two uint16s. + * + * After that, xl_heap_header and new tuple data follow. The new tuple + * data doesn't include the prefix and suffix, which are copied from the + * old tuple on replay. + * + * If XLH_UPDATE_CONTAINS_NEW_TUPLE flag is given, the tuple data is + * included even if a full-page image was taken. + * + * Backup blk 1: old page, if different. (no data, just a reference to the blk) + */ +typedef struct xl_heap_update +{ + TransactionId old_xmax; /* xmax of the old tuple */ + OffsetNumber old_offnum; /* old tuple's offset */ + uint8 old_infobits_set; /* infomask bits to set on old tuple */ + uint8 flags; + TransactionId new_xmax; /* xmax of the new tuple */ + OffsetNumber new_offnum; /* new tuple's offset */ + + /* + * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags + * are set, xl_heap_header and tuple data for the old tuple follow. + */ +} xl_heap_update; + +#define SizeOfHeapUpdate (offsetof(xl_heap_update, new_offnum) + sizeof(OffsetNumber)) + +/* + * This is what we need to know about page pruning (both during VACUUM and + * during opportunistic pruning) + * + * The array of OffsetNumbers following the fixed part of the record contains: + * * for each redirected item: the item offset, then the offset redirected to + * * for each now-dead item: the item offset + * * for each now-unused item: the item offset + * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused. + * Note that nunused is not explicitly stored, but may be found by reference + * to the total record length. + * + * Acquires a full cleanup lock. + */ +typedef struct xl_heap_prune +{ + TransactionId snapshotConflictHorizon; + uint16 nredirected; + uint16 ndead; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + /* OFFSET NUMBERS are in the block reference 0 */ +} xl_heap_prune; + +#define SizeOfHeapPrune (offsetof(xl_heap_prune, isCatalogRel) + sizeof(bool)) + +/* + * The vacuum page record is similar to the prune record, but can only mark + * already LP_DEAD items LP_UNUSED (during VACUUM's second heap pass) + * + * Acquires an ordinary exclusive lock only. + */ +typedef struct xl_heap_vacuum +{ + uint16 nunused; + /* OFFSET NUMBERS are in the block reference 0 */ +} xl_heap_vacuum; + +#define SizeOfHeapVacuum (offsetof(xl_heap_vacuum, nunused) + sizeof(uint16)) + +/* flags for infobits_set */ +#define XLHL_XMAX_IS_MULTI 0x01 +#define XLHL_XMAX_LOCK_ONLY 0x02 +#define XLHL_XMAX_EXCL_LOCK 0x04 +#define XLHL_XMAX_KEYSHR_LOCK 0x08 +#define XLHL_KEYS_UPDATED 0x10 + +/* flag bits for xl_heap_lock / xl_heap_lock_updated's flag field */ +#define XLH_LOCK_ALL_FROZEN_CLEARED 0x01 + +/* This is what we need to know about lock */ +typedef struct xl_heap_lock +{ + TransactionId xmax; /* might be a MultiXactId */ + OffsetNumber offnum; /* locked tuple's offset on page */ + uint8 infobits_set; /* infomask and infomask2 bits to set */ + uint8 flags; /* XLH_LOCK_* flag bits */ +} xl_heap_lock; + +#define SizeOfHeapLock (offsetof(xl_heap_lock, flags) + sizeof(uint8)) + +/* This is what we need to know about locking an updated version of a row */ +typedef struct xl_heap_lock_updated +{ + TransactionId xmax; + OffsetNumber offnum; + uint8 infobits_set; + uint8 flags; +} xl_heap_lock_updated; + +#define SizeOfHeapLockUpdated (offsetof(xl_heap_lock_updated, flags) + sizeof(uint8)) + +/* This is what we need to know about confirmation of speculative insertion */ +typedef struct xl_heap_confirm +{ + OffsetNumber offnum; /* confirmed tuple's offset on page */ +} xl_heap_confirm; + +#define SizeOfHeapConfirm (offsetof(xl_heap_confirm, offnum) + sizeof(OffsetNumber)) + +/* This is what we need to know about in-place update */ +typedef struct xl_heap_inplace +{ + OffsetNumber offnum; /* updated tuple's offset on page */ + /* TUPLE DATA FOLLOWS AT END OF STRUCT */ +} xl_heap_inplace; + +#define SizeOfHeapInplace (offsetof(xl_heap_inplace, offnum) + sizeof(OffsetNumber)) + +/* + * This struct represents a 'freeze plan', which describes how to freeze a + * group of one or more heap tuples (appears in xl_heap_freeze_page record) + */ +/* 0x01 was XLH_FREEZE_XMIN */ +#define XLH_FREEZE_XVAC 0x02 +#define XLH_INVALID_XVAC 0x04 + +typedef struct xl_heap_freeze_plan +{ + TransactionId xmax; + uint16 t_infomask2; + uint16 t_infomask; + uint8 frzflags; + + /* Length of individual page offset numbers array for this plan */ + uint16 ntuples; +} xl_heap_freeze_plan; + +/* + * This is what we need to know about a block being frozen during vacuum + * + * Backup block 0's data contains an array of xl_heap_freeze_plan structs + * (with nplans elements), followed by one or more page offset number arrays. + * Each such page offset number array corresponds to a single freeze plan + * (REDO routine freezes corresponding heap tuples using freeze plan). + */ +typedef struct xl_heap_freeze_page +{ + TransactionId snapshotConflictHorizon; + uint16 nplans; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + + /* + * In payload of blk 0 : FREEZE PLANS and OFFSET NUMBER ARRAY + */ +} xl_heap_freeze_page; + +#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, isCatalogRel) + sizeof(bool)) + +/* + * This is what we need to know about setting a visibility map bit + * + * Backup blk 0: visibility map buffer + * Backup blk 1: heap buffer + */ +typedef struct xl_heap_visible +{ + TransactionId snapshotConflictHorizon; + uint8 flags; +} xl_heap_visible; + +#define SizeOfHeapVisible (offsetof(xl_heap_visible, flags) + sizeof(uint8)) + +typedef struct xl_heap_new_cid +{ + /* + * store toplevel xid so we don't have to merge cids from different + * transactions + */ + TransactionId top_xid; + CommandId cmin; + CommandId cmax; + CommandId combocid; /* just for debugging */ + + /* + * Store the relfilelocator/ctid pair to facilitate lookups. + */ + RelFileLocator target_locator; + ItemPointerData target_tid; +} xl_heap_new_cid; + +#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target_tid) + sizeof(ItemPointerData)) + +/* logical rewrite xlog record header */ +typedef struct xl_heap_rewrite_mapping +{ + TransactionId mapped_xid; /* xid that might need to see the row */ + Oid mapped_db; /* DbOid or InvalidOid for shared rels */ + Oid mapped_rel; /* Oid of the mapped relation */ + off_t offset; /* How far have we written so far */ + uint32 num_mappings; /* Number of in-memory mappings */ + XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */ +} xl_heap_rewrite_mapping; + +extern void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, + TransactionId *snapshotConflictHorizon); + +extern void heap_redo(XLogReaderState *record); +extern void heap_desc(StringInfo buf, XLogReaderState *record); +extern const char *heap_identify(uint8 info); +extern void heap_mask(char *pagedata, BlockNumber blkno); +extern void heap2_redo(XLogReaderState *record); +extern void heap2_desc(StringInfo buf, XLogReaderState *record); +extern const char *heap2_identify(uint8 info); +extern void heap_xlog_logical_rewrite(XLogReaderState *r); + +extern XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, + Buffer vm_buffer, + TransactionId snapshotConflictHorizon, + uint8 vmflags); + +#endif /* HEAPAM_XLOG_H */ diff --git a/src/include/access/heaptoast.h b/src/include/access/heaptoast.h new file mode 100644 index 0000000..5c0a796 --- /dev/null +++ b/src/include/access/heaptoast.h @@ -0,0 +1,149 @@ +/*------------------------------------------------------------------------- + * + * heaptoast.h + * Heap-specific definitions for external and compressed storage + * of variable size attributes. + * + * Copyright (c) 2000-2023, PostgreSQL Global Development Group + * + * src/include/access/heaptoast.h + * + *------------------------------------------------------------------------- + */ +#ifndef HEAPTOAST_H +#define HEAPTOAST_H + +#include "access/htup_details.h" +#include "storage/lockdefs.h" +#include "utils/relcache.h" + +/* + * Find the maximum size of a tuple if there are to be N tuples per page. + */ +#define MaximumBytesPerTuple(tuplesPerPage) \ + MAXALIGN_DOWN((BLCKSZ - \ + MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData))) \ + / (tuplesPerPage)) + +/* + * These symbols control toaster activation. If a tuple is larger than + * TOAST_TUPLE_THRESHOLD, we will try to toast it down to no more than + * TOAST_TUPLE_TARGET bytes through compressing compressible fields and + * moving EXTENDED and EXTERNAL data out-of-line. + * + * The numbers need not be the same, though they currently are. It doesn't + * make sense for TARGET to exceed THRESHOLD, but it could be useful to make + * it be smaller. + * + * Currently we choose both values to match the largest tuple size for which + * TOAST_TUPLES_PER_PAGE tuples can fit on a heap page. + * + * XXX while these can be modified without initdb, some thought needs to be + * given to needs_toast_table() in toasting.c before unleashing random + * changes. Also see LOBLKSIZE in large_object.h, which can *not* be + * changed without initdb. + */ +#define TOAST_TUPLES_PER_PAGE 4 + +#define TOAST_TUPLE_THRESHOLD MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE) + +#define TOAST_TUPLE_TARGET TOAST_TUPLE_THRESHOLD + +/* + * The code will also consider moving MAIN data out-of-line, but only as a + * last resort if the previous steps haven't reached the target tuple size. + * In this phase we use a different target size, currently equal to the + * largest tuple that will fit on a heap page. This is reasonable since + * the user has told us to keep the data in-line if at all possible. + */ +#define TOAST_TUPLES_PER_PAGE_MAIN 1 + +#define TOAST_TUPLE_TARGET_MAIN MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE_MAIN) + +/* + * If an index value is larger than TOAST_INDEX_TARGET, we will try to + * compress it (we can't move it out-of-line, however). Note that this + * number is per-datum, not per-tuple, for simplicity in index_form_tuple(). + */ +#define TOAST_INDEX_TARGET (MaxHeapTupleSize / 16) + +/* + * When we store an oversize datum externally, we divide it into chunks + * containing at most TOAST_MAX_CHUNK_SIZE data bytes. This number *must* + * be small enough that the completed toast-table tuple (including the + * ID and sequence fields and all overhead) will fit on a page. + * The coding here sets the size on the theory that we want to fit + * EXTERN_TUPLES_PER_PAGE tuples of maximum size onto a page. + * + * NB: Changing TOAST_MAX_CHUNK_SIZE requires an initdb. + */ +#define EXTERN_TUPLES_PER_PAGE 4 /* tweak only this */ + +#define EXTERN_TUPLE_MAX_SIZE MaximumBytesPerTuple(EXTERN_TUPLES_PER_PAGE) + +#define TOAST_MAX_CHUNK_SIZE \ + (EXTERN_TUPLE_MAX_SIZE - \ + MAXALIGN(SizeofHeapTupleHeader) - \ + sizeof(Oid) - \ + sizeof(int32) - \ + VARHDRSZ) + +/* ---------- + * heap_toast_insert_or_update - + * + * Called by heap_insert() and heap_update(). + * ---------- + */ +extern HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup, + HeapTuple oldtup, int options); + +/* ---------- + * heap_toast_delete - + * + * Called by heap_delete(). + * ---------- + */ +extern void heap_toast_delete(Relation rel, HeapTuple oldtup, + bool is_speculative); + +/* ---------- + * toast_flatten_tuple - + * + * "Flatten" a tuple to contain no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * ---------- + */ +extern HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc); + +/* ---------- + * toast_flatten_tuple_to_datum - + * + * "Flatten" a tuple containing out-of-line toasted fields into a Datum. + * ---------- + */ +extern Datum toast_flatten_tuple_to_datum(HeapTupleHeader tup, + uint32 tup_len, + TupleDesc tupleDesc); + +/* ---------- + * toast_build_flattened_tuple - + * + * Build a tuple containing no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * ---------- + */ +extern HeapTuple toast_build_flattened_tuple(TupleDesc tupleDesc, + Datum *values, + bool *isnull); + +/* ---------- + * heap_fetch_toast_slice + * + * Fetch a slice from a toast value stored in a heap table. + * ---------- + */ +extern void heap_fetch_toast_slice(Relation toastrel, Oid valueid, + int32 attrsize, int32 sliceoffset, + int32 slicelength, struct varlena *result); + +#endif /* HEAPTOAST_H */ diff --git a/src/include/access/hio.h b/src/include/access/hio.h new file mode 100644 index 0000000..9bc563b --- /dev/null +++ b/src/include/access/hio.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * hio.h + * POSTGRES heap access method input/output definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/hio.h + * + *------------------------------------------------------------------------- + */ +#ifndef HIO_H +#define HIO_H + +#include "access/htup.h" +#include "storage/buf.h" +#include "utils/relcache.h" + +/* + * state for bulk inserts --- private to heapam.c and hio.c + * + * If current_buf isn't InvalidBuffer, then we are holding an extra pin + * on that buffer. + * + * "typedef struct BulkInsertStateData *BulkInsertState" is in heapam.h + */ +typedef struct BulkInsertStateData +{ + BufferAccessStrategy strategy; /* our BULKWRITE strategy object */ + Buffer current_buf; /* current insertion target page */ + + /* + * State for bulk extensions. + * + * last_free..next_free are further pages that were unused at the time of + * the last extension. They might be in use by the time we use them + * though, so rechecks are needed. + * + * XXX: Eventually these should probably live in RelationData instead, + * alongside targetblock. + * + * already_extended_by is the number of pages that this bulk inserted + * extended by. If we already extended by a significant number of pages, + * we can be more aggressive about extending going forward. + */ + BlockNumber next_free; + BlockNumber last_free; + uint32 already_extended_by; +} BulkInsertStateData; + + +extern void RelationPutHeapTuple(Relation relation, Buffer buffer, + HeapTuple tuple, bool token); +extern Buffer RelationGetBufferForTuple(Relation relation, Size len, + Buffer otherBuffer, int options, + BulkInsertStateData *bistate, + Buffer *vmbuffer, Buffer *vmbuffer_other, + int num_pages); + +#endif /* HIO_H */ diff --git a/src/include/access/htup.h b/src/include/access/htup.h new file mode 100644 index 0000000..a8f7ff5 --- /dev/null +++ b/src/include/access/htup.h @@ -0,0 +1,89 @@ +/*------------------------------------------------------------------------- + * + * htup.h + * POSTGRES heap tuple definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/htup.h + * + *------------------------------------------------------------------------- + */ +#ifndef HTUP_H +#define HTUP_H + +#include "storage/itemptr.h" + +/* typedefs and forward declarations for structs defined in htup_details.h */ + +typedef struct HeapTupleHeaderData HeapTupleHeaderData; + +typedef HeapTupleHeaderData *HeapTupleHeader; + +typedef struct MinimalTupleData MinimalTupleData; + +typedef MinimalTupleData *MinimalTuple; + + +/* + * HeapTupleData is an in-memory data structure that points to a tuple. + * + * There are several ways in which this data structure is used: + * + * * Pointer to a tuple in a disk buffer: t_data points directly into the + * buffer (which the code had better be holding a pin on, but this is not + * reflected in HeapTupleData itself). + * + * * Pointer to nothing: t_data is NULL. This is used as a failure indication + * in some functions. + * + * * Part of a palloc'd tuple: the HeapTupleData itself and the tuple + * form a single palloc'd chunk. t_data points to the memory location + * immediately following the HeapTupleData struct (at offset HEAPTUPLESIZE). + * This is the output format of heap_form_tuple and related routines. + * + * * Separately allocated tuple: t_data points to a palloc'd chunk that + * is not adjacent to the HeapTupleData. (This case is deprecated since + * it's difficult to tell apart from case #1. It should be used only in + * limited contexts where the code knows that case #1 will never apply.) + * + * * Separately allocated minimal tuple: t_data points MINIMAL_TUPLE_OFFSET + * bytes before the start of a MinimalTuple. As with the previous case, + * this can't be told apart from case #1 by inspection; code setting up + * or destroying this representation has to know what it's doing. + * + * t_len should always be valid, except in the pointer-to-nothing case. + * t_self and t_tableOid should be valid if the HeapTupleData points to + * a disk buffer, or if it represents a copy of a tuple on disk. They + * should be explicitly set invalid in manufactured tuples. + */ +typedef struct HeapTupleData +{ + uint32 t_len; /* length of *t_data */ + ItemPointerData t_self; /* SelfItemPointer */ + Oid t_tableOid; /* table the tuple came from */ +#define FIELDNO_HEAPTUPLEDATA_DATA 3 + HeapTupleHeader t_data; /* -> tuple header and data */ +} HeapTupleData; + +typedef HeapTupleData *HeapTuple; + +#define HEAPTUPLESIZE MAXALIGN(sizeof(HeapTupleData)) + +/* + * Accessor macros to be used with HeapTuple pointers. + */ +#define HeapTupleIsValid(tuple) PointerIsValid(tuple) + +/* HeapTupleHeader functions implemented in utils/time/combocid.c */ +extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup); +extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup); +extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup, + CommandId *cmax, bool *iscombo); + +/* Prototype for HeapTupleHeader accessors in heapam.c */ +extern TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple); + +#endif /* HTUP_H */ diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h new file mode 100644 index 0000000..e01f4f3 --- /dev/null +++ b/src/include/access/htup_details.h @@ -0,0 +1,811 @@ +/*------------------------------------------------------------------------- + * + * htup_details.h + * POSTGRES heap tuple header definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/htup_details.h + * + *------------------------------------------------------------------------- + */ +#ifndef HTUP_DETAILS_H +#define HTUP_DETAILS_H + +#include "access/htup.h" +#include "access/transam.h" +#include "access/tupdesc.h" +#include "access/tupmacs.h" +#include "storage/bufpage.h" +#include "varatt.h" + +/* + * MaxTupleAttributeNumber limits the number of (user) columns in a tuple. + * The key limit on this value is that the size of the fixed overhead for + * a tuple, plus the size of the null-values bitmap (at 1 bit per column), + * plus MAXALIGN alignment, must fit into t_hoff which is uint8. On most + * machines the upper limit without making t_hoff wider would be a little + * over 1700. We use round numbers here and for MaxHeapAttributeNumber + * so that alterations in HeapTupleHeaderData layout won't change the + * supported max number of columns. + */ +#define MaxTupleAttributeNumber 1664 /* 8 * 208 */ + +/* + * MaxHeapAttributeNumber limits the number of (user) columns in a table. + * This should be somewhat less than MaxTupleAttributeNumber. It must be + * at least one less, else we will fail to do UPDATEs on a maximal-width + * table (because UPDATE has to form working tuples that include CTID). + * In practice we want some additional daylight so that we can gracefully + * support operations that add hidden "resjunk" columns, for example + * SELECT * FROM wide_table ORDER BY foo, bar, baz. + * In any case, depending on column data types you will likely be running + * into the disk-block-based limit on overall tuple size if you have more + * than a thousand or so columns. TOAST won't help. + */ +#define MaxHeapAttributeNumber 1600 /* 8 * 200 */ + +/* + * Heap tuple header. To avoid wasting space, the fields should be + * laid out in such a way as to avoid structure padding. + * + * Datums of composite types (row types) share the same general structure + * as on-disk tuples, so that the same routines can be used to build and + * examine them. However the requirements are slightly different: a Datum + * does not need any transaction visibility information, and it does need + * a length word and some embedded type information. We can achieve this + * by overlaying the xmin/cmin/xmax/cmax/xvac fields of a heap tuple + * with the fields needed in the Datum case. Typically, all tuples built + * in-memory will be initialized with the Datum fields; but when a tuple is + * about to be inserted in a table, the transaction fields will be filled, + * overwriting the datum fields. + * + * The overall structure of a heap tuple looks like: + * fixed fields (HeapTupleHeaderData struct) + * nulls bitmap (if HEAP_HASNULL is set in t_infomask) + * alignment padding (as needed to make user data MAXALIGN'd) + * object ID (if HEAP_HASOID_OLD is set in t_infomask, not created + * anymore) + * user data fields + * + * We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac in three + * physical fields. Xmin and Xmax are always really stored, but Cmin, Cmax + * and Xvac share a field. This works because we know that Cmin and Cmax + * are only interesting for the lifetime of the inserting and deleting + * transaction respectively. If a tuple is inserted and deleted in the same + * transaction, we store a "combo" command id that can be mapped to the real + * cmin and cmax, but only by use of local state within the originating + * backend. See combocid.c for more details. Meanwhile, Xvac is only set by + * old-style VACUUM FULL, which does not have any command sub-structure and so + * does not need either Cmin or Cmax. (This requires that old-style VACUUM + * FULL never try to move a tuple whose Cmin or Cmax is still interesting, + * ie, an insert-in-progress or delete-in-progress tuple.) + * + * A word about t_ctid: whenever a new tuple is stored on disk, its t_ctid + * is initialized with its own TID (location). If the tuple is ever updated, + * its t_ctid is changed to point to the replacement version of the tuple. Or + * if the tuple is moved from one partition to another, due to an update of + * the partition key, t_ctid is set to a special value to indicate that + * (see ItemPointerSetMovedPartitions). Thus, a tuple is the latest version + * of its row iff XMAX is invalid or + * t_ctid points to itself (in which case, if XMAX is valid, the tuple is + * either locked or deleted). One can follow the chain of t_ctid links + * to find the newest version of the row, unless it was moved to a different + * partition. Beware however that VACUUM might + * erase the pointed-to (newer) tuple before erasing the pointing (older) + * tuple. Hence, when following a t_ctid link, it is necessary to check + * to see if the referenced slot is empty or contains an unrelated tuple. + * Check that the referenced tuple has XMIN equal to the referencing tuple's + * XMAX to verify that it is actually the descendant version and not an + * unrelated tuple stored into a slot recently freed by VACUUM. If either + * check fails, one may assume that there is no live descendant version. + * + * t_ctid is sometimes used to store a speculative insertion token, instead + * of a real TID. A speculative token is set on a tuple that's being + * inserted, until the inserter is sure that it wants to go ahead with the + * insertion. Hence a token should only be seen on a tuple with an XMAX + * that's still in-progress, or invalid/aborted. The token is replaced with + * the tuple's real TID when the insertion is confirmed. One should never + * see a speculative insertion token while following a chain of t_ctid links, + * because they are not used on updates, only insertions. + * + * Following the fixed header fields, the nulls bitmap is stored (beginning + * at t_bits). The bitmap is *not* stored if t_infomask shows that there + * are no nulls in the tuple. If an OID field is present (as indicated by + * t_infomask), then it is stored just before the user data, which begins at + * the offset shown by t_hoff. Note that t_hoff must be a multiple of + * MAXALIGN. + */ + +typedef struct HeapTupleFields +{ + TransactionId t_xmin; /* inserting xact ID */ + TransactionId t_xmax; /* deleting or locking xact ID */ + + union + { + CommandId t_cid; /* inserting or deleting command ID, or both */ + TransactionId t_xvac; /* old-style VACUUM FULL xact ID */ + } t_field3; +} HeapTupleFields; + +typedef struct DatumTupleFields +{ + int32 datum_len_; /* varlena header (do not touch directly!) */ + + int32 datum_typmod; /* -1, or identifier of a record type */ + + Oid datum_typeid; /* composite type OID, or RECORDOID */ + + /* + * datum_typeid cannot be a domain over composite, only plain composite, + * even if the datum is meant as a value of a domain-over-composite type. + * This is in line with the general principle that CoerceToDomain does not + * change the physical representation of the base type value. + * + * Note: field ordering is chosen with thought that Oid might someday + * widen to 64 bits. + */ +} DatumTupleFields; + +struct HeapTupleHeaderData +{ + union + { + HeapTupleFields t_heap; + DatumTupleFields t_datum; + } t_choice; + + ItemPointerData t_ctid; /* current TID of this or newer tuple (or a + * speculative insertion token) */ + + /* Fields below here must match MinimalTupleData! */ + +#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2 + uint16 t_infomask2; /* number of attributes + various flags */ + +#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3 + uint16 t_infomask; /* various flag bits, see below */ + +#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4 + uint8 t_hoff; /* sizeof header incl. bitmap, padding */ + + /* ^ - 23 bytes - ^ */ + +#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5 + bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */ + + /* MORE DATA FOLLOWS AT END OF STRUCT */ +}; + +/* typedef appears in htup.h */ + +#define SizeofHeapTupleHeader offsetof(HeapTupleHeaderData, t_bits) + +/* + * information stored in t_infomask: + */ +#define HEAP_HASNULL 0x0001 /* has null attribute(s) */ +#define HEAP_HASVARWIDTH 0x0002 /* has variable-width attribute(s) */ +#define HEAP_HASEXTERNAL 0x0004 /* has external stored attribute(s) */ +#define HEAP_HASOID_OLD 0x0008 /* has an object-id field */ +#define HEAP_XMAX_KEYSHR_LOCK 0x0010 /* xmax is a key-shared locker */ +#define HEAP_COMBOCID 0x0020 /* t_cid is a combo CID */ +#define HEAP_XMAX_EXCL_LOCK 0x0040 /* xmax is exclusive locker */ +#define HEAP_XMAX_LOCK_ONLY 0x0080 /* xmax, if valid, is only a locker */ + + /* xmax is a shared locker */ +#define HEAP_XMAX_SHR_LOCK (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK) + +#define HEAP_LOCK_MASK (HEAP_XMAX_SHR_LOCK | HEAP_XMAX_EXCL_LOCK | \ + HEAP_XMAX_KEYSHR_LOCK) +#define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ +#define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */ +#define HEAP_XMIN_FROZEN (HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID) +#define HEAP_XMAX_COMMITTED 0x0400 /* t_xmax committed */ +#define HEAP_XMAX_INVALID 0x0800 /* t_xmax invalid/aborted */ +#define HEAP_XMAX_IS_MULTI 0x1000 /* t_xmax is a MultiXactId */ +#define HEAP_UPDATED 0x2000 /* this is UPDATEd version of row */ +#define HEAP_MOVED_OFF 0x4000 /* moved to another place by pre-9.0 + * VACUUM FULL; kept for binary + * upgrade support */ +#define HEAP_MOVED_IN 0x8000 /* moved from another place by pre-9.0 + * VACUUM FULL; kept for binary + * upgrade support */ +#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN) + +#define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */ + +/* + * A tuple is only locked (i.e. not updated by its Xmax) if the + * HEAP_XMAX_LOCK_ONLY bit is set; or, for pg_upgrade's sake, if the Xmax is + * not a multi and the EXCL_LOCK bit is set. + * + * See also HeapTupleHeaderIsOnlyLocked, which also checks for a possible + * aborted updater transaction. + * + * Beware of multiple evaluations of the argument. + */ +#define HEAP_XMAX_IS_LOCKED_ONLY(infomask) \ + (((infomask) & HEAP_XMAX_LOCK_ONLY) || \ + (((infomask) & (HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK)) == HEAP_XMAX_EXCL_LOCK)) + +/* + * A tuple that has HEAP_XMAX_IS_MULTI and HEAP_XMAX_LOCK_ONLY but neither of + * HEAP_XMAX_EXCL_LOCK and HEAP_XMAX_KEYSHR_LOCK must come from a tuple that was + * share-locked in 9.2 or earlier and then pg_upgrade'd. + * + * In 9.2 and prior, HEAP_XMAX_IS_MULTI was only set when there were multiple + * FOR SHARE lockers of that tuple. That set HEAP_XMAX_LOCK_ONLY (with a + * different name back then) but neither of HEAP_XMAX_EXCL_LOCK and + * HEAP_XMAX_KEYSHR_LOCK. That combination is no longer possible in 9.3 and + * up, so if we see that combination we know for certain that the tuple was + * locked in an earlier release; since all such lockers are gone (they cannot + * survive through pg_upgrade), such tuples can safely be considered not + * locked. + * + * We must not resolve such multixacts locally, because the result would be + * bogus, regardless of where they stand with respect to the current valid + * multixact range. + */ +#define HEAP_LOCKED_UPGRADED(infomask) \ +( \ + ((infomask) & HEAP_XMAX_IS_MULTI) != 0 && \ + ((infomask) & HEAP_XMAX_LOCK_ONLY) != 0 && \ + (((infomask) & (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)) == 0) \ +) + +/* + * Use these to test whether a particular lock is applied to a tuple + */ +#define HEAP_XMAX_IS_SHR_LOCKED(infomask) \ + (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_SHR_LOCK) +#define HEAP_XMAX_IS_EXCL_LOCKED(infomask) \ + (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_EXCL_LOCK) +#define HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) \ + (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_KEYSHR_LOCK) + +/* turn these all off when Xmax is to change */ +#define HEAP_XMAX_BITS (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | \ + HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK | HEAP_XMAX_LOCK_ONLY) + +/* + * information stored in t_infomask2: + */ +#define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ +/* bits 0x1800 are available */ +#define HEAP_KEYS_UPDATED 0x2000 /* tuple was updated and key cols + * modified, or tuple deleted */ +#define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ +#define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ + +#define HEAP2_XACT_MASK 0xE000 /* visibility-related bits */ + +/* + * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins. It is + * only used in tuples that are in the hash table, and those don't need + * any visibility information, so we can overlay it on a visibility flag + * instead of using up a dedicated bit. + */ +#define HEAP_TUPLE_HAS_MATCH HEAP_ONLY_TUPLE /* tuple has a join match */ + +/* + * HeapTupleHeader accessor macros + * + * Note: beware of multiple evaluations of "tup" argument. But the Set + * macros evaluate their other argument only once. + */ + +/* + * HeapTupleHeaderGetRawXmin returns the "raw" xmin field, which is the xid + * originally used to insert the tuple. However, the tuple might actually + * be frozen (via HeapTupleHeaderSetXminFrozen) in which case the tuple's xmin + * is visible to every snapshot. Prior to PostgreSQL 9.4, we actually changed + * the xmin to FrozenTransactionId, and that value may still be encountered + * on disk. + */ +#define HeapTupleHeaderGetRawXmin(tup) \ +( \ + (tup)->t_choice.t_heap.t_xmin \ +) + +#define HeapTupleHeaderGetXmin(tup) \ +( \ + HeapTupleHeaderXminFrozen(tup) ? \ + FrozenTransactionId : HeapTupleHeaderGetRawXmin(tup) \ +) + +#define HeapTupleHeaderSetXmin(tup, xid) \ +( \ + (tup)->t_choice.t_heap.t_xmin = (xid) \ +) + +#define HeapTupleHeaderXminCommitted(tup) \ +( \ + ((tup)->t_infomask & HEAP_XMIN_COMMITTED) != 0 \ +) + +#define HeapTupleHeaderXminInvalid(tup) \ +( \ + ((tup)->t_infomask & (HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID)) == \ + HEAP_XMIN_INVALID \ +) + +#define HeapTupleHeaderXminFrozen(tup) \ +( \ + ((tup)->t_infomask & (HEAP_XMIN_FROZEN)) == HEAP_XMIN_FROZEN \ +) + +#define HeapTupleHeaderSetXminCommitted(tup) \ +( \ + AssertMacro(!HeapTupleHeaderXminInvalid(tup)), \ + ((tup)->t_infomask |= HEAP_XMIN_COMMITTED) \ +) + +#define HeapTupleHeaderSetXminInvalid(tup) \ +( \ + AssertMacro(!HeapTupleHeaderXminCommitted(tup)), \ + ((tup)->t_infomask |= HEAP_XMIN_INVALID) \ +) + +#define HeapTupleHeaderSetXminFrozen(tup) \ +( \ + AssertMacro(!HeapTupleHeaderXminInvalid(tup)), \ + ((tup)->t_infomask |= HEAP_XMIN_FROZEN) \ +) + +/* + * HeapTupleHeaderGetRawXmax gets you the raw Xmax field. To find out the Xid + * that updated a tuple, you might need to resolve the MultiXactId if certain + * bits are set. HeapTupleHeaderGetUpdateXid checks those bits and takes care + * to resolve the MultiXactId if necessary. This might involve multixact I/O, + * so it should only be used if absolutely necessary. + */ +#define HeapTupleHeaderGetUpdateXid(tup) \ +( \ + (!((tup)->t_infomask & HEAP_XMAX_INVALID) && \ + ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \ + !((tup)->t_infomask & HEAP_XMAX_LOCK_ONLY)) ? \ + HeapTupleGetUpdateXid(tup) \ + : \ + HeapTupleHeaderGetRawXmax(tup) \ +) + +#define HeapTupleHeaderGetRawXmax(tup) \ +( \ + (tup)->t_choice.t_heap.t_xmax \ +) + +#define HeapTupleHeaderSetXmax(tup, xid) \ +( \ + (tup)->t_choice.t_heap.t_xmax = (xid) \ +) + +/* + * HeapTupleHeaderGetRawCommandId will give you what's in the header whether + * it is useful or not. Most code should use HeapTupleHeaderGetCmin or + * HeapTupleHeaderGetCmax instead, but note that those Assert that you can + * get a legitimate result, ie you are in the originating transaction! + */ +#define HeapTupleHeaderGetRawCommandId(tup) \ +( \ + (tup)->t_choice.t_heap.t_field3.t_cid \ +) + +/* SetCmin is reasonably simple since we never need a combo CID */ +#define HeapTupleHeaderSetCmin(tup, cid) \ +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \ + (tup)->t_infomask &= ~HEAP_COMBOCID; \ +} while (0) + +/* SetCmax must be used after HeapTupleHeaderAdjustCmax; see combocid.c */ +#define HeapTupleHeaderSetCmax(tup, cid, iscombo) \ +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \ + if (iscombo) \ + (tup)->t_infomask |= HEAP_COMBOCID; \ + else \ + (tup)->t_infomask &= ~HEAP_COMBOCID; \ +} while (0) + +#define HeapTupleHeaderGetXvac(tup) \ +( \ + ((tup)->t_infomask & HEAP_MOVED) ? \ + (tup)->t_choice.t_heap.t_field3.t_xvac \ + : \ + InvalidTransactionId \ +) + +#define HeapTupleHeaderSetXvac(tup, xid) \ +do { \ + Assert((tup)->t_infomask & HEAP_MOVED); \ + (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \ +} while (0) + +StaticAssertDecl(MaxOffsetNumber < SpecTokenOffsetNumber, + "invalid speculative token constant"); + +#define HeapTupleHeaderIsSpeculative(tup) \ +( \ + (ItemPointerGetOffsetNumberNoCheck(&(tup)->t_ctid) == SpecTokenOffsetNumber) \ +) + +#define HeapTupleHeaderGetSpeculativeToken(tup) \ +( \ + AssertMacro(HeapTupleHeaderIsSpeculative(tup)), \ + ItemPointerGetBlockNumber(&(tup)->t_ctid) \ +) + +#define HeapTupleHeaderSetSpeculativeToken(tup, token) \ +( \ + ItemPointerSet(&(tup)->t_ctid, token, SpecTokenOffsetNumber) \ +) + +#define HeapTupleHeaderIndicatesMovedPartitions(tup) \ + ItemPointerIndicatesMovedPartitions(&(tup)->t_ctid) + +#define HeapTupleHeaderSetMovedPartitions(tup) \ + ItemPointerSetMovedPartitions(&(tup)->t_ctid) + +#define HeapTupleHeaderGetDatumLength(tup) \ + VARSIZE(tup) + +#define HeapTupleHeaderSetDatumLength(tup, len) \ + SET_VARSIZE(tup, len) + +#define HeapTupleHeaderGetTypeId(tup) \ +( \ + (tup)->t_choice.t_datum.datum_typeid \ +) + +#define HeapTupleHeaderSetTypeId(tup, typeid) \ +( \ + (tup)->t_choice.t_datum.datum_typeid = (typeid) \ +) + +#define HeapTupleHeaderGetTypMod(tup) \ +( \ + (tup)->t_choice.t_datum.datum_typmod \ +) + +#define HeapTupleHeaderSetTypMod(tup, typmod) \ +( \ + (tup)->t_choice.t_datum.datum_typmod = (typmod) \ +) + +/* + * Note that we stop considering a tuple HOT-updated as soon as it is known + * aborted or the would-be updating transaction is known aborted. For best + * efficiency, check tuple visibility before using this macro, so that the + * INVALID bits will be as up to date as possible. + */ +#define HeapTupleHeaderIsHotUpdated(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_HOT_UPDATED) != 0 && \ + ((tup)->t_infomask & HEAP_XMAX_INVALID) == 0 && \ + !HeapTupleHeaderXminInvalid(tup) \ +) + +#define HeapTupleHeaderSetHotUpdated(tup) \ +( \ + (tup)->t_infomask2 |= HEAP_HOT_UPDATED \ +) + +#define HeapTupleHeaderClearHotUpdated(tup) \ +( \ + (tup)->t_infomask2 &= ~HEAP_HOT_UPDATED \ +) + +#define HeapTupleHeaderIsHeapOnly(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_ONLY_TUPLE) != 0 \ +) + +#define HeapTupleHeaderSetHeapOnly(tup) \ +( \ + (tup)->t_infomask2 |= HEAP_ONLY_TUPLE \ +) + +#define HeapTupleHeaderClearHeapOnly(tup) \ +( \ + (tup)->t_infomask2 &= ~HEAP_ONLY_TUPLE \ +) + +#define HeapTupleHeaderHasMatch(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_TUPLE_HAS_MATCH) != 0 \ +) + +#define HeapTupleHeaderSetMatch(tup) \ +( \ + (tup)->t_infomask2 |= HEAP_TUPLE_HAS_MATCH \ +) + +#define HeapTupleHeaderClearMatch(tup) \ +( \ + (tup)->t_infomask2 &= ~HEAP_TUPLE_HAS_MATCH \ +) + +#define HeapTupleHeaderGetNatts(tup) \ + ((tup)->t_infomask2 & HEAP_NATTS_MASK) + +#define HeapTupleHeaderSetNatts(tup, natts) \ +( \ + (tup)->t_infomask2 = ((tup)->t_infomask2 & ~HEAP_NATTS_MASK) | (natts) \ +) + +#define HeapTupleHeaderHasExternal(tup) \ + (((tup)->t_infomask & HEAP_HASEXTERNAL) != 0) + + +/* + * BITMAPLEN(NATTS) - + * Computes size of null bitmap given number of data columns. + */ +#define BITMAPLEN(NATTS) (((int)(NATTS) + 7) / 8) + +/* + * MaxHeapTupleSize is the maximum allowed size of a heap tuple, including + * header and MAXALIGN alignment padding. Basically it's BLCKSZ minus the + * other stuff that has to be on a disk page. Since heap pages use no + * "special space", there's no deduction for that. + * + * NOTE: we allow for the ItemId that must point to the tuple, ensuring that + * an otherwise-empty page can indeed hold a tuple of this size. Because + * ItemIds and tuples have different alignment requirements, don't assume that + * you can, say, fit 2 tuples of size MaxHeapTupleSize/2 on the same page. + */ +#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) +#define MinHeapTupleSize MAXALIGN(SizeofHeapTupleHeader) + +/* + * MaxHeapTuplesPerPage is an upper bound on the number of tuples that can + * fit on one heap page. (Note that indexes could have more, because they + * use a smaller tuple header.) We arrive at the divisor because each tuple + * must be maxaligned, and it must have an associated line pointer. + * + * Note: with HOT, there could theoretically be more line pointers (not actual + * tuples) than this on a heap page. However we constrain the number of line + * pointers to this anyway, to avoid excessive line-pointer bloat and not + * require increases in the size of work arrays. + */ +#define MaxHeapTuplesPerPage \ + ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ + (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)))) + +/* + * MaxAttrSize is a somewhat arbitrary upper limit on the declared size of + * data fields of char(n) and similar types. It need not have anything + * directly to do with the *actual* upper limit of varlena values, which + * is currently 1Gb (see TOAST structures in postgres.h). I've set it + * at 10Mb which seems like a reasonable number --- tgl 8/6/00. + */ +#define MaxAttrSize (10 * 1024 * 1024) + + +/* + * MinimalTuple is an alternative representation that is used for transient + * tuples inside the executor, in places where transaction status information + * is not required, the tuple rowtype is known, and shaving off a few bytes + * is worthwhile because we need to store many tuples. The representation + * is chosen so that tuple access routines can work with either full or + * minimal tuples via a HeapTupleData pointer structure. The access routines + * see no difference, except that they must not access the transaction status + * or t_ctid fields because those aren't there. + * + * For the most part, MinimalTuples should be accessed via TupleTableSlot + * routines. These routines will prevent access to the "system columns" + * and thereby prevent accidental use of the nonexistent fields. + * + * MinimalTupleData contains a length word, some padding, and fields matching + * HeapTupleHeaderData beginning with t_infomask2. The padding is chosen so + * that offsetof(t_infomask2) is the same modulo MAXIMUM_ALIGNOF in both + * structs. This makes data alignment rules equivalent in both cases. + * + * When a minimal tuple is accessed via a HeapTupleData pointer, t_data is + * set to point MINIMAL_TUPLE_OFFSET bytes before the actual start of the + * minimal tuple --- that is, where a full tuple matching the minimal tuple's + * data would start. This trick is what makes the structs seem equivalent. + * + * Note that t_hoff is computed the same as in a full tuple, hence it includes + * the MINIMAL_TUPLE_OFFSET distance. t_len does not include that, however. + * + * MINIMAL_TUPLE_DATA_OFFSET is the offset to the first useful (non-pad) data + * other than the length word. tuplesort.c and tuplestore.c use this to avoid + * writing the padding to disk. + */ +#define MINIMAL_TUPLE_OFFSET \ + ((offsetof(HeapTupleHeaderData, t_infomask2) - sizeof(uint32)) / MAXIMUM_ALIGNOF * MAXIMUM_ALIGNOF) +#define MINIMAL_TUPLE_PADDING \ + ((offsetof(HeapTupleHeaderData, t_infomask2) - sizeof(uint32)) % MAXIMUM_ALIGNOF) +#define MINIMAL_TUPLE_DATA_OFFSET \ + offsetof(MinimalTupleData, t_infomask2) + +struct MinimalTupleData +{ + uint32 t_len; /* actual length of minimal tuple */ + + char mt_padding[MINIMAL_TUPLE_PADDING]; + + /* Fields below here must match HeapTupleHeaderData! */ + + uint16 t_infomask2; /* number of attributes + various flags */ + + uint16 t_infomask; /* various flag bits, see below */ + + uint8 t_hoff; /* sizeof header incl. bitmap, padding */ + + /* ^ - 23 bytes - ^ */ + + bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */ + + /* MORE DATA FOLLOWS AT END OF STRUCT */ +}; + +/* typedef appears in htup.h */ + +#define SizeofMinimalTupleHeader offsetof(MinimalTupleData, t_bits) + + +/* + * GETSTRUCT - given a HeapTuple pointer, return address of the user data + */ +#define GETSTRUCT(TUP) ((char *) ((TUP)->t_data) + (TUP)->t_data->t_hoff) + +/* + * Accessor macros to be used with HeapTuple pointers. + */ + +#define HeapTupleHasNulls(tuple) \ + (((tuple)->t_data->t_infomask & HEAP_HASNULL) != 0) + +#define HeapTupleNoNulls(tuple) \ + (!((tuple)->t_data->t_infomask & HEAP_HASNULL)) + +#define HeapTupleHasVarWidth(tuple) \ + (((tuple)->t_data->t_infomask & HEAP_HASVARWIDTH) != 0) + +#define HeapTupleAllFixed(tuple) \ + (!((tuple)->t_data->t_infomask & HEAP_HASVARWIDTH)) + +#define HeapTupleHasExternal(tuple) \ + (((tuple)->t_data->t_infomask & HEAP_HASEXTERNAL) != 0) + +#define HeapTupleIsHotUpdated(tuple) \ + HeapTupleHeaderIsHotUpdated((tuple)->t_data) + +#define HeapTupleSetHotUpdated(tuple) \ + HeapTupleHeaderSetHotUpdated((tuple)->t_data) + +#define HeapTupleClearHotUpdated(tuple) \ + HeapTupleHeaderClearHotUpdated((tuple)->t_data) + +#define HeapTupleIsHeapOnly(tuple) \ + HeapTupleHeaderIsHeapOnly((tuple)->t_data) + +#define HeapTupleSetHeapOnly(tuple) \ + HeapTupleHeaderSetHeapOnly((tuple)->t_data) + +#define HeapTupleClearHeapOnly(tuple) \ + HeapTupleHeaderClearHeapOnly((tuple)->t_data) + +/* prototypes for functions in common/heaptuple.c */ +extern Size heap_compute_data_size(TupleDesc tupleDesc, + Datum *values, bool *isnull); +extern void heap_fill_tuple(TupleDesc tupleDesc, + Datum *values, bool *isnull, + char *data, Size data_size, + uint16 *infomask, bits8 *bit); +extern bool heap_attisnull(HeapTuple tup, int attnum, TupleDesc tupleDesc); +extern Datum nocachegetattr(HeapTuple tup, int attnum, + TupleDesc tupleDesc); +extern Datum heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, + bool *isnull); +extern Datum getmissingattr(TupleDesc tupleDesc, + int attnum, bool *isnull); +extern HeapTuple heap_copytuple(HeapTuple tuple); +extern void heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest); +extern Datum heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc); +extern HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, + Datum *values, bool *isnull); +extern HeapTuple heap_modify_tuple(HeapTuple tuple, + TupleDesc tupleDesc, + Datum *replValues, + bool *replIsnull, + bool *doReplace); +extern HeapTuple heap_modify_tuple_by_cols(HeapTuple tuple, + TupleDesc tupleDesc, + int nCols, + int *replCols, + Datum *replValues, + bool *replIsnull); +extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, + Datum *values, bool *isnull); +extern void heap_freetuple(HeapTuple htup); +extern MinimalTuple heap_form_minimal_tuple(TupleDesc tupleDescriptor, + Datum *values, bool *isnull); +extern void heap_free_minimal_tuple(MinimalTuple mtup); +extern MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup); +extern HeapTuple heap_tuple_from_minimal_tuple(MinimalTuple mtup); +extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup); +extern size_t varsize_any(void *p); +extern HeapTuple heap_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc); +extern MinimalTuple minimal_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc); + +#ifndef FRONTEND +/* + * fastgetattr + * Fetch a user attribute's value as a Datum (might be either a + * value, or a pointer into the data area of the tuple). + * + * This must not be used when a system attribute might be requested. + * Furthermore, the passed attnum MUST be valid. Use heap_getattr() + * instead, if in doubt. + * + * This gets called many times, so we macro the cacheable and NULL + * lookups, and call nocachegetattr() for the rest. + */ +static inline Datum +fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) +{ + Assert(attnum > 0); + + *isnull = false; + if (HeapTupleNoNulls(tup)) + { + Form_pg_attribute att; + + att = TupleDescAttr(tupleDesc, attnum - 1); + if (att->attcacheoff >= 0) + return fetchatt(att, (char *) tup->t_data + tup->t_data->t_hoff + + att->attcacheoff); + else + return nocachegetattr(tup, attnum, tupleDesc); + } + else + { + if (att_isnull(attnum - 1, tup->t_data->t_bits)) + { + *isnull = true; + return (Datum) NULL; + } + else + return nocachegetattr(tup, attnum, tupleDesc); + } +} + +/* + * heap_getattr + * Extract an attribute of a heap tuple and return it as a Datum. + * This works for either system or user attributes. The given attnum + * is properly range-checked. + * + * If the field in question has a NULL value, we return a zero Datum + * and set *isnull == true. Otherwise, we set *isnull == false. + * + * <tup> is the pointer to the heap tuple. <attnum> is the attribute + * number of the column (field) caller wants. <tupleDesc> is a + * pointer to the structure describing the row and all its fields. + * + */ +static inline Datum +heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) +{ + if (attnum > 0) + { + if (attnum > (int) HeapTupleHeaderGetNatts(tup->t_data)) + return getmissingattr(tupleDesc, attnum, isnull); + else + return fastgetattr(tup, attnum, tupleDesc, isnull); + } + else + return heap_getsysattr(tup, attnum, tupleDesc, isnull); +} +#endif /* FRONTEND */ + +#endif /* HTUP_DETAILS_H */ diff --git a/src/include/access/itup.h b/src/include/access/itup.h new file mode 100644 index 0000000..2e2b8c7 --- /dev/null +++ b/src/include/access/itup.h @@ -0,0 +1,170 @@ +/*------------------------------------------------------------------------- + * + * itup.h + * POSTGRES index tuple definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/itup.h + * + *------------------------------------------------------------------------- + */ +#ifndef ITUP_H +#define ITUP_H + +#include "access/tupdesc.h" +#include "access/tupmacs.h" +#include "storage/bufpage.h" +#include "storage/itemptr.h" + +/* + * Index tuple header structure + * + * All index tuples start with IndexTupleData. If the HasNulls bit is set, + * this is followed by an IndexAttributeBitMapData. The index attribute + * values follow, beginning at a MAXALIGN boundary. + * + * Note that the space allocated for the bitmap does not vary with the number + * of attributes; that is because we don't have room to store the number of + * attributes in the header. Given the MAXALIGN constraint there's no space + * savings to be had anyway, for usual values of INDEX_MAX_KEYS. + */ + +typedef struct IndexTupleData +{ + ItemPointerData t_tid; /* reference TID to heap tuple */ + + /* --------------- + * t_info is laid out in the following fashion: + * + * 15th (high) bit: has nulls + * 14th bit: has var-width attributes + * 13th bit: AM-defined meaning + * 12-0 bit: size of tuple + * --------------- + */ + + unsigned short t_info; /* various info about tuple */ + +} IndexTupleData; /* MORE DATA FOLLOWS AT END OF STRUCT */ + +typedef IndexTupleData *IndexTuple; + +typedef struct IndexAttributeBitMapData +{ + bits8 bits[(INDEX_MAX_KEYS + 8 - 1) / 8]; +} IndexAttributeBitMapData; + +typedef IndexAttributeBitMapData * IndexAttributeBitMap; + +/* + * t_info manipulation macros + */ +#define INDEX_SIZE_MASK 0x1FFF +#define INDEX_AM_RESERVED_BIT 0x2000 /* reserved for index-AM specific + * usage */ +#define INDEX_VAR_MASK 0x4000 +#define INDEX_NULL_MASK 0x8000 + +#define IndexTupleSize(itup) ((Size) ((itup)->t_info & INDEX_SIZE_MASK)) +#define IndexTupleHasNulls(itup) ((((IndexTuple) (itup))->t_info & INDEX_NULL_MASK)) +#define IndexTupleHasVarwidths(itup) ((((IndexTuple) (itup))->t_info & INDEX_VAR_MASK)) + + +/* routines in indextuple.c */ +extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor, + Datum *values, bool *isnull); +extern IndexTuple index_form_tuple_context(TupleDesc tupleDescriptor, + Datum *values, bool *isnull, + MemoryContext context); +extern Datum nocache_index_getattr(IndexTuple tup, int attnum, + TupleDesc tupleDesc); +extern void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, + Datum *values, bool *isnull); +extern void index_deform_tuple_internal(TupleDesc tupleDescriptor, + Datum *values, bool *isnull, + char *tp, bits8 *bp, int hasnulls); +extern IndexTuple CopyIndexTuple(IndexTuple source); +extern IndexTuple index_truncate_tuple(TupleDesc sourceDescriptor, + IndexTuple source, int leavenatts); + + +/* + * Takes an infomask as argument (primarily because this needs to be usable + * at index_form_tuple time so enough space is allocated). + */ +static inline Size +IndexInfoFindDataOffset(unsigned short t_info) +{ + if (!(t_info & INDEX_NULL_MASK)) + return MAXALIGN(sizeof(IndexTupleData)); + else + return MAXALIGN(sizeof(IndexTupleData) + sizeof(IndexAttributeBitMapData)); +} + +#ifndef FRONTEND + +/* ---------------- + * index_getattr + * + * This gets called many times, so we macro the cacheable and NULL + * lookups, and call nocache_index_getattr() for the rest. + * + * ---------------- + */ +static inline Datum +index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) +{ + Assert(PointerIsValid(isnull)); + Assert(attnum > 0); + + *isnull = false; + + if (!IndexTupleHasNulls(tup)) + { + if (TupleDescAttr(tupleDesc, attnum - 1)->attcacheoff >= 0) + { + return fetchatt(TupleDescAttr(tupleDesc, attnum - 1), + (char *) tup + IndexInfoFindDataOffset(tup->t_info) + + TupleDescAttr(tupleDesc, attnum - 1)->attcacheoff); + } + else + return nocache_index_getattr(tup, attnum, tupleDesc); + } + else + { + if (att_isnull(attnum - 1, (bits8 *) tup + sizeof(IndexTupleData))) + { + *isnull = true; + return (Datum) NULL; + } + else + return nocache_index_getattr(tup, attnum, tupleDesc); + } +} + +#endif + +/* + * MaxIndexTuplesPerPage is an upper bound on the number of tuples that can + * fit on one index page. An index tuple must have either data or a null + * bitmap, so we can safely assume it's at least 1 byte bigger than a bare + * IndexTupleData struct. We arrive at the divisor because each tuple + * must be maxaligned, and it must have an associated line pointer. + * + * To be index-type-independent, this does not account for any special space + * on the page, and is thus conservative. + * + * Note: in btree non-leaf pages, the first tuple has no key (it's implicitly + * minus infinity), thus breaking the "at least 1 byte bigger" assumption. + * On such a page, N tuples could take one MAXALIGN quantum less space than + * estimated here, seemingly allowing one more tuple than estimated here. + * But such a page always has at least MAXALIGN special space, so we're safe. + */ +#define MaxIndexTuplesPerPage \ + ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ + (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData)))) + +#endif /* ITUP_H */ diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h new file mode 100644 index 0000000..246f757 --- /dev/null +++ b/src/include/access/multixact.h @@ -0,0 +1,165 @@ +/* + * multixact.h + * + * PostgreSQL multi-transaction-log manager + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/multixact.h + */ +#ifndef MULTIXACT_H +#define MULTIXACT_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/sync.h" + + +/* + * The first two MultiXactId values are reserved to store the truncation Xid + * and epoch of the first segment, so we start assigning multixact values from + * 2. + */ +#define InvalidMultiXactId ((MultiXactId) 0) +#define FirstMultiXactId ((MultiXactId) 1) +#define MaxMultiXactId ((MultiXactId) 0xFFFFFFFF) + +#define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) + +#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) + +/* Number of SLRU buffers to use for multixact */ +#define NUM_MULTIXACTOFFSET_BUFFERS 8 +#define NUM_MULTIXACTMEMBER_BUFFERS 16 + +/* + * Possible multixact lock modes ("status"). The first four modes are for + * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the + * next two are used for update and delete modes. + */ +typedef enum +{ + MultiXactStatusForKeyShare = 0x00, + MultiXactStatusForShare = 0x01, + MultiXactStatusForNoKeyUpdate = 0x02, + MultiXactStatusForUpdate = 0x03, + /* an update that doesn't touch "key" columns */ + MultiXactStatusNoKeyUpdate = 0x04, + /* other updates, and delete */ + MultiXactStatusUpdate = 0x05 +} MultiXactStatus; + +#define MaxMultiXactStatus MultiXactStatusUpdate + +/* does a status value correspond to a tuple update? */ +#define ISUPDATE_from_mxstatus(status) \ + ((status) > MultiXactStatusForUpdate) + + +typedef struct MultiXactMember +{ + TransactionId xid; + MultiXactStatus status; +} MultiXactMember; + + +/* ---------------- + * multixact-related XLOG entries + * ---------------- + */ + +#define XLOG_MULTIXACT_ZERO_OFF_PAGE 0x00 +#define XLOG_MULTIXACT_ZERO_MEM_PAGE 0x10 +#define XLOG_MULTIXACT_CREATE_ID 0x20 +#define XLOG_MULTIXACT_TRUNCATE_ID 0x30 + +typedef struct xl_multixact_create +{ + MultiXactId mid; /* new MultiXact's ID */ + MultiXactOffset moff; /* its starting offset in members file */ + int32 nmembers; /* number of member XIDs */ + MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; +} xl_multixact_create; + +#define SizeOfMultiXactCreate (offsetof(xl_multixact_create, members)) + +typedef struct xl_multixact_truncate +{ + Oid oldestMultiDB; + + /* to-be-truncated range of multixact offsets */ + MultiXactId startTruncOff; /* just for completeness' sake */ + MultiXactId endTruncOff; + + /* to-be-truncated range of multixact members */ + MultiXactOffset startTruncMemb; + MultiXactOffset endTruncMemb; +} xl_multixact_truncate; + +#define SizeOfMultiXactTruncate (sizeof(xl_multixact_truncate)) + + +extern MultiXactId MultiXactIdCreate(TransactionId xid1, + MultiXactStatus status1, TransactionId xid2, + MultiXactStatus status2); +extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, + MultiXactStatus status); +extern MultiXactId MultiXactIdCreateFromMembers(int nmembers, + MultiXactMember *members); + +extern MultiXactId ReadNextMultiXactId(void); +extern void ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next); +extern bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly); +extern void MultiXactIdSetOldestMember(void); +extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, + bool from_pgupgrade, bool isLockOnly); +extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); +extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, + MultiXactId multi2); + +extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path); +extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path); + +extern void AtEOXact_MultiXact(void); +extern void AtPrepare_MultiXact(void); +extern void PostPrepare_MultiXact(TransactionId xid); + +extern Size MultiXactShmemSize(void); +extern void MultiXactShmemInit(void); +extern void BootStrapMultiXact(void); +extern void StartupMultiXact(void); +extern void TrimMultiXact(void); +extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, + Oid oldest_datoid, + bool is_startup); +extern void MultiXactGetCheckptMulti(bool is_shutdown, + MultiXactId *nextMulti, + MultiXactOffset *nextMultiOffset, + MultiXactId *oldestMulti, + Oid *oldestMultiDB); +extern void CheckPointMultiXact(void); +extern MultiXactId GetOldestMultiXactId(void); +extern void TruncateMultiXact(MultiXactId newOldestMulti, + Oid newOldestMultiDB); +extern void MultiXactSetNextMXact(MultiXactId nextMulti, + MultiXactOffset nextMultiOffset); +extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, + MultiXactOffset minMultiOffset); +extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); +extern int MultiXactMemberFreezeThreshold(void); + +extern void multixact_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void multixact_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void multixact_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len); + +extern void multixact_redo(XLogReaderState *record); +extern void multixact_desc(StringInfo buf, XLogReaderState *record); +extern const char *multixact_identify(uint8 info); +extern char *mxid_to_string(MultiXactId multi, int nmembers, + MultiXactMember *members); + +#endif /* MULTIXACT_H */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h new file mode 100644 index 0000000..9020abe --- /dev/null +++ b/src/include/access/nbtree.h @@ -0,0 +1,1298 @@ +/*------------------------------------------------------------------------- + * + * nbtree.h + * header file for postgres btree access method implementation. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/nbtree.h + * + *------------------------------------------------------------------------- + */ +#ifndef NBTREE_H +#define NBTREE_H + +#include "access/amapi.h" +#include "access/itup.h" +#include "access/sdir.h" +#include "access/tableam.h" +#include "access/xlogreader.h" +#include "catalog/pg_am_d.h" +#include "catalog/pg_index.h" +#include "lib/stringinfo.h" +#include "storage/bufmgr.h" +#include "storage/shm_toc.h" + +/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ +typedef uint16 BTCycleId; + +/* + * BTPageOpaqueData -- At the end of every page, we store a pointer + * to both siblings in the tree. This is used to do forward/backward + * index scans. The next-page link is also critical for recovery when + * a search has navigated to the wrong page due to concurrent page splits + * or deletions; see src/backend/access/nbtree/README for more info. + * + * In addition, we store the page's btree level (counting upwards from + * zero at a leaf page) as well as some flag bits indicating the page type + * and status. If the page is deleted, a BTDeletedPageData struct is stored + * in the page's tuple area, while a standard BTPageOpaqueData struct is + * stored in the page special area. + * + * We also store a "vacuum cycle ID". When a page is split while VACUUM is + * processing the index, a nonzero value associated with the VACUUM run is + * stored into both halves of the split page. (If VACUUM is not running, + * both pages receive zero cycleids.) This allows VACUUM to detect whether + * a page was split since it started, with a small probability of false match + * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs + * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left + * (original) page, and set in the right page, but only if the next page + * to its right has a different cycleid. + * + * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested + * instead. + * + * NOTE: the btpo_level field used to be a union type in order to allow + * deleted pages to store a 32-bit safexid in the same field. We now store + * 64-bit/full safexid values using BTDeletedPageData instead. + */ + +typedef struct BTPageOpaqueData +{ + BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ + BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ + uint32 btpo_level; /* tree level --- zero for leaf pages */ + uint16 btpo_flags; /* flag bits, see below */ + BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ +} BTPageOpaqueData; + +typedef BTPageOpaqueData *BTPageOpaque; + +#define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page)) + +/* Bits defined in btpo_flags */ +#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ +#define BTP_ROOT (1 << 1) /* root page (has no parent) */ +#define BTP_DELETED (1 << 2) /* page has been deleted from tree */ +#define BTP_META (1 << 3) /* meta-page */ +#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ +#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ +#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ +#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ + +/* + * The max allowed value of a cycle ID is a bit less than 64K. This is + * for convenience of pg_filedump and similar utilities: we want to use + * the last 2 bytes of special space as an index type indicator, and + * restricting cycle ID lets btree use that space for vacuum cycle IDs + * while still allowing index type to be identified. + */ +#define MAX_BT_CYCLE_ID 0xFF7F + + +/* + * The Meta page is always the first page in the btree index. + * Its primary purpose is to point to the location of the btree root page. + * We also point to the "fast" root, which is the current effective root; + * see README for discussion. + */ + +typedef struct BTMetaPageData +{ + uint32 btm_magic; /* should contain BTREE_MAGIC */ + uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */ + BlockNumber btm_root; /* current root location */ + uint32 btm_level; /* tree level of the root page */ + BlockNumber btm_fastroot; /* current "fast" root location */ + uint32 btm_fastlevel; /* tree level of the "fast" root page */ + /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ + + /* number of deleted, non-recyclable pages during last cleanup */ + uint32 btm_last_cleanup_num_delpages; + /* number of heap tuples during last cleanup (deprecated) */ + float8 btm_last_cleanup_num_heap_tuples; + + bool btm_allequalimage; /* are all columns "equalimage"? */ +} BTMetaPageData; + +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) PageGetContents(p)) + +/* + * The current Btree version is 4. That's what you'll get when you create + * a new index. + * + * Btree version 3 was used in PostgreSQL v11. It is mostly the same as + * version 4, but heap TIDs were not part of the keyspace. Index tuples + * with duplicate keys could be stored in any order. We continue to + * support reading and writing Btree versions 2 and 3, so that they don't + * need to be immediately re-indexed at pg_upgrade. In order to get the + * new heapkeyspace semantics, however, a REINDEX is needed. + * + * Deduplication is safe to use when the btm_allequalimage field is set to + * true. It's safe to read the btm_allequalimage field on version 3, but + * only version 4 indexes make use of deduplication. Even version 4 + * indexes created on PostgreSQL v12 will need a REINDEX to make use of + * deduplication, though, since there is no other way to set + * btm_allequalimage to true (pg_upgrade hasn't been taught to set the + * metapage field). + * + * Btree version 2 is mostly the same as version 3. There are two new + * fields in the metapage that were introduced in version 3. A version 2 + * metapage will be automatically upgraded to version 3 on the first + * insert to it. INCLUDE indexes cannot use version 2. + */ +#define BTREE_METAPAGE 0 /* first page is meta */ +#define BTREE_MAGIC 0x053162 /* magic number in metapage */ +#define BTREE_VERSION 4 /* current version number */ +#define BTREE_MIN_VERSION 2 /* minimum supported version */ +#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */ + +/* + * Maximum size of a btree index entry, including its tuple header. + * + * We actually need to be able to fit three items on every page, + * so restrict any one item to 1/3 the per-page available space. + * + * There are rare cases where _bt_truncate() will need to enlarge + * a heap index tuple to make space for a tiebreaker heap TID + * attribute, which we account for here. + */ +#define BTMaxItemSize(page) \ + (MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \ + MAXALIGN(sizeof(ItemPointerData))) +#define BTMaxItemSizeNoHeapTid(page) \ + MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) + +/* + * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples + * that may be stored on a btree leaf page. It is used to size the + * per-page temporary buffers. + * + * Note: we don't bother considering per-tuple overheads here to keep + * things simple (value is based on how many elements a single array of + * heap TIDs must have to fill the space between the page header and + * special area). The value is slightly higher (i.e. more conservative) + * than necessary as a result, which is considered acceptable. + */ +#define MaxTIDsPerBTreePage \ + (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ + sizeof(ItemPointerData)) + +/* + * The leaf-page fillfactor defaults to 90% but is user-adjustable. + * For pages above the leaf level, we use a fixed 70% fillfactor. + * The fillfactor is applied during index build and when splitting + * a rightmost page; when splitting non-rightmost pages we try to + * divide the data equally. When splitting a page that's entirely + * filled with a single value (duplicates), the effective leaf-page + * fillfactor is 96%, regardless of whether the page is a rightmost + * page. + */ +#define BTREE_MIN_FILLFACTOR 10 +#define BTREE_DEFAULT_FILLFACTOR 90 +#define BTREE_NONLEAF_FILLFACTOR 70 +#define BTREE_SINGLEVAL_FILLFACTOR 96 + +/* + * In general, the btree code tries to localize its knowledge about + * page layout to a couple of routines. However, we need a special + * value to indicate "no page number" in those places where we expect + * page numbers. We can use zero for this because we never need to + * make a pointer to the metadata page. + */ + +#define P_NONE 0 + +/* + * Macros to test whether a page is leftmost or rightmost on its tree level, + * as well as other state info kept in the opaque data. + */ +#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) +#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) +#define P_ISLEAF(opaque) (((opaque)->btpo_flags & BTP_LEAF) != 0) +#define P_ISROOT(opaque) (((opaque)->btpo_flags & BTP_ROOT) != 0) +#define P_ISDELETED(opaque) (((opaque)->btpo_flags & BTP_DELETED) != 0) +#define P_ISMETA(opaque) (((opaque)->btpo_flags & BTP_META) != 0) +#define P_ISHALFDEAD(opaque) (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0) +#define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) +#define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) +#define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) +#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) + +/* + * BTDeletedPageData is the page contents of a deleted page + */ +typedef struct BTDeletedPageData +{ + FullTransactionId safexid; /* See BTPageIsRecyclable() */ +} BTDeletedPageData; + +static inline void +BTPageSetDeleted(Page page, FullTransactionId safexid) +{ + BTPageOpaque opaque; + PageHeader header; + BTDeletedPageData *contents; + + opaque = BTPageGetOpaque(page); + header = ((PageHeader) page); + + opaque->btpo_flags &= ~BTP_HALF_DEAD; + opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID; + header->pd_lower = MAXALIGN(SizeOfPageHeaderData) + + sizeof(BTDeletedPageData); + header->pd_upper = header->pd_special; + + /* Set safexid in deleted page */ + contents = ((BTDeletedPageData *) PageGetContents(page)); + contents->safexid = safexid; +} + +static inline FullTransactionId +BTPageGetDeleteXid(Page page) +{ + BTPageOpaque opaque; + BTDeletedPageData *contents; + + /* We only expect to be called with a deleted page */ + Assert(!PageIsNew(page)); + opaque = BTPageGetOpaque(page); + Assert(P_ISDELETED(opaque)); + + /* pg_upgrade'd deleted page -- must be safe to delete now */ + if (!P_HAS_FULLXID(opaque)) + return FirstNormalFullTransactionId; + + /* Get safexid from deleted page */ + contents = ((BTDeletedPageData *) PageGetContents(page)); + return contents->safexid; +} + +/* + * Is an existing page recyclable? + * + * This exists to centralize the policy on which deleted pages are now safe to + * re-use. However, _bt_pendingfsm_finalize() duplicates some of the same + * logic because it doesn't work directly with pages -- keep the two in sync. + * + * Note: PageIsNew() pages are always safe to recycle, but we can't deal with + * them here (caller is responsible for that case themselves). Caller might + * well need special handling for new pages anyway. + */ +static inline bool +BTPageIsRecyclable(Page page, Relation heaprel) +{ + BTPageOpaque opaque; + + Assert(!PageIsNew(page)); + Assert(heaprel != NULL); + + /* Recycling okay iff page is deleted and safexid is old enough */ + opaque = BTPageGetOpaque(page); + if (P_ISDELETED(opaque)) + { + FullTransactionId safexid = BTPageGetDeleteXid(page); + + /* + * The page was deleted, but when? If it was just deleted, a scan + * might have seen the downlink to it, and will read the page later. + * As long as that can happen, we must keep the deleted page around as + * a tombstone. + * + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have + * seen its downlink, and we can recycle it. + */ + return GlobalVisCheckRemovableFullXid(heaprel, safexid); + } + + return false; +} + +/* + * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM. + * They are exported for use by page deletion related code in nbtpage.c. + */ +typedef struct BTPendingFSM +{ + BlockNumber target; /* Page deleted by current VACUUM */ + FullTransactionId safexid; /* Page's BTDeletedPageData.safexid */ +} BTPendingFSM; + +typedef struct BTVacState +{ + IndexVacuumInfo *info; + IndexBulkDeleteResult *stats; + IndexBulkDeleteCallback callback; + void *callback_state; + BTCycleId cycleid; + MemoryContext pagedelcontext; + + /* + * _bt_pendingfsm_finalize() state + */ + int bufsize; /* pendingpages space (in # elements) */ + int maxbufsize; /* max bufsize that respects work_mem */ + BTPendingFSM *pendingpages; /* One entry per newly deleted page */ + int npendingpages; /* current # valid pendingpages */ +} BTVacState; + +/* + * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost + * page. The high key is not a tuple that is used to visit the heap. It is + * a pivot tuple (see "Notes on B-Tree tuple format" below for definition). + * The high key on a page is required to be greater than or equal to any + * other key that appears on the page. If we find ourselves trying to + * insert a key that is strictly > high key, we know we need to move right + * (this should only happen if the page was split since we examined the + * parent page). + * + * Our insertion algorithm guarantees that we can use the initial least key + * on our right sibling as the high key. Once a page is created, its high + * key changes only if the page is split. + * + * On a non-rightmost page, the high key lives in item 1 and data items + * start in item 2. Rightmost pages have no high key, so we store data + * items beginning in item 1. + */ + +#define P_HIKEY ((OffsetNumber) 1) +#define P_FIRSTKEY ((OffsetNumber) 2) +#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) + +/* + * Notes on B-Tree tuple format, and key and non-key attributes: + * + * INCLUDE B-Tree indexes have non-key attributes. These are extra + * attributes that may be returned by index-only scans, but do not influence + * the order of items in the index (formally, non-key attributes are not + * considered to be part of the key space). Non-key attributes are only + * present in leaf index tuples whose item pointers actually point to heap + * tuples (non-pivot tuples). _bt_check_natts() enforces the rules + * described here. + * + * Non-pivot tuple format (plain/non-posting variant): + * + * t_tid | t_info | key values | INCLUDE columns, if any + * + * t_tid points to the heap TID, which is a tiebreaker key column as of + * BTREE_VERSION 4. + * + * Non-pivot tuples complement pivot tuples, which only have key columns. + * The sole purpose of pivot tuples is to represent how the key space is + * separated. In general, any B-Tree index that has more than one level + * (i.e. any index that does not just consist of a metapage and a single + * leaf root page) must have some number of pivot tuples, since pivot + * tuples are used for traversing the tree. Suffix truncation can omit + * trailing key columns when a new pivot is formed, which makes minus + * infinity their logical value. Since BTREE_VERSION 4 indexes treat heap + * TID as a trailing key column that ensures that all index tuples are + * physically unique, it is necessary to represent heap TID as a trailing + * key column in pivot tuples, though very often this can be truncated + * away, just like any other key column. (Actually, the heap TID is + * omitted rather than truncated, since its representation is different to + * the non-pivot representation.) + * + * Pivot tuple format: + * + * t_tid | t_info | key values | [heap TID] + * + * We store the number of columns present inside pivot tuples by abusing + * their t_tid offset field, since pivot tuples never need to store a real + * offset (pivot tuples generally store a downlink in t_tid, though). The + * offset field only stores the number of columns/attributes when the + * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap + * TID column sometimes stored in pivot tuples -- that's represented by + * the presence of BT_PIVOT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in + * t_info is always set on BTREE_VERSION 4 pivot tuples, since + * BTreeTupleIsPivot() must work reliably on heapkeyspace versions. + * + * In version 2 or version 3 (!heapkeyspace) indexes, INDEX_ALT_TID_MASK + * might not be set in pivot tuples. BTreeTupleIsPivot() won't work + * reliably as a result. The number of columns stored is implicitly the + * same as the number of columns in the index, just like any non-pivot + * tuple. (The number of columns stored should not vary, since suffix + * truncation of key columns is unsafe within any !heapkeyspace index.) + * + * The 12 least significant bits from t_tid's offset number are used to + * represent the number of key columns within a pivot tuple. This leaves 4 + * status bits (BT_STATUS_OFFSET_MASK bits), which are shared by all tuples + * that have the INDEX_ALT_TID_MASK bit set (set in t_info) to store basic + * tuple metadata. BTreeTupleIsPivot() and BTreeTupleIsPosting() use the + * BT_STATUS_OFFSET_MASK bits. + * + * Sometimes non-pivot tuples also use a representation that repurposes + * t_tid to store metadata rather than a TID. PostgreSQL v13 introduced a + * new non-pivot tuple format to support deduplication: posting list + * tuples. Deduplication merges together multiple equal non-pivot tuples + * into a logically equivalent, space efficient representation. A posting + * list is an array of ItemPointerData elements. Non-pivot tuples are + * merged together to form posting list tuples lazily, at the point where + * we'd otherwise have to split a leaf page. + * + * Posting tuple format (alternative non-pivot tuple representation): + * + * t_tid | t_info | key values | posting list (TID array) + * + * Posting list tuples are recognized as such by having the + * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status + * bit set in t_tid's offset number. These flags redefine the content of + * the posting tuple's t_tid to store the location of the posting list + * (instead of a block number), as well as the total number of heap TIDs + * present in the tuple (instead of a real offset number). + * + * The 12 least significant bits from t_tid's offset number are used to + * represent the number of heap TIDs present in the tuple, leaving 4 status + * bits (the BT_STATUS_OFFSET_MASK bits). Like any non-pivot tuple, the + * number of columns stored is always implicitly the total number in the + * index (in practice there can never be non-key columns stored, since + * deduplication is not supported with INCLUDE indexes). + */ +#define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT + +/* Item pointer offset bit masks */ +#define BT_OFFSET_MASK 0x0FFF +#define BT_STATUS_OFFSET_MASK 0xF000 +/* BT_STATUS_OFFSET_MASK status bits */ +#define BT_PIVOT_HEAP_TID_ATTR 0x1000 +#define BT_IS_POSTING 0x2000 + +/* + * Mask allocated for number of keys in index tuple must be able to fit + * maximum possible number of index attributes + */ +StaticAssertDecl(BT_OFFSET_MASK >= INDEX_MAX_KEYS, + "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS"); + +/* + * Note: BTreeTupleIsPivot() can have false negatives (but not false + * positives) when used with !heapkeyspace indexes + */ +static inline bool +BTreeTupleIsPivot(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* absence of BT_IS_POSTING in offset number indicates pivot tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0) + return false; + + return true; +} + +static inline bool +BTreeTupleIsPosting(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* presence of BT_IS_POSTING in offset number indicates posting tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0) + return false; + + return true; +} + +static inline void +BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset) +{ + Assert(nhtids > 1); + Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0); + Assert((size_t) postingoffset == MAXALIGN(postingoffset)); + Assert(postingoffset < INDEX_SIZE_MASK); + Assert(!BTreeTupleIsPivot(itup)); + + itup->t_info |= INDEX_ALT_TID_MASK; + ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING)); + ItemPointerSetBlockNumber(&itup->t_tid, postingoffset); +} + +static inline uint16 +BTreeTupleGetNPosting(IndexTuple posting) +{ + OffsetNumber existing; + + Assert(BTreeTupleIsPosting(posting)); + + existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid); + return (existing & BT_OFFSET_MASK); +} + +static inline uint32 +BTreeTupleGetPostingOffset(IndexTuple posting) +{ + Assert(BTreeTupleIsPosting(posting)); + + return ItemPointerGetBlockNumberNoCheck(&posting->t_tid); +} + +static inline ItemPointer +BTreeTupleGetPosting(IndexTuple posting) +{ + return (ItemPointer) ((char *) posting + + BTreeTupleGetPostingOffset(posting)); +} + +static inline ItemPointer +BTreeTupleGetPostingN(IndexTuple posting, int n) +{ + return BTreeTupleGetPosting(posting) + n; +} + +/* + * Get/set downlink block number in pivot tuple. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. + */ +static inline BlockNumber +BTreeTupleGetDownLink(IndexTuple pivot) +{ + return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid); +} + +static inline void +BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&pivot->t_tid, blkno); +} + +/* + * Get number of attributes within tuple. + * + * Note that this does not include an implicit tiebreaker heap TID + * attribute, if any. Note also that the number of key attributes must be + * explicitly represented in all heapkeyspace pivot tuples. + * + * Note: This is defined as a macro rather than an inline function to + * avoid including rel.h. + */ +#define BTreeTupleGetNAtts(itup, rel) \ + ( \ + (BTreeTupleIsPivot(itup)) ? \ + ( \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \ + ) \ + : \ + IndexRelationGetNumberOfAttributes(rel) \ + ) + +/* + * Set number of key attributes in tuple. + * + * The heap TID tiebreaker attribute bit may also be set here, indicating that + * a heap TID value will be stored at the end of the tuple (i.e. using the + * special pivot tuple representation). + */ +static inline void +BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid) +{ + Assert(nkeyatts <= INDEX_MAX_KEYS); + Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0); + Assert(!heaptid || nkeyatts > 0); + Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0); + + itup->t_info |= INDEX_ALT_TID_MASK; + + if (heaptid) + nkeyatts |= BT_PIVOT_HEAP_TID_ATTR; + + /* BT_IS_POSTING bit is deliberately unset here */ + ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts); + Assert(BTreeTupleIsPivot(itup)); +} + +/* + * Get/set leaf page's "top parent" link from its high key. Used during page + * deletion. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. + */ +static inline BlockNumber +BTreeTupleGetTopParent(IndexTuple leafhikey) +{ + return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid); +} + +static inline void +BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno); + BTreeTupleSetNAtts(leafhikey, 0, false); +} + +/* + * Get tiebreaker heap TID attribute, if any. + * + * This returns the first/lowest heap TID in the case of a posting list tuple. + */ +static inline ItemPointer +BTreeTupleGetHeapTID(IndexTuple itup) +{ + if (BTreeTupleIsPivot(itup)) + { + /* Pivot tuple heap TID representation? */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return (ItemPointer) ((char *) itup + IndexTupleSize(itup) - + sizeof(ItemPointerData)); + + /* Heap TID attribute was truncated */ + return NULL; + } + else if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup); + + return &itup->t_tid; +} + +/* + * Get maximum heap TID attribute, which could be the only TID in the case of + * a non-pivot tuple that does not have a posting list tuple. + * + * Works with non-pivot tuples only. + */ +static inline ItemPointer +BTreeTupleGetMaxHeapTID(IndexTuple itup) +{ + Assert(!BTreeTupleIsPivot(itup)); + + if (BTreeTupleIsPosting(itup)) + { + uint16 nposting = BTreeTupleGetNPosting(itup); + + return BTreeTupleGetPostingN(itup, nposting - 1); + } + + return &itup->t_tid; +} + +/* + * Operator strategy numbers for B-tree have been moved to access/stratnum.h, + * because many places need to use them in ScanKeyInit() calls. + * + * The strategy numbers are chosen so that we can commute them by + * subtraction, thus: + */ +#define BTCommuteStrategyNumber(strat) (BTMaxStrategyNumber + 1 - (strat)) + +/* + * When a new operator class is declared, we require that the user + * supply us with an amproc procedure (BTORDER_PROC) for determining + * whether, for two keys a and b, a < b, a = b, or a > b. This routine + * must return < 0, 0, > 0, respectively, in these three cases. + * + * To facilitate accelerated sorting, an operator class may choose to + * offer a second procedure (BTSORTSUPPORT_PROC). For full details, see + * src/include/utils/sortsupport.h. + * + * To support window frames defined by "RANGE offset PRECEDING/FOLLOWING", + * an operator class may choose to offer a third amproc procedure + * (BTINRANGE_PROC), independently of whether it offers sortsupport. + * For full details, see doc/src/sgml/btree.sgml. + * + * To facilitate B-Tree deduplication, an operator class may choose to + * offer a forth amproc procedure (BTEQUALIMAGE_PROC). For full details, + * see doc/src/sgml/btree.sgml. + */ + +#define BTORDER_PROC 1 +#define BTSORTSUPPORT_PROC 2 +#define BTINRANGE_PROC 3 +#define BTEQUALIMAGE_PROC 4 +#define BTOPTIONS_PROC 5 +#define BTNProcs 5 + +/* + * We need to be able to tell the difference between read and write + * requests for pages, in order to do locking correctly. + */ + +#define BT_READ BUFFER_LOCK_SHARE +#define BT_WRITE BUFFER_LOCK_EXCLUSIVE + +/* + * BTStackData -- As we descend a tree, we push the location of pivot + * tuples whose downlink we are about to follow onto a private stack. If + * we split a leaf, we use this stack to walk back up the tree and insert + * data into its parent page at the correct location. We also have to + * recursively insert into the grandparent page if and when the parent page + * splits. Our private stack can become stale due to concurrent page + * splits and page deletions, but it should never give us an irredeemably + * bad picture. + */ +typedef struct BTStackData +{ + BlockNumber bts_blkno; + OffsetNumber bts_offset; + struct BTStackData *bts_parent; +} BTStackData; + +typedef BTStackData *BTStack; + +/* + * BTScanInsertData is the btree-private state needed to find an initial + * position for an indexscan, or to insert new tuples -- an "insertion + * scankey" (not to be confused with a search scankey). It's used to descend + * a B-Tree using _bt_search. + * + * heapkeyspace indicates if we expect all keys in the index to be physically + * unique because heap TID is used as a tiebreaker attribute, and if index may + * have truncated key attributes in pivot tuples. This is actually a property + * of the index relation itself (not an indexscan). heapkeyspace indexes are + * indexes whose version is >= version 4. It's convenient to keep this close + * by, rather than accessing the metapage repeatedly. + * + * allequalimage is set to indicate that deduplication is safe for the index. + * This is also a property of the index relation rather than an indexscan. + * + * anynullkeys indicates if any of the keys had NULL value when scankey was + * built from index tuple (note that already-truncated tuple key attributes + * set NULL as a placeholder key value, which also affects value of + * anynullkeys). This is a convenience for unique index non-pivot tuple + * insertion, which usually temporarily unsets scantid, but shouldn't iff + * anynullkeys is true. Value generally matches non-pivot tuple's HasNulls + * bit, but may not when inserting into an INCLUDE index (tuple header value + * is affected by the NULL-ness of both key and non-key attributes). + * + * When nextkey is false (the usual case), _bt_search and _bt_binsrch will + * locate the first item >= scankey. When nextkey is true, they will locate + * the first item > scan key. + * + * pivotsearch is set to true by callers that want to re-find a leaf page + * using a scankey built from a leaf page's high key. Most callers set this + * to false. + * + * scantid is the heap TID that is used as a final tiebreaker attribute. It + * is set to NULL when index scan doesn't need to find a position for a + * specific physical tuple. Must be set when inserting new tuples into + * heapkeyspace indexes, since every tuple in the tree unambiguously belongs + * in one exact position (it's never set with !heapkeyspace indexes, though). + * Despite the representational difference, nbtree search code considers + * scantid to be just another insertion scankey attribute. + * + * scankeys is an array of scan key entries for attributes that are compared + * before scantid (user-visible attributes). keysz is the size of the array. + * During insertion, there must be a scan key for every attribute, but when + * starting a regular index scan some can be omitted. The array is used as a + * flexible array member, though it's sized in a way that makes it possible to + * use stack allocations. See nbtree/README for full details. + */ +typedef struct BTScanInsertData +{ + bool heapkeyspace; + bool allequalimage; + bool anynullkeys; + bool nextkey; + bool pivotsearch; + ItemPointer scantid; /* tiebreaker for scankeys */ + int keysz; /* Size of scankeys array */ + ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */ +} BTScanInsertData; + +typedef BTScanInsertData *BTScanInsert; + +/* + * BTInsertStateData is a working area used during insertion. + * + * This is filled in after descending the tree to the first leaf page the new + * tuple might belong on. Tracks the current position while performing + * uniqueness check, before we have determined which exact page to insert + * to. + * + * (This should be private to nbtinsert.c, but it's also used by + * _bt_binsrch_insert) + */ +typedef struct BTInsertStateData +{ + IndexTuple itup; /* Item we're inserting */ + Size itemsz; /* Size of itup -- should be MAXALIGN()'d */ + BTScanInsert itup_key; /* Insertion scankey */ + + /* Buffer containing leaf page we're likely to insert itup on */ + Buffer buf; + + /* + * Cache of bounds within the current buffer. Only used for insertions + * where _bt_check_unique is called. See _bt_binsrch_insert and + * _bt_findinsertloc for details. + */ + bool bounds_valid; + OffsetNumber low; + OffsetNumber stricthigh; + + /* + * if _bt_binsrch_insert found the location inside existing posting list, + * save the position inside the list. -1 sentinel value indicates overlap + * with an existing posting list tuple that has its LP_DEAD bit set. + */ + int postingoff; +} BTInsertStateData; + +typedef BTInsertStateData *BTInsertState; + +/* + * State used to representing an individual pending tuple during + * deduplication. + */ +typedef struct BTDedupInterval +{ + OffsetNumber baseoff; + uint16 nitems; +} BTDedupInterval; + +/* + * BTDedupStateData is a working area used during deduplication. + * + * The status info fields track the state of a whole-page deduplication pass. + * State about the current pending posting list is also tracked. + * + * A pending posting list is comprised of a contiguous group of equal items + * from the page, starting from page offset number 'baseoff'. This is the + * offset number of the "base" tuple for new posting list. 'nitems' is the + * current total number of existing items from the page that will be merged to + * make a new posting list tuple, including the base tuple item. (Existing + * items may themselves be posting list tuples, or regular non-pivot tuples.) + * + * The total size of the existing tuples to be freed when pending posting list + * is processed gets tracked by 'phystupsize'. This information allows + * deduplication to calculate the space saving for each new posting list + * tuple, and for the entire pass over the page as a whole. + */ +typedef struct BTDedupStateData +{ + /* Deduplication status info for entire pass over page */ + bool deduplicate; /* Still deduplicating page? */ + int nmaxitems; /* Number of max-sized tuples so far */ + Size maxpostingsize; /* Limit on size of final tuple */ + + /* Metadata about base tuple of current pending posting list */ + IndexTuple base; /* Use to form new posting list */ + OffsetNumber baseoff; /* page offset of base */ + Size basetupsize; /* base size without original posting list */ + + /* Other metadata about pending posting list */ + ItemPointer htids; /* Heap TIDs in pending posting list */ + int nhtids; /* Number of heap TIDs in htids array */ + int nitems; /* Number of existing tuples/line pointers */ + Size phystupsize; /* Includes line pointer overhead */ + + /* + * Array of tuples to go on new version of the page. Contains one entry + * for each group of consecutive items. Note that existing tuples that + * will not become posting list tuples do not appear in the array (they + * are implicitly unchanged by deduplication pass). + */ + int nintervals; /* current number of intervals in array */ + BTDedupInterval intervals[MaxIndexTuplesPerPage]; +} BTDedupStateData; + +typedef BTDedupStateData *BTDedupState; + +/* + * BTVacuumPostingData is state that represents how to VACUUM (or delete) a + * posting list tuple when some (though not all) of its TIDs are to be + * deleted. + * + * Convention is that itup field is the original posting list tuple on input, + * and palloc()'d final tuple used to overwrite existing tuple on output. + */ +typedef struct BTVacuumPostingData +{ + /* Tuple that will be/was updated */ + IndexTuple itup; + OffsetNumber updatedoffset; + + /* State needed to describe final itup in WAL */ + uint16 ndeletedtids; + uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]; +} BTVacuumPostingData; + +typedef BTVacuumPostingData *BTVacuumPosting; + +/* + * BTScanOpaqueData is the btree-private state needed for an indexscan. + * This consists of preprocessed scan keys (see _bt_preprocess_keys() for + * details of the preprocessing), information about the current location + * of the scan, and information about the marked location, if any. (We use + * BTScanPosData to represent the data needed for each of current and marked + * locations.) In addition we can remember some known-killed index entries + * that must be marked before we can move off the current page. + * + * Index scans work a page at a time: we pin and read-lock the page, identify + * all the matching items on the page and save them in BTScanPosData, then + * release the read-lock while returning the items to the caller for + * processing. This approach minimizes lock/unlock traffic. Note that we + * keep the pin on the index page until the caller is done with all the items + * (this is needed for VACUUM synchronization, see nbtree/README). When we + * are ready to step to the next page, if the caller has told us any of the + * items were killed, we re-lock the page to mark them killed, then unlock. + * Finally we drop the pin and step to the next page in the appropriate + * direction. + * + * If we are doing an index-only scan, we save the entire IndexTuple for each + * matched item, otherwise only its heap TID and offset. The IndexTuples go + * into a separate workspace array; each BTScanPosItem stores its tuple's + * offset within that array. Posting list tuples store a "base" tuple once, + * allowing the same key to be returned for each TID in the posting list + * tuple. + */ + +typedef struct BTScanPosItem /* what we remember about each match */ +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ + LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ +} BTScanPosItem; + +typedef struct BTScanPosData +{ + Buffer buf; /* if valid, the buffer is pinned */ + + XLogRecPtr lsn; /* pos in the WAL stream when page was read */ + BlockNumber currPage; /* page referenced by items array */ + BlockNumber nextPage; /* page's right link when we scanned it */ + + /* + * moreLeft and moreRight track whether we think there may be matching + * index entries to the left and right of the current page, respectively. + * We can clear the appropriate one of these flags when _bt_checkkeys() + * returns continuescan = false. + */ + bool moreLeft; + bool moreRight; + + /* + * If we are doing an index-only scan, nextTupleOffset is the first free + * location in the associated tuple storage workspace. + */ + int nextTupleOffset; + + /* + * The items array is always ordered in index order (ie, increasing + * indexoffset). When scanning backwards it is convenient to fill the + * array back-to-front, so we start at the last slot and fill downwards. + * Hence we need both a first-valid-entry and a last-valid-entry counter. + * itemIndex is a cursor showing which entry was last returned to caller. + */ + int firstItem; /* first valid index in items[] */ + int lastItem; /* last valid index in items[] */ + int itemIndex; /* current index in items[] */ + + BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */ +} BTScanPosData; + +typedef BTScanPosData *BTScanPos; + +#define BTScanPosIsPinned(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BufferIsValid((scanpos).buf) \ +) +#define BTScanPosUnpin(scanpos) \ + do { \ + ReleaseBuffer((scanpos).buf); \ + (scanpos).buf = InvalidBuffer; \ + } while (0) +#define BTScanPosUnpinIfPinned(scanpos) \ + do { \ + if (BTScanPosIsPinned(scanpos)) \ + BTScanPosUnpin(scanpos); \ + } while (0) + +#define BTScanPosIsValid(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BlockNumberIsValid((scanpos).currPage) \ +) +#define BTScanPosInvalidate(scanpos) \ + do { \ + (scanpos).currPage = InvalidBlockNumber; \ + (scanpos).nextPage = InvalidBlockNumber; \ + (scanpos).buf = InvalidBuffer; \ + (scanpos).lsn = InvalidXLogRecPtr; \ + (scanpos).nextTupleOffset = 0; \ + } while (0) + +/* We need one of these for each equality-type SK_SEARCHARRAY scan key */ +typedef struct BTArrayKeyInfo +{ + int scan_key; /* index of associated key in arrayKeyData */ + int cur_elem; /* index of current element in elem_values */ + int mark_elem; /* index of marked element in elem_values */ + int num_elems; /* number of elems in current array value */ + Datum *elem_values; /* array of num_elems Datums */ +} BTArrayKeyInfo; + +typedef struct BTScanOpaqueData +{ + /* all fields (except arraysStarted) are set by _bt_preprocess_keys(): */ + bool qual_ok; /* false if qual can never be satisfied */ + bool arraysStarted; /* Started array keys, but have yet to "reach + * past the end" of all arrays? */ + int numberOfKeys; /* number of preprocessed scan keys */ + ScanKey keyData; /* array of preprocessed scan keys */ + + /* workspace for SK_SEARCHARRAY support */ + ScanKey arrayKeyData; /* modified copy of scan->keyData */ + int numArrayKeys; /* number of equality-type array keys (-1 if + * there are any unsatisfiable array keys) */ + int arrayKeyCount; /* count indicating number of array scan keys + * processed */ + BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + MemoryContext arrayContext; /* scan-lifespan context for array data */ + + /* info about killed items if any (killedItems is NULL if never used) */ + int *killedItems; /* currPos.items indexes of killed items */ + int numKilled; /* number of currently stored items */ + + /* + * If we are doing an index-only scan, these are the tuple storage + * workspaces for the currPos and markPos respectively. Each is of size + * BLCKSZ, so it can hold as much as a full page's worth of tuples. + */ + char *currTuples; /* tuple storage for currPos */ + char *markTuples; /* tuple storage for markPos */ + + /* + * If the marked position is on the same page as current position, we + * don't use markPos, but just keep the marked itemIndex in markItemIndex + * (all the rest of currPos is valid for the mark position). Hence, to + * determine if there is a mark, first look at markItemIndex, then at + * markPos. + */ + int markItemIndex; /* itemIndex, or -1 if not valid */ + + /* keep these last in struct for efficiency */ + BTScanPosData currPos; /* current position data */ + BTScanPosData markPos; /* marked position, if any */ +} BTScanOpaqueData; + +typedef BTScanOpaqueData *BTScanOpaque; + +/* + * We use some private sk_flags bits in preprocessed scan keys. We're allowed + * to use bits 16-31 (see skey.h). The uppermost bits are copied from the + * index's indoption[] array entry for the index attribute. + */ +#define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ +#define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ +#define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) +#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) + +typedef struct BTOptions +{ + int32 varlena_header_; /* varlena header (do not touch directly!) */ + int fillfactor; /* page fill factor in percent (0..100) */ + float8 vacuum_cleanup_index_scale_factor; /* deprecated */ + bool deduplicate_items; /* Try to deduplicate items? */ +} BTOptions; + +#define BTGetFillFactor(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BTREE_AM_OID), \ + (relation)->rd_options ? \ + ((BTOptions *) (relation)->rd_options)->fillfactor : \ + BTREE_DEFAULT_FILLFACTOR) +#define BTGetTargetPageFreeSpace(relation) \ + (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100) +#define BTGetDeduplicateItems(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BTREE_AM_OID), \ + ((relation)->rd_options ? \ + ((BTOptions *) (relation)->rd_options)->deduplicate_items : true)) + +/* + * Constant definition for progress reporting. Phase numbers must match + * btbuildphasename. + */ +/* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */ +#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN 2 +#define PROGRESS_BTREE_PHASE_PERFORMSORT_1 3 +#define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4 +#define PROGRESS_BTREE_PHASE_LEAF_LOAD 5 + +/* + * external entry points for btree, in nbtree.c + */ +extern void btbuildempty(Relation index); +extern bool btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); +extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); +extern Size btestimateparallelscan(void); +extern void btinitparallelscan(void *target); +extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); +extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); +extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern void btparallelrescan(IndexScanDesc scan); +extern void btendscan(IndexScanDesc scan); +extern void btmarkpos(IndexScanDesc scan); +extern void btrestrpos(IndexScanDesc scan); +extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); +extern bool btcanreturn(Relation index, int attno); + +/* + * prototypes for internal functions in nbtree.c + */ +extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno); +extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); +extern void _bt_parallel_done(IndexScanDesc scan); +extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); + +/* + * prototypes for functions in nbtdedup.c + */ +extern void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, + Size newitemsz, bool bottomupdedup); +extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, + Size newitemsz); +extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff); +extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup); +extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state); +extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, + int nhtids); +extern void _bt_update_posting(BTVacuumPosting vacposting); +extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, + int postingoff); + +/* + * prototypes for functions in nbtinsert.c + */ +extern bool _bt_doinsert(Relation rel, IndexTuple itup, + IndexUniqueCheck checkUnique, bool indexUnchanged, + Relation heapRel); +extern void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, + BTStack stack); +extern Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, + BlockNumber child); + +/* + * prototypes for functions in nbtsplitloc.c + */ +extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + bool *newitemonleft); + +/* + * prototypes for functions in nbtpage.c + */ +extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool allequalimage); +extern bool _bt_vacuum_needs_cleanup(Relation rel); +extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages); +extern void _bt_upgrademetapage(Page page); +extern Buffer _bt_getroot(Relation rel, Relation heaprel, int access); +extern Buffer _bt_gettrueroot(Relation rel); +extern int _bt_getrootheight(Relation rel); +extern void _bt_metaversion(Relation rel, bool *heapkeyspace, + bool *allequalimage); +extern void _bt_checkpage(Relation rel, Buffer buf); +extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); +extern Buffer _bt_allocbuf(Relation rel, Relation heaprel); +extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, + BlockNumber blkno, int access); +extern void _bt_relbuf(Relation rel, Buffer buf); +extern void _bt_lockbuf(Relation rel, Buffer buf, int access); +extern void _bt_unlockbuf(Relation rel, Buffer buf); +extern bool _bt_conditionallockbuf(Relation rel, Buffer buf); +extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf); +extern void _bt_pageinit(Page page, Size size); +extern void _bt_delitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable); +extern void _bt_delitems_delete_check(Relation rel, Buffer buf, + Relation heapRel, + TM_IndexDeleteOp *delstate); +extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate); +extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, + bool cleanuponly); +extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate); + +/* + * prototypes for functions in nbtsearch.c + */ +extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, + Buffer *bufP, int access, Snapshot snapshot); +extern Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, + Buffer buf, bool forupdate, BTStack stack, + int access, Snapshot snapshot); +extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); +extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); +extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); +extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot); + +/* + * prototypes for functions in nbtutils.c + */ +extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); +extern void _bt_freestack(BTStack stack); +extern void _bt_preprocess_array_keys(IndexScanDesc scan); +extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); +extern void _bt_mark_array_keys(IndexScanDesc scan); +extern void _bt_restore_array_keys(IndexScanDesc scan); +extern void _bt_preprocess_keys(IndexScanDesc scan); +extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, + int tupnatts, ScanDirection dir, bool *continuescan); +extern void _bt_killitems(IndexScanDesc scan); +extern BTCycleId _bt_vacuum_cycleid(Relation rel); +extern BTCycleId _bt_start_vacuum(Relation rel); +extern void _bt_end_vacuum(Relation rel); +extern void _bt_end_vacuum_callback(int code, Datum arg); +extern Size BTreeShmemSize(void); +extern void BTreeShmemInit(void); +extern bytea *btoptions(Datum reloptions, bool validate); +extern bool btproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull); +extern char *btbuildphasename(int64 phasenum); +extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, + IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, + IndexTuple firstright); +extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, + OffsetNumber offnum); +extern void _bt_check_third_page(Relation rel, Relation heap, + bool needheaptidspace, Page page, IndexTuple newtup); +extern bool _bt_allequalimage(Relation rel, bool debugmessage); + +/* + * prototypes for functions in nbtvalidate.c + */ +extern bool btvalidate(Oid opclassoid); +extern void btadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions); + +/* + * prototypes for functions in nbtsort.c + */ +extern IndexBuildResult *btbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc); + +#endif /* NBTREE_H */ diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h new file mode 100644 index 0000000..7dd9fd0 --- /dev/null +++ b/src/include/access/nbtxlog.h @@ -0,0 +1,367 @@ +/*------------------------------------------------------------------------- + * + * nbtxlog.h + * header file for postgres btree xlog routines + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/nbtxlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef NBTXLOG_H +#define NBTXLOG_H + +#include "access/transam.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +/* + * XLOG records for btree operations + * + * XLOG allows to store some information in high 4 bits of log + * record xl_info field + */ +#define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ +#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ +#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ +#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ +#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ +#define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */ +#define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */ +#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ +#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ +#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ +#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ +#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */ +#define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during + * vacuum */ +#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from + * FSM */ +#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the + * metapage */ + +/* + * All that we need to regenerate the meta-data page + */ +typedef struct xl_btree_metadata +{ + uint32 version; + BlockNumber root; + uint32 level; + BlockNumber fastroot; + uint32 fastlevel; + uint32 last_cleanup_num_delpages; + bool allequalimage; +} xl_btree_metadata; + +/* + * This is what we need to know about simple (without split) insert. + * + * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and + * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a + * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf + * page. + * + * Backup Blk 0: original page + * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META + * Backup Blk 2: xl_btree_metadata, if INSERT_META + * + * Note: The new tuple is actually the "original" new item in the posting + * list split insert case (i.e. the INSERT_POST case). A split offset for + * the posting list is logged before the original new item. Recovery needs + * both, since it must do an in-place update of the existing posting list + * that was split as an extra step. Also, recovery generates a "final" + * newitem. See _bt_swap_posting() for details on posting list splits. + */ +typedef struct xl_btree_insert +{ + OffsetNumber offnum; + + /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */ + /* NEW TUPLE ALWAYS FOLLOWS AT THE END */ +} xl_btree_insert; + +#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) + +/* + * On insert with split, we save all the items going into the right sibling + * so that we can restore it completely from the log record. This way takes + * less xlog space than the normal approach, because if we did it standardly, + * XLogInsert would almost always think the right page is new and store its + * whole page image. The left page, however, is handled in the normal + * incremental-update fashion. + * + * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record. + * There are two variants to indicate whether the inserted tuple went into the + * left or right split page (and thus, whether the new item is stored or not). + * We always log the left page high key because suffix truncation can generate + * a new leaf high key using user-defined code. This is also necessary on + * internal pages, since the firstright item that the left page's high key was + * based on will have been truncated to zero attributes in the right page (the + * separator key is unavailable from the right page). + * + * Backup Blk 0: original page / new left page + * + * The left page's data portion contains the new item, if it's the _L variant. + * _R variant split records generally do not have a newitem (_R variant leaf + * page split records that must deal with a posting list split will include an + * explicit newitem, though it is never used on the right page -- it is + * actually an orignewitem needed to update existing posting list). The new + * high key of the left/original page appears last of all (and must always be + * present). + * + * Page split records that need the REDO routine to deal with a posting list + * split directly will have an explicit newitem, which is actually an + * orignewitem (the newitem as it was before the posting list split, not + * after). A posting list split always has a newitem that comes immediately + * after the posting list being split (which would have overlapped with + * orignewitem prior to split). Usually REDO must deal with posting list + * splits with an _L variant page split record, and usually both the new + * posting list and the final newitem go on the left page (the existing + * posting list will be inserted instead of the old, and the final newitem + * will be inserted next to that). However, _R variant split records will + * include an orignewitem when the split point for the page happens to have a + * lastleft tuple that is also the posting list being split (leaving newitem + * as the page split's firstright tuple). The existence of this corner case + * does not change the basic fact about newitem/orignewitem for the REDO + * routine: it is always state used for the left page alone. (This is why the + * record's postingoff field isn't a reliable indicator of whether or not a + * posting list split occurred during the page split; a non-zero value merely + * indicates that the REDO routine must reconstruct a new posting list tuple + * that is needed for the left page.) + * + * This posting list split handling is equivalent to the xl_btree_insert REDO + * routine's INSERT_POST handling. While the details are more complicated + * here, the concept and goals are exactly the same. See _bt_swap_posting() + * for details on posting list splits. + * + * Backup Blk 1: new right page + * + * The right page's data portion contains the right page's tuples in the form + * used by _bt_restore_page. This includes the new item, if it's the _R + * variant. The right page's tuples also include the right page's high key + * with either variant (moved from the left/original page during the split), + * unless the split happened to be of the rightmost page on its level, where + * there is no high key for new right page. + * + * Backup Blk 2: next block (orig page's rightlink), if any + * Backup Blk 3: child's left sibling, if non-leaf split + */ +typedef struct xl_btree_split +{ + uint32 level; /* tree level of page being split */ + OffsetNumber firstrightoff; /* first origpage item on rightpage */ + OffsetNumber newitemoff; /* new item's offset */ + uint16 postingoff; /* offset inside orig posting tuple */ +} xl_btree_split; + +#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16)) + +/* + * When page is deduplicated, consecutive groups of tuples with equal keys are + * merged together into posting list tuples. + * + * The WAL record represents a deduplication pass for a leaf page. An array + * of BTDedupInterval structs follows. + */ +typedef struct xl_btree_dedup +{ + uint16 nintervals; + + /* DEDUPLICATION INTERVALS FOLLOW */ +} xl_btree_dedup; + +#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16)) + +/* + * This is what we need to know about page reuse within btree. This record + * only exists to generate a conflict point for Hot Standby. + * + * Note that we must include a RelFileLocator in the record because we don't + * actually register the buffer with the record. + */ +typedef struct xl_btree_reuse_page +{ + RelFileLocator locator; + BlockNumber block; + FullTransactionId snapshotConflictHorizon; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ +} xl_btree_reuse_page; + +#define SizeOfBtreeReusePage (offsetof(xl_btree_reuse_page, isCatalogRel) + sizeof(bool)) + +/* + * xl_btree_vacuum and xl_btree_delete records describe deletion of index + * tuples on a leaf page. The former variant is used by VACUUM, while the + * latter variant is used by the ad-hoc deletions that sometimes take place + * when btinsert() is called. + * + * The records are very similar. The only difference is that xl_btree_delete + * have snapshotConflictHorizon/isCatalogRel fields for recovery conflicts. + * (VACUUM operations can just rely on earlier conflicts generated during + * pruning of the table whose TIDs the to-be-deleted index tuples point to. + * There are also small differences between each REDO routine that we don't go + * into here.) + * + * xl_btree_vacuum and xl_btree_delete both represent deletion of any number + * of index tuples on a single leaf page using page offset numbers. Both also + * support "updates" of index tuples, which is how deletes of a subset of TIDs + * contained in an existing posting list tuple are implemented. + * + * Updated posting list tuples are represented using xl_btree_update metadata. + * The REDO routines each use the xl_btree_update entries (plus each + * corresponding original index tuple from the target leaf page) to generate + * the final updated tuple. + * + * Updates are only used when there will be some remaining TIDs left by the + * REDO routine. Otherwise the posting list tuple just gets deleted outright. + */ +typedef struct xl_btree_vacuum +{ + uint16 ndeleted; + uint16 nupdated; + + /*---- + * In payload of blk 0 : + * - DELETED TARGET OFFSET NUMBERS + * - UPDATED TARGET OFFSET NUMBERS + * - UPDATED TUPLES METADATA (xl_btree_update) ITEMS + *---- + */ +} xl_btree_vacuum; + +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) + +typedef struct xl_btree_delete +{ + TransactionId snapshotConflictHorizon; + uint16 ndeleted; + uint16 nupdated; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + + /*---- + * In payload of blk 0 : + * - DELETED TARGET OFFSET NUMBERS + * - UPDATED TARGET OFFSET NUMBERS + * - UPDATED TUPLES METADATA (xl_btree_update) ITEMS + *---- + */ +} xl_btree_delete; + +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, isCatalogRel) + sizeof(bool)) + +/* + * The offsets that appear in xl_btree_update metadata are offsets into the + * original posting list from tuple, not page offset numbers. These are + * 0-based. The page offset number for the original posting list tuple comes + * from the main xl_btree_vacuum/xl_btree_delete record. + */ +typedef struct xl_btree_update +{ + uint16 ndeletedtids; + + /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */ +} xl_btree_update; + +#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16)) + +/* + * This is what we need to know about marking an empty subtree for deletion. + * The target identifies the tuple removed from the parent page (note that we + * remove this tuple's downlink and the *following* tuple's key). Note that + * the leaf page is empty, so we don't need to store its content --- it is + * just reinitialized during recovery using the rest of the fields. + * + * Backup Blk 0: leaf block + * Backup Blk 1: top parent + */ +typedef struct xl_btree_mark_page_halfdead +{ + OffsetNumber poffset; /* deleted tuple id in parent page */ + + /* information needed to recreate the leaf page: */ + BlockNumber leafblk; /* leaf block ultimately being deleted */ + BlockNumber leftblk; /* leaf block's left sibling, if any */ + BlockNumber rightblk; /* leaf block's right sibling */ + BlockNumber topparent; /* topmost internal page in the subtree */ +} xl_btree_mark_page_halfdead; + +#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) + +/* + * This is what we need to know about deletion of a btree page. Note that we + * only leave behind a small amount of bookkeeping information in deleted + * pages (deleted pages must be kept around as tombstones for a while). It is + * convenient for the REDO routine to regenerate its target page from scratch. + * This is why WAL record describes certain details that are actually directly + * available from the target page. + * + * Backup Blk 0: target block being deleted + * Backup Blk 1: target block's left sibling, if any + * Backup Blk 2: target block's right sibling + * Backup Blk 3: leaf block (if different from target) + * Backup Blk 4: metapage (if rightsib becomes new fast root) + */ +typedef struct xl_btree_unlink_page +{ + BlockNumber leftsib; /* target block's left sibling, if any */ + BlockNumber rightsib; /* target block's right sibling */ + uint32 level; /* target block's level */ + FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */ + + /* + * Information needed to recreate a half-dead leaf page with correct + * topparent link. The fields are only used when deletion operation's + * target page is an internal page. REDO routine creates half-dead page + * from scratch to keep things simple (this is the same convenient + * approach used for the target page itself). + */ + BlockNumber leafleftsib; + BlockNumber leafrightsib; + BlockNumber leaftopparent; /* next child down in the subtree */ + + /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ +} xl_btree_unlink_page; + +#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber)) + +/* + * New root log record. There are zero tuples if this is to establish an + * empty root, or two if it is the result of splitting an old root. + * + * Note that although this implies rewriting the metadata page, we don't need + * an xl_btree_metadata record --- the rootblk and level are sufficient. + * + * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) + * Backup Blk 1: left child (if splitting an old root) + * Backup Blk 2: metapage + */ +typedef struct xl_btree_newroot +{ + BlockNumber rootblk; /* location of new root (redundant with blk 0) */ + uint32 level; /* its tree level */ +} xl_btree_newroot; + +#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) + + +/* + * prototypes for functions in nbtxlog.c + */ +extern void btree_redo(XLogReaderState *record); +extern void btree_xlog_startup(void); +extern void btree_xlog_cleanup(void); +extern void btree_mask(char *pagedata, BlockNumber blkno); + +/* + * prototypes for functions in nbtdesc.c + */ +extern void btree_desc(StringInfo buf, XLogReaderState *record); +extern const char *btree_identify(uint8 info); + +#endif /* NBTXLOG_H */ diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h new file mode 100644 index 0000000..061f8a4 --- /dev/null +++ b/src/include/access/parallel.h @@ -0,0 +1,82 @@ +/*------------------------------------------------------------------------- + * + * parallel.h + * Infrastructure for launching parallel workers + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/parallel.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PARALLEL_H +#define PARALLEL_H + +#include "access/xlogdefs.h" +#include "lib/ilist.h" +#include "postmaster/bgworker.h" +#include "storage/shm_mq.h" +#include "storage/shm_toc.h" + +typedef void (*parallel_worker_main_type) (dsm_segment *seg, shm_toc *toc); + +typedef struct ParallelWorkerInfo +{ + BackgroundWorkerHandle *bgwhandle; + shm_mq_handle *error_mqh; + int32 pid; +} ParallelWorkerInfo; + +typedef struct ParallelContext +{ + dlist_node node; + SubTransactionId subid; + int nworkers; /* Maximum number of workers to launch */ + int nworkers_to_launch; /* Actual number of workers to launch */ + int nworkers_launched; + char *library_name; + char *function_name; + ErrorContextCallback *error_context_stack; + shm_toc_estimator estimator; + dsm_segment *seg; + void *private_memory; + shm_toc *toc; + ParallelWorkerInfo *worker; + int nknown_attached_workers; + bool *known_attached_workers; +} ParallelContext; + +typedef struct ParallelWorkerContext +{ + dsm_segment *seg; + shm_toc *toc; +} ParallelWorkerContext; + +extern PGDLLIMPORT volatile sig_atomic_t ParallelMessagePending; +extern PGDLLIMPORT int ParallelWorkerNumber; +extern PGDLLIMPORT bool InitializingParallelWorker; + +#define IsParallelWorker() (ParallelWorkerNumber >= 0) + +extern ParallelContext *CreateParallelContext(const char *library_name, + const char *function_name, int nworkers); +extern void InitializeParallelDSM(ParallelContext *pcxt); +extern void ReinitializeParallelDSM(ParallelContext *pcxt); +extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); +extern void LaunchParallelWorkers(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); +extern void DestroyParallelContext(ParallelContext *pcxt); +extern bool ParallelContextActive(void); + +extern void HandleParallelMessageInterrupt(void); +extern void HandleParallelMessages(void); +extern void AtEOXact_Parallel(bool isCommit); +extern void AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId); +extern void ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end); + +extern void ParallelWorkerMain(Datum main_arg); + +#endif /* PARALLEL_H */ diff --git a/src/include/access/printsimple.h b/src/include/access/printsimple.h new file mode 100644 index 0000000..9883e72 --- /dev/null +++ b/src/include/access/printsimple.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * printsimple.h + * print simple tuples without catalog access + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/printsimple.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PRINTSIMPLE_H +#define PRINTSIMPLE_H + +#include "tcop/dest.h" + +extern bool printsimple(TupleTableSlot *slot, DestReceiver *self); +extern void printsimple_startup(DestReceiver *self, int operation, + TupleDesc tupdesc); + +#endif /* PRINTSIMPLE_H */ diff --git a/src/include/access/printtup.h b/src/include/access/printtup.h new file mode 100644 index 0000000..747ecb8 --- /dev/null +++ b/src/include/access/printtup.h @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * printtup.h + * + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/printtup.h + * + *------------------------------------------------------------------------- + */ +#ifndef PRINTTUP_H +#define PRINTTUP_H + +#include "utils/portal.h" + +extern DestReceiver *printtup_create_DR(CommandDest dest); + +extern void SetRemoteDestReceiverParams(DestReceiver *self, Portal portal); + +extern void SendRowDescriptionMessage(StringInfo buf, + TupleDesc typeinfo, List *targetlist, int16 *formats); + +extern void debugStartup(DestReceiver *self, int operation, + TupleDesc typeinfo); +extern bool debugtup(TupleTableSlot *slot, DestReceiver *self); + +/* XXX these are really in executor/spi.c */ +extern void spi_dest_startup(DestReceiver *self, int operation, + TupleDesc typeinfo); +extern bool spi_printtup(TupleTableSlot *slot, DestReceiver *self); + +#endif /* PRINTTUP_H */ diff --git a/src/include/access/relation.h b/src/include/access/relation.h new file mode 100644 index 0000000..66efd36 --- /dev/null +++ b/src/include/access/relation.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * relation.h + * Generic relation related routines. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/relation.h + * + *------------------------------------------------------------------------- + */ +#ifndef ACCESS_RELATION_H +#define ACCESS_RELATION_H + +#include "nodes/primnodes.h" +#include "storage/lockdefs.h" +#include "utils/relcache.h" + +extern Relation relation_open(Oid relationId, LOCKMODE lockmode); +extern Relation try_relation_open(Oid relationId, LOCKMODE lockmode); +extern Relation relation_openrv(const RangeVar *relation, LOCKMODE lockmode); +extern Relation relation_openrv_extended(const RangeVar *relation, + LOCKMODE lockmode, bool missing_ok); +extern void relation_close(Relation relation, LOCKMODE lockmode); + +#endif /* ACCESS_RELATION_H */ diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h new file mode 100644 index 0000000..1d5bfa6 --- /dev/null +++ b/src/include/access/reloptions.h @@ -0,0 +1,247 @@ +/*------------------------------------------------------------------------- + * + * reloptions.h + * Core support for relation and tablespace options (pg_class.reloptions + * and pg_tablespace.spcoptions) + * + * Note: the functions dealing with text-array reloptions values declare + * them as Datum, not ArrayType *, to avoid needing to include array.h + * into a lot of low-level code. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/reloptions.h + * + *------------------------------------------------------------------------- + */ +#ifndef RELOPTIONS_H +#define RELOPTIONS_H + +#include "access/amapi.h" +#include "access/htup.h" +#include "access/tupdesc.h" +#include "nodes/pg_list.h" +#include "storage/lock.h" + +/* types supported by reloptions */ +typedef enum relopt_type +{ + RELOPT_TYPE_BOOL, + RELOPT_TYPE_INT, + RELOPT_TYPE_REAL, + RELOPT_TYPE_ENUM, + RELOPT_TYPE_STRING +} relopt_type; + +/* kinds supported by reloptions */ +typedef enum relopt_kind +{ + RELOPT_KIND_LOCAL = 0, + RELOPT_KIND_HEAP = (1 << 0), + RELOPT_KIND_TOAST = (1 << 1), + RELOPT_KIND_BTREE = (1 << 2), + RELOPT_KIND_HASH = (1 << 3), + RELOPT_KIND_GIN = (1 << 4), + RELOPT_KIND_GIST = (1 << 5), + RELOPT_KIND_ATTRIBUTE = (1 << 6), + RELOPT_KIND_TABLESPACE = (1 << 7), + RELOPT_KIND_SPGIST = (1 << 8), + RELOPT_KIND_VIEW = (1 << 9), + RELOPT_KIND_BRIN = (1 << 10), + RELOPT_KIND_PARTITIONED = (1 << 11), + /* if you add a new kind, make sure you update "last_default" too */ + RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_PARTITIONED, + /* some compilers treat enums as signed ints, so we can't use 1 << 31 */ + RELOPT_KIND_MAX = (1 << 30) +} relopt_kind; + +/* reloption namespaces allowed for heaps -- currently only TOAST */ +#define HEAP_RELOPT_NAMESPACES { "toast", NULL } + +/* generic struct to hold shared data */ +typedef struct relopt_gen +{ + const char *name; /* must be first (used as list termination + * marker) */ + const char *desc; + bits32 kinds; + LOCKMODE lockmode; + int namelen; + relopt_type type; +} relopt_gen; + +/* holds a parsed value */ +typedef struct relopt_value +{ + relopt_gen *gen; + bool isset; + union + { + bool bool_val; + int int_val; + double real_val; + int enum_val; + char *string_val; /* allocated separately */ + } values; +} relopt_value; + +/* reloptions records for specific variable types */ +typedef struct relopt_bool +{ + relopt_gen gen; + bool default_val; +} relopt_bool; + +typedef struct relopt_int +{ + relopt_gen gen; + int default_val; + int min; + int max; +} relopt_int; + +typedef struct relopt_real +{ + relopt_gen gen; + double default_val; + double min; + double max; +} relopt_real; + +/* + * relopt_enum_elt_def -- One member of the array of acceptable values + * of an enum reloption. + */ +typedef struct relopt_enum_elt_def +{ + const char *string_val; + int symbol_val; +} relopt_enum_elt_def; + +typedef struct relopt_enum +{ + relopt_gen gen; + relopt_enum_elt_def *members; + int default_val; + const char *detailmsg; + /* null-terminated array of members */ +} relopt_enum; + +/* validation routines for strings */ +typedef void (*validate_string_relopt) (const char *value); +typedef Size (*fill_string_relopt) (const char *value, void *ptr); + +/* validation routine for the whole option set */ +typedef void (*relopts_validator) (void *parsed_options, relopt_value *vals, int nvals); + +typedef struct relopt_string +{ + relopt_gen gen; + int default_len; + bool default_isnull; + validate_string_relopt validate_cb; + fill_string_relopt fill_cb; + char *default_val; +} relopt_string; + +/* This is the table datatype for build_reloptions() */ +typedef struct +{ + const char *optname; /* option's name */ + relopt_type opttype; /* option's datatype */ + int offset; /* offset of field in result struct */ +} relopt_parse_elt; + +/* Local reloption definition */ +typedef struct local_relopt +{ + relopt_gen *option; /* option definition */ + int offset; /* offset of parsed value in bytea structure */ +} local_relopt; + +/* Structure to hold local reloption data for build_local_reloptions() */ +typedef struct local_relopts +{ + List *options; /* list of local_relopt definitions */ + List *validators; /* list of relopts_validator callbacks */ + Size relopt_struct_size; /* size of parsed bytea structure */ +} local_relopts; + +/* + * Utility macro to get a value for a string reloption once the options + * are parsed. This gets a pointer to the string value itself. "optstruct" + * is the StdRdOptions struct or equivalent, "member" is the struct member + * corresponding to the string option. + */ +#define GET_STRING_RELOPTION(optstruct, member) \ + ((optstruct)->member == 0 ? NULL : \ + (char *)(optstruct) + (optstruct)->member) + +extern relopt_kind add_reloption_kind(void); +extern void add_bool_reloption(bits32 kinds, const char *name, const char *desc, + bool default_val, LOCKMODE lockmode); +extern void add_int_reloption(bits32 kinds, const char *name, const char *desc, + int default_val, int min_val, int max_val, + LOCKMODE lockmode); +extern void add_real_reloption(bits32 kinds, const char *name, const char *desc, + double default_val, double min_val, double max_val, + LOCKMODE lockmode); +extern void add_enum_reloption(bits32 kinds, const char *name, const char *desc, + relopt_enum_elt_def *members, int default_val, + const char *detailmsg, LOCKMODE lockmode); +extern void add_string_reloption(bits32 kinds, const char *name, const char *desc, + const char *default_val, validate_string_relopt validator, + LOCKMODE lockmode); + +extern void init_local_reloptions(local_relopts *relopts, Size relopt_struct_size); +extern void register_reloptions_validator(local_relopts *relopts, + relopts_validator validator); +extern void add_local_bool_reloption(local_relopts *relopts, const char *name, + const char *desc, bool default_val, + int offset); +extern void add_local_int_reloption(local_relopts *relopts, const char *name, + const char *desc, int default_val, + int min_val, int max_val, int offset); +extern void add_local_real_reloption(local_relopts *relopts, const char *name, + const char *desc, double default_val, + double min_val, double max_val, + int offset); +extern void add_local_enum_reloption(local_relopts *relopts, + const char *name, const char *desc, + relopt_enum_elt_def *members, + int default_val, const char *detailmsg, + int offset); +extern void add_local_string_reloption(local_relopts *relopts, const char *name, + const char *desc, + const char *default_val, + validate_string_relopt validator, + fill_string_relopt filler, int offset); + +extern Datum transformRelOptions(Datum oldOptions, List *defList, + const char *namspace, char *validnsps[], + bool acceptOidsOff, bool isReset); +extern List *untransformRelOptions(Datum options); +extern bytea *extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, + amoptions_function amoptions); +extern void *build_reloptions(Datum reloptions, bool validate, + relopt_kind kind, + Size relopt_struct_size, + const relopt_parse_elt *relopt_elems, + int num_relopt_elems); +extern void *build_local_reloptions(local_relopts *relopts, Datum options, + bool validate); + +extern bytea *default_reloptions(Datum reloptions, bool validate, + relopt_kind kind); +extern bytea *heap_reloptions(char relkind, Datum reloptions, bool validate); +extern bytea *view_reloptions(Datum reloptions, bool validate); +extern bytea *partitioned_table_reloptions(Datum reloptions, bool validate); +extern bytea *index_reloptions(amoptions_function amoptions, Datum reloptions, + bool validate); +extern bytea *attribute_reloptions(Datum reloptions, bool validate); +extern bytea *tablespace_reloptions(Datum reloptions, bool validate); +extern LOCKMODE AlterTableGetRelOptionsLockLevel(List *defList); + +#endif /* RELOPTIONS_H */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h new file mode 100644 index 0000000..d03360e --- /dev/null +++ b/src/include/access/relscan.h @@ -0,0 +1,191 @@ +/*------------------------------------------------------------------------- + * + * relscan.h + * POSTGRES relation scan descriptor definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/relscan.h + * + *------------------------------------------------------------------------- + */ +#ifndef RELSCAN_H +#define RELSCAN_H + +#include "access/htup_details.h" +#include "access/itup.h" +#include "port/atomics.h" +#include "storage/buf.h" +#include "storage/spin.h" +#include "utils/relcache.h" + + +struct ParallelTableScanDescData; + +/* + * Generic descriptor for table scans. This is the base-class for table scans, + * which needs to be embedded in the scans of individual AMs. + */ +typedef struct TableScanDescData +{ + /* scan parameters */ + Relation rs_rd; /* heap relation descriptor */ + struct SnapshotData *rs_snapshot; /* snapshot to see */ + int rs_nkeys; /* number of scan keys */ + struct ScanKeyData *rs_key; /* array of scan key descriptors */ + + /* Range of ItemPointers for table_scan_getnextslot_tidrange() to scan. */ + ItemPointerData rs_mintid; + ItemPointerData rs_maxtid; + + /* + * Information about type and behaviour of the scan, a bitmask of members + * of the ScanOptions enum (see tableam.h). + */ + uint32 rs_flags; + + struct ParallelTableScanDescData *rs_parallel; /* parallel scan + * information */ +} TableScanDescData; +typedef struct TableScanDescData *TableScanDesc; + +/* + * Shared state for parallel table scan. + * + * Each backend participating in a parallel table scan has its own + * TableScanDesc in backend-private memory, and those objects all contain a + * pointer to this structure. The information here must be sufficient to + * properly initialize each new TableScanDesc as workers join the scan, and it + * must act as a information what to scan for those workers. + */ +typedef struct ParallelTableScanDescData +{ + Oid phs_relid; /* OID of relation to scan */ + bool phs_syncscan; /* report location to syncscan logic? */ + bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + Size phs_snapshot_off; /* data for snapshot */ +} ParallelTableScanDescData; +typedef struct ParallelTableScanDescData *ParallelTableScanDesc; + +/* + * Shared state for parallel table scans, for block oriented storage. + */ +typedef struct ParallelBlockTableScanDescData +{ + ParallelTableScanDescData base; + + BlockNumber phs_nblocks; /* # blocks in relation at start of scan */ + slock_t phs_mutex; /* mutual exclusion for setting startblock */ + BlockNumber phs_startblock; /* starting block number */ + pg_atomic_uint64 phs_nallocated; /* number of blocks allocated to + * workers so far. */ +} ParallelBlockTableScanDescData; +typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc; + +/* + * Per backend state for parallel table scan, for block-oriented storage. + */ +typedef struct ParallelBlockTableScanWorkerData +{ + uint64 phsw_nallocated; /* Current # of blocks into the scan */ + uint32 phsw_chunk_remaining; /* # blocks left in this chunk */ + uint32 phsw_chunk_size; /* The number of blocks to allocate in + * each I/O chunk for the scan */ +} ParallelBlockTableScanWorkerData; +typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; + +/* + * Base class for fetches from a table via an index. This is the base-class + * for such scans, which needs to be embedded in the respective struct for + * individual AMs. + */ +typedef struct IndexFetchTableData +{ + Relation rel; +} IndexFetchTableData; + +/* + * We use the same IndexScanDescData structure for both amgettuple-based + * and amgetbitmap-based index scans. Some fields are only relevant in + * amgettuple-based scans. + */ +typedef struct IndexScanDescData +{ + /* scan parameters */ + Relation heapRelation; /* heap relation descriptor, or NULL */ + Relation indexRelation; /* index relation descriptor */ + struct SnapshotData *xs_snapshot; /* snapshot to see */ + int numberOfKeys; /* number of index qualifier conditions */ + int numberOfOrderBys; /* number of ordering operators */ + struct ScanKeyData *keyData; /* array of index qualifier descriptors */ + struct ScanKeyData *orderByData; /* array of ordering op descriptors */ + bool xs_want_itup; /* caller requests index tuples */ + bool xs_temp_snap; /* unregister snapshot at scan end? */ + + /* signaling to index AM about killing index tuples */ + bool kill_prior_tuple; /* last-returned tuple is dead */ + bool ignore_killed_tuples; /* do not return killed entries */ + bool xactStartedInRecovery; /* prevents killing/seeing killed + * tuples */ + + /* index access method's private state */ + void *opaque; /* access-method-specific info */ + + /* + * In an index-only scan, a successful amgettuple call must fill either + * xs_itup (and xs_itupdesc) or xs_hitup (and xs_hitupdesc) to provide the + * data returned by the scan. It can fill both, in which case the heap + * format will be used. + */ + IndexTuple xs_itup; /* index tuple returned by AM */ + struct TupleDescData *xs_itupdesc; /* rowtype descriptor of xs_itup */ + HeapTuple xs_hitup; /* index data returned by AM, as HeapTuple */ + struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ + + ItemPointerData xs_heaptid; /* result */ + bool xs_heap_continue; /* T if must keep walking, potential + * further results */ + IndexFetchTableData *xs_heapfetch; + + bool xs_recheck; /* T means scan keys must be rechecked */ + + /* + * When fetching with an ordering operator, the values of the ORDER BY + * expressions of the last returned tuple, according to the index. If + * xs_recheckorderby is true, these need to be rechecked just like the + * scan keys, and the values returned here are a lower-bound on the actual + * values. + */ + Datum *xs_orderbyvals; + bool *xs_orderbynulls; + bool xs_recheckorderby; + + /* parallel index scan information, in shared memory */ + struct ParallelIndexScanDescData *parallel_scan; +} IndexScanDescData; + +/* Generic structure for parallel scans */ +typedef struct ParallelIndexScanDescData +{ + Oid ps_relid; + Oid ps_indexid; + Size ps_offset; /* Offset in bytes of am specific structure */ + char ps_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; +} ParallelIndexScanDescData; + +struct TupleTableSlot; + +/* Struct for storage-or-index scans of system tables */ +typedef struct SysScanDescData +{ + Relation heap_rel; /* catalog being scanned */ + Relation irel; /* NULL if doing heap scan */ + struct TableScanDescData *scan; /* only valid in storage-scan case */ + struct IndexScanDescData *iscan; /* only valid in index-scan case */ + struct SnapshotData *snapshot; /* snapshot to unregister at end of scan */ + struct TupleTableSlot *slot; +} SysScanDescData; + +#endif /* RELSCAN_H */ diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h new file mode 100644 index 0000000..1125457 --- /dev/null +++ b/src/include/access/rewriteheap.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * rewriteheap.h + * Declarations for heap rewrite support functions + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994-5, Regents of the University of California + * + * src/include/access/rewriteheap.h + * + *------------------------------------------------------------------------- + */ +#ifndef REWRITE_HEAP_H +#define REWRITE_HEAP_H + +#include "access/htup.h" +#include "storage/itemptr.h" +#include "storage/relfilelocator.h" +#include "utils/relcache.h" + +/* struct definition is private to rewriteheap.c */ +typedef struct RewriteStateData *RewriteState; + +extern RewriteState begin_heap_rewrite(Relation old_heap, Relation new_heap, + TransactionId oldest_xmin, TransactionId freeze_xid, + MultiXactId cutoff_multi); +extern void end_heap_rewrite(RewriteState state); +extern void rewrite_heap_tuple(RewriteState state, HeapTuple old_tuple, + HeapTuple new_tuple); +extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple); + +/* + * On-Disk data format for an individual logical rewrite mapping. + */ +typedef struct LogicalRewriteMappingData +{ + RelFileLocator old_locator; + RelFileLocator new_locator; + ItemPointerData old_tid; + ItemPointerData new_tid; +} LogicalRewriteMappingData; + +/* --- + * The filename consists of the following, dash separated, + * components: + * 1) database oid or InvalidOid for shared relations + * 2) the oid of the relation + * 3) upper 32bit of the LSN at which a rewrite started + * 4) lower 32bit of the LSN at which a rewrite started + * 5) xid we are mapping for + * 6) xid of the xact performing the mapping + * --- + */ +#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x" +extern void CheckPointLogicalRewriteHeap(void); + +#endif /* REWRITE_HEAP_H */ diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h new file mode 100644 index 0000000..3b6a497 --- /dev/null +++ b/src/include/access/rmgr.h @@ -0,0 +1,62 @@ +/* + * rmgr.h + * + * Resource managers definition + * + * src/include/access/rmgr.h + */ +#ifndef RMGR_H +#define RMGR_H + +typedef uint8 RmgrId; + +/* + * Built-in resource managers + * + * The actual numerical values for each rmgr ID are defined by the order + * of entries in rmgrlist.h. + * + * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG + * file format. + */ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ + symname, + +typedef enum RmgrIds +{ +#include "access/rmgrlist.h" + RM_NEXT_ID +} RmgrIds; + +#undef PG_RMGR + +#define RM_MAX_ID UINT8_MAX +#define RM_MAX_BUILTIN_ID (RM_NEXT_ID - 1) +#define RM_MIN_CUSTOM_ID 128 +#define RM_MAX_CUSTOM_ID UINT8_MAX +#define RM_N_IDS (UINT8_MAX + 1) +#define RM_N_BUILTIN_IDS (RM_MAX_BUILTIN_ID + 1) +#define RM_N_CUSTOM_IDS (RM_MAX_CUSTOM_ID - RM_MIN_CUSTOM_ID + 1) + +static inline bool +RmgrIdIsBuiltin(int rmid) +{ + return rmid <= RM_MAX_BUILTIN_ID; +} + +static inline bool +RmgrIdIsCustom(int rmid) +{ + return rmid >= RM_MIN_CUSTOM_ID && rmid <= RM_MAX_CUSTOM_ID; +} + +#define RmgrIdIsValid(rmid) (RmgrIdIsBuiltin((rmid)) || RmgrIdIsCustom((rmid))) + +/* + * RmgrId to use for extensions that require an RmgrId, but are still in + * development and have not reserved their own unique RmgrId yet. See: + * https://wiki.postgresql.org/wiki/CustomWALResourceManagers + */ +#define RM_EXPERIMENTAL_ID 128 + +#endif /* RMGR_H */ diff --git a/src/include/access/rmgrdesc_utils.h b/src/include/access/rmgrdesc_utils.h new file mode 100644 index 0000000..bd41469 --- /dev/null +++ b/src/include/access/rmgrdesc_utils.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * rmgrdesc_utils.h + * Support functions for rmgrdesc routines + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * src/include/access/rmgrdesc_utils.h + * + *------------------------------------------------------------------------- + */ +#ifndef RMGRDESC_UTILS_H_ +#define RMGRDESC_UTILS_H_ + +/* + * Guidelines for rmgrdesc routine authors: + * + * The goal of these guidelines is to avoid gratuitous inconsistencies across + * each rmgr, and to allow users to parse desc output strings without too much + * difficulty. This is not an API specification or an interchange format. + * (Only heapam and nbtree desc routines follow these guidelines at present, + * in any case.) + * + * Record descriptions are similar to JSON style key/value objects. However, + * there is no explicit "string" type/string escaping. Top-level { } brackets + * should be omitted. For example: + * + * snapshotConflictHorizon: 0, flags: 0x03 + * + * Record descriptions may contain variable-length arrays. For example: + * + * nunused: 5, unused: [1, 2, 3, 4, 5] + * + * Nested objects are supported via { } brackets. They generally appear + * inside variable-length arrays. For example: + * + * ndeleted: 0, nupdated: 1, deleted: [], updated: [{ off: 45, nptids: 1, ptids: [0] }] + * + * Try to output things in an order that faithfully represents the order of + * fields from the underlying physical WAL record struct. Key names should be + * unique (at the same nesting level) to make parsing easy. It's a good idea + * if the number of items in the array appears before the array. + * + * It's okay for individual WAL record types to invent their own conventions. + * For example, Heap2's PRUNE record descriptions use a custom array format + * for the record's "redirected" field: + * + * ... redirected: [1->4, 5->9], dead: [10, 11], unused: [3, 7, 8] + * + * Arguably the desc routine should be using object notation for this instead. + * However, there is value in using a custom format when it conveys useful + * information about the underlying physical data structures. + * + * This ad-hoc format has the advantage of being close to the format used for + * the "dead" and "unused" arrays (which follow the standard desc convention + * for page offset number arrays). It suggests that the "redirected" elements + * shown are just pairs of page offset numbers (which is how it really works). + */ +extern void array_desc(StringInfo buf, void *array, size_t elem_size, int count, + void (*elem_desc) (StringInfo buf, void *elem, void *data), + void *data); +extern void offset_elem_desc(StringInfo buf, void *offset, void *data); +extern void redirect_elem_desc(StringInfo buf, void *offset, void *data); +extern void oid_elem_desc(StringInfo buf, void *relid, void *data); + +#endif /* RMGRDESC_UTILS_H_ */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h new file mode 100644 index 0000000..463bcb6 --- /dev/null +++ b/src/include/access/rmgrlist.h @@ -0,0 +1,49 @@ +/*--------------------------------------------------------------------------- + * rmgrlist.h + * + * The resource manager list is kept in its own source file for possible + * use by automatic tools. The exact representation of a rmgr is determined + * by the PG_RMGR macro, which is not defined in this file; it can be + * defined by the caller for special purposes. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/rmgrlist.h + *--------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef RMGRLIST_H here */ + +/* + * List of resource manager entries. Note that order of entries defines the + * numerical values of each rmgr's ID, which is stored in WAL records. New + * entries should be added at the end, to avoid changing IDs of existing + * entries. + * + * Changes to this list possibly need an XLOG_PAGE_MAGIC bump. + */ + +/* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, decode */ +PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode) +PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode) +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, standby_decode) +PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, heap2_decode) +PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, heap_decode) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, NULL) +PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, NULL) +PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL) +PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, NULL) +PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL) +PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL) +PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL) +PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL) +PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode) diff --git a/src/include/access/sdir.h b/src/include/access/sdir.h new file mode 100644 index 0000000..322aeb3 --- /dev/null +++ b/src/include/access/sdir.h @@ -0,0 +1,67 @@ +/*------------------------------------------------------------------------- + * + * sdir.h + * POSTGRES scan direction definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/sdir.h + * + *------------------------------------------------------------------------- + */ +#ifndef SDIR_H +#define SDIR_H + + +/* + * Defines the direction for scanning a table or an index. Scans are never + * invoked using NoMovementScanDirectionScans. For convenience, we use the + * values -1 and 1 for backward and forward scans. This allows us to perform + * a few mathematical tricks such as what is done in ScanDirectionCombine. + */ +typedef enum ScanDirection +{ + BackwardScanDirection = -1, + NoMovementScanDirection = 0, + ForwardScanDirection = 1 +} ScanDirection; + +/* + * Determine the net effect of two direction specifications. + * This relies on having ForwardScanDirection = +1, BackwardScanDirection = -1, + * and will probably not do what you want if applied to any other values. + */ +#define ScanDirectionCombine(a, b) ((a) * (b)) + +/* + * ScanDirectionIsValid + * True iff scan direction is valid. + */ +#define ScanDirectionIsValid(direction) \ + ((bool) (BackwardScanDirection <= (direction) && \ + (direction) <= ForwardScanDirection)) + +/* + * ScanDirectionIsBackward + * True iff scan direction is backward. + */ +#define ScanDirectionIsBackward(direction) \ + ((bool) ((direction) == BackwardScanDirection)) + +/* + * ScanDirectionIsNoMovement + * True iff scan direction indicates no movement. + */ +#define ScanDirectionIsNoMovement(direction) \ + ((bool) ((direction) == NoMovementScanDirection)) + +/* + * ScanDirectionIsForward + * True iff scan direction is forward. + */ +#define ScanDirectionIsForward(direction) \ + ((bool) ((direction) == ForwardScanDirection)) + +#endif /* SDIR_H */ diff --git a/src/include/access/session.h b/src/include/access/session.h new file mode 100644 index 0000000..f905fe4 --- /dev/null +++ b/src/include/access/session.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * session.h + * Encapsulation of user session. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * src/include/access/session.h + * + *------------------------------------------------------------------------- + */ +#ifndef SESSION_H +#define SESSION_H + +#include "lib/dshash.h" + +/* Avoid including typcache.h */ +struct SharedRecordTypmodRegistry; + +/* + * A struct encapsulating some elements of a user's session. For now this + * manages state that applies to parallel query, but in principle it could + * include other things that are currently global variables. + */ +typedef struct Session +{ + dsm_segment *segment; /* The session-scoped DSM segment. */ + dsa_area *area; /* The session-scoped DSA area. */ + + /* State managed by typcache.c. */ + struct SharedRecordTypmodRegistry *shared_typmod_registry; + dshash_table *shared_record_table; + dshash_table *shared_typmod_table; +} Session; + +extern void InitializeSession(void); +extern dsm_handle GetSessionDsmHandle(void); +extern void AttachSession(dsm_handle handle); +extern void DetachSession(void); + +/* The current session, or NULL for none. */ +extern PGDLLIMPORT Session *CurrentSession; + +#endif /* SESSION_H */ diff --git a/src/include/access/skey.h b/src/include/access/skey.h new file mode 100644 index 0000000..fbdb23c --- /dev/null +++ b/src/include/access/skey.h @@ -0,0 +1,151 @@ +/*------------------------------------------------------------------------- + * + * skey.h + * POSTGRES scan key definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/skey.h + * + *------------------------------------------------------------------------- + */ +#ifndef SKEY_H +#define SKEY_H + +#include "access/attnum.h" +#include "access/stratnum.h" +#include "fmgr.h" + + +/* + * A ScanKey represents the application of a comparison operator between + * a table or index column and a constant. When it's part of an array of + * ScanKeys, the comparison conditions are implicitly ANDed. The index + * column is the left argument of the operator, if it's a binary operator. + * (The data structure can support unary indexable operators too; in that + * case sk_argument would go unused. This is not currently implemented.) + * + * For an index scan, sk_strategy and sk_subtype must be set correctly for + * the operator. When using a ScanKey in a heap scan, these fields are not + * used and may be set to InvalidStrategy/InvalidOid. + * + * If the operator is collation-sensitive, sk_collation must be set + * correctly as well. + * + * A ScanKey can also represent a ScalarArrayOpExpr, that is a condition + * "column op ANY(ARRAY[...])". This is signaled by the SK_SEARCHARRAY + * flag bit. The sk_argument is not a value of the operator's right-hand + * argument type, but rather an array of such values, and the per-element + * comparisons are to be ORed together. + * + * A ScanKey can also represent a condition "column IS NULL" or "column + * IS NOT NULL"; these cases are signaled by the SK_SEARCHNULL and + * SK_SEARCHNOTNULL flag bits respectively. The argument is always NULL, + * and the sk_strategy, sk_subtype, sk_collation, and sk_func fields are + * not used (unless set by the index AM). + * + * SK_SEARCHARRAY, SK_SEARCHNULL and SK_SEARCHNOTNULL are supported only + * for index scans, not heap scans; and not all index AMs support them, + * only those that set amsearcharray or amsearchnulls respectively. + * + * A ScanKey can also represent an ordering operator invocation, that is + * an ordering requirement "ORDER BY indexedcol op constant". This looks + * the same as a comparison operator, except that the operator doesn't + * (usually) yield boolean. We mark such ScanKeys with SK_ORDER_BY. + * SK_SEARCHARRAY, SK_SEARCHNULL, SK_SEARCHNOTNULL cannot be used here. + * + * Note: in some places, ScanKeys are used as a convenient representation + * for the invocation of an access method support procedure. In this case + * sk_strategy/sk_subtype are not meaningful (but sk_collation can be); and + * sk_func may refer to a function that returns something other than boolean. + */ +typedef struct ScanKeyData +{ + int sk_flags; /* flags, see below */ + AttrNumber sk_attno; /* table or index column number */ + StrategyNumber sk_strategy; /* operator strategy number */ + Oid sk_subtype; /* strategy subtype */ + Oid sk_collation; /* collation to use, if needed */ + FmgrInfo sk_func; /* lookup info for function to call */ + Datum sk_argument; /* data to compare */ +} ScanKeyData; + +typedef ScanKeyData *ScanKey; + +/* + * About row comparisons: + * + * The ScanKey data structure also supports row comparisons, that is ordered + * tuple comparisons like (x, y) > (c1, c2), having the SQL-spec semantics + * "x > c1 OR (x = c1 AND y > c2)". Note that this is currently only + * implemented for btree index searches, not for heapscans or any other index + * type. A row comparison is represented by a "header" ScanKey entry plus + * a separate array of ScanKeys, one for each column of the row comparison. + * The header entry has these properties: + * sk_flags = SK_ROW_HEADER + * sk_attno = index column number for leading column of row comparison + * sk_strategy = btree strategy code for semantics of row comparison + * (ie, < <= > or >=) + * sk_subtype, sk_collation, sk_func: not used + * sk_argument: pointer to subsidiary ScanKey array + * If the header is part of a ScanKey array that's sorted by attno, it + * must be sorted according to the leading column number. + * + * The subsidiary ScanKey array appears in logical column order of the row + * comparison, which may be different from index column order. The array + * elements are like a normal ScanKey array except that: + * sk_flags must include SK_ROW_MEMBER, plus SK_ROW_END in the last + * element (needed since row header does not include a count) + * sk_func points to the btree comparison support function for the + * opclass, NOT the operator's implementation function. + * sk_strategy must be the same in all elements of the subsidiary array, + * that is, the same as in the header entry. + * SK_SEARCHARRAY, SK_SEARCHNULL, SK_SEARCHNOTNULL cannot be used here. + */ + +/* + * ScanKeyData sk_flags + * + * sk_flags bits 0-15 are reserved for system-wide use (symbols for those + * bits should be defined here). Bits 16-31 are reserved for use within + * individual index access methods. + */ +#define SK_ISNULL 0x0001 /* sk_argument is NULL */ +#define SK_UNARY 0x0002 /* unary operator (not supported!) */ +#define SK_ROW_HEADER 0x0004 /* row comparison header (see above) */ +#define SK_ROW_MEMBER 0x0008 /* row comparison member (see above) */ +#define SK_ROW_END 0x0010 /* last row comparison member */ +#define SK_SEARCHARRAY 0x0020 /* scankey represents ScalarArrayOp */ +#define SK_SEARCHNULL 0x0040 /* scankey represents "col IS NULL" */ +#define SK_SEARCHNOTNULL 0x0080 /* scankey represents "col IS NOT NULL" */ +#define SK_ORDER_BY 0x0100 /* scankey is for ORDER BY op */ + + +/* + * prototypes for functions in access/common/scankey.c + */ +extern void ScanKeyInit(ScanKey entry, + AttrNumber attributeNumber, + StrategyNumber strategy, + RegProcedure procedure, + Datum argument); +extern void ScanKeyEntryInitialize(ScanKey entry, + int flags, + AttrNumber attributeNumber, + StrategyNumber strategy, + Oid subtype, + Oid collation, + RegProcedure procedure, + Datum argument); +extern void ScanKeyEntryInitializeWithInfo(ScanKey entry, + int flags, + AttrNumber attributeNumber, + StrategyNumber strategy, + Oid subtype, + Oid collation, + FmgrInfo *finfo, + Datum argument); + +#endif /* SKEY_H */ diff --git a/src/include/access/slru.h b/src/include/access/slru.h new file mode 100644 index 0000000..a8a424d --- /dev/null +++ b/src/include/access/slru.h @@ -0,0 +1,174 @@ +/*------------------------------------------------------------------------- + * + * slru.h + * Simple LRU buffering for transaction status logfiles + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/slru.h + * + *------------------------------------------------------------------------- + */ +#ifndef SLRU_H +#define SLRU_H + +#include "access/xlogdefs.h" +#include "storage/lwlock.h" +#include "storage/sync.h" + + +/* + * Define SLRU segment size. A page is the same BLCKSZ as is used everywhere + * else in Postgres. The segment size can be chosen somewhat arbitrarily; + * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG + * or 64K transactions for SUBTRANS. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * page numbering also wraps around at 0xFFFFFFFF/xxxx_XACTS_PER_PAGE (where + * xxxx is CLOG or SUBTRANS, respectively), and segment numbering at + * 0xFFFFFFFF/xxxx_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need + * take no explicit notice of that fact in slru.c, except when comparing + * segment and page numbers in SimpleLruTruncate (see PagePrecedes()). + */ +#define SLRU_PAGES_PER_SEGMENT 32 + +/* + * Page status codes. Note that these do not include the "dirty" bit. + * page_dirty can be true only in the VALID or WRITE_IN_PROGRESS states; + * in the latter case it implies that the page has been re-dirtied since + * the write started. + */ +typedef enum +{ + SLRU_PAGE_EMPTY, /* buffer is not in use */ + SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */ + SLRU_PAGE_VALID, /* page is valid and not being written */ + SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */ +} SlruPageStatus; + +/* + * Shared-memory state + */ +typedef struct SlruSharedData +{ + LWLock *ControlLock; + + /* Number of buffers managed by this SLRU structure */ + int num_slots; + + /* + * Arrays holding info for each buffer slot. Page number is undefined + * when status is EMPTY, as is page_lru_count. + */ + char **page_buffer; + SlruPageStatus *page_status; + bool *page_dirty; + int *page_number; + int *page_lru_count; + LWLockPadded *buffer_locks; + + /* + * Optional array of WAL flush LSNs associated with entries in the SLRU + * pages. If not zero/NULL, we must flush WAL before writing pages (true + * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[] + * has lsn_groups_per_page entries per buffer slot, each containing the + * highest LSN known for a contiguous group of SLRU entries on that slot's + * page. + */ + XLogRecPtr *group_lsn; + int lsn_groups_per_page; + + /*---------- + * We mark a page "most recently used" by setting + * page_lru_count[slotno] = ++cur_lru_count; + * The oldest page is therefore the one with the highest value of + * cur_lru_count - page_lru_count[slotno] + * The counts will eventually wrap around, but this calculation still + * works as long as no page's age exceeds INT_MAX counts. + *---------- + */ + int cur_lru_count; + + /* + * latest_page_number is the page number of the current end of the log; + * this is not critical data, since we use it only to avoid swapping out + * the latest page. + */ + int latest_page_number; + + /* SLRU's index for statistics purposes (might not be unique) */ + int slru_stats_idx; +} SlruSharedData; + +typedef SlruSharedData *SlruShared; + +/* + * SlruCtlData is an unshared structure that points to the active information + * in shared memory. + */ +typedef struct SlruCtlData +{ + SlruShared shared; + + /* + * Which sync handler function to use when handing sync requests over to + * the checkpointer. SYNC_HANDLER_NONE to disable fsync (eg pg_notify). + */ + SyncRequestHandler sync_handler; + + /* + * Decide whether a page is "older" for truncation and as a hint for + * evicting pages in LRU order. Return true if every entry of the first + * argument is older than every entry of the second argument. Note that + * !PagePrecedes(a,b) && !PagePrecedes(b,a) need not imply a==b; it also + * arises when some entries are older and some are not. For SLRUs using + * SimpleLruTruncate(), this must use modular arithmetic. (For others, + * the behavior of this callback has no functional implications.) Use + * SlruPagePrecedesUnitTests() in SLRUs meeting its criteria. + */ + bool (*PagePrecedes) (int, int); + + /* + * Dir is set during SimpleLruInit and does not change thereafter. Since + * it's always the same, it doesn't need to be in shared memory. + */ + char Dir[64]; +} SlruCtlData; + +typedef SlruCtlData *SlruCtl; + + +extern Size SimpleLruShmemSize(int nslots, int nlsns); +extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler); +extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); +extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, + TransactionId xid); +extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, + TransactionId xid); +extern void SimpleLruWritePage(SlruCtl ctl, int slotno); +extern void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied); +#ifdef USE_ASSERT_CHECKING +extern void SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page); +#else +#define SlruPagePrecedesUnitTests(ctl, per_page) do {} while (0) +#endif +extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); +extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno); + +typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, + void *data); +extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); +extern void SlruDeleteSegment(SlruCtl ctl, int segno); + +extern int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path); + +/* SlruScanDirectory public callbacks */ +extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, + int segpage, void *data); +extern bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, + void *data); + +#endif /* SLRU_H */ diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h new file mode 100644 index 0000000..fe31d32 --- /dev/null +++ b/src/include/access/spgist.h @@ -0,0 +1,229 @@ +/*------------------------------------------------------------------------- + * + * spgist.h + * Public header file for SP-GiST access method. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/spgist.h + * + *------------------------------------------------------------------------- + */ +#ifndef SPGIST_H +#define SPGIST_H + +#include "access/amapi.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + + +/* SPGiST opclass support function numbers */ +#define SPGIST_CONFIG_PROC 1 +#define SPGIST_CHOOSE_PROC 2 +#define SPGIST_PICKSPLIT_PROC 3 +#define SPGIST_INNER_CONSISTENT_PROC 4 +#define SPGIST_LEAF_CONSISTENT_PROC 5 +#define SPGIST_COMPRESS_PROC 6 +#define SPGIST_OPTIONS_PROC 7 +#define SPGISTNRequiredProc 5 +#define SPGISTNProc 7 + +/* + * Argument structs for spg_config method + */ +typedef struct spgConfigIn +{ + Oid attType; /* Data type to be indexed */ +} spgConfigIn; + +typedef struct spgConfigOut +{ + Oid prefixType; /* Data type of inner-tuple prefixes */ + Oid labelType; /* Data type of inner-tuple node labels */ + Oid leafType; /* Data type of leaf-tuple values */ + bool canReturnData; /* Opclass can reconstruct original data */ + bool longValuesOK; /* Opclass can cope with values > 1 page */ +} spgConfigOut; + +/* + * Argument structs for spg_choose method + */ +typedef struct spgChooseIn +{ + Datum datum; /* original datum to be indexed */ + Datum leafDatum; /* current datum to be stored at leaf */ + int level; /* current level (counting from zero) */ + + /* Data from current inner tuple */ + bool allTheSame; /* tuple is marked all-the-same? */ + bool hasPrefix; /* tuple has a prefix? */ + Datum prefixDatum; /* if so, the prefix value */ + int nNodes; /* number of nodes in the inner tuple */ + Datum *nodeLabels; /* node label values (NULL if none) */ +} spgChooseIn; + +typedef enum spgChooseResultType +{ + spgMatchNode = 1, /* descend into existing node */ + spgAddNode, /* add a node to the inner tuple */ + spgSplitTuple /* split inner tuple (change its prefix) */ +} spgChooseResultType; + +typedef struct spgChooseOut +{ + spgChooseResultType resultType; /* action code, see above */ + union + { + struct /* results for spgMatchNode */ + { + int nodeN; /* descend to this node (index from 0) */ + int levelAdd; /* increment level by this much */ + Datum restDatum; /* new leaf datum */ + } matchNode; + struct /* results for spgAddNode */ + { + Datum nodeLabel; /* new node's label */ + int nodeN; /* where to insert it (index from 0) */ + } addNode; + struct /* results for spgSplitTuple */ + { + /* Info to form new upper-level inner tuple with one child tuple */ + bool prefixHasPrefix; /* tuple should have a prefix? */ + Datum prefixPrefixDatum; /* if so, its value */ + int prefixNNodes; /* number of nodes */ + Datum *prefixNodeLabels; /* their labels (or NULL for no + * labels) */ + int childNodeN; /* which node gets child tuple */ + + /* Info to form new lower-level inner tuple with all old nodes */ + bool postfixHasPrefix; /* tuple should have a prefix? */ + Datum postfixPrefixDatum; /* if so, its value */ + } splitTuple; + } result; +} spgChooseOut; + +/* + * Argument structs for spg_picksplit method + */ +typedef struct spgPickSplitIn +{ + int nTuples; /* number of leaf tuples */ + Datum *datums; /* their datums (array of length nTuples) */ + int level; /* current level (counting from zero) */ +} spgPickSplitIn; + +typedef struct spgPickSplitOut +{ + bool hasPrefix; /* new inner tuple should have a prefix? */ + Datum prefixDatum; /* if so, its value */ + + int nNodes; /* number of nodes for new inner tuple */ + Datum *nodeLabels; /* their labels (or NULL for no labels) */ + + int *mapTuplesToNodes; /* node index for each leaf tuple */ + Datum *leafTupleDatums; /* datum to store in each new leaf tuple */ +} spgPickSplitOut; + +/* + * Argument structs for spg_inner_consistent method + */ +typedef struct spgInnerConsistentIn +{ + ScanKey scankeys; /* array of operators and comparison values */ + ScanKey orderbys; /* array of ordering operators and comparison + * values */ + int nkeys; /* length of scankeys array */ + int norderbys; /* length of orderbys array */ + + Datum reconstructedValue; /* value reconstructed at parent */ + void *traversalValue; /* opclass-specific traverse value */ + MemoryContext traversalMemoryContext; /* put new traverse values here */ + int level; /* current level (counting from zero) */ + bool returnData; /* original data must be returned? */ + + /* Data from current inner tuple */ + bool allTheSame; /* tuple is marked all-the-same? */ + bool hasPrefix; /* tuple has a prefix? */ + Datum prefixDatum; /* if so, the prefix value */ + int nNodes; /* number of nodes in the inner tuple */ + Datum *nodeLabels; /* node label values (NULL if none) */ +} spgInnerConsistentIn; + +typedef struct spgInnerConsistentOut +{ + int nNodes; /* number of child nodes to be visited */ + int *nodeNumbers; /* their indexes in the node array */ + int *levelAdds; /* increment level by this much for each */ + Datum *reconstructedValues; /* associated reconstructed values */ + void **traversalValues; /* opclass-specific traverse values */ + double **distances; /* associated distances */ +} spgInnerConsistentOut; + +/* + * Argument structs for spg_leaf_consistent method + */ +typedef struct spgLeafConsistentIn +{ + ScanKey scankeys; /* array of operators and comparison values */ + ScanKey orderbys; /* array of ordering operators and comparison + * values */ + int nkeys; /* length of scankeys array */ + int norderbys; /* length of orderbys array */ + + Datum reconstructedValue; /* value reconstructed at parent */ + void *traversalValue; /* opclass-specific traverse value */ + int level; /* current level (counting from zero) */ + bool returnData; /* original data must be returned? */ + + Datum leafDatum; /* datum in leaf tuple */ +} spgLeafConsistentIn; + +typedef struct spgLeafConsistentOut +{ + Datum leafValue; /* reconstructed original data, if any */ + bool recheck; /* set true if operator must be rechecked */ + bool recheckDistances; /* set true if distances must be rechecked */ + double *distances; /* associated distances */ +} spgLeafConsistentOut; + + +/* spgutils.c */ +extern bytea *spgoptions(Datum reloptions, bool validate); + +/* spginsert.c */ +extern IndexBuildResult *spgbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void spgbuildempty(Relation index); +extern bool spginsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); + +/* spgscan.c */ +extern IndexScanDesc spgbeginscan(Relation rel, int keysz, int orderbysz); +extern void spgendscan(IndexScanDesc scan); +extern void spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern int64 spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm); +extern bool spggettuple(IndexScanDesc scan, ScanDirection dir); +extern bool spgcanreturn(Relation index, int attno); + +/* spgvacuum.c */ +extern IndexBulkDeleteResult *spgbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *spgvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); + +/* spgvalidate.c */ +extern bool spgvalidate(Oid opclassoid); +extern void spgadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions); + +#endif /* SPGIST_H */ diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h new file mode 100644 index 0000000..c6ef46f --- /dev/null +++ b/src/include/access/spgist_private.h @@ -0,0 +1,548 @@ +/*------------------------------------------------------------------------- + * + * spgist_private.h + * Private declarations for SP-GiST access method. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/spgist_private.h + * + *------------------------------------------------------------------------- + */ +#ifndef SPGIST_PRIVATE_H +#define SPGIST_PRIVATE_H + +#include "access/itup.h" +#include "access/spgist.h" +#include "catalog/pg_am_d.h" +#include "nodes/tidbitmap.h" +#include "storage/buf.h" +#include "utils/geo_decls.h" +#include "utils/relcache.h" + + +typedef struct SpGistOptions +{ + int32 varlena_header_; /* varlena header (do not touch directly!) */ + int fillfactor; /* page fill factor in percent (0..100) */ +} SpGistOptions; + +#define SpGistGetFillFactor(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == SPGIST_AM_OID), \ + (relation)->rd_options ? \ + ((SpGistOptions *) (relation)->rd_options)->fillfactor : \ + SPGIST_DEFAULT_FILLFACTOR) +#define SpGistGetTargetPageFreeSpace(relation) \ + (BLCKSZ * (100 - SpGistGetFillFactor(relation)) / 100) + + +/* SPGiST leaf tuples have one key column, optionally have included columns */ +#define spgKeyColumn 0 +#define spgFirstIncludeColumn 1 + +/* Page numbers of fixed-location pages */ +#define SPGIST_METAPAGE_BLKNO (0) /* metapage */ +#define SPGIST_ROOT_BLKNO (1) /* root for normal entries */ +#define SPGIST_NULL_BLKNO (2) /* root for null-value entries */ +#define SPGIST_LAST_FIXED_BLKNO SPGIST_NULL_BLKNO + +#define SpGistBlockIsRoot(blkno) \ + ((blkno) == SPGIST_ROOT_BLKNO || (blkno) == SPGIST_NULL_BLKNO) +#define SpGistBlockIsFixed(blkno) \ + ((BlockNumber) (blkno) <= (BlockNumber) SPGIST_LAST_FIXED_BLKNO) + +/* + * Contents of page special space on SPGiST index pages + */ +typedef struct SpGistPageOpaqueData +{ + uint16 flags; /* see bit definitions below */ + uint16 nRedirection; /* number of redirection tuples on page */ + uint16 nPlaceholder; /* number of placeholder tuples on page */ + /* note there's no count of either LIVE or DEAD tuples ... */ + uint16 spgist_page_id; /* for identification of SP-GiST indexes */ +} SpGistPageOpaqueData; + +typedef SpGistPageOpaqueData *SpGistPageOpaque; + +/* Flag bits in page special space */ +#define SPGIST_META (1<<0) +#define SPGIST_DELETED (1<<1) /* never set, but keep for backwards + * compatibility */ +#define SPGIST_LEAF (1<<2) +#define SPGIST_NULLS (1<<3) + +#define SpGistPageGetOpaque(page) ((SpGistPageOpaque) PageGetSpecialPointer(page)) +#define SpGistPageIsMeta(page) (SpGistPageGetOpaque(page)->flags & SPGIST_META) +#define SpGistPageIsDeleted(page) (SpGistPageGetOpaque(page)->flags & SPGIST_DELETED) +#define SpGistPageIsLeaf(page) (SpGistPageGetOpaque(page)->flags & SPGIST_LEAF) +#define SpGistPageStoresNulls(page) (SpGistPageGetOpaque(page)->flags & SPGIST_NULLS) + +/* + * The page ID is for the convenience of pg_filedump and similar utilities, + * which otherwise would have a hard time telling pages of different index + * types apart. It should be the last 2 bytes on the page. This is more or + * less "free" due to alignment considerations. + * + * See comments above GinPageOpaqueData. + */ +#define SPGIST_PAGE_ID 0xFF82 + +/* + * Each backend keeps a cache of last-used page info in its index->rd_amcache + * area. This is initialized from, and occasionally written back to, + * shared storage in the index metapage. + */ +typedef struct SpGistLastUsedPage +{ + BlockNumber blkno; /* block number, or InvalidBlockNumber */ + int freeSpace; /* page's free space (could be obsolete!) */ +} SpGistLastUsedPage; + +/* Note: indexes in cachedPage[] match flag assignments for SpGistGetBuffer */ +#define SPGIST_CACHED_PAGES 8 + +typedef struct SpGistLUPCache +{ + SpGistLastUsedPage cachedPage[SPGIST_CACHED_PAGES]; +} SpGistLUPCache; + +/* + * metapage + */ +typedef struct SpGistMetaPageData +{ + uint32 magicNumber; /* for identity cross-check */ + SpGistLUPCache lastUsedPages; /* shared storage of last-used info */ +} SpGistMetaPageData; + +#define SPGIST_MAGIC_NUMBER (0xBA0BABEE) + +#define SpGistPageGetMeta(p) \ + ((SpGistMetaPageData *) PageGetContents(p)) + +/* + * Private state of index AM. SpGistState is common to both insert and + * search code; SpGistScanOpaque is for searches only. + */ + +typedef struct SpGistLeafTupleData *SpGistLeafTuple; /* forward reference */ + +/* Per-datatype info needed in SpGistState */ +typedef struct SpGistTypeDesc +{ + Oid type; + int16 attlen; + bool attbyval; + char attalign; + char attstorage; +} SpGistTypeDesc; + +typedef struct SpGistState +{ + Relation index; /* index we're working with */ + + spgConfigOut config; /* filled in by opclass config method */ + + SpGistTypeDesc attType; /* type of values to be indexed/restored */ + SpGistTypeDesc attLeafType; /* type of leaf-tuple values */ + SpGistTypeDesc attPrefixType; /* type of inner-tuple prefix values */ + SpGistTypeDesc attLabelType; /* type of node label values */ + + /* leafTupDesc typically points to index's tupdesc, but not always */ + TupleDesc leafTupDesc; /* descriptor for leaf-level tuples */ + + char *deadTupleStorage; /* workspace for spgFormDeadTuple */ + + TransactionId myXid; /* XID to use when creating a redirect tuple */ + bool isBuild; /* true if doing index build */ +} SpGistState; + +/* Item to be re-examined later during a search */ +typedef struct SpGistSearchItem +{ + pairingheap_node phNode; /* pairing heap node */ + Datum value; /* value reconstructed from parent, or + * leafValue if isLeaf */ + SpGistLeafTuple leafTuple; /* whole leaf tuple, if needed */ + void *traversalValue; /* opclass-specific traverse value */ + int level; /* level of items on this page */ + ItemPointerData heapPtr; /* heap info, if heap tuple */ + bool isNull; /* SearchItem is NULL item */ + bool isLeaf; /* SearchItem is heap item */ + bool recheck; /* qual recheck is needed */ + bool recheckDistances; /* distance recheck is needed */ + + /* array with numberOfOrderBys entries */ + double distances[FLEXIBLE_ARRAY_MEMBER]; +} SpGistSearchItem; + +#define SizeOfSpGistSearchItem(n_distances) \ + (offsetof(SpGistSearchItem, distances) + sizeof(double) * (n_distances)) + +/* + * Private state of an index scan + */ +typedef struct SpGistScanOpaqueData +{ + SpGistState state; /* see above */ + pairingheap *scanQueue; /* queue of to be visited items */ + MemoryContext tempCxt; /* short-lived memory context */ + MemoryContext traversalCxt; /* single scan lifetime memory context */ + + /* Control flags showing whether to search nulls and/or non-nulls */ + bool searchNulls; /* scan matches (all) null entries */ + bool searchNonNulls; /* scan matches (some) non-null entries */ + + /* Index quals to be passed to opclass (null-related quals removed) */ + int numberOfKeys; /* number of index qualifier conditions */ + ScanKey keyData; /* array of index qualifier descriptors */ + int numberOfOrderBys; /* number of ordering operators */ + int numberOfNonNullOrderBys; /* number of ordering operators + * with non-NULL arguments */ + ScanKey orderByData; /* array of ordering op descriptors */ + Oid *orderByTypes; /* array of ordering op return types */ + int *nonNullOrderByOffsets; /* array of offset of non-NULL + * ordering keys in the original array */ + Oid indexCollation; /* collation of index column */ + + /* Opclass defined functions: */ + FmgrInfo innerConsistentFn; + FmgrInfo leafConsistentFn; + + /* Pre-allocated workspace arrays: */ + double *zeroDistances; + double *infDistances; + + /* These fields are only used in amgetbitmap scans: */ + TIDBitmap *tbm; /* bitmap being filled */ + int64 ntids; /* number of TIDs passed to bitmap */ + + /* These fields are only used in amgettuple scans: */ + bool want_itup; /* are we reconstructing tuples? */ + TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */ + int nPtrs; /* number of TIDs found on current page */ + int iPtr; /* index for scanning through same */ + ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */ + bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */ + bool recheckDistances[MaxIndexTuplesPerPage]; /* distance recheck + * flags */ + HeapTuple reconTups[MaxIndexTuplesPerPage]; /* reconstructed tuples */ + + /* distances (for recheck) */ + IndexOrderByDistance *distances[MaxIndexTuplesPerPage]; + + /* + * Note: using MaxIndexTuplesPerPage above is a bit hokey since + * SpGistLeafTuples aren't exactly IndexTuples; however, they are larger, + * so this is safe. + */ +} SpGistScanOpaqueData; + +typedef SpGistScanOpaqueData *SpGistScanOpaque; + +/* + * This struct is what we actually keep in index->rd_amcache. It includes + * static configuration information as well as the lastUsedPages cache. + */ +typedef struct SpGistCache +{ + spgConfigOut config; /* filled in by opclass config method */ + + SpGistTypeDesc attType; /* type of values to be indexed/restored */ + SpGistTypeDesc attLeafType; /* type of leaf-tuple values */ + SpGistTypeDesc attPrefixType; /* type of inner-tuple prefix values */ + SpGistTypeDesc attLabelType; /* type of node label values */ + + SpGistLUPCache lastUsedPages; /* local storage of last-used info */ +} SpGistCache; + + +/* + * SPGiST tuple types. Note: inner, leaf, and dead tuple structs + * must have the same tupstate field in the same position! Real inner and + * leaf tuples always have tupstate = LIVE; if the state is something else, + * use the SpGistDeadTuple struct to inspect the tuple. + */ + +/* values of tupstate (see README for more info) */ +#define SPGIST_LIVE 0 /* normal live tuple (either inner or leaf) */ +#define SPGIST_REDIRECT 1 /* temporary redirection placeholder */ +#define SPGIST_DEAD 2 /* dead, cannot be removed because of links */ +#define SPGIST_PLACEHOLDER 3 /* placeholder, used to preserve offsets */ + +/* + * SPGiST inner tuple: list of "nodes" that subdivide a set of tuples + * + * Inner tuple layout: + * header/optional prefix/array of nodes, which are SpGistNodeTuples + * + * size and prefixSize must be multiples of MAXALIGN + * + * If the prefix datum is of a pass-by-value type, it is stored in its + * Datum representation, that is its on-disk representation is of length + * sizeof(Datum). This is a fairly unfortunate choice, because in no other + * place does Postgres use Datum as an on-disk representation; it creates + * an unnecessary incompatibility between 32-bit and 64-bit builds. But the + * compatibility loss is mostly theoretical since MAXIMUM_ALIGNOF typically + * differs between such builds, too. Anyway we're stuck with it now. + */ +typedef struct SpGistInnerTupleData +{ + unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */ + allTheSame:1, /* all nodes in tuple are equivalent */ + nNodes:13, /* number of nodes within inner tuple */ + prefixSize:16; /* size of prefix, or 0 if none */ + uint16 size; /* total size of inner tuple */ + /* On most machines there will be a couple of wasted bytes here */ + /* prefix datum follows, then nodes */ +} SpGistInnerTupleData; + +typedef SpGistInnerTupleData *SpGistInnerTuple; + +/* these must match largest values that fit in bit fields declared above */ +#define SGITMAXNNODES 0x1FFF +#define SGITMAXPREFIXSIZE 0xFFFF +#define SGITMAXSIZE 0xFFFF + +#define SGITHDRSZ MAXALIGN(sizeof(SpGistInnerTupleData)) +#define _SGITDATA(x) (((char *) (x)) + SGITHDRSZ) +#define SGITDATAPTR(x) ((x)->prefixSize ? _SGITDATA(x) : NULL) +#define SGITDATUM(x, s) ((x)->prefixSize ? \ + ((s)->attPrefixType.attbyval ? \ + *(Datum *) _SGITDATA(x) : \ + PointerGetDatum(_SGITDATA(x))) \ + : (Datum) 0) +#define SGITNODEPTR(x) ((SpGistNodeTuple) (_SGITDATA(x) + (x)->prefixSize)) + +/* Macro for iterating through the nodes of an inner tuple */ +#define SGITITERATE(x, i, nt) \ + for ((i) = 0, (nt) = SGITNODEPTR(x); \ + (i) < (x)->nNodes; \ + (i)++, (nt) = (SpGistNodeTuple) (((char *) (nt)) + IndexTupleSize(nt))) + +/* + * SPGiST node tuple: one node within an inner tuple + * + * Node tuples use the same header as ordinary Postgres IndexTuples, but + * we do not use a null bitmap, because we know there is only one column + * so the INDEX_NULL_MASK bit suffices. Also, pass-by-value datums are + * stored in Datum form, the same convention as for inner tuple prefixes. + */ + +typedef IndexTupleData SpGistNodeTupleData; + +typedef SpGistNodeTupleData *SpGistNodeTuple; + +#define SGNTHDRSZ MAXALIGN(sizeof(SpGistNodeTupleData)) +#define SGNTDATAPTR(x) (((char *) (x)) + SGNTHDRSZ) +#define SGNTDATUM(x, s) ((s)->attLabelType.attbyval ? \ + *(Datum *) SGNTDATAPTR(x) : \ + PointerGetDatum(SGNTDATAPTR(x))) + +/* + * SPGiST leaf tuple: carries a leaf datum and a heap tuple TID, + * and optionally some "included" columns. + * + * In the simplest case, the leaf datum is the same as the indexed value; + * but it could also be a suffix or some other sort of delta that permits + * reconstruction given knowledge of the prefix path traversed to get here. + * Any included columns are stored without modification. + * + * A nulls bitmap is present if there are included columns AND any of the + * datums are NULL. We do not need a nulls bitmap for the case of a null + * leaf datum without included columns, as we can infer whether the leaf + * datum is null from whether the tuple is stored on a nulls page. (This + * provision is mostly for backwards compatibility, but it does save space + * on 32-bit machines.) As with other PG index tuple designs, if the nulls + * bitmap exists then it's of size INDEX_MAX_KEYS bits regardless of the + * actual number of attributes. For the usual choice of INDEX_MAX_KEYS, + * this costs nothing because of alignment considerations. + * + * The size field is wider than could possibly be needed for an on-disk leaf + * tuple, but this allows us to form leaf tuples even when the datum is too + * wide to be stored immediately, and it costs nothing because of alignment + * considerations. + * + * t_info holds the nextOffset field (14 bits wide, enough for supported + * page sizes) plus the has-nulls-bitmap flag bit; another flag bit is free. + * + * Normally, nextOffset links to the next tuple belonging to the same parent + * node (which must be on the same page), or it's 0 if there is no next tuple. + * But when the root page is a leaf page, we don't chain its tuples, + * so nextOffset is always 0 on the root. + * + * size must be a multiple of MAXALIGN; also, it must be at least SGDTSIZE + * so that the tuple can be converted to REDIRECT status later. (This + * restriction only adds bytes for a NULL leaf datum stored on a 32-bit + * machine; otherwise alignment restrictions force it anyway.) + */ +typedef struct SpGistLeafTupleData +{ + unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */ + size:30; /* large enough for any palloc'able value */ + uint16 t_info; /* nextOffset, which links to the next tuple + * in chain, plus two flag bits */ + ItemPointerData heapPtr; /* TID of represented heap tuple */ + /* nulls bitmap follows if the flag bit for it is set */ + /* leaf datum, then any included datums, follows on a MAXALIGN boundary */ +} SpGistLeafTupleData; + +/* Macros to access nextOffset and bit fields inside t_info */ +#define SGLT_GET_NEXTOFFSET(spgLeafTuple) \ + ((spgLeafTuple)->t_info & 0x3FFF) +#define SGLT_GET_HASNULLMASK(spgLeafTuple) \ + (((spgLeafTuple)->t_info & 0x8000) ? true : false) +#define SGLT_SET_NEXTOFFSET(spgLeafTuple, offsetNumber) \ + ((spgLeafTuple)->t_info = \ + ((spgLeafTuple)->t_info & 0xC000) | ((offsetNumber) & 0x3FFF)) +#define SGLT_SET_HASNULLMASK(spgLeafTuple, hasnulls) \ + ((spgLeafTuple)->t_info = \ + ((spgLeafTuple)->t_info & 0x7FFF) | ((hasnulls) ? 0x8000 : 0)) + +#define SGLTHDRSZ(hasnulls) \ + ((hasnulls) ? MAXALIGN(sizeof(SpGistLeafTupleData) + \ + sizeof(IndexAttributeBitMapData)) : \ + MAXALIGN(sizeof(SpGistLeafTupleData))) +#define SGLTDATAPTR(x) (((char *) (x)) + SGLTHDRSZ(SGLT_GET_HASNULLMASK(x))) +#define SGLTDATUM(x, s) fetch_att(SGLTDATAPTR(x), \ + (s)->attLeafType.attbyval, \ + (s)->attLeafType.attlen) + +/* + * SPGiST dead tuple: declaration for examining non-live tuples + * + * The tupstate field of this struct must match those of regular inner and + * leaf tuples, and its size field must match a leaf tuple's. + * Also, the pointer field must be in the same place as a leaf tuple's heapPtr + * field, to satisfy some Asserts that we make when replacing a leaf tuple + * with a dead tuple. + * We don't use t_info, but it's needed to align the pointer field. + * pointer and xid are only valid when tupstate = REDIRECT. + */ +typedef struct SpGistDeadTupleData +{ + unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */ + size:30; + uint16 t_info; /* not used in dead tuples */ + ItemPointerData pointer; /* redirection inside index */ + TransactionId xid; /* ID of xact that inserted this tuple */ +} SpGistDeadTupleData; + +typedef SpGistDeadTupleData *SpGistDeadTuple; + +#define SGDTSIZE MAXALIGN(sizeof(SpGistDeadTupleData)) + +/* + * Macros for doing free-space calculations. Note that when adding up the + * space needed for tuples, we always consider each tuple to need the tuple's + * size plus sizeof(ItemIdData) (for the line pointer). This works correctly + * so long as tuple sizes are always maxaligned. + */ + +/* Page capacity after allowing for fixed header and special space */ +#define SPGIST_PAGE_CAPACITY \ + MAXALIGN_DOWN(BLCKSZ - \ + SizeOfPageHeaderData - \ + MAXALIGN(sizeof(SpGistPageOpaqueData))) + +/* + * Compute free space on page, assuming that up to n placeholders can be + * recycled if present (n should be the number of tuples to be inserted) + */ +#define SpGistPageGetFreeSpace(p, n) \ + (PageGetExactFreeSpace(p) + \ + Min(SpGistPageGetOpaque(p)->nPlaceholder, n) * \ + (SGDTSIZE + sizeof(ItemIdData))) + +/* + * XLOG stuff + */ + +#define STORE_STATE(s, d) \ + do { \ + (d).myXid = (s)->myXid; \ + (d).isBuild = (s)->isBuild; \ + } while(0) + +/* + * The "flags" argument for SpGistGetBuffer should be either GBUF_LEAF to + * get a leaf page, or GBUF_INNER_PARITY(blockNumber) to get an inner + * page in the same triple-parity group as the specified block number. + * (Typically, this should be GBUF_INNER_PARITY(parentBlockNumber + 1) + * to follow the rule described in spgist/README.) + * In addition, GBUF_NULLS can be OR'd in to get a page for storage of + * null-valued tuples. + * + * Note: these flag values are used as indexes into lastUsedPages. + */ +#define GBUF_LEAF 0x03 +#define GBUF_INNER_PARITY(x) ((x) % 3) +#define GBUF_NULLS 0x04 + +#define GBUF_PARITY_MASK 0x03 +#define GBUF_REQ_LEAF(flags) (((flags) & GBUF_PARITY_MASK) == GBUF_LEAF) +#define GBUF_REQ_NULLS(flags) ((flags) & GBUF_NULLS) + +/* spgutils.c */ + +/* reloption parameters */ +#define SPGIST_MIN_FILLFACTOR 10 +#define SPGIST_DEFAULT_FILLFACTOR 80 + +extern SpGistCache *spgGetCache(Relation index); +extern TupleDesc getSpGistTupleDesc(Relation index, SpGistTypeDesc *keyType); +extern void initSpGistState(SpGistState *state, Relation index); +extern Buffer SpGistNewBuffer(Relation index); +extern void SpGistUpdateMetaPage(Relation index); +extern Buffer SpGistGetBuffer(Relation index, int flags, + int needSpace, bool *isNew); +extern void SpGistSetLastUsedPage(Relation index, Buffer buffer); +extern void SpGistInitPage(Page page, uint16 f); +extern void SpGistInitBuffer(Buffer b, uint16 f); +extern void SpGistInitMetapage(Page page); +extern unsigned int SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum); +extern Size SpGistGetLeafTupleSize(TupleDesc tupleDescriptor, + Datum *datums, bool *isnulls); +extern SpGistLeafTuple spgFormLeafTuple(SpGistState *state, + ItemPointer heapPtr, + Datum *datums, bool *isnulls); +extern SpGistNodeTuple spgFormNodeTuple(SpGistState *state, + Datum label, bool isnull); +extern SpGistInnerTuple spgFormInnerTuple(SpGistState *state, + bool hasPrefix, Datum prefix, + int nNodes, SpGistNodeTuple *nodes); +extern SpGistDeadTuple spgFormDeadTuple(SpGistState *state, int tupstate, + BlockNumber blkno, OffsetNumber offnum); +extern void spgDeformLeafTuple(SpGistLeafTuple tup, TupleDesc tupleDescriptor, + Datum *datums, bool *isnulls, + bool keyColumnIsNull); +extern Datum *spgExtractNodeLabels(SpGistState *state, + SpGistInnerTuple innerTuple); +extern OffsetNumber SpGistPageAddNewItem(SpGistState *state, Page page, + Item item, Size size, + OffsetNumber *startOffset, + bool errorOK); +extern bool spgproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull); + +/* spgdoinsert.c */ +extern void spgUpdateNodeLink(SpGistInnerTuple tup, int nodeN, + BlockNumber blkno, OffsetNumber offset); +extern void spgPageIndexMultiDelete(SpGistState *state, Page page, + OffsetNumber *itemnos, int nitems, + int firststate, int reststate, + BlockNumber blkno, OffsetNumber offnum); +extern bool spgdoinsert(Relation index, SpGistState *state, + ItemPointer heapPtr, Datum *datums, bool *isnulls); + +/* spgproc.c */ +extern double *spg_key_orderbys_distances(Datum key, bool isLeaf, + ScanKey orderbys, int norderbys); +extern BOX *box_copy(BOX *orig); + +#endif /* SPGIST_PRIVATE_H */ diff --git a/src/include/access/spgxlog.h b/src/include/access/spgxlog.h new file mode 100644 index 0000000..a7f10bf --- /dev/null +++ b/src/include/access/spgxlog.h @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * spgxlog.h + * xlog declarations for SP-GiST access method. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/spgxlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef SPGXLOG_H +#define SPGXLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +/* XLOG record types for SPGiST */ + /* #define XLOG_SPGIST_CREATE_INDEX 0x00 */ /* not used anymore */ +#define XLOG_SPGIST_ADD_LEAF 0x10 +#define XLOG_SPGIST_MOVE_LEAFS 0x20 +#define XLOG_SPGIST_ADD_NODE 0x30 +#define XLOG_SPGIST_SPLIT_TUPLE 0x40 +#define XLOG_SPGIST_PICKSPLIT 0x50 +#define XLOG_SPGIST_VACUUM_LEAF 0x60 +#define XLOG_SPGIST_VACUUM_ROOT 0x70 +#define XLOG_SPGIST_VACUUM_REDIRECT 0x80 + +/* + * Some redo functions need an SpGistState, although only a few of its fields + * need to be valid. spgxlogState carries the required info in xlog records. + * (See fillFakeState in spgxlog.c for more comments.) + */ +typedef struct spgxlogState +{ + TransactionId myXid; + bool isBuild; +} spgxlogState; + +/* + * Backup Blk 0: destination page for leaf tuple + * Backup Blk 1: parent page (if any) + */ +typedef struct spgxlogAddLeaf +{ + bool newPage; /* init dest page? */ + bool storesNulls; /* page is in the nulls tree? */ + OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */ + OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */ + + OffsetNumber offnumParent; /* where the parent downlink is, if any */ + uint16 nodeI; + + /* new leaf tuple follows (unaligned!) */ +} spgxlogAddLeaf; + +/* + * Backup Blk 0: source leaf page + * Backup Blk 1: destination leaf page + * Backup Blk 2: parent page + */ +typedef struct spgxlogMoveLeafs +{ + uint16 nMoves; /* number of tuples moved from source page */ + bool newPage; /* init dest page? */ + bool replaceDead; /* are we replacing a DEAD source tuple? */ + bool storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nMoves + * array of inserted tuple numbers, length nMoves + 1 or 1 + * list of leaf tuples, length nMoves + 1 or 1 (unaligned!) + * + * Note: if replaceDead is true then there is only one inserted tuple + * number and only one leaf tuple in the data, because we are not copying + * the dead tuple from the source + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogMoveLeafs; + +#define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets) + +/* + * Backup Blk 0: original page + * Backup Blk 1: where new tuple goes, if not same place + * Backup Blk 2: where parent downlink is, if updated and different from + * the old and new + */ +typedef struct spgxlogAddNode +{ + /* + * Offset of the original inner tuple, in the original page (on backup + * block 0). + */ + OffsetNumber offnum; + + /* + * Offset of the new tuple, on the new page (on backup block 1). Invalid, + * if we overwrote the old tuple in the original page). + */ + OffsetNumber offnumNew; + bool newPage; /* init new page? */ + + /*---- + * Where is the parent downlink? parentBlk indicates which page it's on, + * and offnumParent is the offset within the page. The possible values for + * parentBlk are: + * + * 0: parent == original page + * 1: parent == new page + * 2: parent == different page (blk ref 2) + * -1: parent not updated + *---- + */ + int8 parentBlk; + OffsetNumber offnumParent; /* offset within the parent page */ + + uint16 nodeI; + + spgxlogState stateSrc; + + /* + * updated inner tuple follows (unaligned!) + */ +} spgxlogAddNode; + +/* + * Backup Blk 0: where the prefix tuple goes + * Backup Blk 1: where the postfix tuple goes (if different page) + */ +typedef struct spgxlogSplitTuple +{ + /* where the prefix tuple goes */ + OffsetNumber offnumPrefix; + + /* where the postfix tuple goes */ + OffsetNumber offnumPostfix; + bool newPage; /* need to init that page? */ + bool postfixBlkSame; /* was postfix tuple put on same page as + * prefix? */ + + /* + * new prefix inner tuple follows, then new postfix inner tuple (both are + * unaligned!) + */ +} spgxlogSplitTuple; + +/* + * Buffer references in the rdata array are: + * Backup Blk 0: Src page (only if not root) + * Backup Blk 1: Dest page (if used) + * Backup Blk 2: Inner page + * Backup Blk 3: Parent page (if any, and different from Inner) + */ +typedef struct spgxlogPickSplit +{ + bool isRootSplit; + + uint16 nDelete; /* n to delete from Src */ + uint16 nInsert; /* n to insert on Src and/or Dest */ + bool initSrc; /* re-init the Src page? */ + bool initDest; /* re-init the Dest page? */ + + /* where to put new inner tuple */ + OffsetNumber offnumInner; + bool initInner; /* re-init the Inner page? */ + + bool storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is, if any */ + bool innerIsParent; /* is parent the same as inner page? */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nDelete + * array of inserted tuple numbers, length nInsert + * array of page selector bytes for inserted tuples, length nInsert + * new inner tuple (unaligned!) + * list of leaf tuples, length nInsert (unaligned!) + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogPickSplit; + +#define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets) + +typedef struct spgxlogVacuumLeaf +{ + uint16 nDead; /* number of tuples to become DEAD */ + uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */ + uint16 nMove; /* number of tuples to move */ + uint16 nChain; /* number of tuples to re-chain */ + + spgxlogState stateSrc; + + /*---------- + * data follows: + * tuple numbers to become DEAD + * tuple numbers to become PLACEHOLDER + * tuple numbers to move from (and replace with PLACEHOLDER) + * tuple numbers to move to (replacing what is there) + * tuple numbers to update nextOffset links of + * tuple numbers to insert in nextOffset links + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogVacuumLeaf; + +#define SizeOfSpgxlogVacuumLeaf offsetof(spgxlogVacuumLeaf, offsets) + +typedef struct spgxlogVacuumRoot +{ + /* vacuum a root page when it is also a leaf */ + uint16 nDelete; /* number of tuples to delete */ + + spgxlogState stateSrc; + + /* offsets of tuples to delete follow */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogVacuumRoot; + +#define SizeOfSpgxlogVacuumRoot offsetof(spgxlogVacuumRoot, offsets) + +typedef struct spgxlogVacuumRedirect +{ + uint16 nToPlaceholder; /* number of redirects to make placeholders */ + OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */ + TransactionId snapshotConflictHorizon; /* newest XID of removed redirects */ + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ + + /* offsets of redirect tuples to make placeholders follow */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogVacuumRedirect; + +#define SizeOfSpgxlogVacuumRedirect offsetof(spgxlogVacuumRedirect, offsets) + +extern void spg_redo(XLogReaderState *record); +extern void spg_desc(StringInfo buf, XLogReaderState *record); +extern const char *spg_identify(uint8 info); +extern void spg_xlog_startup(void); +extern void spg_xlog_cleanup(void); +extern void spg_mask(char *pagedata, BlockNumber blkno); + +#endif /* SPGXLOG_H */ diff --git a/src/include/access/stratnum.h b/src/include/access/stratnum.h new file mode 100644 index 0000000..1b29653 --- /dev/null +++ b/src/include/access/stratnum.h @@ -0,0 +1,85 @@ +/*------------------------------------------------------------------------- + * + * stratnum.h + * POSTGRES strategy number definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/stratnum.h + * + *------------------------------------------------------------------------- + */ +#ifndef STRATNUM_H +#define STRATNUM_H + +/* + * Strategy numbers identify the semantics that particular operators have + * with respect to particular operator classes. In some cases a strategy + * subtype (an OID) is used as further information. + */ +typedef uint16 StrategyNumber; + +#define InvalidStrategy ((StrategyNumber) 0) + +/* + * Strategy numbers for B-tree indexes. + */ +#define BTLessStrategyNumber 1 +#define BTLessEqualStrategyNumber 2 +#define BTEqualStrategyNumber 3 +#define BTGreaterEqualStrategyNumber 4 +#define BTGreaterStrategyNumber 5 + +#define BTMaxStrategyNumber 5 + +/* + * Strategy numbers for hash indexes. There's only one valid strategy for + * hashing: equality. + */ +#define HTEqualStrategyNumber 1 + +#define HTMaxStrategyNumber 1 + +/* + * Strategy numbers common to (some) GiST, SP-GiST and BRIN opclasses. + * + * The first few of these come from the R-Tree indexing method (hence the + * names); the others have been added over time as they have been needed. + */ +#define RTLeftStrategyNumber 1 /* for << */ +#define RTOverLeftStrategyNumber 2 /* for &< */ +#define RTOverlapStrategyNumber 3 /* for && */ +#define RTOverRightStrategyNumber 4 /* for &> */ +#define RTRightStrategyNumber 5 /* for >> */ +#define RTSameStrategyNumber 6 /* for ~= */ +#define RTContainsStrategyNumber 7 /* for @> */ +#define RTContainedByStrategyNumber 8 /* for <@ */ +#define RTOverBelowStrategyNumber 9 /* for &<| */ +#define RTBelowStrategyNumber 10 /* for <<| */ +#define RTAboveStrategyNumber 11 /* for |>> */ +#define RTOverAboveStrategyNumber 12 /* for |&> */ +#define RTOldContainsStrategyNumber 13 /* for old spelling of @> */ +#define RTOldContainedByStrategyNumber 14 /* for old spelling of <@ */ +#define RTKNNSearchStrategyNumber 15 /* for <-> (distance) */ +#define RTContainsElemStrategyNumber 16 /* for range types @> elem */ +#define RTAdjacentStrategyNumber 17 /* for -|- */ +#define RTEqualStrategyNumber 18 /* for = */ +#define RTNotEqualStrategyNumber 19 /* for != */ +#define RTLessStrategyNumber 20 /* for < */ +#define RTLessEqualStrategyNumber 21 /* for <= */ +#define RTGreaterStrategyNumber 22 /* for > */ +#define RTGreaterEqualStrategyNumber 23 /* for >= */ +#define RTSubStrategyNumber 24 /* for inet >> */ +#define RTSubEqualStrategyNumber 25 /* for inet <<= */ +#define RTSuperStrategyNumber 26 /* for inet << */ +#define RTSuperEqualStrategyNumber 27 /* for inet >>= */ +#define RTPrefixStrategyNumber 28 /* for text ^@ */ +#define RTOldBelowStrategyNumber 29 /* for old spelling of <<| */ +#define RTOldAboveStrategyNumber 30 /* for old spelling of |>> */ + +#define RTMaxStrategyNumber 30 + + +#endif /* STRATNUM_H */ diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h new file mode 100644 index 0000000..46a473c --- /dev/null +++ b/src/include/access/subtrans.h @@ -0,0 +1,29 @@ +/* + * subtrans.h + * + * PostgreSQL subtransaction-log manager + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/subtrans.h + */ +#ifndef SUBTRANS_H +#define SUBTRANS_H + +/* Number of SLRU buffers to use for subtrans */ +#define NUM_SUBTRANS_BUFFERS 32 + +extern void SubTransSetParent(TransactionId xid, TransactionId parent); +extern TransactionId SubTransGetParent(TransactionId xid); +extern TransactionId SubTransGetTopmostTransaction(TransactionId xid); + +extern Size SUBTRANSShmemSize(void); +extern void SUBTRANSShmemInit(void); +extern void BootStrapSUBTRANS(void); +extern void StartupSUBTRANS(TransactionId oldestActiveXID); +extern void CheckPointSUBTRANS(void); +extern void ExtendSUBTRANS(TransactionId newestXact); +extern void TruncateSUBTRANS(TransactionId oldestXact); + +#endif /* SUBTRANS_H */ diff --git a/src/include/access/syncscan.h b/src/include/access/syncscan.h new file mode 100644 index 0000000..01a8eec --- /dev/null +++ b/src/include/access/syncscan.h @@ -0,0 +1,25 @@ +/*------------------------------------------------------------------------- + * + * syncscan.h + * POSTGRES synchronous scan support functions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/syncscan.h + * + *------------------------------------------------------------------------- + */ +#ifndef SYNCSCAN_H +#define SYNCSCAN_H + +#include "storage/block.h" +#include "utils/relcache.h" + +extern void ss_report_location(Relation rel, BlockNumber location); +extern BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks); +extern void SyncScanShmemInit(void); +extern Size SyncScanShmemSize(void); + +#endif diff --git a/src/include/access/sysattr.h b/src/include/access/sysattr.h new file mode 100644 index 0000000..8f08682 --- /dev/null +++ b/src/include/access/sysattr.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * sysattr.h + * POSTGRES system attribute definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/sysattr.h + * + *------------------------------------------------------------------------- + */ +#ifndef SYSATTR_H +#define SYSATTR_H + + +/* + * Attribute numbers for the system-defined attributes + */ +#define SelfItemPointerAttributeNumber (-1) +#define MinTransactionIdAttributeNumber (-2) +#define MinCommandIdAttributeNumber (-3) +#define MaxTransactionIdAttributeNumber (-4) +#define MaxCommandIdAttributeNumber (-5) +#define TableOidAttributeNumber (-6) +#define FirstLowInvalidHeapAttributeNumber (-7) + +#endif /* SYSATTR_H */ diff --git a/src/include/access/table.h b/src/include/access/table.h new file mode 100644 index 0000000..e664c7b --- /dev/null +++ b/src/include/access/table.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * table.h + * Generic routines for table related code. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/table.h + * + *------------------------------------------------------------------------- + */ +#ifndef TABLE_H +#define TABLE_H + +#include "nodes/primnodes.h" +#include "storage/lockdefs.h" +#include "utils/relcache.h" + +extern Relation table_open(Oid relationId, LOCKMODE lockmode); +extern Relation table_openrv(const RangeVar *relation, LOCKMODE lockmode); +extern Relation table_openrv_extended(const RangeVar *relation, + LOCKMODE lockmode, bool missing_ok); +extern Relation try_table_open(Oid relationId, LOCKMODE lockmode); +extern void table_close(Relation relation, LOCKMODE lockmode); + +#endif /* TABLE_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h new file mode 100644 index 0000000..ac7b279 --- /dev/null +++ b/src/include/access/tableam.h @@ -0,0 +1,2100 @@ +/*------------------------------------------------------------------------- + * + * tableam.h + * POSTGRES table access method definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/tableam.h + * + * NOTES + * See tableam.sgml for higher level documentation. + * + *------------------------------------------------------------------------- + */ +#ifndef TABLEAM_H +#define TABLEAM_H + +#include "access/relscan.h" +#include "access/sdir.h" +#include "access/xact.h" +#include "executor/tuptable.h" +#include "utils/rel.h" +#include "utils/snapshot.h" + + +#define DEFAULT_TABLE_ACCESS_METHOD "heap" + +/* GUCs */ +extern PGDLLIMPORT char *default_table_access_method; +extern PGDLLIMPORT bool synchronize_seqscans; + + +struct BulkInsertStateData; +struct IndexInfo; +struct SampleScanState; +struct TBMIterateResult; +struct VacuumParams; +struct ValidateIndexState; + +/* + * Bitmask values for the flags argument to the scan_begin callback. + */ +typedef enum ScanOptions +{ + /* one of SO_TYPE_* may be specified */ + SO_TYPE_SEQSCAN = 1 << 0, + SO_TYPE_BITMAPSCAN = 1 << 1, + SO_TYPE_SAMPLESCAN = 1 << 2, + SO_TYPE_TIDSCAN = 1 << 3, + SO_TYPE_TIDRANGESCAN = 1 << 4, + SO_TYPE_ANALYZE = 1 << 5, + + /* several of SO_ALLOW_* may be specified */ + /* allow or disallow use of access strategy */ + SO_ALLOW_STRAT = 1 << 6, + /* report location to syncscan logic? */ + SO_ALLOW_SYNC = 1 << 7, + /* verify visibility page-at-a-time? */ + SO_ALLOW_PAGEMODE = 1 << 8, + + /* unregister snapshot at scan end? */ + SO_TEMP_SNAPSHOT = 1 << 9 +} ScanOptions; + +/* + * Result codes for table_{update,delete,lock_tuple}, and for visibility + * routines inside table AMs. + */ +typedef enum TM_Result +{ + /* + * Signals that the action succeeded (i.e. update/delete performed, lock + * was acquired) + */ + TM_Ok, + + /* The affected tuple wasn't visible to the relevant snapshot */ + TM_Invisible, + + /* The affected tuple was already modified by the calling backend */ + TM_SelfModified, + + /* + * The affected tuple was updated by another transaction. This includes + * the case where tuple was moved to another partition. + */ + TM_Updated, + + /* The affected tuple was deleted by another transaction */ + TM_Deleted, + + /* + * The affected tuple is currently being modified by another session. This + * will only be returned if table_(update/delete/lock_tuple) are + * instructed not to wait. + */ + TM_BeingModified, + + /* lock couldn't be acquired, action skipped. Only used by lock_tuple */ + TM_WouldBlock +} TM_Result; + +/* + * Result codes for table_update(..., update_indexes*..). + * Used to determine which indexes to update. + */ +typedef enum TU_UpdateIndexes +{ + /* No indexed columns were updated (incl. TID addressing of tuple) */ + TU_None, + + /* A non-summarizing indexed column was updated, or the TID has changed */ + TU_All, + + /* Only summarized columns were updated, TID is unchanged */ + TU_Summarizing +} TU_UpdateIndexes; + +/* + * When table_tuple_update, table_tuple_delete, or table_tuple_lock fail + * because the target tuple is already outdated, they fill in this struct to + * provide information to the caller about what happened. + * + * ctid is the target's ctid link: it is the same as the target's TID if the + * target was deleted, or the location of the replacement tuple if the target + * was updated. + * + * xmax is the outdating transaction's XID. If the caller wants to visit the + * replacement tuple, it must check that this matches before believing the + * replacement is really a match. + * + * cmax is the outdating command's CID, but only when the failure code is + * TM_SelfModified (i.e., something in the current transaction outdated the + * tuple); otherwise cmax is zero. (We make this restriction because + * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other + * transactions.) + */ +typedef struct TM_FailureData +{ + ItemPointerData ctid; + TransactionId xmax; + CommandId cmax; + bool traversed; +} TM_FailureData; + +/* + * State used when calling table_index_delete_tuples(). + * + * Represents the status of table tuples, referenced by table TID and taken by + * index AM from index tuples. State consists of high level parameters of the + * deletion operation, plus two mutable palloc()'d arrays for information + * about the status of individual table tuples. These are conceptually one + * single array. Using two arrays keeps the TM_IndexDelete struct small, + * which makes sorting the first array (the deltids array) fast. + * + * Some index AM callers perform simple index tuple deletion (by specifying + * bottomup = false), and include only known-dead deltids. These known-dead + * entries are all marked knowndeletable = true directly (typically these are + * TIDs from LP_DEAD-marked index tuples), but that isn't strictly required. + * + * Callers that specify bottomup = true are "bottom-up index deletion" + * callers. The considerations for the tableam are more subtle with these + * callers because they ask the tableam to perform highly speculative work, + * and might only expect the tableam to check a small fraction of all entries. + * Caller is not allowed to specify knowndeletable = true for any entry + * because everything is highly speculative. Bottom-up caller provides + * context and hints to tableam -- see comments below for details on how index + * AMs and tableams should coordinate during bottom-up index deletion. + * + * Simple index deletion callers may ask the tableam to perform speculative + * work, too. This is a little like bottom-up deletion, but not too much. + * The tableam will only perform speculative work when it's practically free + * to do so in passing for simple deletion caller (while always performing + * whatever work is needed to enable knowndeletable/LP_DEAD index tuples to + * be deleted within index AM). This is the real reason why it's possible for + * simple index deletion caller to specify knowndeletable = false up front + * (this means "check if it's possible for me to delete corresponding index + * tuple when it's cheap to do so in passing"). The index AM should only + * include "extra" entries for index tuples whose TIDs point to a table block + * that tableam is expected to have to visit anyway (in the event of a block + * orientated tableam). The tableam isn't strictly obligated to check these + * "extra" TIDs, but a block-based AM should always manage to do so in + * practice. + * + * The final contents of the deltids/status arrays are interesting to callers + * that ask tableam to perform speculative work (i.e. when _any_ items have + * knowndeletable set to false up front). These index AM callers will + * naturally need to consult final state to determine which index tuples are + * in fact deletable. + * + * The index AM can keep track of which index tuple relates to which deltid by + * setting idxoffnum (and/or relying on each entry being uniquely identifiable + * using tid), which is important when the final contents of the array will + * need to be interpreted -- the array can shrink from initial size after + * tableam processing and/or have entries in a new order (tableam may sort + * deltids array for its own reasons). Bottom-up callers may find that final + * ndeltids is 0 on return from call to tableam, in which case no index tuple + * deletions are possible. Simple deletion callers can rely on any entries + * they know to be deletable appearing in the final array as deletable. + */ +typedef struct TM_IndexDelete +{ + ItemPointerData tid; /* table TID from index tuple */ + int16 id; /* Offset into TM_IndexStatus array */ +} TM_IndexDelete; + +typedef struct TM_IndexStatus +{ + OffsetNumber idxoffnum; /* Index am page offset number */ + bool knowndeletable; /* Currently known to be deletable? */ + + /* Bottom-up index deletion specific fields follow */ + bool promising; /* Promising (duplicate) index tuple? */ + int16 freespace; /* Space freed in index if deleted */ +} TM_IndexStatus; + +/* + * Index AM/tableam coordination is central to the design of bottom-up index + * deletion. The index AM provides hints about where to look to the tableam + * by marking some entries as "promising". Index AM does this with duplicate + * index tuples that are strongly suspected to be old versions left behind by + * UPDATEs that did not logically modify indexed values. Index AM may find it + * helpful to only mark entries as promising when they're thought to have been + * affected by such an UPDATE in the recent past. + * + * Bottom-up index deletion casts a wide net at first, usually by including + * all TIDs on a target index page. It is up to the tableam to worry about + * the cost of checking transaction status information. The tableam is in + * control, but needs careful guidance from the index AM. Index AM requests + * that bottomupfreespace target be met, while tableam measures progress + * towards that goal by tallying the per-entry freespace value for known + * deletable entries. (All !bottomup callers can just set these space related + * fields to zero.) + */ +typedef struct TM_IndexDeleteOp +{ + Relation irel; /* Target index relation */ + BlockNumber iblknum; /* Index block number (for error reports) */ + bool bottomup; /* Bottom-up (not simple) deletion? */ + int bottomupfreespace; /* Bottom-up space target */ + + /* Mutable per-TID information follows (index AM initializes entries) */ + int ndeltids; /* Current # of deltids/status elements */ + TM_IndexDelete *deltids; + TM_IndexStatus *status; +} TM_IndexDeleteOp; + +/* "options" flag bits for table_tuple_insert */ +/* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ +#define TABLE_INSERT_SKIP_FSM 0x0002 +#define TABLE_INSERT_FROZEN 0x0004 +#define TABLE_INSERT_NO_LOGICAL 0x0008 + +/* flag bits for table_tuple_lock */ +/* Follow tuples whose update is in progress if lock modes don't conflict */ +#define TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS (1 << 0) +/* Follow update chain and lock latest version of tuple */ +#define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1) + + +/* Typedef for callback function for table_index_build_scan */ +typedef void (*IndexBuildCallback) (Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state); + +/* + * API struct for a table AM. Note this must be allocated in a + * server-lifetime manner, typically as a static const struct, which then gets + * returned by FormData_pg_am.amhandler. + * + * In most cases it's not appropriate to call the callbacks directly, use the + * table_* wrapper functions instead. + * + * GetTableAmRoutine() asserts that required callbacks are filled in, remember + * to update when adding a callback. + */ +typedef struct TableAmRoutine +{ + /* this must be set to T_TableAmRoutine */ + NodeTag type; + + + /* ------------------------------------------------------------------------ + * Slot related callbacks. + * ------------------------------------------------------------------------ + */ + + /* + * Return slot implementation suitable for storing a tuple of this AM. + */ + const TupleTableSlotOps *(*slot_callbacks) (Relation rel); + + + /* ------------------------------------------------------------------------ + * Table scan callbacks. + * ------------------------------------------------------------------------ + */ + + /* + * Start a scan of `rel`. The callback has to return a TableScanDesc, + * which will typically be embedded in a larger, AM specific, struct. + * + * If nkeys != 0, the results need to be filtered by those scan keys. + * + * pscan, if not NULL, will have already been initialized with + * parallelscan_initialize(), and has to be for the same relation. Will + * only be set coming from table_beginscan_parallel(). + * + * `flags` is a bitmask indicating the type of scan (ScanOptions's + * SO_TYPE_*, currently only one may be specified), options controlling + * the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be + * specified, an AM may ignore unsupported ones) and whether the snapshot + * needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT). + */ + TableScanDesc (*scan_begin) (Relation rel, + Snapshot snapshot, + int nkeys, struct ScanKeyData *key, + ParallelTableScanDesc pscan, + uint32 flags); + + /* + * Release resources and deallocate scan. If TableScanDesc.temp_snap, + * TableScanDesc.rs_snapshot needs to be unregistered. + */ + void (*scan_end) (TableScanDesc scan); + + /* + * Restart relation scan. If set_params is set to true, allow_{strat, + * sync, pagemode} (see scan_begin) changes should be taken into account. + */ + void (*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key, + bool set_params, bool allow_strat, + bool allow_sync, bool allow_pagemode); + + /* + * Return next tuple from `scan`, store in slot. + */ + bool (*scan_getnextslot) (TableScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); + + /*----------- + * Optional functions to provide scanning for ranges of ItemPointers. + * Implementations must either provide both of these functions, or neither + * of them. + * + * Implementations of scan_set_tidrange must themselves handle + * ItemPointers of any value. i.e, they must handle each of the following: + * + * 1) mintid or maxtid is beyond the end of the table; and + * 2) mintid is above maxtid; and + * 3) item offset for mintid or maxtid is beyond the maximum offset + * allowed by the AM. + * + * Implementations can assume that scan_set_tidrange is always called + * before scan_getnextslot_tidrange or after scan_rescan and before any + * further calls to scan_getnextslot_tidrange. + */ + void (*scan_set_tidrange) (TableScanDesc scan, + ItemPointer mintid, + ItemPointer maxtid); + + /* + * Return next tuple from `scan` that's in the range of TIDs defined by + * scan_set_tidrange. + */ + bool (*scan_getnextslot_tidrange) (TableScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); + + /* ------------------------------------------------------------------------ + * Parallel table scan related functions. + * ------------------------------------------------------------------------ + */ + + /* + * Estimate the size of shared memory needed for a parallel scan of this + * relation. The snapshot does not need to be accounted for. + */ + Size (*parallelscan_estimate) (Relation rel); + + /* + * Initialize ParallelTableScanDesc for a parallel scan of this relation. + * `pscan` will be sized according to parallelscan_estimate() for the same + * relation. + */ + Size (*parallelscan_initialize) (Relation rel, + ParallelTableScanDesc pscan); + + /* + * Reinitialize `pscan` for a new scan. `rel` will be the same relation as + * when `pscan` was initialized by parallelscan_initialize. + */ + void (*parallelscan_reinitialize) (Relation rel, + ParallelTableScanDesc pscan); + + + /* ------------------------------------------------------------------------ + * Index Scan Callbacks + * ------------------------------------------------------------------------ + */ + + /* + * Prepare to fetch tuples from the relation, as needed when fetching + * tuples for an index scan. The callback has to return an + * IndexFetchTableData, which the AM will typically embed in a larger + * structure with additional information. + * + * Tuples for an index scan can then be fetched via index_fetch_tuple. + */ + struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); + + /* + * Reset index fetch. Typically this will release cross index fetch + * resources held in IndexFetchTableData. + */ + void (*index_fetch_reset) (struct IndexFetchTableData *data); + + /* + * Release resources and deallocate index fetch. + */ + void (*index_fetch_end) (struct IndexFetchTableData *data); + + /* + * Fetch tuple at `tid` into `slot`, after doing a visibility test + * according to `snapshot`. If a tuple was found and passed the visibility + * test, return true, false otherwise. + * + * Note that AMs that do not necessarily update indexes when indexed + * columns do not change, need to return the current/correct version of + * the tuple that is visible to the snapshot, even if the tid points to an + * older version of the tuple. + * + * *call_again is false on the first call to index_fetch_tuple for a tid. + * If there potentially is another tuple matching the tid, *call_again + * needs to be set to true by index_fetch_tuple, signaling to the caller + * that index_fetch_tuple should be called again for the same tid. + * + * *all_dead, if all_dead is not NULL, should be set to true by + * index_fetch_tuple iff it is guaranteed that no backend needs to see + * that tuple. Index AMs can use that to avoid returning that tid in + * future searches. + */ + bool (*index_fetch_tuple) (struct IndexFetchTableData *scan, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *call_again, bool *all_dead); + + + /* ------------------------------------------------------------------------ + * Callbacks for non-modifying operations on individual tuples + * ------------------------------------------------------------------------ + */ + + /* + * Fetch tuple at `tid` into `slot`, after doing a visibility test + * according to `snapshot`. If a tuple was found and passed the visibility + * test, returns true, false otherwise. + */ + bool (*tuple_fetch_row_version) (Relation rel, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot); + + /* + * Is tid valid for a scan of this relation. + */ + bool (*tuple_tid_valid) (TableScanDesc scan, + ItemPointer tid); + + /* + * Return the latest version of the tuple at `tid`, by updating `tid` to + * point at the newest version. + */ + void (*tuple_get_latest_tid) (TableScanDesc scan, + ItemPointer tid); + + /* + * Does the tuple in `slot` satisfy `snapshot`? The slot needs to be of + * the appropriate type for the AM. + */ + bool (*tuple_satisfies_snapshot) (Relation rel, + TupleTableSlot *slot, + Snapshot snapshot); + + /* see table_index_delete_tuples() */ + TransactionId (*index_delete_tuples) (Relation rel, + TM_IndexDeleteOp *delstate); + + + /* ------------------------------------------------------------------------ + * Manipulations of physical tuples. + * ------------------------------------------------------------------------ + */ + + /* see table_tuple_insert() for reference about parameters */ + void (*tuple_insert) (Relation rel, TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate); + + /* see table_tuple_insert_speculative() for reference about parameters */ + void (*tuple_insert_speculative) (Relation rel, + TupleTableSlot *slot, + CommandId cid, + int options, + struct BulkInsertStateData *bistate, + uint32 specToken); + + /* see table_tuple_complete_speculative() for reference about parameters */ + void (*tuple_complete_speculative) (Relation rel, + TupleTableSlot *slot, + uint32 specToken, + bool succeeded); + + /* see table_multi_insert() for reference about parameters */ + void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots, + CommandId cid, int options, struct BulkInsertStateData *bistate); + + /* see table_tuple_delete() for reference about parameters */ + TM_Result (*tuple_delete) (Relation rel, + ItemPointer tid, + CommandId cid, + Snapshot snapshot, + Snapshot crosscheck, + bool wait, + TM_FailureData *tmfd, + bool changingPart); + + /* see table_tuple_update() for reference about parameters */ + TM_Result (*tuple_update) (Relation rel, + ItemPointer otid, + TupleTableSlot *slot, + CommandId cid, + Snapshot snapshot, + Snapshot crosscheck, + bool wait, + TM_FailureData *tmfd, + LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes); + + /* see table_tuple_lock() for reference about parameters */ + TM_Result (*tuple_lock) (Relation rel, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + CommandId cid, + LockTupleMode mode, + LockWaitPolicy wait_policy, + uint8 flags, + TM_FailureData *tmfd); + + /* + * Perform operations necessary to complete insertions made via + * tuple_insert and multi_insert with a BulkInsertState specified. In-tree + * access methods ceased to use this. + * + * Typically callers of tuple_insert and multi_insert will just pass all + * the flags that apply to them, and each AM has to decide which of them + * make sense for it, and then only take actions in finish_bulk_insert for + * those flags, and ignore others. + * + * Optional callback. + */ + void (*finish_bulk_insert) (Relation rel, int options); + + + /* ------------------------------------------------------------------------ + * DDL related functionality. + * ------------------------------------------------------------------------ + */ + + /* + * This callback needs to create new relation storage for `rel`, with + * appropriate durability behaviour for `persistence`. + * + * Note that only the subset of the relcache filled by + * RelationBuildLocalRelation() can be relied upon and that the relation's + * catalog entries will either not yet exist (new relation), or will still + * reference the old relfilelocator. + * + * As output *freezeXid, *minmulti must be set to the values appropriate + * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those + * fields to be filled they can be set to InvalidTransactionId and + * InvalidMultiXactId, respectively. + * + * See also table_relation_set_new_filelocator(). + */ + void (*relation_set_new_filelocator) (Relation rel, + const RelFileLocator *newrlocator, + char persistence, + TransactionId *freezeXid, + MultiXactId *minmulti); + + /* + * This callback needs to remove all contents from `rel`'s current + * relfilelocator. No provisions for transactional behaviour need to be + * made. Often this can be implemented by truncating the underlying + * storage to its minimal size. + * + * See also table_relation_nontransactional_truncate(). + */ + void (*relation_nontransactional_truncate) (Relation rel); + + /* + * See table_relation_copy_data(). + * + * This can typically be implemented by directly copying the underlying + * storage, unless it contains references to the tablespace internally. + */ + void (*relation_copy_data) (Relation rel, + const RelFileLocator *newrlocator); + + /* See table_relation_copy_for_cluster() */ + void (*relation_copy_for_cluster) (Relation NewTable, + Relation OldTable, + Relation OldIndex, + bool use_sort, + TransactionId OldestXmin, + TransactionId *xid_cutoff, + MultiXactId *multi_cutoff, + double *num_tuples, + double *tups_vacuumed, + double *tups_recently_dead); + + /* + * React to VACUUM command on the relation. The VACUUM can be triggered by + * a user or by autovacuum. The specific actions performed by the AM will + * depend heavily on the individual AM. + * + * On entry a transaction is already established, and the relation is + * locked with a ShareUpdateExclusive lock. + * + * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through + * this routine, even if (for ANALYZE) it is part of the same VACUUM + * command. + * + * There probably, in the future, needs to be a separate callback to + * integrate with autovacuum's scheduling. + */ + void (*relation_vacuum) (Relation rel, + struct VacuumParams *params, + BufferAccessStrategy bstrategy); + + /* + * Prepare to analyze block `blockno` of `scan`. The scan has been started + * with table_beginscan_analyze(). See also + * table_scan_analyze_next_block(). + * + * The callback may acquire resources like locks that are held until + * table_scan_analyze_next_tuple() returns false. It e.g. can make sense + * to hold a lock until all tuples on a block have been analyzed by + * scan_analyze_next_tuple. + * + * The callback can return false if the block is not suitable for + * sampling, e.g. because it's a metapage that could never contain tuples. + * + * XXX: This obviously is primarily suited for block-based AMs. It's not + * clear what a good interface for non block based AMs would be, so there + * isn't one yet. + */ + bool (*scan_analyze_next_block) (TableScanDesc scan, + BlockNumber blockno, + BufferAccessStrategy bstrategy); + + /* + * See table_scan_analyze_next_tuple(). + * + * Not every AM might have a meaningful concept of dead rows, in which + * case it's OK to not increment *deadrows - but note that that may + * influence autovacuum scheduling (see comment for relation_vacuum + * callback). + */ + bool (*scan_analyze_next_tuple) (TableScanDesc scan, + TransactionId OldestXmin, + double *liverows, + double *deadrows, + TupleTableSlot *slot); + + /* see table_index_build_range_scan for reference about parameters */ + double (*index_build_range_scan) (Relation table_rel, + Relation index_rel, + struct IndexInfo *index_info, + bool allow_sync, + bool anyvisible, + bool progress, + BlockNumber start_blockno, + BlockNumber numblocks, + IndexBuildCallback callback, + void *callback_state, + TableScanDesc scan); + + /* see table_index_validate_scan for reference about parameters */ + void (*index_validate_scan) (Relation table_rel, + Relation index_rel, + struct IndexInfo *index_info, + Snapshot snapshot, + struct ValidateIndexState *state); + + + /* ------------------------------------------------------------------------ + * Miscellaneous functions. + * ------------------------------------------------------------------------ + */ + + /* + * See table_relation_size(). + * + * Note that currently a few callers use the MAIN_FORKNUM size to figure + * out the range of potentially interesting blocks (brin, analyze). It's + * probable that we'll need to revise the interface for those at some + * point. + */ + uint64 (*relation_size) (Relation rel, ForkNumber forkNumber); + + + /* + * This callback should return true if the relation requires a TOAST table + * and false if it does not. It may wish to examine the relation's tuple + * descriptor before making a decision, but if it uses some other method + * of storing large values (or if it does not support them) it can simply + * return false. + */ + bool (*relation_needs_toast_table) (Relation rel); + + /* + * This callback should return the OID of the table AM that implements + * TOAST tables for this AM. If the relation_needs_toast_table callback + * always returns false, this callback is not required. + */ + Oid (*relation_toast_am) (Relation rel); + + /* + * This callback is invoked when detoasting a value stored in a toast + * table implemented by this AM. See table_relation_fetch_toast_slice() + * for more details. + */ + void (*relation_fetch_toast_slice) (Relation toastrel, Oid valueid, + int32 attrsize, + int32 sliceoffset, + int32 slicelength, + struct varlena *result); + + + /* ------------------------------------------------------------------------ + * Planner related functions. + * ------------------------------------------------------------------------ + */ + + /* + * See table_relation_estimate_size(). + * + * While block oriented, it shouldn't be too hard for an AM that doesn't + * internally use blocks to convert into a usable representation. + * + * This differs from the relation_size callback by returning size + * estimates (both relation size and tuple count) for planning purposes, + * rather than returning a currently correct estimate. + */ + void (*relation_estimate_size) (Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples, + double *allvisfrac); + + + /* ------------------------------------------------------------------------ + * Executor related functions. + * ------------------------------------------------------------------------ + */ + + /* + * Prepare to fetch / check / return tuples from `tbmres->blockno` as part + * of a bitmap table scan. `scan` was started via table_beginscan_bm(). + * Return false if there are no tuples to be found on the page, true + * otherwise. + * + * This will typically read and pin the target block, and do the necessary + * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might + * make sense to perform tuple visibility checks at this time). For some + * AMs it will make more sense to do all the work referencing `tbmres` + * contents here, for others it might be better to defer more work to + * scan_bitmap_next_tuple. + * + * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples + * on the page have to be returned, otherwise the tuples at offsets in + * `tbmres->offsets` need to be returned. + * + * XXX: Currently this may only be implemented if the AM uses md.c as its + * storage manager, and uses ItemPointer->ip_blkid in a manner that maps + * blockids directly to the underlying storage. nodeBitmapHeapscan.c + * performs prefetching directly using that interface. This probably + * needs to be rectified at a later point. + * + * XXX: Currently this may only be implemented if the AM uses the + * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to + * perform prefetching. This probably needs to be rectified at a later + * point. + * + * Optional callback, but either both scan_bitmap_next_block and + * scan_bitmap_next_tuple need to exist, or neither. + */ + bool (*scan_bitmap_next_block) (TableScanDesc scan, + struct TBMIterateResult *tbmres); + + /* + * Fetch the next tuple of a bitmap table scan into `slot` and return true + * if a visible tuple was found, false otherwise. + * + * For some AMs it will make more sense to do all the work referencing + * `tbmres` contents in scan_bitmap_next_block, for others it might be + * better to defer more work to this callback. + * + * Optional callback, but either both scan_bitmap_next_block and + * scan_bitmap_next_tuple need to exist, or neither. + */ + bool (*scan_bitmap_next_tuple) (TableScanDesc scan, + struct TBMIterateResult *tbmres, + TupleTableSlot *slot); + + /* + * Prepare to fetch tuples from the next block in a sample scan. Return + * false if the sample scan is finished, true otherwise. `scan` was + * started via table_beginscan_sampling(). + * + * Typically this will first determine the target block by calling the + * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively + * perform a sequential scan over all blocks. The determined block is + * then typically read and pinned. + * + * As the TsmRoutine interface is block based, a block needs to be passed + * to NextSampleBlock(). If that's not appropriate for an AM, it + * internally needs to perform mapping between the internal and a block + * based representation. + * + * Note that it's not acceptable to hold deadlock prone resources such as + * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the + * block - the tuple is likely to be returned to an upper query node, and + * the next call could be off a long while. Holding buffer pins and such + * is obviously OK. + * + * Currently it is required to implement this interface, as there's no + * alternative way (contrary e.g. to bitmap scans) to implement sample + * scans. If infeasible to implement, the AM may raise an error. + */ + bool (*scan_sample_next_block) (TableScanDesc scan, + struct SampleScanState *scanstate); + + /* + * This callback, only called after scan_sample_next_block has returned + * true, should determine the next tuple to be returned from the selected + * block using the TsmRoutine's NextSampleTuple() callback. + * + * The callback needs to perform visibility checks, and only return + * visible tuples. That obviously can mean calling NextSampleTuple() + * multiple times. + * + * The TsmRoutine interface assumes that there's a maximum offset on a + * given page, so if that doesn't apply to an AM, it needs to emulate that + * assumption somehow. + */ + bool (*scan_sample_next_tuple) (TableScanDesc scan, + struct SampleScanState *scanstate, + TupleTableSlot *slot); + +} TableAmRoutine; + + +/* ---------------------------------------------------------------------------- + * Slot functions. + * ---------------------------------------------------------------------------- + */ + +/* + * Returns slot callbacks suitable for holding tuples of the appropriate type + * for the relation. Works for tables, views, foreign tables and partitioned + * tables. + */ +extern const TupleTableSlotOps *table_slot_callbacks(Relation relation); + +/* + * Returns slot using the callbacks returned by table_slot_callbacks(), and + * registers it on *reglist. + */ +extern TupleTableSlot *table_slot_create(Relation relation, List **reglist); + + +/* ---------------------------------------------------------------------------- + * Table scan functions. + * ---------------------------------------------------------------------------- + */ + +/* + * Start a scan of `rel`. Returned tuples pass a visibility test of + * `snapshot`, and if nkeys != 0, the results are filtered by those scan keys. + */ +static inline TableScanDesc +table_beginscan(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + +/* + * Like table_beginscan(), but for scanning catalog. It'll automatically use a + * snapshot appropriate for scanning catalog relations. + */ +extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, + struct ScanKeyData *key); + +/* + * Like table_beginscan(), but table_beginscan_strat() offers an extended API + * that lets the caller control whether a nondefault buffer access strategy + * can be used, and whether syncscan can be chosen (possibly resulting in the + * scan not starting from block zero). Both of these default to true with + * plain table_beginscan. + */ +static inline TableScanDesc +table_beginscan_strat(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, + bool allow_strat, bool allow_sync) +{ + uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; + + if (allow_strat) + flags |= SO_ALLOW_STRAT; + if (allow_sync) + flags |= SO_ALLOW_SYNC; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + +/* + * table_beginscan_bm is an alternative entry point for setting up a + * TableScanDesc for a bitmap heap scan. Although that scan technology is + * really quite unlike a standard seqscan, there is just enough commonality to + * make it worth using the same data structure. + */ +static inline TableScanDesc +table_beginscan_bm(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key) +{ + uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + +/* + * table_beginscan_sampling is an alternative entry point for setting up a + * TableScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth + * using the same data structure although the behavior is rather different. + * In addition to the options offered by table_beginscan_strat, this call + * also allows control of whether page-mode visibility checking is used. + */ +static inline TableScanDesc +table_beginscan_sampling(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, + bool allow_strat, bool allow_sync, + bool allow_pagemode) +{ + uint32 flags = SO_TYPE_SAMPLESCAN; + + if (allow_strat) + flags |= SO_ALLOW_STRAT; + if (allow_sync) + flags |= SO_ALLOW_SYNC; + if (allow_pagemode) + flags |= SO_ALLOW_PAGEMODE; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + +/* + * table_beginscan_tid is an alternative entry point for setting up a + * TableScanDesc for a Tid scan. As with bitmap scans, it's worth using + * the same data structure although the behavior is rather different. + */ +static inline TableScanDesc +table_beginscan_tid(Relation rel, Snapshot snapshot) +{ + uint32 flags = SO_TYPE_TIDSCAN; + + return rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags); +} + +/* + * table_beginscan_analyze is an alternative entry point for setting up a + * TableScanDesc for an ANALYZE scan. As with bitmap scans, it's worth using + * the same data structure although the behavior is rather different. + */ +static inline TableScanDesc +table_beginscan_analyze(Relation rel) +{ + uint32 flags = SO_TYPE_ANALYZE; + + return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags); +} + +/* + * End relation scan. + */ +static inline void +table_endscan(TableScanDesc scan) +{ + scan->rs_rd->rd_tableam->scan_end(scan); +} + +/* + * Restart a relation scan. + */ +static inline void +table_rescan(TableScanDesc scan, + struct ScanKeyData *key) +{ + scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false); +} + +/* + * Restart a relation scan after changing params. + * + * This call allows changing the buffer strategy, syncscan, and pagemode + * options before starting a fresh scan. Note that although the actual use of + * syncscan might change (effectively, enabling or disabling reporting), the + * previously selected startblock will be kept. + */ +static inline void +table_rescan_set_params(TableScanDesc scan, struct ScanKeyData *key, + bool allow_strat, bool allow_sync, bool allow_pagemode) +{ + scan->rs_rd->rd_tableam->scan_rescan(scan, key, true, + allow_strat, allow_sync, + allow_pagemode); +} + +/* + * Update snapshot used by the scan. + */ +extern void table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot); + +/* + * Return next tuple from `scan`, store in slot. + */ +static inline bool +table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) +{ + slot->tts_tableOid = RelationGetRelid(sscan->rs_rd); + + /* We don't expect actual scans using NoMovementScanDirection */ + Assert(direction == ForwardScanDirection || + direction == BackwardScanDirection); + + /* + * We don't expect direct calls to table_scan_getnextslot with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_getnextslot call during logical decoding"); + + return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot); +} + +/* ---------------------------------------------------------------------------- + * TID Range scanning related functions. + * ---------------------------------------------------------------------------- + */ + +/* + * table_beginscan_tidrange is the entry point for setting up a TableScanDesc + * for a TID range scan. + */ +static inline TableScanDesc +table_beginscan_tidrange(Relation rel, Snapshot snapshot, + ItemPointer mintid, + ItemPointer maxtid) +{ + TableScanDesc sscan; + uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE; + + sscan = rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags); + + /* Set the range of TIDs to scan */ + sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid); + + return sscan; +} + +/* + * table_rescan_tidrange resets the scan position and sets the minimum and + * maximum TID range to scan for a TableScanDesc created by + * table_beginscan_tidrange. + */ +static inline void +table_rescan_tidrange(TableScanDesc sscan, ItemPointer mintid, + ItemPointer maxtid) +{ + /* Ensure table_beginscan_tidrange() was used. */ + Assert((sscan->rs_flags & SO_TYPE_TIDRANGESCAN) != 0); + + sscan->rs_rd->rd_tableam->scan_rescan(sscan, NULL, false, false, false, false); + sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid); +} + +/* + * Fetch the next tuple from `sscan` for a TID range scan created by + * table_beginscan_tidrange(). Stores the tuple in `slot` and returns true, + * or returns false if no more tuples exist in the range. + */ +static inline bool +table_scan_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, + TupleTableSlot *slot) +{ + /* Ensure table_beginscan_tidrange() was used. */ + Assert((sscan->rs_flags & SO_TYPE_TIDRANGESCAN) != 0); + + /* We don't expect actual scans using NoMovementScanDirection */ + Assert(direction == ForwardScanDirection || + direction == BackwardScanDirection); + + return sscan->rs_rd->rd_tableam->scan_getnextslot_tidrange(sscan, + direction, + slot); +} + + +/* ---------------------------------------------------------------------------- + * Parallel table scan related functions. + * ---------------------------------------------------------------------------- + */ + +/* + * Estimate the size of shared memory needed for a parallel scan of this + * relation. + */ +extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); + +/* + * Initialize ParallelTableScanDesc for a parallel scan of this + * relation. `pscan` needs to be sized according to parallelscan_estimate() + * for the same relation. Call this just once in the leader process; then, + * individual workers attach via table_beginscan_parallel. + */ +extern void table_parallelscan_initialize(Relation rel, + ParallelTableScanDesc pscan, + Snapshot snapshot); + +/* + * Begin a parallel scan. `pscan` needs to have been initialized with + * table_parallelscan_initialize(), for the same relation. The initialization + * does not need to have happened in this backend. + * + * Caller must hold a suitable lock on the relation. + */ +extern TableScanDesc table_beginscan_parallel(Relation relation, + ParallelTableScanDesc pscan); + +/* + * Restart a parallel scan. Call this in the leader process. Caller is + * responsible for making sure that all workers have finished the scan + * beforehand. + */ +static inline void +table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) +{ + rel->rd_tableam->parallelscan_reinitialize(rel, pscan); +} + + +/* ---------------------------------------------------------------------------- + * Index scan related functions. + * ---------------------------------------------------------------------------- + */ + +/* + * Prepare to fetch tuples from the relation, as needed when fetching tuples + * for an index scan. + * + * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). + */ +static inline IndexFetchTableData * +table_index_fetch_begin(Relation rel) +{ + return rel->rd_tableam->index_fetch_begin(rel); +} + +/* + * Reset index fetch. Typically this will release cross index fetch resources + * held in IndexFetchTableData. + */ +static inline void +table_index_fetch_reset(struct IndexFetchTableData *scan) +{ + scan->rel->rd_tableam->index_fetch_reset(scan); +} + +/* + * Release resources and deallocate index fetch. + */ +static inline void +table_index_fetch_end(struct IndexFetchTableData *scan) +{ + scan->rel->rd_tableam->index_fetch_end(scan); +} + +/* + * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing + * a visibility test according to `snapshot`. If a tuple was found and passed + * the visibility test, returns true, false otherwise. Note that *tid may be + * modified when we return true (see later remarks on multiple row versions + * reachable via a single index entry). + * + * *call_again needs to be false on the first call to table_index_fetch_tuple() for + * a tid. If there potentially is another tuple matching the tid, *call_again + * will be set to true, signaling that table_index_fetch_tuple() should be called + * again for the same tid. + * + * *all_dead, if all_dead is not NULL, will be set to true by + * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see + * that tuple. Index AMs can use that to avoid returning that tid in future + * searches. + * + * The difference between this function and table_tuple_fetch_row_version() + * is that this function returns the currently visible version of a row if + * the AM supports storing multiple row versions reachable via a single index + * entry (like heap's HOT). Whereas table_tuple_fetch_row_version() only + * evaluates the tuple exactly at `tid`. Outside of index entry ->table tuple + * lookups, table_tuple_fetch_row_version() is what's usually needed. + */ +static inline bool +table_index_fetch_tuple(struct IndexFetchTableData *scan, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *call_again, bool *all_dead) +{ + /* + * We don't expect direct calls to table_index_fetch_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); + + return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, + slot, call_again, + all_dead); +} + +/* + * This is a convenience wrapper around table_index_fetch_tuple() which + * returns whether there are table tuple items corresponding to an index + * entry. This likely is only useful to verify if there's a conflict in a + * unique index. + */ +extern bool table_index_fetch_tuple_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead); + + +/* ------------------------------------------------------------------------ + * Functions for non-modifying operations on individual tuples + * ------------------------------------------------------------------------ + */ + + +/* + * Fetch tuple at `tid` into `slot`, after doing a visibility test according to + * `snapshot`. If a tuple was found and passed the visibility test, returns + * true, false otherwise. + * + * See table_index_fetch_tuple's comment about what the difference between + * these functions is. It is correct to use this function outside of index + * entry->table tuple lookups. + */ +static inline bool +table_tuple_fetch_row_version(Relation rel, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot) +{ + /* + * We don't expect direct calls to table_tuple_fetch_row_version with + * valid CheckXidAlive for catalog or regular tables. See detailed + * comments in xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding"); + + return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); +} + +/* + * Verify that `tid` is a potentially valid tuple identifier. That doesn't + * mean that the pointed to row needs to exist or be visible, but that + * attempting to fetch the row (e.g. with table_tuple_get_latest_tid() or + * table_tuple_fetch_row_version()) should not error out if called with that + * tid. + * + * `scan` needs to have been started via table_beginscan(). + */ +static inline bool +table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) +{ + return scan->rs_rd->rd_tableam->tuple_tid_valid(scan, tid); +} + +/* + * Return the latest version of the tuple at `tid`, by updating `tid` to + * point at the newest version. + */ +extern void table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid); + +/* + * Return true iff tuple in slot satisfies the snapshot. + * + * This assumes the slot's tuple is valid, and of the appropriate type for the + * AM. + * + * Some AMs might modify the data underlying the tuple as a side-effect. If so + * they ought to mark the relevant buffer dirty. + */ +static inline bool +table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, + Snapshot snapshot) +{ + return rel->rd_tableam->tuple_satisfies_snapshot(rel, slot, snapshot); +} + +/* + * Determine which index tuples are safe to delete based on their table TID. + * + * Determines which entries from index AM caller's TM_IndexDeleteOp state + * point to vacuumable table tuples. Entries that are found by tableam to be + * vacuumable are naturally safe for index AM to delete, and so get directly + * marked as deletable. See comments above TM_IndexDelete and comments above + * TM_IndexDeleteOp for full details. + * + * Returns a snapshotConflictHorizon transaction ID that caller places in + * its index deletion WAL record. This might be used during subsequent REDO + * of the WAL record when in Hot Standby mode -- a recovery conflict for the + * index deletion operation might be required on the standby. + */ +static inline TransactionId +table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) +{ + return rel->rd_tableam->index_delete_tuples(rel, delstate); +} + + +/* ---------------------------------------------------------------------------- + * Functions for manipulations of physical tuples. + * ---------------------------------------------------------------------------- + */ + +/* + * Insert a tuple from a slot into table AM routine. + * + * The options bitmask allows the caller to specify options that may change the + * behaviour of the AM. The AM will ignore options that it does not support. + * + * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse + * free space in the relation. This can save some cycles when we know the + * relation is new and doesn't contain useful amounts of free space. + * TABLE_INSERT_SKIP_FSM is commonly passed directly to + * RelationGetBufferForTuple. See that method for more information. + * + * TABLE_INSERT_FROZEN should only be specified for inserts into + * relation storage created during the current subtransaction and when + * there are no prior snapshots or pre-existing portals open. + * This causes rows to be frozen, which is an MVCC violation and + * requires explicit options chosen by user. + * + * TABLE_INSERT_NO_LOGICAL force-disables the emitting of logical decoding + * information for the tuple. This should solely be used during table rewrites + * where RelationIsLogicallyLogged(relation) is not yet accurate for the new + * relation. + * + * Note that most of these options will be applied when inserting into the + * heap's TOAST table, too, if the tuple requires any out-of-line data. + * + * The BulkInsertState object (if any; bistate can be NULL for default + * behavior) is also just passed through to RelationGetBufferForTuple. If + * `bistate` is provided, table_finish_bulk_insert() needs to be called. + * + * On return the slot's tts_tid and tts_tableOid are updated to reflect the + * insertion. But note that any toasting of fields within the slot is NOT + * reflected in the slots contents. + */ +static inline void +table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, + int options, struct BulkInsertStateData *bistate) +{ + rel->rd_tableam->tuple_insert(rel, slot, cid, options, + bistate); +} + +/* + * Perform a "speculative insertion". These can be backed out afterwards + * without aborting the whole transaction. Other sessions can wait for the + * speculative insertion to be confirmed, turning it into a regular tuple, or + * aborted, as if it never existed. Speculatively inserted tuples behave as + * "value locks" of short duration, used to implement INSERT .. ON CONFLICT. + * + * A transaction having performed a speculative insertion has to either abort, + * or finish the speculative insertion with + * table_tuple_complete_speculative(succeeded = ...). + */ +static inline void +table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + uint32 specToken) +{ + rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options, + bistate, specToken); +} + +/* + * Complete "speculative insertion" started in the same transaction. If + * succeeded is true, the tuple is fully inserted, if false, it's removed. + */ +static inline void +table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot, + uint32 specToken, bool succeeded) +{ + rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken, + succeeded); +} + +/* + * Insert multiple tuples into a table. + * + * This is like table_tuple_insert(), but inserts multiple tuples in one + * operation. That's often faster than calling table_tuple_insert() in a loop, + * because e.g. the AM can reduce WAL logging and page locking overhead. + * + * Except for taking `nslots` tuples as input, and an array of TupleTableSlots + * in `slots`, the parameters for table_multi_insert() are the same as for + * table_tuple_insert(). + * + * Note: this leaks memory into the current memory context. You can create a + * temporary context before calling this, if that's a problem. + */ +static inline void +table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, + CommandId cid, int options, struct BulkInsertStateData *bistate) +{ + rel->rd_tableam->multi_insert(rel, slots, nslots, + cid, options, bistate); +} + +/* + * Delete a tuple. + * + * NB: do not call this directly unless prepared to deal with + * concurrent-update conditions. Use simple_table_tuple_delete instead. + * + * Input parameters: + * relation - table to be modified (caller must hold suitable lock) + * tid - TID of tuple to be deleted + * cid - delete command ID (used for visibility test, and stored into + * cmax if successful) + * crosscheck - if not InvalidSnapshot, also check tuple against this + * wait - true if should wait for any conflicting update to commit/abort + * Output parameters: + * tmfd - filled in failure cases (see below) + * changingPart - true iff the tuple is being moved to another partition + * table due to an update of the partition key. Otherwise, false. + * + * Normal, successful return value is TM_Ok, which means we did actually + * delete it. Failure return codes are TM_SelfModified, TM_Updated, and + * TM_BeingModified (the last only possible if wait == false). + * + * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, + * t_xmax, and, if possible, t_cmax. See comments for struct + * TM_FailureData for additional info. + */ +static inline TM_Result +table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, bool wait, + TM_FailureData *tmfd, bool changingPart) +{ + return rel->rd_tableam->tuple_delete(rel, tid, cid, + snapshot, crosscheck, + wait, tmfd, changingPart); +} + +/* + * Update a tuple. + * + * NB: do not call this directly unless you are prepared to deal with + * concurrent-update conditions. Use simple_table_tuple_update instead. + * + * Input parameters: + * relation - table to be modified (caller must hold suitable lock) + * otid - TID of old tuple to be replaced + * slot - newly constructed tuple data to store + * cid - update command ID (used for visibility test, and stored into + * cmax/cmin if successful) + * crosscheck - if not InvalidSnapshot, also check old tuple against this + * wait - true if should wait for any conflicting update to commit/abort + * Output parameters: + * tmfd - filled in failure cases (see below) + * lockmode - filled with lock mode acquired on tuple + * update_indexes - in success cases this is set to true if new index entries + * are required for this tuple + * + * Normal, successful return value is TM_Ok, which means we did actually + * update it. Failure return codes are TM_SelfModified, TM_Updated, and + * TM_BeingModified (the last only possible if wait == false). + * + * On success, the slot's tts_tid and tts_tableOid are updated to match the new + * stored tuple; in particular, slot->tts_tid is set to the TID where the + * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT + * update was done. However, any TOAST changes in the new tuple's + * data are not reflected into *newtup. + * + * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, + * t_xmax, and, if possible, t_cmax. See comments for struct TM_FailureData + * for additional info. + */ +static inline TM_Result +table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, + CommandId cid, Snapshot snapshot, Snapshot crosscheck, + bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes) +{ + return rel->rd_tableam->tuple_update(rel, otid, slot, + cid, snapshot, crosscheck, + wait, tmfd, + lockmode, update_indexes); +} + +/* + * Lock a tuple in the specified mode. + * + * Input parameters: + * relation: relation containing tuple (caller must hold suitable lock) + * tid: TID of tuple to lock + * snapshot: snapshot to use for visibility determinations + * cid: current command ID (used for visibility test, and stored into + * tuple's cmax if lock is successful) + * mode: lock mode desired + * wait_policy: what to do if tuple lock is not available + * flags: + * If TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS, follow the update chain to + * also lock descendant tuples if lock modes don't conflict. + * If TUPLE_LOCK_FLAG_FIND_LAST_VERSION, follow the update chain and lock + * latest version. + * + * Output parameters: + * *slot: contains the target tuple + * *tmfd: filled in failure cases (see below) + * + * Function result may be: + * TM_Ok: lock was successfully acquired + * TM_Invisible: lock failed because tuple was never visible to us + * TM_SelfModified: lock failed because tuple updated by self + * TM_Updated: lock failed because tuple updated by other xact + * TM_Deleted: lock failed because tuple deleted by other xact + * TM_WouldBlock: lock couldn't be acquired and wait_policy is skip + * + * In the failure cases other than TM_Invisible and TM_Deleted, the routine + * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See + * comments for struct TM_FailureData for additional info. + */ +static inline TM_Result +table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *slot, CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd) +{ + return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot, + cid, mode, wait_policy, + flags, tmfd); +} + +/* + * Perform operations necessary to complete insertions made via + * tuple_insert and multi_insert with a BulkInsertState specified. + */ +static inline void +table_finish_bulk_insert(Relation rel, int options) +{ + /* optional callback */ + if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert) + rel->rd_tableam->finish_bulk_insert(rel, options); +} + + +/* ------------------------------------------------------------------------ + * DDL related functionality. + * ------------------------------------------------------------------------ + */ + +/* + * Create storage for `rel` in `newrlocator`, with persistence set to + * `persistence`. + * + * This is used both during relation creation and various DDL operations to + * create new rel storage that can be filled from scratch. When creating + * new storage for an existing relfilelocator, this should be called before the + * relcache entry has been updated. + * + * *freezeXid, *minmulti are set to the xid / multixact horizon for the table + * that pg_class.{relfrozenxid, relminmxid} have to be set to. + */ +static inline void +table_relation_set_new_filelocator(Relation rel, + const RelFileLocator *newrlocator, + char persistence, + TransactionId *freezeXid, + MultiXactId *minmulti) +{ + rel->rd_tableam->relation_set_new_filelocator(rel, newrlocator, + persistence, freezeXid, + minmulti); +} + +/* + * Remove all table contents from `rel`, in a non-transactional manner. + * Non-transactional meaning that there's no need to support rollbacks. This + * commonly only is used to perform truncations for relation storage created in + * the current transaction. + */ +static inline void +table_relation_nontransactional_truncate(Relation rel) +{ + rel->rd_tableam->relation_nontransactional_truncate(rel); +} + +/* + * Copy data from `rel` into the new relfilelocator `newrlocator`. The new + * relfilelocator may not have storage associated before this function is + * called. This is only supposed to be used for low level operations like + * changing a relation's tablespace. + */ +static inline void +table_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) +{ + rel->rd_tableam->relation_copy_data(rel, newrlocator); +} + +/* + * Copy data from `OldTable` into `NewTable`, as part of a CLUSTER or VACUUM + * FULL. + * + * Additional Input parameters: + * - use_sort - if true, the table contents are sorted appropriate for + * `OldIndex`; if false and OldIndex is not InvalidOid, the data is copied + * in that index's order; if false and OldIndex is InvalidOid, no sorting is + * performed + * - OldIndex - see use_sort + * - OldestXmin - computed by vacuum_get_cutoffs(), even when + * not needed for the relation's AM + * - *xid_cutoff - ditto + * - *multi_cutoff - ditto + * + * Output parameters: + * - *xid_cutoff - rel's new relfrozenxid value, may be invalid + * - *multi_cutoff - rel's new relminmxid value, may be invalid + * - *tups_vacuumed - stats, for logging, if appropriate for AM + * - *tups_recently_dead - stats, for logging, if appropriate for AM + */ +static inline void +table_relation_copy_for_cluster(Relation OldTable, Relation NewTable, + Relation OldIndex, + bool use_sort, + TransactionId OldestXmin, + TransactionId *xid_cutoff, + MultiXactId *multi_cutoff, + double *num_tuples, + double *tups_vacuumed, + double *tups_recently_dead) +{ + OldTable->rd_tableam->relation_copy_for_cluster(OldTable, NewTable, OldIndex, + use_sort, OldestXmin, + xid_cutoff, multi_cutoff, + num_tuples, tups_vacuumed, + tups_recently_dead); +} + +/* + * Perform VACUUM on the relation. The VACUUM can be triggered by a user or by + * autovacuum. The specific actions performed by the AM will depend heavily on + * the individual AM. + * + * On entry a transaction needs to already been established, and the + * table is locked with a ShareUpdateExclusive lock. + * + * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this + * routine, even if (for ANALYZE) it is part of the same VACUUM command. + */ +static inline void +table_relation_vacuum(Relation rel, struct VacuumParams *params, + BufferAccessStrategy bstrategy) +{ + rel->rd_tableam->relation_vacuum(rel, params, bstrategy); +} + +/* + * Prepare to analyze block `blockno` of `scan`. The scan needs to have been + * started with table_beginscan_analyze(). Note that this routine might + * acquire resources like locks that are held until + * table_scan_analyze_next_tuple() returns false. + * + * Returns false if block is unsuitable for sampling, true otherwise. + */ +static inline bool +table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, + BufferAccessStrategy bstrategy) +{ + return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno, + bstrategy); +} + +/* + * Iterate over tuples in the block selected with + * table_scan_analyze_next_block() (which needs to have returned true, and + * this routine may not have returned false for the same block before). If a + * tuple that's suitable for sampling is found, true is returned and a tuple + * is stored in `slot`. + * + * *liverows and *deadrows are incremented according to the encountered + * tuples. + */ +static inline bool +table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, + double *liverows, double *deadrows, + TupleTableSlot *slot) +{ + return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin, + liverows, deadrows, + slot); +} + +/* + * table_index_build_scan - scan the table to find tuples to be indexed + * + * This is called back from an access-method-specific index build procedure + * after the AM has done whatever setup it needs. The parent table relation + * is scanned to find tuples that should be entered into the index. Each + * such tuple is passed to the AM's callback routine, which does the right + * things to add it to the new index. After we return, the AM's index + * build procedure does whatever cleanup it needs. + * + * The total count of live tuples is returned. This is for updating pg_class + * statistics. (It's annoying not to be able to do that here, but we want to + * merge that update with others; see index_update_stats.) Note that the + * index AM itself must keep track of the number of index tuples; we don't do + * so here because the AM might reject some of the tuples for its own reasons, + * such as being unable to store NULLs. + * + * If 'progress', the PROGRESS_SCAN_BLOCKS_TOTAL counter is updated when + * starting the scan, and PROGRESS_SCAN_BLOCKS_DONE is updated as we go along. + * + * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect + * any potentially broken HOT chains. Currently, we set this if there are any + * RECENTLY_DEAD or DELETE_IN_PROGRESS entries in a HOT chain, without trying + * very hard to detect whether they're really incompatible with the chain tip. + * This only really makes sense for heap AM, it might need to be generalized + * for other AMs later. + */ +static inline double +table_index_build_scan(Relation table_rel, + Relation index_rel, + struct IndexInfo *index_info, + bool allow_sync, + bool progress, + IndexBuildCallback callback, + void *callback_state, + TableScanDesc scan) +{ + return table_rel->rd_tableam->index_build_range_scan(table_rel, + index_rel, + index_info, + allow_sync, + false, + progress, + 0, + InvalidBlockNumber, + callback, + callback_state, + scan); +} + +/* + * As table_index_build_scan(), except that instead of scanning the complete + * table, only the given number of blocks are scanned. Scan to end-of-rel can + * be signaled by passing InvalidBlockNumber as numblocks. Note that + * restricting the range to scan cannot be done when requesting syncscan. + * + * When "anyvisible" mode is requested, all tuples visible to any transaction + * are indexed and counted as live, including those inserted or deleted by + * transactions that are still in progress. + */ +static inline double +table_index_build_range_scan(Relation table_rel, + Relation index_rel, + struct IndexInfo *index_info, + bool allow_sync, + bool anyvisible, + bool progress, + BlockNumber start_blockno, + BlockNumber numblocks, + IndexBuildCallback callback, + void *callback_state, + TableScanDesc scan) +{ + return table_rel->rd_tableam->index_build_range_scan(table_rel, + index_rel, + index_info, + allow_sync, + anyvisible, + progress, + start_blockno, + numblocks, + callback, + callback_state, + scan); +} + +/* + * table_index_validate_scan - second table scan for concurrent index build + * + * See validate_index() for an explanation. + */ +static inline void +table_index_validate_scan(Relation table_rel, + Relation index_rel, + struct IndexInfo *index_info, + Snapshot snapshot, + struct ValidateIndexState *state) +{ + table_rel->rd_tableam->index_validate_scan(table_rel, + index_rel, + index_info, + snapshot, + state); +} + + +/* ---------------------------------------------------------------------------- + * Miscellaneous functionality + * ---------------------------------------------------------------------------- + */ + +/* + * Return the current size of `rel` in bytes. If `forkNumber` is + * InvalidForkNumber, return the relation's overall size, otherwise the size + * for the indicated fork. + * + * Note that the overall size might not be the equivalent of the sum of sizes + * for the individual forks for some AMs, e.g. because the AMs storage does + * not neatly map onto the builtin types of forks. + */ +static inline uint64 +table_relation_size(Relation rel, ForkNumber forkNumber) +{ + return rel->rd_tableam->relation_size(rel, forkNumber); +} + +/* + * table_relation_needs_toast_table - does this relation need a toast table? + */ +static inline bool +table_relation_needs_toast_table(Relation rel) +{ + return rel->rd_tableam->relation_needs_toast_table(rel); +} + +/* + * Return the OID of the AM that should be used to implement the TOAST table + * for this relation. + */ +static inline Oid +table_relation_toast_am(Relation rel) +{ + return rel->rd_tableam->relation_toast_am(rel); +} + +/* + * Fetch all or part of a TOAST value from a TOAST table. + * + * If this AM is never used to implement a TOAST table, then this callback + * is not needed. But, if toasted values are ever stored in a table of this + * type, then you will need this callback. + * + * toastrel is the relation in which the toasted value is stored. + * + * valueid identifies which toast value is to be fetched. For the heap, + * this corresponds to the values stored in the chunk_id column. + * + * attrsize is the total size of the toast value to be fetched. + * + * sliceoffset is the offset within the toast value of the first byte that + * should be fetched. + * + * slicelength is the number of bytes from the toast value that should be + * fetched. + * + * result is caller-allocated space into which the fetched bytes should be + * stored. + */ +static inline void +table_relation_fetch_toast_slice(Relation toastrel, Oid valueid, + int32 attrsize, int32 sliceoffset, + int32 slicelength, struct varlena *result) +{ + toastrel->rd_tableam->relation_fetch_toast_slice(toastrel, valueid, + attrsize, + sliceoffset, slicelength, + result); +} + + +/* ---------------------------------------------------------------------------- + * Planner related functionality + * ---------------------------------------------------------------------------- + */ + +/* + * Estimate the current size of the relation, as an AM specific workhorse for + * estimate_rel_size(). Look there for an explanation of the parameters. + */ +static inline void +table_relation_estimate_size(Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples, + double *allvisfrac) +{ + rel->rd_tableam->relation_estimate_size(rel, attr_widths, pages, tuples, + allvisfrac); +} + + +/* ---------------------------------------------------------------------------- + * Executor related functionality + * ---------------------------------------------------------------------------- + */ + +/* + * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of + * a bitmap table scan. `scan` needs to have been started via + * table_beginscan_bm(). Returns false if there are no tuples to be found on + * the page, true otherwise. + * + * Note, this is an optionally implemented function, therefore should only be + * used after verifying the presence (at plan time or such). + */ +static inline bool +table_scan_bitmap_next_block(TableScanDesc scan, + struct TBMIterateResult *tbmres) +{ + /* + * We don't expect direct calls to table_scan_bitmap_next_block with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding"); + + return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, + tbmres); +} + +/* + * Fetch the next tuple of a bitmap table scan into `slot` and return true if + * a visible tuple was found, false otherwise. + * table_scan_bitmap_next_block() needs to previously have selected a + * block (i.e. returned true), and no previous + * table_scan_bitmap_next_tuple() for the same block may have + * returned false. + */ +static inline bool +table_scan_bitmap_next_tuple(TableScanDesc scan, + struct TBMIterateResult *tbmres, + TupleTableSlot *slot) +{ + /* + * We don't expect direct calls to table_scan_bitmap_next_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding"); + + return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan, + tbmres, + slot); +} + +/* + * Prepare to fetch tuples from the next block in a sample scan. Returns false + * if the sample scan is finished, true otherwise. `scan` needs to have been + * started via table_beginscan_sampling(). + * + * This will call the TsmRoutine's NextSampleBlock() callback if necessary + * (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the + * underlying relation. + */ +static inline bool +table_scan_sample_next_block(TableScanDesc scan, + struct SampleScanState *scanstate) +{ + /* + * We don't expect direct calls to table_scan_sample_next_block with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_sample_next_block call during logical decoding"); + return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate); +} + +/* + * Fetch the next sample tuple into `slot` and return true if a visible tuple + * was found, false otherwise. table_scan_sample_next_block() needs to + * previously have selected a block (i.e. returned true), and no previous + * table_scan_sample_next_tuple() for the same block may have returned false. + * + * This will call the TsmRoutine's NextSampleTuple() callback. + */ +static inline bool +table_scan_sample_next_tuple(TableScanDesc scan, + struct SampleScanState *scanstate, + TupleTableSlot *slot) +{ + /* + * We don't expect direct calls to table_scan_sample_next_tuple with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_scan_sample_next_tuple call during logical decoding"); + return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate, + slot); +} + + +/* ---------------------------------------------------------------------------- + * Functions to make modifications a bit simpler. + * ---------------------------------------------------------------------------- + */ + +extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); +extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, + Snapshot snapshot); +extern void simple_table_tuple_update(Relation rel, ItemPointer otid, + TupleTableSlot *slot, Snapshot snapshot, + TU_UpdateIndexes *update_indexes); + + +/* ---------------------------------------------------------------------------- + * Helper functions to implement parallel scans for block oriented AMs. + * ---------------------------------------------------------------------------- + */ + +extern Size table_block_parallelscan_estimate(Relation rel); +extern Size table_block_parallelscan_initialize(Relation rel, + ParallelTableScanDesc pscan); +extern void table_block_parallelscan_reinitialize(Relation rel, + ParallelTableScanDesc pscan); +extern BlockNumber table_block_parallelscan_nextpage(Relation rel, + ParallelBlockTableScanWorker pbscanwork, + ParallelBlockTableScanDesc pbscan); +extern void table_block_parallelscan_startblock_init(Relation rel, + ParallelBlockTableScanWorker pbscanwork, + ParallelBlockTableScanDesc pbscan); + + +/* ---------------------------------------------------------------------------- + * Helper functions to implement relation sizing for block oriented AMs. + * ---------------------------------------------------------------------------- + */ + +extern uint64 table_block_relation_size(Relation rel, ForkNumber forkNumber); +extern void table_block_relation_estimate_size(Relation rel, + int32 *attr_widths, + BlockNumber *pages, + double *tuples, + double *allvisfrac, + Size overhead_bytes_per_tuple, + Size usable_bytes_per_page); + +/* ---------------------------------------------------------------------------- + * Functions in tableamapi.c + * ---------------------------------------------------------------------------- + */ + +extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler); +extern const TableAmRoutine *GetHeapamTableAmRoutine(void); + +#endif /* TABLEAM_H */ diff --git a/src/include/access/timeline.h b/src/include/access/timeline.h new file mode 100644 index 0000000..72e3622 --- /dev/null +++ b/src/include/access/timeline.h @@ -0,0 +1,44 @@ +/* + * timeline.h + * + * Functions for reading and writing timeline history files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/timeline.h + */ +#ifndef TIMELINE_H +#define TIMELINE_H + +#include "access/xlogdefs.h" +#include "nodes/pg_list.h" + +/* + * A list of these structs describes the timeline history of the server. Each + * TimeLineHistoryEntry represents a piece of WAL belonging to the history, + * from newest to oldest. All WAL locations between 'begin' and 'end' belong to + * the timeline represented by the entry. Together the 'begin' and 'end' + * pointers of all the entries form a contiguous line from beginning of time + * to infinity. + */ +typedef struct +{ + TimeLineID tli; + XLogRecPtr begin; /* inclusive */ + XLogRecPtr end; /* exclusive, InvalidXLogRecPtr means infinity */ +} TimeLineHistoryEntry; + +extern List *readTimeLineHistory(TimeLineID targetTLI); +extern bool existsTimeLineHistory(TimeLineID probeTLI); +extern TimeLineID findNewestTimeLine(TimeLineID startTLI); +extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, + XLogRecPtr switchpoint, char *reason); +extern void writeTimeLineHistoryFile(TimeLineID tli, char *content, int size); +extern void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end); +extern bool tliInHistory(TimeLineID tli, List *expectedTLEs); +extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history); +extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, + TimeLineID *nextTLI); + +#endif /* TIMELINE_H */ diff --git a/src/include/access/toast_compression.h b/src/include/access/toast_compression.h new file mode 100644 index 0000000..7f02820 --- /dev/null +++ b/src/include/access/toast_compression.h @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * toast_compression.h + * Functions for toast compression. + * + * Copyright (c) 2021-2023, PostgreSQL Global Development Group + * + * src/include/access/toast_compression.h + * + *------------------------------------------------------------------------- + */ + +#ifndef TOAST_COMPRESSION_H +#define TOAST_COMPRESSION_H + +/* + * GUC support. + * + * default_toast_compression is an integer for purposes of the GUC machinery, + * but the value is one of the char values defined below, as they appear in + * pg_attribute.attcompression, e.g. TOAST_PGLZ_COMPRESSION. + */ +extern PGDLLIMPORT int default_toast_compression; + +/* + * Built-in compression method ID. The toast compression header will store + * this in the first 2 bits of the raw length. These built-in compression + * method IDs are directly mapped to the built-in compression methods. + * + * Don't use these values for anything other than understanding the meaning + * of the raw bits from a varlena; in particular, if the goal is to identify + * a compression method, use the constants TOAST_PGLZ_COMPRESSION, etc. + * below. We might someday support more than 4 compression methods, but + * we can never have more than 4 values in this enum, because there are + * only 2 bits available in the places where this is stored. + */ +typedef enum ToastCompressionId +{ + TOAST_PGLZ_COMPRESSION_ID = 0, + TOAST_LZ4_COMPRESSION_ID = 1, + TOAST_INVALID_COMPRESSION_ID = 2 +} ToastCompressionId; + +/* + * Built-in compression methods. pg_attribute will store these in the + * attcompression column. In attcompression, InvalidCompressionMethod + * denotes the default behavior. + */ +#define TOAST_PGLZ_COMPRESSION 'p' +#define TOAST_LZ4_COMPRESSION 'l' +#define InvalidCompressionMethod '\0' + +#define CompressionMethodIsValid(cm) ((cm) != InvalidCompressionMethod) + + +/* pglz compression/decompression routines */ +extern struct varlena *pglz_compress_datum(const struct varlena *value); +extern struct varlena *pglz_decompress_datum(const struct varlena *value); +extern struct varlena *pglz_decompress_datum_slice(const struct varlena *value, + int32 slicelength); + +/* lz4 compression/decompression routines */ +extern struct varlena *lz4_compress_datum(const struct varlena *value); +extern struct varlena *lz4_decompress_datum(const struct varlena *value); +extern struct varlena *lz4_decompress_datum_slice(const struct varlena *value, + int32 slicelength); + +/* other stuff */ +extern ToastCompressionId toast_get_compression_id(struct varlena *attr); +extern char CompressionNameToMethod(const char *compression); +extern const char *GetCompressionMethodName(char method); + +#endif /* TOAST_COMPRESSION_H */ diff --git a/src/include/access/toast_helper.h b/src/include/access/toast_helper.h new file mode 100644 index 0000000..e971bd2 --- /dev/null +++ b/src/include/access/toast_helper.h @@ -0,0 +1,116 @@ +/*------------------------------------------------------------------------- + * + * toast_helper.h + * Helper functions for table AMs implementing compressed or + * out-of-line storage of varlena attributes. + * + * Copyright (c) 2000-2023, PostgreSQL Global Development Group + * + * src/include/access/toast_helper.h + * + *------------------------------------------------------------------------- + */ + +#ifndef TOAST_HELPER_H +#define TOAST_HELPER_H + +#include "utils/rel.h" + +/* + * Information about one column of a tuple being toasted. + * + * NOTE: toast_action[i] can have these values: + * ' ' default handling + * TYPSTORAGE_PLAIN already processed --- don't touch it + * TYPSTORAGE_EXTENDED incompressible, but OK to move off + * + * NOTE: toast_attr[i].tai_size is only made valid for varlena attributes with + * toast_action[i] different from TYPSTORAGE_PLAIN. + */ +typedef struct +{ + struct varlena *tai_oldexternal; + int32 tai_size; + uint8 tai_colflags; + char tai_compression; +} ToastAttrInfo; + +/* + * Information about one tuple being toasted. + */ +typedef struct +{ + /* + * Before calling toast_tuple_init, the caller must initialize the + * following fields. Each array must have a length equal to + * ttc_rel->rd_att->natts. The ttc_oldvalues and ttc_oldisnull fields + * should be NULL in the case of an insert. + */ + Relation ttc_rel; /* the relation that contains the tuple */ + Datum *ttc_values; /* values from the tuple columns */ + bool *ttc_isnull; /* null flags for the tuple columns */ + Datum *ttc_oldvalues; /* values from previous tuple */ + bool *ttc_oldisnull; /* null flags from previous tuple */ + + /* + * Before calling toast_tuple_init, the caller should set ttc_attr to + * point to an array of ToastAttrInfo structures of a length equal to + * ttc_rel->rd_att->natts. The contents of the array need not be + * initialized. ttc_flags also does not need to be initialized. + */ + uint8 ttc_flags; + ToastAttrInfo *ttc_attr; +} ToastTupleContext; + +/* + * Flags indicating the overall state of a TOAST operation. + * + * TOAST_NEEDS_DELETE_OLD indicates that one or more old TOAST datums need + * to be deleted. + * + * TOAST_NEEDS_FREE indicates that one or more TOAST values need to be freed. + * + * TOAST_HAS_NULLS indicates that nulls were found in the tuple being toasted. + * + * TOAST_NEEDS_CHANGE indicates that a new tuple needs to built; in other + * words, the toaster did something. + */ +#define TOAST_NEEDS_DELETE_OLD 0x0001 +#define TOAST_NEEDS_FREE 0x0002 +#define TOAST_HAS_NULLS 0x0004 +#define TOAST_NEEDS_CHANGE 0x0008 + +/* + * Flags indicating the status of a TOAST operation with respect to a + * particular column. + * + * TOASTCOL_NEEDS_DELETE_OLD indicates that the old TOAST datums for this + * column need to be deleted. + * + * TOASTCOL_NEEDS_FREE indicates that the value for this column needs to + * be freed. + * + * TOASTCOL_IGNORE indicates that the toaster should not further process + * this column. + * + * TOASTCOL_INCOMPRESSIBLE indicates that this column has been found to + * be incompressible, but could be moved out-of-line. + */ +#define TOASTCOL_NEEDS_DELETE_OLD TOAST_NEEDS_DELETE_OLD +#define TOASTCOL_NEEDS_FREE TOAST_NEEDS_FREE +#define TOASTCOL_IGNORE 0x0010 +#define TOASTCOL_INCOMPRESSIBLE 0x0020 + +extern void toast_tuple_init(ToastTupleContext *ttc); +extern int toast_tuple_find_biggest_attribute(ToastTupleContext *ttc, + bool for_compression, + bool check_main); +extern void toast_tuple_try_compression(ToastTupleContext *ttc, int attribute); +extern void toast_tuple_externalize(ToastTupleContext *ttc, int attribute, + int options); +extern void toast_tuple_cleanup(ToastTupleContext *ttc); + +extern void toast_delete_external(Relation rel, Datum *values, bool *isnull, + bool is_speculative); + +#endif diff --git a/src/include/access/toast_internals.h b/src/include/access/toast_internals.h new file mode 100644 index 0000000..30dba09 --- /dev/null +++ b/src/include/access/toast_internals.h @@ -0,0 +1,63 @@ +/*------------------------------------------------------------------------- + * + * toast_internals.h + * Internal definitions for the TOAST system. + * + * Copyright (c) 2000-2023, PostgreSQL Global Development Group + * + * src/include/access/toast_internals.h + * + *------------------------------------------------------------------------- + */ +#ifndef TOAST_INTERNALS_H +#define TOAST_INTERNALS_H + +#include "access/toast_compression.h" +#include "storage/lockdefs.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + +/* + * The information at the start of the compressed toast data. + */ +typedef struct toast_compress_header +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + uint32 tcinfo; /* 2 bits for compression method and 30 bits + * external size; see va_extinfo */ +} toast_compress_header; + +/* + * Utilities for manipulation of header information for compressed + * toast entries. + */ +#define TOAST_COMPRESS_EXTSIZE(ptr) \ + (((toast_compress_header *) (ptr))->tcinfo & VARLENA_EXTSIZE_MASK) +#define TOAST_COMPRESS_METHOD(ptr) \ + (((toast_compress_header *) (ptr))->tcinfo >> VARLENA_EXTSIZE_BITS) + +#define TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(ptr, len, cm_method) \ + do { \ + Assert((len) > 0 && (len) <= VARLENA_EXTSIZE_MASK); \ + Assert((cm_method) == TOAST_PGLZ_COMPRESSION_ID || \ + (cm_method) == TOAST_LZ4_COMPRESSION_ID); \ + ((toast_compress_header *) (ptr))->tcinfo = \ + (len) | ((uint32) (cm_method) << VARLENA_EXTSIZE_BITS); \ + } while (0) + +extern Datum toast_compress_datum(Datum value, char cmethod); +extern Oid toast_get_valid_index(Oid toastoid, LOCKMODE lock); + +extern void toast_delete_datum(Relation rel, Datum value, bool is_speculative); +extern Datum toast_save_datum(Relation rel, Datum value, + struct varlena *oldexternal, int options); + +extern int toast_open_indexes(Relation toastrel, + LOCKMODE lock, + Relation **toastidxs, + int *num_indexes); +extern void toast_close_indexes(Relation *toastidxs, int num_indexes, + LOCKMODE lock); +extern void init_toast_snapshot(Snapshot toast_snapshot); + +#endif /* TOAST_INTERNALS_H */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h new file mode 100644 index 0000000..f5af6d3 --- /dev/null +++ b/src/include/access/transam.h @@ -0,0 +1,375 @@ +/*------------------------------------------------------------------------- + * + * transam.h + * postgres transaction access method support code + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/transam.h + * + *------------------------------------------------------------------------- + */ +#ifndef TRANSAM_H +#define TRANSAM_H + +#include "access/xlogdefs.h" + + +/* ---------------- + * Special transaction ID values + * + * BootstrapTransactionId is the XID for "bootstrap" operations, and + * FrozenTransactionId is used for very old tuples. Both should + * always be considered valid. + * + * FirstNormalTransactionId is the first "normal" transaction id. + * Note: if you need to change it, you must change pg_class.h as well. + * ---------------- + */ +#define InvalidTransactionId ((TransactionId) 0) +#define BootstrapTransactionId ((TransactionId) 1) +#define FrozenTransactionId ((TransactionId) 2) +#define FirstNormalTransactionId ((TransactionId) 3) +#define MaxTransactionId ((TransactionId) 0xFFFFFFFF) + +/* ---------------- + * transaction ID manipulation macros + * ---------------- + */ +#define TransactionIdIsValid(xid) ((xid) != InvalidTransactionId) +#define TransactionIdIsNormal(xid) ((xid) >= FirstNormalTransactionId) +#define TransactionIdEquals(id1, id2) ((id1) == (id2)) +#define TransactionIdStore(xid, dest) (*(dest) = (xid)) +#define StoreInvalidTransactionId(dest) (*(dest) = InvalidTransactionId) + +#define EpochFromFullTransactionId(x) ((uint32) ((x).value >> 32)) +#define XidFromFullTransactionId(x) ((uint32) (x).value) +#define U64FromFullTransactionId(x) ((x).value) +#define FullTransactionIdEquals(a, b) ((a).value == (b).value) +#define FullTransactionIdPrecedes(a, b) ((a).value < (b).value) +#define FullTransactionIdPrecedesOrEquals(a, b) ((a).value <= (b).value) +#define FullTransactionIdFollows(a, b) ((a).value > (b).value) +#define FullTransactionIdFollowsOrEquals(a, b) ((a).value >= (b).value) +#define FullTransactionIdIsValid(x) TransactionIdIsValid(XidFromFullTransactionId(x)) +#define InvalidFullTransactionId FullTransactionIdFromEpochAndXid(0, InvalidTransactionId) +#define FirstNormalFullTransactionId FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId) +#define FullTransactionIdIsNormal(x) FullTransactionIdFollowsOrEquals(x, FirstNormalFullTransactionId) + +/* + * A 64 bit value that contains an epoch and a TransactionId. This is + * wrapped in a struct to prevent implicit conversion to/from TransactionId. + * Not all values represent valid normal XIDs. + */ +typedef struct FullTransactionId +{ + uint64 value; +} FullTransactionId; + +static inline FullTransactionId +FullTransactionIdFromEpochAndXid(uint32 epoch, TransactionId xid) +{ + FullTransactionId result; + + result.value = ((uint64) epoch) << 32 | xid; + + return result; +} + +static inline FullTransactionId +FullTransactionIdFromU64(uint64 value) +{ + FullTransactionId result; + + result.value = value; + + return result; +} + +/* advance a transaction ID variable, handling wraparound correctly */ +#define TransactionIdAdvance(dest) \ + do { \ + (dest)++; \ + if ((dest) < FirstNormalTransactionId) \ + (dest) = FirstNormalTransactionId; \ + } while(0) + +/* + * Retreat a FullTransactionId variable, stepping over xids that would appear + * to be special only when viewed as 32bit XIDs. + */ +static inline void +FullTransactionIdRetreat(FullTransactionId *dest) +{ + dest->value--; + + /* + * In contrast to 32bit XIDs don't step over the "actual" special xids. + * For 64bit xids these can't be reached as part of a wraparound as they + * can in the 32bit case. + */ + if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId)) + return; + + /* + * But we do need to step over XIDs that'd appear special only for 32bit + * XIDs. + */ + while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId) + dest->value--; +} + +/* + * Advance a FullTransactionId variable, stepping over xids that would appear + * to be special only when viewed as 32bit XIDs. + */ +static inline void +FullTransactionIdAdvance(FullTransactionId *dest) +{ + dest->value++; + + /* see FullTransactionIdAdvance() */ + if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId)) + return; + + while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId) + dest->value++; +} + +/* back up a transaction ID variable, handling wraparound correctly */ +#define TransactionIdRetreat(dest) \ + do { \ + (dest)--; \ + } while ((dest) < FirstNormalTransactionId) + +/* compare two XIDs already known to be normal; this is a macro for speed */ +#define NormalTransactionIdPrecedes(id1, id2) \ + (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \ + (int32) ((id1) - (id2)) < 0) + +/* compare two XIDs already known to be normal; this is a macro for speed */ +#define NormalTransactionIdFollows(id1, id2) \ + (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \ + (int32) ((id1) - (id2)) > 0) + +/* ---------- + * Object ID (OID) zero is InvalidOid. + * + * OIDs 1-9999 are reserved for manual assignment (see .dat files in + * src/include/catalog/). Of these, 8000-9999 are reserved for + * development purposes (such as in-progress patches and forks); + * they should not appear in released versions. + * + * OIDs 10000-11999 are reserved for assignment by genbki.pl, for use + * when the .dat files in src/include/catalog/ do not specify an OID + * for a catalog entry that requires one. Note that genbki.pl assigns + * these OIDs independently in each catalog, so they're not guaranteed + * to be globally unique. Furthermore, the bootstrap backend and + * initdb's post-bootstrap processing can also assign OIDs in this range. + * The normal OID-generation logic takes care of any OID conflicts that + * might arise from that. + * + * OIDs 12000-16383 are reserved for unpinned objects created by initdb's + * post-bootstrap processing. initdb forces the OID generator up to + * 12000 as soon as it's made the pinned objects it's responsible for. + * + * OIDs beginning at 16384 are assigned from the OID generator + * during normal multiuser operation. (We force the generator up to + * 16384 as soon as we are in normal operation.) + * + * The choices of 8000, 10000 and 12000 are completely arbitrary, and can be + * moved if we run low on OIDs in any category. Changing the macros below, + * and updating relevant documentation (see bki.sgml and RELEASE_CHANGES), + * should be sufficient to do this. Moving the 16384 boundary between + * initdb-assigned OIDs and user-defined objects would be substantially + * more painful, however, since some user-defined OIDs will appear in + * on-disk data; such a change would probably break pg_upgrade. + * + * NOTE: if the OID generator wraps around, we skip over OIDs 0-16383 + * and resume with 16384. This minimizes the odds of OID conflict, by not + * reassigning OIDs that might have been assigned during initdb. Critically, + * it also ensures that no user-created object will be considered pinned. + * ---------- + */ +#define FirstGenbkiObjectId 10000 +#define FirstUnpinnedObjectId 12000 +#define FirstNormalObjectId 16384 + +/* + * VariableCache is a data structure in shared memory that is used to track + * OID and XID assignment state. For largely historical reasons, there is + * just one struct with different fields that are protected by different + * LWLocks. + * + * Note: xidWrapLimit and oldestXidDB are not "active" values, but are + * used just to generate useful messages when xidWarnLimit or xidStopLimit + * are exceeded. + */ +typedef struct VariableCacheData +{ + /* + * These fields are protected by OidGenLock. + */ + Oid nextOid; /* next OID to assign */ + uint32 oidCount; /* OIDs available before must do XLOG work */ + + /* + * These fields are protected by XidGenLock. + */ + FullTransactionId nextXid; /* next XID to assign */ + + TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ + TransactionId xidVacLimit; /* start forcing autovacuums here */ + TransactionId xidWarnLimit; /* start complaining here */ + TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */ + TransactionId xidWrapLimit; /* where the world ends */ + Oid oldestXidDB; /* database with minimum datfrozenxid */ + + /* + * These fields are protected by CommitTsLock + */ + TransactionId oldestCommitTsXid; + TransactionId newestCommitTsXid; + + /* + * These fields are protected by ProcArrayLock. + */ + FullTransactionId latestCompletedXid; /* newest full XID that has + * committed or aborted */ + + /* + * Number of top-level transactions with xids (i.e. which may have + * modified the database) that completed in some form since the start of + * the server. This currently is solely used to check whether + * GetSnapshotData() needs to recompute the contents of the snapshot, or + * not. There are likely other users of this. Always above 1. + */ + uint64 xactCompletionCount; + + /* + * These fields are protected by XactTruncationLock + */ + TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ + +} VariableCacheData; + +typedef VariableCacheData *VariableCache; + + +/* ---------------- + * extern declarations + * ---------------- + */ + +/* in transam/xact.c */ +extern bool TransactionStartedDuringRecovery(void); + +/* in transam/varsup.c */ +extern PGDLLIMPORT VariableCache ShmemVariableCache; + +/* + * prototypes for functions in transam/transam.c + */ +extern bool TransactionIdDidCommit(TransactionId transactionId); +extern bool TransactionIdDidAbort(TransactionId transactionId); +extern void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids); +extern void TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, XLogRecPtr lsn); +extern void TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids); +extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2); +extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2); +extern bool TransactionIdFollows(TransactionId id1, TransactionId id2); +extern bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2); +extern TransactionId TransactionIdLatest(TransactionId mainxid, + int nxids, const TransactionId *xids); +extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid); + +/* in transam/varsup.c */ +extern FullTransactionId GetNewTransactionId(bool isSubXact); +extern void AdvanceNextFullTransactionIdPastXid(TransactionId xid); +extern FullTransactionId ReadNextFullTransactionId(void); +extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, + Oid oldest_datoid); +extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); +extern bool ForceTransactionIdLimitUpdate(void); +extern Oid GetNewObjectId(void); +extern void StopGeneratingPinnedObjectIds(void); + +#ifdef USE_ASSERT_CHECKING +extern void AssertTransactionIdInAllowableRange(TransactionId xid); +#else +#define AssertTransactionIdInAllowableRange(xid) ((void)true) +#endif + +/* + * Some frontend programs include this header. For compilers that emit static + * inline functions even when they're unused, that leads to unsatisfied + * external references; hence hide them with #ifndef FRONTEND. + */ +#ifndef FRONTEND + +/* + * For callers that just need the XID part of the next transaction ID. + */ +static inline TransactionId +ReadNextTransactionId(void) +{ + return XidFromFullTransactionId(ReadNextFullTransactionId()); +} + +/* return transaction ID backed up by amount, handling wraparound correctly */ +static inline TransactionId +TransactionIdRetreatedBy(TransactionId xid, uint32 amount) +{ + xid -= amount; + + while (xid < FirstNormalTransactionId) + xid--; + + return xid; +} + +/* return the older of the two IDs */ +static inline TransactionId +TransactionIdOlder(TransactionId a, TransactionId b) +{ + if (!TransactionIdIsValid(a)) + return b; + + if (!TransactionIdIsValid(b)) + return a; + + if (TransactionIdPrecedes(a, b)) + return a; + return b; +} + +/* return the older of the two IDs, assuming they're both normal */ +static inline TransactionId +NormalTransactionIdOlder(TransactionId a, TransactionId b) +{ + Assert(TransactionIdIsNormal(a)); + Assert(TransactionIdIsNormal(b)); + if (NormalTransactionIdPrecedes(a, b)) + return a; + return b; +} + +/* return the newer of the two IDs */ +static inline FullTransactionId +FullTransactionIdNewer(FullTransactionId a, FullTransactionId b) +{ + if (!FullTransactionIdIsValid(a)) + return b; + + if (!FullTransactionIdIsValid(b)) + return a; + + if (FullTransactionIdFollows(a, b)) + return a; + return b; +} + +#endif /* FRONTEND */ + +#endif /* TRANSAM_H */ diff --git a/src/include/access/tsmapi.h b/src/include/access/tsmapi.h new file mode 100644 index 0000000..80784be --- /dev/null +++ b/src/include/access/tsmapi.h @@ -0,0 +1,82 @@ +/*------------------------------------------------------------------------- + * + * tsmapi.h + * API for tablesample methods + * + * Copyright (c) 2015-2023, PostgreSQL Global Development Group + * + * src/include/access/tsmapi.h + * + *------------------------------------------------------------------------- + */ +#ifndef TSMAPI_H +#define TSMAPI_H + +#include "nodes/execnodes.h" +#include "nodes/pathnodes.h" + + +/* + * Callback function signatures --- see tablesample-method.sgml for more info. + */ + +typedef void (*SampleScanGetSampleSize_function) (PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); + +typedef void (*InitSampleScan_function) (SampleScanState *node, + int eflags); + +typedef void (*BeginSampleScan_function) (SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); + +typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node, + BlockNumber nblocks); + +typedef OffsetNumber (*NextSampleTuple_function) (SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + +typedef void (*EndSampleScan_function) (SampleScanState *node); + +/* + * TsmRoutine is the struct returned by a tablesample method's handler + * function. It provides pointers to the callback functions needed by the + * planner and executor, as well as additional information about the method. + * + * More function pointers are likely to be added in the future. + * Therefore it's recommended that the handler initialize the struct with + * makeNode(TsmRoutine) so that all fields are set to NULL. This will + * ensure that no fields are accidentally left undefined. + */ +typedef struct TsmRoutine +{ + NodeTag type; + + /* List of datatype OIDs for the arguments of the TABLESAMPLE clause */ + List *parameterTypes; + + /* Can method produce repeatable samples across, or even within, queries? */ + bool repeatable_across_queries; + bool repeatable_across_scans; + + /* Functions for planning a SampleScan on a physical table */ + SampleScanGetSampleSize_function SampleScanGetSampleSize; + + /* Functions for executing a SampleScan on a physical table */ + InitSampleScan_function InitSampleScan; /* can be NULL */ + BeginSampleScan_function BeginSampleScan; + NextSampleBlock_function NextSampleBlock; /* can be NULL */ + NextSampleTuple_function NextSampleTuple; + EndSampleScan_function EndSampleScan; /* can be NULL */ +} TsmRoutine; + + +/* Functions in access/tablesample/tablesample.c */ +extern TsmRoutine *GetTsmRoutine(Oid tsmhandler); + +#endif /* TSMAPI_H */ diff --git a/src/include/access/tupconvert.h b/src/include/access/tupconvert.h new file mode 100644 index 0000000..f00918b --- /dev/null +++ b/src/include/access/tupconvert.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * tupconvert.h + * Tuple conversion support. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/tupconvert.h + * + *------------------------------------------------------------------------- + */ +#ifndef TUPCONVERT_H +#define TUPCONVERT_H + +#include "access/attmap.h" +#include "access/htup.h" +#include "access/tupdesc.h" +#include "executor/tuptable.h" +#include "nodes/bitmapset.h" + + +typedef struct TupleConversionMap +{ + TupleDesc indesc; /* tupdesc for source rowtype */ + TupleDesc outdesc; /* tupdesc for result rowtype */ + AttrMap *attrMap; /* indexes of input fields, or 0 for null */ + Datum *invalues; /* workspace for deconstructing source */ + bool *inisnull; + Datum *outvalues; /* workspace for constructing result */ + bool *outisnull; +} TupleConversionMap; + + +extern TupleConversionMap *convert_tuples_by_position(TupleDesc indesc, + TupleDesc outdesc, + const char *msg); + +extern TupleConversionMap *convert_tuples_by_name(TupleDesc indesc, + TupleDesc outdesc); +extern TupleConversionMap *convert_tuples_by_name_attrmap(TupleDesc indesc, + TupleDesc outdesc, + AttrMap *attrMap); + +extern HeapTuple execute_attr_map_tuple(HeapTuple tuple, TupleConversionMap *map); +extern TupleTableSlot *execute_attr_map_slot(AttrMap *attrMap, + TupleTableSlot *in_slot, + TupleTableSlot *out_slot); +extern Bitmapset *execute_attr_map_cols(AttrMap *attrMap, Bitmapset *in_cols); + +extern void free_conversion_map(TupleConversionMap *map); + +#endif /* TUPCONVERT_H */ diff --git a/src/include/access/tupdesc.h b/src/include/access/tupdesc.h new file mode 100644 index 0000000..b4286cf --- /dev/null +++ b/src/include/access/tupdesc.h @@ -0,0 +1,154 @@ +/*------------------------------------------------------------------------- + * + * tupdesc.h + * POSTGRES tuple descriptor definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/tupdesc.h + * + *------------------------------------------------------------------------- + */ +#ifndef TUPDESC_H +#define TUPDESC_H + +#include "access/attnum.h" +#include "catalog/pg_attribute.h" +#include "nodes/pg_list.h" + + +typedef struct AttrDefault +{ + AttrNumber adnum; + char *adbin; /* nodeToString representation of expr */ +} AttrDefault; + +typedef struct ConstrCheck +{ + char *ccname; + char *ccbin; /* nodeToString representation of expr */ + bool ccvalid; + bool ccnoinherit; /* this is a non-inheritable constraint */ +} ConstrCheck; + +/* This structure contains constraints of a tuple */ +typedef struct TupleConstr +{ + AttrDefault *defval; /* array */ + ConstrCheck *check; /* array */ + struct AttrMissing *missing; /* missing attributes values, NULL if none */ + uint16 num_defval; + uint16 num_check; + bool has_not_null; + bool has_generated_stored; +} TupleConstr; + +/* + * This struct is passed around within the backend to describe the structure + * of tuples. For tuples coming from on-disk relations, the information is + * collected from the pg_attribute, pg_attrdef, and pg_constraint catalogs. + * Transient row types (such as the result of a join query) have anonymous + * TupleDesc structs that generally omit any constraint info; therefore the + * structure is designed to let the constraints be omitted efficiently. + * + * Note that only user attributes, not system attributes, are mentioned in + * TupleDesc. + * + * If the tupdesc is known to correspond to a named rowtype (such as a table's + * rowtype) then tdtypeid identifies that type and tdtypmod is -1. Otherwise + * tdtypeid is RECORDOID, and tdtypmod can be either -1 for a fully anonymous + * row type, or a value >= 0 to allow the rowtype to be looked up in the + * typcache.c type cache. + * + * Note that tdtypeid is never the OID of a domain over composite, even if + * we are dealing with values that are known (at some higher level) to be of + * a domain-over-composite type. This is because tdtypeid/tdtypmod need to + * match up with the type labeling of composite Datums, and those are never + * explicitly marked as being of a domain type, either. + * + * Tuple descriptors that live in caches (relcache or typcache, at present) + * are reference-counted: they can be deleted when their reference count goes + * to zero. Tuple descriptors created by the executor need no reference + * counting, however: they are simply created in the appropriate memory + * context and go away when the context is freed. We set the tdrefcount + * field of such a descriptor to -1, while reference-counted descriptors + * always have tdrefcount >= 0. + */ +typedef struct TupleDescData +{ + int natts; /* number of attributes in the tuple */ + Oid tdtypeid; /* composite type ID for tuple type */ + int32 tdtypmod; /* typmod for tuple type */ + int tdrefcount; /* reference count, or -1 if not counting */ + TupleConstr *constr; /* constraints, or NULL if none */ + /* attrs[N] is the description of Attribute Number N+1 */ + FormData_pg_attribute attrs[FLEXIBLE_ARRAY_MEMBER]; +} TupleDescData; +typedef struct TupleDescData *TupleDesc; + +/* Accessor for the i'th attribute of tupdesc. */ +#define TupleDescAttr(tupdesc, i) (&(tupdesc)->attrs[(i)]) + +extern TupleDesc CreateTemplateTupleDesc(int natts); + +extern TupleDesc CreateTupleDesc(int natts, Form_pg_attribute *attrs); + +extern TupleDesc CreateTupleDescCopy(TupleDesc tupdesc); + +extern TupleDesc CreateTupleDescCopyConstr(TupleDesc tupdesc); + +#define TupleDescSize(src) \ + (offsetof(struct TupleDescData, attrs) + \ + (src)->natts * sizeof(FormData_pg_attribute)) + +extern void TupleDescCopy(TupleDesc dst, TupleDesc src); + +extern void TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno, + TupleDesc src, AttrNumber srcAttno); + +extern void FreeTupleDesc(TupleDesc tupdesc); + +extern void IncrTupleDescRefCount(TupleDesc tupdesc); +extern void DecrTupleDescRefCount(TupleDesc tupdesc); + +#define PinTupleDesc(tupdesc) \ + do { \ + if ((tupdesc)->tdrefcount >= 0) \ + IncrTupleDescRefCount(tupdesc); \ + } while (0) + +#define ReleaseTupleDesc(tupdesc) \ + do { \ + if ((tupdesc)->tdrefcount >= 0) \ + DecrTupleDescRefCount(tupdesc); \ + } while (0) + +extern bool equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2); + +extern uint32 hashTupleDesc(TupleDesc desc); + +extern void TupleDescInitEntry(TupleDesc desc, + AttrNumber attributeNumber, + const char *attributeName, + Oid oidtypeid, + int32 typmod, + int attdim); + +extern void TupleDescInitBuiltinEntry(TupleDesc desc, + AttrNumber attributeNumber, + const char *attributeName, + Oid oidtypeid, + int32 typmod, + int attdim); + +extern void TupleDescInitEntryCollation(TupleDesc desc, + AttrNumber attributeNumber, + Oid collationid); + +extern TupleDesc BuildDescForRelation(List *schema); + +extern TupleDesc BuildDescFromLists(List *names, List *types, List *typmods, List *collations); + +#endif /* TUPDESC_H */ diff --git a/src/include/access/tupdesc_details.h b/src/include/access/tupdesc_details.h new file mode 100644 index 0000000..8d1b098 --- /dev/null +++ b/src/include/access/tupdesc_details.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * tupdesc_details.h + * POSTGRES tuple descriptor definitions we can't include everywhere + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/tupdesc_details.h + * + *------------------------------------------------------------------------- + */ + +#ifndef TUPDESC_DETAILS_H +#define TUPDESC_DETAILS_H + +/* + * Structure used to represent value to be used when the attribute is not + * present at all in a tuple, i.e. when the column was created after the tuple + */ +typedef struct AttrMissing +{ + bool am_present; /* true if non-NULL missing value exists */ + Datum am_value; /* value when attribute is missing */ +} AttrMissing; + +#endif /* TUPDESC_DETAILS_H */ diff --git a/src/include/access/tupmacs.h b/src/include/access/tupmacs.h new file mode 100644 index 0000000..3414446 --- /dev/null +++ b/src/include/access/tupmacs.h @@ -0,0 +1,207 @@ +/*------------------------------------------------------------------------- + * + * tupmacs.h + * Tuple macros used by both index tuples and heap tuples. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/tupmacs.h + * + *------------------------------------------------------------------------- + */ +#ifndef TUPMACS_H +#define TUPMACS_H + +#include "catalog/pg_type_d.h" /* for TYPALIGN macros */ + + +/* + * Check a tuple's null bitmap to determine whether the attribute is null. + * Note that a 0 in the null bitmap indicates a null, while 1 indicates + * non-null. + */ +static inline bool +att_isnull(int ATT, const bits8 *BITS) +{ + return !(BITS[ATT >> 3] & (1 << (ATT & 0x07))); +} + +#ifndef FRONTEND +/* + * Given a Form_pg_attribute and a pointer into a tuple's data area, + * return the correct value or pointer. + * + * We return a Datum value in all cases. If the attribute has "byval" false, + * we return the same pointer into the tuple data area that we're passed. + * Otherwise, we return the correct number of bytes fetched from the data + * area and extended to Datum form. + * + * On machines where Datum is 8 bytes, we support fetching 8-byte byval + * attributes; otherwise, only 1, 2, and 4-byte values are supported. + * + * Note that T must already be properly aligned for this to work correctly. + */ +#define fetchatt(A,T) fetch_att(T, (A)->attbyval, (A)->attlen) + +/* + * Same, but work from byval/len parameters rather than Form_pg_attribute. + */ +static inline Datum +fetch_att(const void *T, bool attbyval, int attlen) +{ + if (attbyval) + { + switch (attlen) + { + case sizeof(char): + return CharGetDatum(*((const char *) T)); + case sizeof(int16): + return Int16GetDatum(*((const int16 *) T)); + case sizeof(int32): + return Int32GetDatum(*((const int32 *) T)); +#if SIZEOF_DATUM == 8 + case sizeof(Datum): + return *((const Datum *) T); +#endif + default: + elog(ERROR, "unsupported byval length: %d", attlen); + return 0; + } + } + else + return PointerGetDatum(T); +} +#endif /* FRONTEND */ + +/* + * att_align_datum aligns the given offset as needed for a datum of alignment + * requirement attalign and typlen attlen. attdatum is the Datum variable + * we intend to pack into a tuple (it's only accessed if we are dealing with + * a varlena type). Note that this assumes the Datum will be stored as-is; + * callers that are intending to convert non-short varlena datums to short + * format have to account for that themselves. + */ +#define att_align_datum(cur_offset, attalign, attlen, attdatum) \ +( \ + ((attlen) == -1 && VARATT_IS_SHORT(DatumGetPointer(attdatum))) ? \ + (uintptr_t) (cur_offset) : \ + att_align_nominal(cur_offset, attalign) \ +) + +/* + * att_align_pointer performs the same calculation as att_align_datum, + * but is used when walking a tuple. attptr is the current actual data + * pointer; when accessing a varlena field we have to "peek" to see if we + * are looking at a pad byte or the first byte of a 1-byte-header datum. + * (A zero byte must be either a pad byte, or the first byte of a correctly + * aligned 4-byte length word; in either case we can align safely. A non-zero + * byte must be either a 1-byte length word, or the first byte of a correctly + * aligned 4-byte length word; in either case we need not align.) + * + * Note: some callers pass a "char *" pointer for cur_offset. This is + * a bit of a hack but should work all right as long as uintptr_t is the + * correct width. + */ +#define att_align_pointer(cur_offset, attalign, attlen, attptr) \ +( \ + ((attlen) == -1 && VARATT_NOT_PAD_BYTE(attptr)) ? \ + (uintptr_t) (cur_offset) : \ + att_align_nominal(cur_offset, attalign) \ +) + +/* + * att_align_nominal aligns the given offset as needed for a datum of alignment + * requirement attalign, ignoring any consideration of packed varlena datums. + * There are three main use cases for using this macro directly: + * * we know that the att in question is not varlena (attlen != -1); + * in this case it is cheaper than the above macros and just as good. + * * we need to estimate alignment padding cost abstractly, ie without + * reference to a real tuple. We must assume the worst case that + * all varlenas are aligned. + * * within arrays and multiranges, we unconditionally align varlenas (XXX this + * should be revisited, probably). + * + * The attalign cases are tested in what is hopefully something like their + * frequency of occurrence. + */ +#define att_align_nominal(cur_offset, attalign) \ +( \ + ((attalign) == TYPALIGN_INT) ? INTALIGN(cur_offset) : \ + (((attalign) == TYPALIGN_CHAR) ? (uintptr_t) (cur_offset) : \ + (((attalign) == TYPALIGN_DOUBLE) ? DOUBLEALIGN(cur_offset) : \ + ( \ + AssertMacro((attalign) == TYPALIGN_SHORT), \ + SHORTALIGN(cur_offset) \ + ))) \ +) + +/* + * att_addlength_datum increments the given offset by the space needed for + * the given Datum variable. attdatum is only accessed if we are dealing + * with a variable-length attribute. + */ +#define att_addlength_datum(cur_offset, attlen, attdatum) \ + att_addlength_pointer(cur_offset, attlen, DatumGetPointer(attdatum)) + +/* + * att_addlength_pointer performs the same calculation as att_addlength_datum, + * but is used when walking a tuple --- attptr is the pointer to the field + * within the tuple. + * + * Note: some callers pass a "char *" pointer for cur_offset. This is + * actually perfectly OK, but probably should be cleaned up along with + * the same practice for att_align_pointer. + */ +#define att_addlength_pointer(cur_offset, attlen, attptr) \ +( \ + ((attlen) > 0) ? \ + ( \ + (cur_offset) + (attlen) \ + ) \ + : (((attlen) == -1) ? \ + ( \ + (cur_offset) + VARSIZE_ANY(attptr) \ + ) \ + : \ + ( \ + AssertMacro((attlen) == -2), \ + (cur_offset) + (strlen((char *) (attptr)) + 1) \ + )) \ +) + +#ifndef FRONTEND +/* + * store_att_byval is a partial inverse of fetch_att: store a given Datum + * value into a tuple data area at the specified address. However, it only + * handles the byval case, because in typical usage the caller needs to + * distinguish by-val and by-ref cases anyway, and so a do-it-all function + * wouldn't be convenient. + */ +static inline void +store_att_byval(void *T, Datum newdatum, int attlen) +{ + switch (attlen) + { + case sizeof(char): + *(char *) T = DatumGetChar(newdatum); + break; + case sizeof(int16): + *(int16 *) T = DatumGetInt16(newdatum); + break; + case sizeof(int32): + *(int32 *) T = DatumGetInt32(newdatum); + break; +#if SIZEOF_DATUM == 8 + case sizeof(Datum): + *(Datum *) T = newdatum; + break; +#endif + default: + elog(ERROR, "unsupported byval length: %d", attlen); + } +} +#endif /* FRONTEND */ + +#endif /* TUPMACS_H */ diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h new file mode 100644 index 0000000..21e2af7 --- /dev/null +++ b/src/include/access/twophase.h @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * twophase.h + * Two-phase-commit related declarations. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/twophase.h + * + *------------------------------------------------------------------------- + */ +#ifndef TWOPHASE_H +#define TWOPHASE_H + +#include "access/xact.h" +#include "access/xlogdefs.h" +#include "datatype/timestamp.h" +#include "storage/lock.h" + +/* + * GlobalTransactionData is defined in twophase.c; other places have no + * business knowing the internal definition. + */ +typedef struct GlobalTransactionData *GlobalTransaction; + +/* GUC variable */ +extern PGDLLIMPORT int max_prepared_xacts; + +extern Size TwoPhaseShmemSize(void); +extern void TwoPhaseShmemInit(void); + +extern void AtAbort_Twophase(void); +extern void PostPrepare_Twophase(void); + +extern TransactionId TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, + bool *have_more); +extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid, bool lock_held); +extern BackendId TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held); + +extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, + TimestampTz prepared_at, + Oid owner, Oid databaseid); + +extern void StartPrepare(GlobalTransaction gxact); +extern void EndPrepare(GlobalTransaction gxact); +extern bool StandbyTransactionIdIsPrepared(TransactionId xid); + +extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p, + int *nxids_p); +extern void StandbyRecoverPreparedTransactions(void); +extern void RecoverPreparedTransactions(void); + +extern void CheckPointTwoPhase(XLogRecPtr redo_horizon); + +extern void FinishPreparedTransaction(const char *gid, bool isCommit); + +extern void PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, + XLogRecPtr end_lsn, RepOriginId origin_id); +extern void PrepareRedoRemove(TransactionId xid, bool giveWarning); +extern void restoreTwoPhaseData(void); +extern bool LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, + TimestampTz origin_prepare_timestamp); +#endif /* TWOPHASE_H */ diff --git a/src/include/access/twophase_rmgr.h b/src/include/access/twophase_rmgr.h new file mode 100644 index 0000000..02f7ba2 --- /dev/null +++ b/src/include/access/twophase_rmgr.h @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * twophase_rmgr.h + * Two-phase-commit resource managers definition + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/twophase_rmgr.h + * + *------------------------------------------------------------------------- + */ +#ifndef TWOPHASE_RMGR_H +#define TWOPHASE_RMGR_H + +typedef void (*TwoPhaseCallback) (TransactionId xid, uint16 info, + void *recdata, uint32 len); +typedef uint8 TwoPhaseRmgrId; + +/* + * Built-in resource managers + */ +#define TWOPHASE_RM_END_ID 0 +#define TWOPHASE_RM_LOCK_ID 1 +#define TWOPHASE_RM_PGSTAT_ID 2 +#define TWOPHASE_RM_MULTIXACT_ID 3 +#define TWOPHASE_RM_PREDICATELOCK_ID 4 +#define TWOPHASE_RM_MAX_ID TWOPHASE_RM_PREDICATELOCK_ID + +extern PGDLLIMPORT const TwoPhaseCallback twophase_recover_callbacks[]; +extern PGDLLIMPORT const TwoPhaseCallback twophase_postcommit_callbacks[]; +extern PGDLLIMPORT const TwoPhaseCallback twophase_postabort_callbacks[]; +extern PGDLLIMPORT const TwoPhaseCallback twophase_standby_recover_callbacks[]; + + +extern void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, + const void *data, uint32 len); + +#endif /* TWOPHASE_RMGR_H */ diff --git a/src/include/access/valid.h b/src/include/access/valid.h new file mode 100644 index 0000000..85d476a --- /dev/null +++ b/src/include/access/valid.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * valid.h + * POSTGRES tuple qualification validity definitions. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/valid.h + * + *------------------------------------------------------------------------- + */ +#ifndef VALID_H +#define VALID_H + +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/skey.h" +#include "access/tupdesc.h" + +/* + * HeapKeyTest + * + * Test a heap tuple to see if it satisfies a scan key. + */ +static inline bool +HeapKeyTest(HeapTuple tuple, TupleDesc tupdesc, int nkeys, ScanKey keys) +{ + int cur_nkeys = nkeys; + ScanKey cur_key = keys; + + for (; cur_nkeys--; cur_key++) + { + Datum atp; + bool isnull; + Datum test; + + if (cur_key->sk_flags & SK_ISNULL) + return false; + + atp = heap_getattr(tuple, cur_key->sk_attno, tupdesc, &isnull); + + if (isnull) + return false; + + test = FunctionCall2Coll(&cur_key->sk_func, + cur_key->sk_collation, + atp, cur_key->sk_argument); + + if (!DatumGetBool(test)) + return false; + } + + return true; +} + +#endif /* VALID_H */ diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h new file mode 100644 index 0000000..daaa01a --- /dev/null +++ b/src/include/access/visibilitymap.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * visibilitymap.h + * visibility map interface + * + * + * Portions Copyright (c) 2007-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/visibilitymap.h + * + *------------------------------------------------------------------------- + */ +#ifndef VISIBILITYMAP_H +#define VISIBILITYMAP_H + +#include "access/visibilitymapdefs.h" +#include "access/xlogdefs.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "utils/relcache.h" + +/* Macros for visibilitymap test */ +#define VM_ALL_VISIBLE(r, b, v) \ + ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_VISIBLE) != 0) +#define VM_ALL_FROZEN(r, b, v) \ + ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) + +extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, + Buffer vmbuf, uint8 flags); +extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, + Buffer *vmbuf); +extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); +extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags); +extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); +extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); +extern BlockNumber visibilitymap_prepare_truncate(Relation rel, + BlockNumber nheapblocks); + +#endif /* VISIBILITYMAP_H */ diff --git a/src/include/access/visibilitymapdefs.h b/src/include/access/visibilitymapdefs.h new file mode 100644 index 0000000..8dfdbfa --- /dev/null +++ b/src/include/access/visibilitymapdefs.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------- + * + * visibilitymapdefs.h + * macros for accessing contents of visibility map pages + * + * + * Copyright (c) 2021-2023, PostgreSQL Global Development Group + * + * src/include/access/visibilitymapdefs.h + * + *------------------------------------------------------------------------- + */ +#ifndef VISIBILITYMAPDEFS_H +#define VISIBILITYMAPDEFS_H + +/* Number of bits for one heap page */ +#define BITS_PER_HEAPBLOCK 2 + +/* Flags for bit map */ +#define VISIBILITYMAP_ALL_VISIBLE 0x01 +#define VISIBILITYMAP_ALL_FROZEN 0x02 +#define VISIBILITYMAP_VALID_BITS 0x03 /* OR of all valid visibilitymap + * flags bits */ +/* + * To detect recovery conflicts during logical decoding on a standby, we need + * to know if a table is a user catalog table. For that we add an additional + * bit into xl_heap_visible.flags, in addition to the above. + * + * NB: VISIBILITYMAP_XLOG_* may not be passed to visibilitymap_set(). + */ +#define VISIBILITYMAP_XLOG_CATALOG_REL 0x04 +#define VISIBILITYMAP_XLOG_VALID_BITS (VISIBILITYMAP_VALID_BITS | VISIBILITYMAP_XLOG_CATALOG_REL) + +#endif /* VISIBILITYMAPDEFS_H */ diff --git a/src/include/access/xact.h b/src/include/access/xact.h new file mode 100644 index 0000000..7d3b944 --- /dev/null +++ b/src/include/access/xact.h @@ -0,0 +1,530 @@ +/*------------------------------------------------------------------------- + * + * xact.h + * postgres transaction system definitions + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xact.h + * + *------------------------------------------------------------------------- + */ +#ifndef XACT_H +#define XACT_H + +#include "access/transam.h" +#include "access/xlogreader.h" +#include "datatype/timestamp.h" +#include "lib/stringinfo.h" +#include "nodes/pg_list.h" +#include "storage/relfilelocator.h" +#include "storage/sinval.h" + +/* + * Maximum size of Global Transaction ID (including '\0'). + * + * Note that the max value of GIDSIZE must fit in the uint16 gidlen, + * specified in TwoPhaseFileHeader. + */ +#define GIDSIZE 200 + +/* + * Xact isolation levels + */ +#define XACT_READ_UNCOMMITTED 0 +#define XACT_READ_COMMITTED 1 +#define XACT_REPEATABLE_READ 2 +#define XACT_SERIALIZABLE 3 + +extern PGDLLIMPORT int DefaultXactIsoLevel; +extern PGDLLIMPORT int XactIsoLevel; + +/* + * We implement three isolation levels internally. + * The two stronger ones use one snapshot per database transaction; + * the others use one snapshot per statement. + * Serializable uses predicate locks in addition to snapshots. + * These macros should be used to check which isolation level is selected. + */ +#define IsolationUsesXactSnapshot() (XactIsoLevel >= XACT_REPEATABLE_READ) +#define IsolationIsSerializable() (XactIsoLevel == XACT_SERIALIZABLE) + +/* Xact read-only state */ +extern PGDLLIMPORT bool DefaultXactReadOnly; +extern PGDLLIMPORT bool XactReadOnly; + +/* flag for logging statements in this transaction */ +extern PGDLLIMPORT bool xact_is_sampled; + +/* + * Xact is deferrable -- only meaningful (currently) for read only + * SERIALIZABLE transactions + */ +extern PGDLLIMPORT bool DefaultXactDeferrable; +extern PGDLLIMPORT bool XactDeferrable; + +typedef enum +{ + SYNCHRONOUS_COMMIT_OFF, /* asynchronous commit */ + SYNCHRONOUS_COMMIT_LOCAL_FLUSH, /* wait for local flush only */ + SYNCHRONOUS_COMMIT_REMOTE_WRITE, /* wait for local flush and remote + * write */ + SYNCHRONOUS_COMMIT_REMOTE_FLUSH, /* wait for local and remote flush */ + SYNCHRONOUS_COMMIT_REMOTE_APPLY /* wait for local and remote flush and + * remote apply */ +} SyncCommitLevel; + +/* Define the default setting for synchronous_commit */ +#define SYNCHRONOUS_COMMIT_ON SYNCHRONOUS_COMMIT_REMOTE_FLUSH + +/* Synchronous commit level */ +extern PGDLLIMPORT int synchronous_commit; + +/* used during logical streaming of a transaction */ +extern PGDLLIMPORT TransactionId CheckXidAlive; +extern PGDLLIMPORT bool bsysscan; + +/* + * Miscellaneous flag bits to record events which occur on the top level + * transaction. These flags are only persisted in MyXactFlags and are intended + * so we remember to do certain things later in the transaction. This is + * globally accessible, so can be set from anywhere in the code which requires + * recording flags. + */ +extern PGDLLIMPORT int MyXactFlags; + +/* + * XACT_FLAGS_ACCESSEDTEMPNAMESPACE - set when a temporary object is accessed. + * We don't allow PREPARE TRANSACTION in that case. + */ +#define XACT_FLAGS_ACCESSEDTEMPNAMESPACE (1U << 0) + +/* + * XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK - records whether the top level xact + * logged any Access Exclusive Locks. + */ +#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK (1U << 1) + +/* + * XACT_FLAGS_NEEDIMMEDIATECOMMIT - records whether the top level statement + * is one that requires immediate commit, such as CREATE DATABASE. + */ +#define XACT_FLAGS_NEEDIMMEDIATECOMMIT (1U << 2) + +/* + * XACT_FLAGS_PIPELINING - set when we complete an extended-query-protocol + * Execute message. This is useful for detecting that an implicit transaction + * block has been created via pipelining. + */ +#define XACT_FLAGS_PIPELINING (1U << 3) + +/* + * start- and end-of-transaction callbacks for dynamically loaded modules + */ +typedef enum +{ + XACT_EVENT_COMMIT, + XACT_EVENT_PARALLEL_COMMIT, + XACT_EVENT_ABORT, + XACT_EVENT_PARALLEL_ABORT, + XACT_EVENT_PREPARE, + XACT_EVENT_PRE_COMMIT, + XACT_EVENT_PARALLEL_PRE_COMMIT, + XACT_EVENT_PRE_PREPARE +} XactEvent; + +typedef void (*XactCallback) (XactEvent event, void *arg); + +typedef enum +{ + SUBXACT_EVENT_START_SUB, + SUBXACT_EVENT_COMMIT_SUB, + SUBXACT_EVENT_ABORT_SUB, + SUBXACT_EVENT_PRE_COMMIT_SUB +} SubXactEvent; + +typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid, + SubTransactionId parentSubid, void *arg); + +/* Data structure for Save/RestoreTransactionCharacteristics */ +typedef struct SavedTransactionCharacteristics +{ + int save_XactIsoLevel; + bool save_XactReadOnly; + bool save_XactDeferrable; +} SavedTransactionCharacteristics; + + +/* ---------------- + * transaction-related XLOG entries + * ---------------- + */ + +/* + * XLOG allows to store some information in high 4 bits of log record xl_info + * field. We use 3 for the opcode, and one about an optional flag variable. + */ +#define XLOG_XACT_COMMIT 0x00 +#define XLOG_XACT_PREPARE 0x10 +#define XLOG_XACT_ABORT 0x20 +#define XLOG_XACT_COMMIT_PREPARED 0x30 +#define XLOG_XACT_ABORT_PREPARED 0x40 +#define XLOG_XACT_ASSIGNMENT 0x50 +#define XLOG_XACT_INVALIDATIONS 0x60 +/* free opcode 0x70 */ + +/* mask for filtering opcodes out of xl_info */ +#define XLOG_XACT_OPMASK 0x70 + +/* does this record have a 'xinfo' field or not */ +#define XLOG_XACT_HAS_INFO 0x80 + +/* + * The following flags, stored in xinfo, determine which information is + * contained in commit/abort records. + */ +#define XACT_XINFO_HAS_DBINFO (1U << 0) +#define XACT_XINFO_HAS_SUBXACTS (1U << 1) +#define XACT_XINFO_HAS_RELFILELOCATORS (1U << 2) +#define XACT_XINFO_HAS_INVALS (1U << 3) +#define XACT_XINFO_HAS_TWOPHASE (1U << 4) +#define XACT_XINFO_HAS_ORIGIN (1U << 5) +#define XACT_XINFO_HAS_AE_LOCKS (1U << 6) +#define XACT_XINFO_HAS_GID (1U << 7) +#define XACT_XINFO_HAS_DROPPED_STATS (1U << 8) + +/* + * Also stored in xinfo, these indicating a variety of additional actions that + * need to occur when emulating transaction effects during recovery. + * + * They are named XactCompletion... to differentiate them from + * EOXact... routines which run at the end of the original transaction + * completion. + */ +#define XACT_COMPLETION_APPLY_FEEDBACK (1U << 29) +#define XACT_COMPLETION_UPDATE_RELCACHE_FILE (1U << 30) +#define XACT_COMPLETION_FORCE_SYNC_COMMIT (1U << 31) + +/* Access macros for above flags */ +#define XactCompletionApplyFeedback(xinfo) \ + ((xinfo & XACT_COMPLETION_APPLY_FEEDBACK) != 0) +#define XactCompletionRelcacheInitFileInval(xinfo) \ + ((xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE) != 0) +#define XactCompletionForceSyncCommit(xinfo) \ + ((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0) + +typedef struct xl_xact_assignment +{ + TransactionId xtop; /* assigned XID's top-level XID */ + int nsubxacts; /* number of subtransaction XIDs */ + TransactionId xsub[FLEXIBLE_ARRAY_MEMBER]; /* assigned subxids */ +} xl_xact_assignment; + +#define MinSizeOfXactAssignment offsetof(xl_xact_assignment, xsub) + +/* + * Commit and abort records can contain a lot of information. But a large + * portion of the records won't need all possible pieces of information. So we + * only include what's needed. + * + * A minimal commit/abort record only consists of a xl_xact_commit/abort + * struct. The presence of additional information is indicated by bits set in + * 'xl_xact_xinfo->xinfo'. The presence of the xinfo field itself is signaled + * by a set XLOG_XACT_HAS_INFO bit in the xl_info field. + * + * NB: All the individual data chunks should be sized to multiples of + * sizeof(int) and only require int32 alignment. If they require bigger + * alignment, they need to be copied upon reading. + */ + +/* sub-records for commit/abort */ + +typedef struct xl_xact_xinfo +{ + /* + * Even though we right now only require two bytes of space in xinfo we + * use four so following records don't have to care about alignment. + * Commit records can be large, so copying large portions isn't + * attractive. + */ + uint32 xinfo; +} xl_xact_xinfo; + +typedef struct xl_xact_dbinfo +{ + Oid dbId; /* MyDatabaseId */ + Oid tsId; /* MyDatabaseTableSpace */ +} xl_xact_dbinfo; + +typedef struct xl_xact_subxacts +{ + int nsubxacts; /* number of subtransaction XIDs */ + TransactionId subxacts[FLEXIBLE_ARRAY_MEMBER]; +} xl_xact_subxacts; +#define MinSizeOfXactSubxacts offsetof(xl_xact_subxacts, subxacts) + +typedef struct xl_xact_relfilelocators +{ + int nrels; /* number of relations */ + RelFileLocator xlocators[FLEXIBLE_ARRAY_MEMBER]; +} xl_xact_relfilelocators; +#define MinSizeOfXactRelfileLocators offsetof(xl_xact_relfilelocators, xlocators) + +/* + * A transactionally dropped statistics entry. + * + * Declared here rather than pgstat.h because pgstat.h can't be included from + * frontend code, but the WAL format needs to be readable by frontend + * programs. + */ +typedef struct xl_xact_stats_item +{ + int kind; + Oid dboid; + Oid objoid; +} xl_xact_stats_item; + +typedef struct xl_xact_stats_items +{ + int nitems; + xl_xact_stats_item items[FLEXIBLE_ARRAY_MEMBER]; +} xl_xact_stats_items; +#define MinSizeOfXactStatsItems offsetof(xl_xact_stats_items, items) + +typedef struct xl_xact_invals +{ + int nmsgs; /* number of shared inval msgs */ + SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]; +} xl_xact_invals; +#define MinSizeOfXactInvals offsetof(xl_xact_invals, msgs) + +typedef struct xl_xact_twophase +{ + TransactionId xid; +} xl_xact_twophase; + +typedef struct xl_xact_origin +{ + XLogRecPtr origin_lsn; + TimestampTz origin_timestamp; +} xl_xact_origin; + +typedef struct xl_xact_commit +{ + TimestampTz xact_time; /* time of commit */ + + /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */ + /* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */ + /* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */ + /* xl_xact_relfilelocators follows if XINFO_HAS_RELFILELOCATORS */ + /* xl_xact_stats_items follows if XINFO_HAS_DROPPED_STATS */ + /* xl_xact_invals follows if XINFO_HAS_INVALS */ + /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ + /* twophase_gid follows if XINFO_HAS_GID. As a null-terminated string. */ + /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */ +} xl_xact_commit; +#define MinSizeOfXactCommit (offsetof(xl_xact_commit, xact_time) + sizeof(TimestampTz)) + +typedef struct xl_xact_abort +{ + TimestampTz xact_time; /* time of abort */ + + /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */ + /* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */ + /* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */ + /* xl_xact_relfilelocators follows if XINFO_HAS_RELFILELOCATORS */ + /* xl_xact_stats_items follows if XINFO_HAS_DROPPED_STATS */ + /* No invalidation messages needed. */ + /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ + /* twophase_gid follows if XINFO_HAS_GID. As a null-terminated string. */ + /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */ +} xl_xact_abort; +#define MinSizeOfXactAbort sizeof(xl_xact_abort) + +typedef struct xl_xact_prepare +{ + uint32 magic; /* format identifier */ + uint32 total_len; /* actual file length */ + TransactionId xid; /* original transaction XID */ + Oid database; /* OID of database it was in */ + TimestampTz prepared_at; /* time of preparation */ + Oid owner; /* user running the transaction */ + int32 nsubxacts; /* number of following subxact XIDs */ + int32 ncommitrels; /* number of delete-on-commit rels */ + int32 nabortrels; /* number of delete-on-abort rels */ + int32 ncommitstats; /* number of stats to drop on commit */ + int32 nabortstats; /* number of stats to drop on abort */ + int32 ninvalmsgs; /* number of cache invalidation messages */ + bool initfileinval; /* does relcache init file need invalidation? */ + uint16 gidlen; /* length of the GID - GID follows the header */ + XLogRecPtr origin_lsn; /* lsn of this record at origin node */ + TimestampTz origin_timestamp; /* time of prepare at origin node */ +} xl_xact_prepare; + +/* + * Commit/Abort records in the above form are a bit verbose to parse, so + * there's a deconstructed versions generated by ParseCommit/AbortRecord() for + * easier consumption. + */ +typedef struct xl_xact_parsed_commit +{ + TimestampTz xact_time; + uint32 xinfo; + + Oid dbId; /* MyDatabaseId */ + Oid tsId; /* MyDatabaseTableSpace */ + + int nsubxacts; + TransactionId *subxacts; + + int nrels; + RelFileLocator *xlocators; + + int nstats; + xl_xact_stats_item *stats; + + int nmsgs; + SharedInvalidationMessage *msgs; + + TransactionId twophase_xid; /* only for 2PC */ + char twophase_gid[GIDSIZE]; /* only for 2PC */ + int nabortrels; /* only for 2PC */ + RelFileLocator *abortlocators; /* only for 2PC */ + int nabortstats; /* only for 2PC */ + xl_xact_stats_item *abortstats; /* only for 2PC */ + + XLogRecPtr origin_lsn; + TimestampTz origin_timestamp; +} xl_xact_parsed_commit; + +typedef xl_xact_parsed_commit xl_xact_parsed_prepare; + +typedef struct xl_xact_parsed_abort +{ + TimestampTz xact_time; + uint32 xinfo; + + Oid dbId; /* MyDatabaseId */ + Oid tsId; /* MyDatabaseTableSpace */ + + int nsubxacts; + TransactionId *subxacts; + + int nrels; + RelFileLocator *xlocators; + + int nstats; + xl_xact_stats_item *stats; + + TransactionId twophase_xid; /* only for 2PC */ + char twophase_gid[GIDSIZE]; /* only for 2PC */ + + XLogRecPtr origin_lsn; + TimestampTz origin_timestamp; +} xl_xact_parsed_abort; + + +/* ---------------- + * extern definitions + * ---------------- + */ +extern bool IsTransactionState(void); +extern bool IsAbortedTransactionBlockState(void); +extern TransactionId GetTopTransactionId(void); +extern TransactionId GetTopTransactionIdIfAny(void); +extern TransactionId GetCurrentTransactionId(void); +extern TransactionId GetCurrentTransactionIdIfAny(void); +extern TransactionId GetStableLatestTransactionId(void); +extern SubTransactionId GetCurrentSubTransactionId(void); +extern FullTransactionId GetTopFullTransactionId(void); +extern FullTransactionId GetTopFullTransactionIdIfAny(void); +extern FullTransactionId GetCurrentFullTransactionId(void); +extern FullTransactionId GetCurrentFullTransactionIdIfAny(void); +extern void MarkCurrentTransactionIdLoggedIfAny(void); +extern bool SubTransactionIsActive(SubTransactionId subxid); +extern CommandId GetCurrentCommandId(bool used); +extern void SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts); +extern TimestampTz GetCurrentTransactionStartTimestamp(void); +extern TimestampTz GetCurrentStatementStartTimestamp(void); +extern TimestampTz GetCurrentTransactionStopTimestamp(void); +extern void SetCurrentStatementStartTimestamp(void); +extern int GetCurrentTransactionNestLevel(void); +extern bool TransactionIdIsCurrentTransactionId(TransactionId xid); +extern void CommandCounterIncrement(void); +extern void ForceSyncCommit(void); +extern void StartTransactionCommand(void); +extern void SaveTransactionCharacteristics(SavedTransactionCharacteristics *s); +extern void RestoreTransactionCharacteristics(const SavedTransactionCharacteristics *s); +extern void CommitTransactionCommand(void); +extern void AbortCurrentTransaction(void); +extern void BeginTransactionBlock(void); +extern bool EndTransactionBlock(bool chain); +extern bool PrepareTransactionBlock(const char *gid); +extern void UserAbortTransactionBlock(bool chain); +extern void BeginImplicitTransactionBlock(void); +extern void EndImplicitTransactionBlock(void); +extern void ReleaseSavepoint(const char *name); +extern void DefineSavepoint(const char *name); +extern void RollbackToSavepoint(const char *name); +extern void BeginInternalSubTransaction(const char *name); +extern void ReleaseCurrentSubTransaction(void); +extern void RollbackAndReleaseCurrentSubTransaction(void); +extern bool IsSubTransaction(void); +extern Size EstimateTransactionStateSpace(void); +extern void SerializeTransactionState(Size maxsize, char *start_address); +extern void StartParallelWorkerTransaction(char *tstatespace); +extern void EndParallelWorkerTransaction(void); +extern bool IsTransactionBlock(void); +extern bool IsTransactionOrTransactionBlock(void); +extern char TransactionBlockStatusCode(void); +extern void AbortOutOfAnyTransaction(void); +extern void PreventInTransactionBlock(bool isTopLevel, const char *stmtType); +extern void RequireTransactionBlock(bool isTopLevel, const char *stmtType); +extern void WarnNoTransactionBlock(bool isTopLevel, const char *stmtType); +extern bool IsInTransactionBlock(bool isTopLevel); +extern void RegisterXactCallback(XactCallback callback, void *arg); +extern void UnregisterXactCallback(XactCallback callback, void *arg); +extern void RegisterSubXactCallback(SubXactCallback callback, void *arg); +extern void UnregisterSubXactCallback(SubXactCallback callback, void *arg); + +extern bool IsSubxactTopXidLogPending(void); +extern void MarkSubxactTopXidLogged(void); + +extern int xactGetCommittedChildren(TransactionId **ptr); + +extern XLogRecPtr XactLogCommitRecord(TimestampTz commit_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileLocator *rels, + int ndroppedstats, + xl_xact_stats_item *droppedstats, + int nmsgs, SharedInvalidationMessage *msgs, + bool relcacheInval, + int xactflags, + TransactionId twophase_xid, + const char *twophase_gid); + +extern XLogRecPtr XactLogAbortRecord(TimestampTz abort_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileLocator *rels, + int ndroppedstats, + xl_xact_stats_item *droppedstats, + int xactflags, TransactionId twophase_xid, + const char *twophase_gid); +extern void xact_redo(XLogReaderState *record); + +/* xactdesc.c */ +extern void xact_desc(StringInfo buf, XLogReaderState *record); +extern const char *xact_identify(uint8 info); + +/* also in xactdesc.c, so they can be shared between front/backend code */ +extern void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed); +extern void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed); +extern void ParsePrepareRecord(uint8 info, xl_xact_prepare *xlrec, xl_xact_parsed_prepare *parsed); + +extern void EnterParallelMode(void); +extern void ExitParallelMode(void); +extern bool IsInParallelMode(void); + +#endif /* XACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h new file mode 100644 index 0000000..48ca852 --- /dev/null +++ b/src/include/access/xlog.h @@ -0,0 +1,302 @@ +/* + * xlog.h + * + * PostgreSQL write-ahead log manager + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlog.h + */ +#ifndef XLOG_H +#define XLOG_H + +#include "access/xlogbackup.h" +#include "access/xlogdefs.h" +#include "datatype/timestamp.h" +#include "lib/stringinfo.h" +#include "nodes/pg_list.h" + + +/* Sync methods */ +#define SYNC_METHOD_FSYNC 0 +#define SYNC_METHOD_FDATASYNC 1 +#define SYNC_METHOD_OPEN 2 /* for O_SYNC */ +#define SYNC_METHOD_FSYNC_WRITETHROUGH 3 +#define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */ +extern PGDLLIMPORT int sync_method; + +extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr; +extern PGDLLIMPORT XLogRecPtr XactLastRecEnd; +extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; + +/* these variables are GUC parameters related to XLOG */ +extern PGDLLIMPORT int wal_segment_size; +extern PGDLLIMPORT int min_wal_size_mb; +extern PGDLLIMPORT int max_wal_size_mb; +extern PGDLLIMPORT int wal_keep_size_mb; +extern PGDLLIMPORT int max_slot_wal_keep_size_mb; +extern PGDLLIMPORT int XLOGbuffers; +extern PGDLLIMPORT int XLogArchiveTimeout; +extern PGDLLIMPORT int wal_retrieve_retry_interval; +extern PGDLLIMPORT char *XLogArchiveCommand; +extern PGDLLIMPORT bool EnableHotStandby; +extern PGDLLIMPORT bool fullPageWrites; +extern PGDLLIMPORT bool wal_log_hints; +extern PGDLLIMPORT int wal_compression; +extern PGDLLIMPORT bool wal_init_zero; +extern PGDLLIMPORT bool wal_recycle; +extern PGDLLIMPORT bool *wal_consistency_checking; +extern PGDLLIMPORT char *wal_consistency_checking_string; +extern PGDLLIMPORT bool log_checkpoints; +extern PGDLLIMPORT bool track_wal_io_timing; +extern PGDLLIMPORT int wal_decode_buffer_size; + +extern PGDLLIMPORT int CheckPointSegments; + +/* Archive modes */ +typedef enum ArchiveMode +{ + ARCHIVE_MODE_OFF = 0, /* disabled */ + ARCHIVE_MODE_ON, /* enabled while server is running normally */ + ARCHIVE_MODE_ALWAYS /* enabled always (even during recovery) */ +} ArchiveMode; +extern PGDLLIMPORT int XLogArchiveMode; + +/* WAL levels */ +typedef enum WalLevel +{ + WAL_LEVEL_MINIMAL = 0, + WAL_LEVEL_REPLICA, + WAL_LEVEL_LOGICAL +} WalLevel; + +/* Compression algorithms for WAL */ +typedef enum WalCompression +{ + WAL_COMPRESSION_NONE = 0, + WAL_COMPRESSION_PGLZ, + WAL_COMPRESSION_LZ4, + WAL_COMPRESSION_ZSTD +} WalCompression; + +/* Recovery states */ +typedef enum RecoveryState +{ + RECOVERY_STATE_CRASH = 0, /* crash recovery */ + RECOVERY_STATE_ARCHIVE, /* archive recovery */ + RECOVERY_STATE_DONE /* currently in production */ +} RecoveryState; + +extern PGDLLIMPORT int wal_level; + +/* Is WAL archiving enabled (always or only while server is running normally)? */ +#define XLogArchivingActive() \ + (AssertMacro(XLogArchiveMode == ARCHIVE_MODE_OFF || wal_level >= WAL_LEVEL_REPLICA), XLogArchiveMode > ARCHIVE_MODE_OFF) +/* Is WAL archiving enabled always (even during recovery)? */ +#define XLogArchivingAlways() \ + (AssertMacro(XLogArchiveMode == ARCHIVE_MODE_OFF || wal_level >= WAL_LEVEL_REPLICA), XLogArchiveMode == ARCHIVE_MODE_ALWAYS) + +/* + * Is WAL-logging necessary for archival or log-shipping, or can we skip + * WAL-logging if we fsync() the data before committing instead? + */ +#define XLogIsNeeded() (wal_level >= WAL_LEVEL_REPLICA) + +/* + * Is a full-page image needed for hint bit updates? + * + * Normally, we don't WAL-log hint bit updates, but if checksums are enabled, + * we have to protect them against torn page writes. When you only set + * individual bits on a page, it's still consistent no matter what combination + * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log + * them if forced by wal_log_hints=on. + */ +#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) + +/* Do we need to WAL-log information required only for Hot Standby and logical replication? */ +#define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) + +/* Do we need to WAL-log information required only for logical replication? */ +#define XLogLogicalInfoActive() (wal_level >= WAL_LEVEL_LOGICAL) + +#ifdef WAL_DEBUG +extern PGDLLIMPORT bool XLOG_DEBUG; +#endif + +/* + * OR-able request flag bits for checkpoints. The "cause" bits are used only + * for logging purposes. Note: the flags must be defined so that it's + * sensible to OR together request flags arising from different requestors. + */ + +/* These directly affect the behavior of CreateCheckPoint and subsidiaries */ +#define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */ +#define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but + * issued at end of WAL recovery */ +#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */ +#define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */ +#define CHECKPOINT_FLUSH_ALL 0x0010 /* Flush all pages, including those + * belonging to unlogged tables */ +/* These are important to RequestCheckpoint */ +#define CHECKPOINT_WAIT 0x0020 /* Wait for completion */ +#define CHECKPOINT_REQUESTED 0x0040 /* Checkpoint request has been made */ +/* These indicate the cause of a checkpoint request */ +#define CHECKPOINT_CAUSE_XLOG 0x0080 /* XLOG consumption */ +#define CHECKPOINT_CAUSE_TIME 0x0100 /* Elapsed time */ + +/* + * Flag bits for the record being inserted, set using XLogSetRecordFlags(). + */ +#define XLOG_INCLUDE_ORIGIN 0x01 /* include the replication origin */ +#define XLOG_MARK_UNIMPORTANT 0x02 /* record not important for durability */ + + +/* Checkpoint statistics */ +typedef struct CheckpointStatsData +{ + TimestampTz ckpt_start_t; /* start of checkpoint */ + TimestampTz ckpt_write_t; /* start of flushing buffers */ + TimestampTz ckpt_sync_t; /* start of fsyncs */ + TimestampTz ckpt_sync_end_t; /* end of fsyncs */ + TimestampTz ckpt_end_t; /* end of checkpoint */ + + int ckpt_bufs_written; /* # of buffers written */ + + int ckpt_segs_added; /* # of new xlog segments created */ + int ckpt_segs_removed; /* # of xlog segments deleted */ + int ckpt_segs_recycled; /* # of xlog segments recycled */ + + int ckpt_sync_rels; /* # of relations synced */ + uint64 ckpt_longest_sync; /* Longest sync for one relation */ + uint64 ckpt_agg_sync_time; /* The sum of all the individual sync + * times, which is not necessarily the + * same as the total elapsed time for the + * entire sync phase. */ +} CheckpointStatsData; + +extern PGDLLIMPORT CheckpointStatsData CheckpointStats; + +/* + * GetWALAvailability return codes + */ +typedef enum WALAvailability +{ + WALAVAIL_INVALID_LSN, /* parameter error */ + WALAVAIL_RESERVED, /* WAL segment is within max_wal_size */ + WALAVAIL_EXTENDED, /* WAL segment is reserved by a slot or + * wal_keep_size */ + WALAVAIL_UNRESERVED, /* no longer reserved, but not removed yet */ + WALAVAIL_REMOVED /* WAL segment has been removed */ +} WALAvailability; + +struct XLogRecData; +struct XLogReaderState; + +extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags, + int num_fpi, + bool topxid_included); +extern void XLogFlush(XLogRecPtr record); +extern bool XLogBackgroundFlush(void); +extern bool XLogNeedsFlush(XLogRecPtr record); +extern int XLogFileInit(XLogSegNo logsegno, TimeLineID logtli); +extern int XLogFileOpen(XLogSegNo segno, TimeLineID tli); + +extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); +extern XLogSegNo XLogGetLastRemovedSegno(void); +extern void XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN); +extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); + +extern void xlog_redo(struct XLogReaderState *record); +extern void xlog_desc(StringInfo buf, struct XLogReaderState *record); +extern const char *xlog_identify(uint8 info); + +extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); + +extern bool RecoveryInProgress(void); +extern RecoveryState GetRecoveryState(void); +extern bool XLogInsertAllowed(void); +extern XLogRecPtr GetXLogInsertRecPtr(void); +extern XLogRecPtr GetXLogWriteRecPtr(void); + +extern uint64 GetSystemIdentifier(void); +extern char *GetMockAuthenticationNonce(void); +extern bool DataChecksumsEnabled(void); +extern XLogRecPtr GetFakeLSNForUnloggedRel(void); +extern Size XLOGShmemSize(void); +extern void XLOGShmemInit(void); +extern void BootStrapXLOG(void); +extern void InitializeWalConsistencyChecking(void); +extern void LocalProcessControlFile(bool reset); +extern WalLevel GetActiveWalLevelOnStandby(void); +extern void StartupXLOG(void); +extern void ShutdownXLOG(int code, Datum arg); +extern void CreateCheckPoint(int flags); +extern bool CreateRestartPoint(int flags); +extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN); +extern void XLogPutNextOid(Oid nextOid); +extern XLogRecPtr XLogRestorePoint(const char *rpName); +extern void UpdateFullPageWrites(void); +extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p); +extern XLogRecPtr GetRedoRecPtr(void); +extern XLogRecPtr GetInsertRecPtr(void); +extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI); +extern TimeLineID GetWALInsertionTimeLine(void); +extern XLogRecPtr GetLastImportantRecPtr(void); + +extern void SetWalWriterSleeping(bool sleeping); + +/* + * Routines used by xlogrecovery.c to call back into xlog.c during recovery. + */ +extern void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI); +extern bool XLogCheckpointNeeded(XLogSegNo new_segno); +extern void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI); +extern void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli); +extern void SetInstallXLogFileSegmentActive(void); +extern bool IsInstallXLogFileSegmentActive(void); +extern void XLogShutdownWalRcv(void); + +/* + * Routines to start, stop, and get status of a base backup. + */ + +/* + * Session-level status of base backups + * + * This is used in parallel with the shared memory status to control parallel + * execution of base backup functions for a given session, be it a backend + * dedicated to replication or a normal backend connected to a database. The + * update of the session-level status happens at the same time as the shared + * memory counters to keep a consistent global and local state of the backups + * running. + */ +typedef enum SessionBackupState +{ + SESSION_BACKUP_NONE, + SESSION_BACKUP_RUNNING, +} SessionBackupState; + +extern void do_pg_backup_start(const char *backupidstr, bool fast, + List **tablespaces, BackupState *state, + StringInfo tblspcmapfile); +extern void do_pg_backup_stop(BackupState *state, bool waitforarchive); +extern void do_pg_abort_backup(int code, Datum arg); +extern void register_persistent_abort_backup_handler(void); +extern SessionBackupState get_backup_status(void); + +/* File path names (all relative to $PGDATA) */ +#define RECOVERY_SIGNAL_FILE "recovery.signal" +#define STANDBY_SIGNAL_FILE "standby.signal" +#define BACKUP_LABEL_FILE "backup_label" +#define BACKUP_LABEL_OLD "backup_label.old" + +#define TABLESPACE_MAP "tablespace_map" +#define TABLESPACE_MAP_OLD "tablespace_map.old" + +/* files to signal promotion to primary */ +#define PROMOTE_SIGNAL_FILE "promote" + +#endif /* XLOG_H */ diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h new file mode 100644 index 0000000..b0fd338 --- /dev/null +++ b/src/include/access/xlog_internal.h @@ -0,0 +1,404 @@ +/* + * xlog_internal.h + * + * PostgreSQL write-ahead log internal declarations + * + * NOTE: this file is intended to contain declarations useful for + * manipulating the XLOG files directly, but it is not supposed to be + * needed by rmgr routines (redo support for individual record types). + * So the XLogRecord typedef and associated stuff appear in xlogrecord.h. + * + * Note: This file must be includable in both frontend and backend contexts, + * to allow stand-alone tools like pg_receivewal to deal with WAL files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlog_internal.h + */ +#ifndef XLOG_INTERNAL_H +#define XLOG_INTERNAL_H + +#include "access/xlogdefs.h" +#include "access/xlogreader.h" +#include "datatype/timestamp.h" +#include "lib/stringinfo.h" +#include "pgtime.h" +#include "storage/block.h" +#include "storage/relfilelocator.h" + + +/* + * Each page of XLOG file has a header like this: + */ +#define XLOG_PAGE_MAGIC 0xD113 /* can be used as WAL version indicator */ + +typedef struct XLogPageHeaderData +{ + uint16 xlp_magic; /* magic value for correctness checks */ + uint16 xlp_info; /* flag bits, see below */ + TimeLineID xlp_tli; /* TimeLineID of first record on page */ + XLogRecPtr xlp_pageaddr; /* XLOG address of this page */ + + /* + * When there is not enough space on current page for whole record, we + * continue on the next page. xlp_rem_len is the number of bytes + * remaining from a previous page; it tracks xl_tot_len in the initial + * header. Note that the continuation data isn't necessarily aligned. + */ + uint32 xlp_rem_len; /* total len of remaining data for record */ +} XLogPageHeaderData; + +#define SizeOfXLogShortPHD MAXALIGN(sizeof(XLogPageHeaderData)) + +typedef XLogPageHeaderData *XLogPageHeader; + +/* + * When the XLP_LONG_HEADER flag is set, we store additional fields in the + * page header. (This is ordinarily done just in the first page of an + * XLOG file.) The additional fields serve to identify the file accurately. + */ +typedef struct XLogLongPageHeaderData +{ + XLogPageHeaderData std; /* standard header fields */ + uint64 xlp_sysid; /* system identifier from pg_control */ + uint32 xlp_seg_size; /* just as a cross-check */ + uint32 xlp_xlog_blcksz; /* just as a cross-check */ +} XLogLongPageHeaderData; + +#define SizeOfXLogLongPHD MAXALIGN(sizeof(XLogLongPageHeaderData)) + +typedef XLogLongPageHeaderData *XLogLongPageHeader; + +/* When record crosses page boundary, set this flag in new page's header */ +#define XLP_FIRST_IS_CONTRECORD 0x0001 +/* This flag indicates a "long" page header */ +#define XLP_LONG_HEADER 0x0002 +/* This flag indicates backup blocks starting in this page are optional */ +#define XLP_BKP_REMOVABLE 0x0004 +/* Replaces a missing contrecord; see CreateOverwriteContrecordRecord */ +#define XLP_FIRST_IS_OVERWRITE_CONTRECORD 0x0008 +/* All defined flag bits in xlp_info (used for validity checking of header) */ +#define XLP_ALL_FLAGS 0x000F + +#define XLogPageHeaderSize(hdr) \ + (((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD) + +/* wal_segment_size can range from 1MB to 1GB */ +#define WalSegMinSize 1024 * 1024 +#define WalSegMaxSize 1024 * 1024 * 1024 +/* default number of min and max wal segments */ +#define DEFAULT_MIN_WAL_SEGS 5 +#define DEFAULT_MAX_WAL_SEGS 64 + +/* check that the given size is a valid wal_segment_size */ +#define IsPowerOf2(x) (x > 0 && ((x) & ((x)-1)) == 0) +#define IsValidWalSegSize(size) \ + (IsPowerOf2(size) && \ + ((size) >= WalSegMinSize && (size) <= WalSegMaxSize)) + +#define XLogSegmentsPerXLogId(wal_segsz_bytes) \ + (UINT64CONST(0x100000000) / (wal_segsz_bytes)) + +#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest) \ + (dest) = (segno) * (wal_segsz_bytes) + (offset) + +#define XLogSegmentOffset(xlogptr, wal_segsz_bytes) \ + ((xlogptr) & ((wal_segsz_bytes) - 1)) + +/* + * Compute a segment number from an XLogRecPtr. + * + * For XLByteToSeg, do the computation at face value. For XLByteToPrevSeg, + * a boundary byte is taken to be in the previous segment. This is suitable + * for deciding which segment to write given a pointer to a record end, + * for example. + */ +#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes) \ + logSegNo = (xlrp) / (wal_segsz_bytes) + +#define XLByteToPrevSeg(xlrp, logSegNo, wal_segsz_bytes) \ + logSegNo = ((xlrp) - 1) / (wal_segsz_bytes) + +/* + * Convert values of GUCs measured in megabytes to equiv. segment count. + * Rounds down. + */ +#define XLogMBVarToSegs(mbvar, wal_segsz_bytes) \ + ((mbvar) / ((wal_segsz_bytes) / (1024 * 1024))) + +/* + * Is an XLogRecPtr within a particular XLOG segment? + * + * For XLByteInSeg, do the computation at face value. For XLByteInPrevSeg, + * a boundary byte is taken to be in the previous segment. + */ +#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes) \ + (((xlrp) / (wal_segsz_bytes)) == (logSegNo)) + +#define XLByteInPrevSeg(xlrp, logSegNo, wal_segsz_bytes) \ + ((((xlrp) - 1) / (wal_segsz_bytes)) == (logSegNo)) + +/* Check if an XLogRecPtr value is in a plausible range */ +#define XRecOffIsValid(xlrp) \ + ((xlrp) % XLOG_BLCKSZ >= SizeOfXLogShortPHD) + +/* + * The XLog directory and control file (relative to $PGDATA) + */ +#define XLOGDIR "pg_wal" +#define XLOG_CONTROL_FILE "global/pg_control" + +/* + * These macros encapsulate knowledge about the exact layout of XLog file + * names, timeline history file names, and archive-status file names. + */ +#define MAXFNAMELEN 64 + +/* Length of XLog file name */ +#define XLOG_FNAME_LEN 24 + +/* + * Generate a WAL segment file name. Do not use this function in a helper + * function allocating the result generated. + */ +static inline void +XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes) +{ + snprintf(fname, MAXFNAMELEN, "%08X%08X%08X", tli, + (uint32) (logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes)), + (uint32) (logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes))); +} + +static inline void +XLogFileNameById(char *fname, TimeLineID tli, uint32 log, uint32 seg) +{ + snprintf(fname, MAXFNAMELEN, "%08X%08X%08X", tli, log, seg); +} + +static inline bool +IsXLogFileName(const char *fname) +{ + return (strlen(fname) == XLOG_FNAME_LEN && \ + strspn(fname, "0123456789ABCDEF") == XLOG_FNAME_LEN); +} + +/* + * XLOG segment with .partial suffix. Used by pg_receivewal and at end of + * archive recovery, when we want to archive a WAL segment but it might not + * be complete yet. + */ +static inline bool +IsPartialXLogFileName(const char *fname) +{ + return (strlen(fname) == XLOG_FNAME_LEN + strlen(".partial") && + strspn(fname, "0123456789ABCDEF") == XLOG_FNAME_LEN && + strcmp(fname + XLOG_FNAME_LEN, ".partial") == 0); +} + +static inline void +XLogFromFileName(const char *fname, TimeLineID *tli, XLogSegNo *logSegNo, int wal_segsz_bytes) +{ + uint32 log; + uint32 seg; + + sscanf(fname, "%08X%08X%08X", tli, &log, &seg); + *logSegNo = (uint64) log * XLogSegmentsPerXLogId(wal_segsz_bytes) + seg; +} + +static inline void +XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes) +{ + snprintf(path, MAXPGPATH, XLOGDIR "/%08X%08X%08X", tli, + (uint32) (logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes)), + (uint32) (logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes))); +} + +static inline void +TLHistoryFileName(char *fname, TimeLineID tli) +{ + snprintf(fname, MAXFNAMELEN, "%08X.history", tli); +} + +static inline bool +IsTLHistoryFileName(const char *fname) +{ + return (strlen(fname) == 8 + strlen(".history") && + strspn(fname, "0123456789ABCDEF") == 8 && + strcmp(fname + 8, ".history") == 0); +} + +static inline void +TLHistoryFilePath(char *path, TimeLineID tli) +{ + snprintf(path, MAXPGPATH, XLOGDIR "/%08X.history", tli); +} + +static inline void +StatusFilePath(char *path, const char *xlog, const char *suffix) +{ + snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s%s", xlog, suffix); +} + +static inline void +BackupHistoryFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, XLogRecPtr startpoint, int wal_segsz_bytes) +{ + snprintf(fname, MAXFNAMELEN, "%08X%08X%08X.%08X.backup", tli, + (uint32) (logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes)), + (uint32) (logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)), + (uint32) (XLogSegmentOffset(startpoint, wal_segsz_bytes))); +} + +static inline bool +IsBackupHistoryFileName(const char *fname) +{ + return (strlen(fname) > XLOG_FNAME_LEN && + strspn(fname, "0123456789ABCDEF") == XLOG_FNAME_LEN && + strcmp(fname + strlen(fname) - strlen(".backup"), ".backup") == 0); +} + +static inline void +BackupHistoryFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, XLogRecPtr startpoint, int wal_segsz_bytes) +{ + snprintf(path, MAXPGPATH, XLOGDIR "/%08X%08X%08X.%08X.backup", tli, + (uint32) (logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes)), + (uint32) (logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)), + (uint32) (XLogSegmentOffset((startpoint), wal_segsz_bytes))); +} + +/* + * Information logged when we detect a change in one of the parameters + * important for Hot Standby. + */ +typedef struct xl_parameter_change +{ + int MaxConnections; + int max_worker_processes; + int max_wal_senders; + int max_prepared_xacts; + int max_locks_per_xact; + int wal_level; + bool wal_log_hints; + bool track_commit_timestamp; +} xl_parameter_change; + +/* logs restore point */ +typedef struct xl_restore_point +{ + TimestampTz rp_time; + char rp_name[MAXFNAMELEN]; +} xl_restore_point; + +/* Overwrite of prior contrecord */ +typedef struct xl_overwrite_contrecord +{ + XLogRecPtr overwritten_lsn; + TimestampTz overwrite_time; +} xl_overwrite_contrecord; + +/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */ +typedef struct xl_end_of_recovery +{ + TimestampTz end_time; + TimeLineID ThisTimeLineID; /* new TLI */ + TimeLineID PrevTimeLineID; /* previous TLI we forked off from */ +} xl_end_of_recovery; + +/* + * The functions in xloginsert.c construct a chain of XLogRecData structs + * to represent the final WAL record. + */ +typedef struct XLogRecData +{ + struct XLogRecData *next; /* next struct in chain, or NULL */ + char *data; /* start of rmgr data to include */ + uint32 len; /* length of rmgr data to include */ +} XLogRecData; + +/* + * Recovery target action. + */ +typedef enum +{ + RECOVERY_TARGET_ACTION_PAUSE, + RECOVERY_TARGET_ACTION_PROMOTE, + RECOVERY_TARGET_ACTION_SHUTDOWN +} RecoveryTargetAction; + +struct LogicalDecodingContext; +struct XLogRecordBuffer; + +/* + * Method table for resource managers. + * + * This struct must be kept in sync with the PG_RMGR definition in + * rmgr.c. + * + * rm_identify must return a name for the record based on xl_info (without + * reference to the rmid). For example, XLOG_BTREE_VACUUM would be named + * "VACUUM". rm_desc can then be called to obtain additional detail for the + * record, if available (e.g. the last block). + * + * rm_mask takes as input a page modified by the resource manager and masks + * out bits that shouldn't be flagged by wal_consistency_checking. + * + * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h). If rm_name is + * NULL, the corresponding RmgrTable entry is considered invalid. + */ +typedef struct RmgrData +{ + const char *rm_name; + void (*rm_redo) (XLogReaderState *record); + void (*rm_desc) (StringInfo buf, XLogReaderState *record); + const char *(*rm_identify) (uint8 info); + void (*rm_startup) (void); + void (*rm_cleanup) (void); + void (*rm_mask) (char *pagedata, BlockNumber blkno); + void (*rm_decode) (struct LogicalDecodingContext *ctx, + struct XLogRecordBuffer *buf); +} RmgrData; + +extern PGDLLIMPORT RmgrData RmgrTable[]; +extern void RmgrStartup(void); +extern void RmgrCleanup(void); +extern void RmgrNotFound(RmgrId rmid); +extern void RegisterCustomRmgr(RmgrId rmid, const RmgrData *rmgr); + +#ifndef FRONTEND +static inline bool +RmgrIdExists(RmgrId rmid) +{ + return RmgrTable[rmid].rm_name != NULL; +} + +static inline RmgrData +GetRmgr(RmgrId rmid) +{ + if (unlikely(!RmgrIdExists(rmid))) + RmgrNotFound(rmid); + return RmgrTable[rmid]; +} +#endif + +/* + * Exported to support xlog switching from checkpointer + */ +extern pg_time_t GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN); +extern XLogRecPtr RequestXLogSwitch(bool mark_unimportant); + +extern void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli); + +extern void XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, + bool detailed_format, StringInfo buf, + uint32 *fpi_len); + +/* + * Exported for the functions in timeline.c and xlogarchive.c. Only valid + * in the startup process. + */ +extern PGDLLIMPORT bool ArchiveRecoveryRequested; +extern PGDLLIMPORT bool InArchiveRecovery; +extern PGDLLIMPORT bool StandbyMode; +extern PGDLLIMPORT char *recoveryRestoreCommand; + +#endif /* XLOG_INTERNAL_H */ diff --git a/src/include/access/xlogarchive.h b/src/include/access/xlogarchive.h new file mode 100644 index 0000000..31ff206 --- /dev/null +++ b/src/include/access/xlogarchive.h @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------ + * + * xlogarchive.h + * Prototypes for WAL archives in the backend + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/xlogarchive.h + * + *------------------------------------------------------------------------ + */ + +#ifndef XLOG_ARCHIVE_H +#define XLOG_ARCHIVE_H + +#include "access/xlogdefs.h" + +extern bool RestoreArchivedFile(char *path, const char *xlogfname, + const char *recovername, off_t expectedSize, + bool cleanupEnabled); +extern void ExecuteRecoveryCommand(const char *command, const char *commandName, + bool failOnSignal, uint32 wait_event_info); +extern void KeepFileRestoredFromArchive(const char *path, const char *xlogfname); +extern void XLogArchiveNotify(const char *xlog); +extern void XLogArchiveNotifySeg(XLogSegNo segno, TimeLineID tli); +extern void XLogArchiveForceDone(const char *xlog); +extern bool XLogArchiveCheckDone(const char *xlog); +extern bool XLogArchiveIsBusy(const char *xlog); +extern bool XLogArchiveIsReady(const char *xlog); +extern bool XLogArchiveIsReadyOrDone(const char *xlog); +extern void XLogArchiveCleanup(const char *xlog); + +#endif /* XLOG_ARCHIVE_H */ diff --git a/src/include/access/xlogbackup.h b/src/include/access/xlogbackup.h new file mode 100644 index 0000000..1611358 --- /dev/null +++ b/src/include/access/xlogbackup.h @@ -0,0 +1,41 @@ +/*------------------------------------------------------------------------- + * + * xlogbackup.h + * Definitions for internals of base backups. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/xlogbackup.h + *------------------------------------------------------------------------- + */ + +#ifndef XLOG_BACKUP_H +#define XLOG_BACKUP_H + +#include "access/xlogdefs.h" +#include "pgtime.h" + +/* Structure to hold backup state. */ +typedef struct BackupState +{ + /* Fields saved at backup start */ + /* Backup label name one extra byte for null-termination */ + char name[MAXPGPATH + 1]; + XLogRecPtr startpoint; /* backup start WAL location */ + TimeLineID starttli; /* backup start TLI */ + XLogRecPtr checkpointloc; /* last checkpoint location */ + pg_time_t starttime; /* backup start time */ + bool started_in_recovery; /* backup started in recovery? */ + + /* Fields saved at the end of backup */ + XLogRecPtr stoppoint; /* backup stop WAL location */ + TimeLineID stoptli; /* backup stop TLI */ + pg_time_t stoptime; /* backup stop time */ +} BackupState; + +extern char *build_backup_content(BackupState *state, + bool ishistoryfile); + +#endif /* XLOG_BACKUP_H */ diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h new file mode 100644 index 0000000..fe794c7 --- /dev/null +++ b/src/include/access/xlogdefs.h @@ -0,0 +1,82 @@ +/* + * xlogdefs.h + * + * Postgres write-ahead log manager record pointer and + * timeline number definitions + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlogdefs.h + */ +#ifndef XLOG_DEFS_H +#define XLOG_DEFS_H + +#include <fcntl.h> /* need open() flags */ + +/* + * Pointer to a location in the XLOG. These pointers are 64 bits wide, + * because we don't want them ever to overflow. + */ +typedef uint64 XLogRecPtr; + +/* + * Zero is used indicate an invalid pointer. Bootstrap skips the first possible + * WAL segment, initializing the first WAL page at WAL segment size, so no XLOG + * record can begin at zero. + */ +#define InvalidXLogRecPtr 0 +#define XLogRecPtrIsInvalid(r) ((r) == InvalidXLogRecPtr) + +/* + * First LSN to use for "fake" LSNs. + * + * Values smaller than this can be used for special per-AM purposes. + */ +#define FirstNormalUnloggedLSN ((XLogRecPtr) 1000) + +/* + * Handy macro for printing XLogRecPtr in conventional format, e.g., + * + * printf("%X/%X", LSN_FORMAT_ARGS(lsn)); + */ +#define LSN_FORMAT_ARGS(lsn) (AssertVariableIsOfTypeMacro((lsn), XLogRecPtr), (uint32) ((lsn) >> 32)), ((uint32) (lsn)) + +/* + * XLogSegNo - physical log file sequence number. + */ +typedef uint64 XLogSegNo; + +/* + * TimeLineID (TLI) - identifies different database histories to prevent + * confusion after restoring a prior state of a database installation. + * TLI does not change in a normal stop/restart of the database (including + * crash-and-recover cases); but we must assign a new TLI after doing + * a recovery to a prior state, a/k/a point-in-time recovery. This makes + * the new WAL logfile sequence we generate distinguishable from the + * sequence that was generated in the previous incarnation. + */ +typedef uint32 TimeLineID; + +/* + * Replication origin id - this is located in this file to avoid having to + * include origin.h in a bunch of xlog related places. + */ +typedef uint16 RepOriginId; + +/* + * This chunk of hackery attempts to determine which file sync methods + * are available on the current platform, and to choose an appropriate + * default method. + * + * Note that we define our own O_DSYNC on Windows, but not O_SYNC. + */ +#if defined(PLATFORM_DEFAULT_SYNC_METHOD) +#define DEFAULT_SYNC_METHOD PLATFORM_DEFAULT_SYNC_METHOD +#elif defined(O_DSYNC) && (!defined(O_SYNC) || O_DSYNC != O_SYNC) +#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN_DSYNC +#else +#define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC +#endif + +#endif /* XLOG_DEFS_H */ diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h new file mode 100644 index 0000000..31785dc --- /dev/null +++ b/src/include/access/xloginsert.h @@ -0,0 +1,66 @@ +/* + * xloginsert.h + * + * Functions for generating WAL records + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xloginsert.h + */ +#ifndef XLOGINSERT_H +#define XLOGINSERT_H + +#include "access/rmgr.h" +#include "access/xlogdefs.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/relfilelocator.h" +#include "utils/relcache.h" + +/* + * The minimum size of the WAL construction working area. If you need to + * register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more + * than XLR_NORMAL_RDATAS data chunks in a single WAL record, you must call + * XLogEnsureRecordSpace() first to allocate more working memory. + */ +#define XLR_NORMAL_MAX_BLOCK_ID 4 +#define XLR_NORMAL_RDATAS 20 + +/* flags for XLogRegisterBuffer */ +#define REGBUF_FORCE_IMAGE 0x01 /* force a full-page image */ +#define REGBUF_NO_IMAGE 0x02 /* don't take a full-page image */ +#define REGBUF_WILL_INIT (0x04 | 0x02) /* page will be re-initialized at + * replay (implies NO_IMAGE) */ +#define REGBUF_STANDARD 0x08 /* page follows "standard" page layout, + * (data between pd_lower and pd_upper + * will be skipped) */ +#define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image + * is taken */ + +/* prototypes for public functions in xloginsert.c: */ +extern void XLogBeginInsert(void); +extern void XLogSetRecordFlags(uint8 flags); +extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info); +extern void XLogEnsureRecordSpace(int max_block_id, int ndatas); +extern void XLogRegisterData(char *data, uint32 len); +extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags); +extern void XLogRegisterBlock(uint8 block_id, RelFileLocator *rlocator, + ForkNumber forknum, BlockNumber blknum, char *page, + uint8 flags); +extern void XLogRegisterBufData(uint8 block_id, char *data, uint32 len); +extern void XLogResetInsertion(void); +extern bool XLogCheckBufferNeedsBackup(Buffer buffer); + +extern XLogRecPtr log_newpage(RelFileLocator *rlocator, ForkNumber forknum, + BlockNumber blkno, char *page, bool page_std); +extern void log_newpages(RelFileLocator *rlocator, ForkNumber forknum, int num_pages, + BlockNumber *blknos, char **pages, bool page_std); +extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); +extern void log_newpage_range(Relation rel, ForkNumber forknum, + BlockNumber startblk, BlockNumber endblk, bool page_std); +extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); + +extern void InitXLogInsert(void); + +#endif /* XLOGINSERT_H */ diff --git a/src/include/access/xlogprefetcher.h b/src/include/access/xlogprefetcher.h new file mode 100644 index 0000000..7dd7f20 --- /dev/null +++ b/src/include/access/xlogprefetcher.h @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------- + * + * xlogprefetcher.h + * Declarations for the recovery prefetching module. + * + * Portions Copyright (c) 2022-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/access/xlogprefetcher.h + *------------------------------------------------------------------------- + */ +#ifndef XLOGPREFETCHER_H +#define XLOGPREFETCHER_H + +#include "access/xlogdefs.h" +#include "access/xlogreader.h" +#include "access/xlogrecord.h" + +/* GUCs */ +extern PGDLLIMPORT int recovery_prefetch; + +/* Possible values for recovery_prefetch */ +typedef enum +{ + RECOVERY_PREFETCH_OFF, + RECOVERY_PREFETCH_ON, + RECOVERY_PREFETCH_TRY +} RecoveryPrefetchValue; + +struct XLogPrefetcher; +typedef struct XLogPrefetcher XLogPrefetcher; + + +extern void XLogPrefetchReconfigure(void); + +extern size_t XLogPrefetchShmemSize(void); +extern void XLogPrefetchShmemInit(void); + +extern void XLogPrefetchResetStats(void); + +extern XLogPrefetcher *XLogPrefetcherAllocate(XLogReaderState *reader); +extern void XLogPrefetcherFree(XLogPrefetcher *prefetcher); + +extern XLogReaderState *XLogPrefetcherGetReader(XLogPrefetcher *prefetcher); + +extern void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, + XLogRecPtr recPtr); + +extern XLogRecord *XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, + char **errmsg); + +extern void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher); + +#endif diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h new file mode 100644 index 0000000..da32c7d --- /dev/null +++ b/src/include/access/xlogreader.h @@ -0,0 +1,444 @@ +/*------------------------------------------------------------------------- + * + * xlogreader.h + * Definitions for the generic XLog reading facility + * + * Portions Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/access/xlogreader.h + * + * NOTES + * See the definition of the XLogReaderState struct for instructions on + * how to use the XLogReader infrastructure. + * + * The basic idea is to allocate an XLogReaderState via + * XLogReaderAllocate(), position the reader to the first record with + * XLogBeginRead() or XLogFindNextRecord(), and call XLogReadRecord() + * until it returns NULL. + * + * Callers supply a page_read callback if they want to call + * XLogReadRecord or XLogFindNextRecord; it can be passed in as NULL + * otherwise. The WALRead function can be used as a helper to write + * page_read callbacks, but it is not mandatory; callers that use it, + * must supply segment_open callbacks. The segment_close callback + * must always be supplied. + * + * After reading a record with XLogReadRecord(), it's decomposed into + * the per-block and main data parts, and the parts can be accessed + * with the XLogRec* macros and functions. You can also decode a + * record that's already constructed in memory, without reading from + * disk, by calling the DecodeXLogRecord() function. + *------------------------------------------------------------------------- + */ +#ifndef XLOGREADER_H +#define XLOGREADER_H + +#ifndef FRONTEND +#include "access/transam.h" +#endif + +#include "access/xlogrecord.h" +#include "storage/buf.h" + +/* WALOpenSegment represents a WAL segment being read. */ +typedef struct WALOpenSegment +{ + int ws_file; /* segment file descriptor */ + XLogSegNo ws_segno; /* segment number */ + TimeLineID ws_tli; /* timeline ID of the currently open file */ +} WALOpenSegment; + +/* WALSegmentContext carries context information about WAL segments to read */ +typedef struct WALSegmentContext +{ + char ws_dir[MAXPGPATH]; + int ws_segsize; +} WALSegmentContext; + +typedef struct XLogReaderState XLogReaderState; + +/* Function type definitions for various xlogreader interactions */ +typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader, + XLogRecPtr targetPagePtr, + int reqLen, + XLogRecPtr targetRecPtr, + char *readBuf); +typedef void (*WALSegmentOpenCB) (XLogReaderState *xlogreader, + XLogSegNo nextSegNo, + TimeLineID *tli_p); +typedef void (*WALSegmentCloseCB) (XLogReaderState *xlogreader); + +typedef struct XLogReaderRoutine +{ + /* + * Data input callback + * + * This callback shall read at least reqLen valid bytes of the xlog page + * starting at targetPagePtr, and store them in readBuf. The callback + * shall return the number of bytes read (never more than XLOG_BLCKSZ), or + * -1 on failure. The callback shall sleep, if necessary, to wait for the + * requested bytes to become available. The callback will not be invoked + * again for the same page unless more than the returned number of bytes + * are needed. + * + * targetRecPtr is the position of the WAL record we're reading. Usually + * it is equal to targetPagePtr + reqLen, but sometimes xlogreader needs + * to read and verify the page or segment header, before it reads the + * actual WAL record it's interested in. In that case, targetRecPtr can + * be used to determine which timeline to read the page from. + * + * The callback shall set ->seg.ws_tli to the TLI of the file the page was + * read from. + */ + XLogPageReadCB page_read; + + /* + * Callback to open the specified WAL segment for reading. ->seg.ws_file + * shall be set to the file descriptor of the opened segment. In case of + * failure, an error shall be raised by the callback and it shall not + * return. + * + * "nextSegNo" is the number of the segment to be opened. + * + * "tli_p" is an input/output argument. WALRead() uses it to pass the + * timeline in which the new segment should be found, but the callback can + * use it to return the TLI that it actually opened. + */ + WALSegmentOpenCB segment_open; + + /* + * WAL segment close callback. ->seg.ws_file shall be set to a negative + * number. + */ + WALSegmentCloseCB segment_close; +} XLogReaderRoutine; + +#define XL_ROUTINE(...) &(XLogReaderRoutine){__VA_ARGS__} + +typedef struct +{ + /* Is this block ref in use? */ + bool in_use; + + /* Identify the block this refers to */ + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber blkno; + + /* Prefetching workspace. */ + Buffer prefetch_buffer; + + /* copy of the fork_flags field from the XLogRecordBlockHeader */ + uint8 flags; + + /* Information on full-page image, if any */ + bool has_image; /* has image, even for consistency checking */ + bool apply_image; /* has image that should be restored */ + char *bkp_image; + uint16 hole_offset; + uint16 hole_length; + uint16 bimg_len; + uint8 bimg_info; + + /* Buffer holding the rmgr-specific data associated with this block */ + bool has_data; + char *data; + uint16 data_len; + uint16 data_bufsz; +} DecodedBkpBlock; + +/* + * The decoded contents of a record. This occupies a contiguous region of + * memory, with main_data and blocks[n].data pointing to memory after the + * members declared here. + */ +typedef struct DecodedXLogRecord +{ + /* Private member used for resource management. */ + size_t size; /* total size of decoded record */ + bool oversized; /* outside the regular decode buffer? */ + struct DecodedXLogRecord *next; /* decoded record queue link */ + + /* Public members. */ + XLogRecPtr lsn; /* location */ + XLogRecPtr next_lsn; /* location of next record */ + XLogRecord header; /* header */ + RepOriginId record_origin; + TransactionId toplevel_xid; /* XID of top-level transaction */ + char *main_data; /* record's main data portion */ + uint32 main_data_len; /* main data portion's length */ + int max_block_id; /* highest block_id in use (-1 if none) */ + DecodedBkpBlock blocks[FLEXIBLE_ARRAY_MEMBER]; +} DecodedXLogRecord; + +struct XLogReaderState +{ + /* + * Operational callbacks + */ + XLogReaderRoutine routine; + + /* ---------------------------------------- + * Public parameters + * ---------------------------------------- + */ + + /* + * System identifier of the xlog files we're about to read. Set to zero + * (the default value) if unknown or unimportant. + */ + uint64 system_identifier; + + /* + * Opaque data for callbacks to use. Not used by XLogReader. + */ + void *private_data; + + /* + * Start and end point of last record read. EndRecPtr is also used as the + * position to read next. Calling XLogBeginRead() sets EndRecPtr to the + * starting position and ReadRecPtr to invalid. + * + * Start and end point of last record returned by XLogReadRecord(). These + * are also available as record->lsn and record->next_lsn. + */ + XLogRecPtr ReadRecPtr; /* start of last record read */ + XLogRecPtr EndRecPtr; /* end+1 of last record read */ + + /* + * Set at the end of recovery: the start point of a partial record at the + * end of WAL (InvalidXLogRecPtr if there wasn't one), and the start + * location of its first contrecord that went missing. + */ + XLogRecPtr abortedRecPtr; + XLogRecPtr missingContrecPtr; + /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */ + XLogRecPtr overwrittenRecPtr; + + + /* ---------------------------------------- + * Decoded representation of current record + * + * Use XLogRecGet* functions to investigate the record; these fields + * should not be accessed directly. + * ---------------------------------------- + * Start and end point of the last record read and decoded by + * XLogReadRecordInternal(). NextRecPtr is also used as the position to + * decode next. Calling XLogBeginRead() sets NextRecPtr and EndRecPtr to + * the requested starting position. + */ + XLogRecPtr DecodeRecPtr; /* start of last record decoded */ + XLogRecPtr NextRecPtr; /* end+1 of last record decoded */ + XLogRecPtr PrevRecPtr; /* start of previous record decoded */ + + /* Last record returned by XLogReadRecord(). */ + DecodedXLogRecord *record; + + /* ---------------------------------------- + * private/internal state + * ---------------------------------------- + */ + + /* + * Buffer for decoded records. This is a circular buffer, though + * individual records can't be split in the middle, so some space is often + * wasted at the end. Oversized records that don't fit in this space are + * allocated separately. + */ + char *decode_buffer; + size_t decode_buffer_size; + bool free_decode_buffer; /* need to free? */ + char *decode_buffer_head; /* data is read from the head */ + char *decode_buffer_tail; /* new data is written at the tail */ + + /* + * Queue of records that have been decoded. This is a linked list that + * usually consists of consecutive records in decode_buffer, but may also + * contain oversized records allocated with palloc(). + */ + DecodedXLogRecord *decode_queue_head; /* oldest decoded record */ + DecodedXLogRecord *decode_queue_tail; /* newest decoded record */ + + /* + * Buffer for currently read page (XLOG_BLCKSZ bytes, valid up to at least + * readLen bytes) + */ + char *readBuf; + uint32 readLen; + + /* last read XLOG position for data currently in readBuf */ + WALSegmentContext segcxt; + WALOpenSegment seg; + uint32 segoff; + + /* + * beginning of prior page read, and its TLI. Doesn't necessarily + * correspond to what's in readBuf; used for timeline sanity checks. + */ + XLogRecPtr latestPagePtr; + TimeLineID latestPageTLI; + + /* beginning of the WAL record being read. */ + XLogRecPtr currRecPtr; + /* timeline to read it from, 0 if a lookup is required */ + TimeLineID currTLI; + + /* + * Safe point to read to in currTLI if current TLI is historical + * (tliSwitchPoint) or InvalidXLogRecPtr if on current timeline. + * + * Actually set to the start of the segment containing the timeline switch + * that ends currTLI's validity, not the LSN of the switch its self, since + * we can't assume the old segment will be present. + */ + XLogRecPtr currTLIValidUntil; + + /* + * If currTLI is not the most recent known timeline, the next timeline to + * read from when currTLIValidUntil is reached. + */ + TimeLineID nextTLI; + + /* + * Buffer for current ReadRecord result (expandable), used when a record + * crosses a page boundary. + */ + char *readRecordBuf; + uint32 readRecordBufSize; + + /* Buffer to hold error message */ + char *errormsg_buf; + bool errormsg_deferred; + + /* + * Flag to indicate to XLogPageReadCB that it should not block waiting for + * data. + */ + bool nonblocking; +}; + +/* + * Check if XLogNextRecord() has any more queued records or an error to return. + */ +static inline bool +XLogReaderHasQueuedRecordOrError(XLogReaderState *state) +{ + return (state->decode_queue_head != NULL) || state->errormsg_deferred; +} + +/* Get a new XLogReader */ +extern XLogReaderState *XLogReaderAllocate(int wal_segment_size, + const char *waldir, + XLogReaderRoutine *routine, + void *private_data); + +/* Free an XLogReader */ +extern void XLogReaderFree(XLogReaderState *state); + +/* Optionally provide a circular decoding buffer to allow readahead. */ +extern void XLogReaderSetDecodeBuffer(XLogReaderState *state, + void *buffer, + size_t size); + +/* Position the XLogReader to given record */ +extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr); +extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr); + +/* Return values from XLogPageReadCB. */ +typedef enum XLogPageReadResult +{ + XLREAD_SUCCESS = 0, /* record is successfully read */ + XLREAD_FAIL = -1, /* failed during reading a record */ + XLREAD_WOULDBLOCK = -2 /* nonblocking mode only, no data */ +} XLogPageReadResult; + +/* Read the next XLog record. Returns NULL on end-of-WAL or failure */ +extern struct XLogRecord *XLogReadRecord(XLogReaderState *state, + char **errormsg); + +/* Consume the next record or error. */ +extern DecodedXLogRecord *XLogNextRecord(XLogReaderState *state, + char **errormsg); + +/* Release the previously returned record, if necessary. */ +extern XLogRecPtr XLogReleasePreviousRecord(XLogReaderState *state); + +/* Try to read ahead, if there is data and space. */ +extern DecodedXLogRecord *XLogReadAhead(XLogReaderState *state, + bool nonblocking); + +/* Validate a page */ +extern bool XLogReaderValidatePageHeader(XLogReaderState *state, + XLogRecPtr recptr, char *phdr); + +/* Forget error produced by XLogReaderValidatePageHeader(). */ +extern void XLogReaderResetError(XLogReaderState *state); + +/* + * Error information from WALRead that both backend and frontend caller can + * process. Currently only errors from pg_pread can be reported. + */ +typedef struct WALReadError +{ + int wre_errno; /* errno set by the last pg_pread() */ + int wre_off; /* Offset we tried to read from. */ + int wre_req; /* Bytes requested to be read. */ + int wre_read; /* Bytes read by the last read(). */ + WALOpenSegment wre_seg; /* Segment we tried to read from. */ +} WALReadError; + +extern bool WALRead(XLogReaderState *state, + char *buf, XLogRecPtr startptr, Size count, + TimeLineID tli, WALReadError *errinfo); + +/* Functions for decoding an XLogRecord */ + +extern size_t DecodeXLogRecordRequiredSpace(size_t xl_tot_len); +extern bool DecodeXLogRecord(XLogReaderState *state, + DecodedXLogRecord *decoded, + XLogRecord *record, + XLogRecPtr lsn, + char **errormsg); + +/* + * Macros that provide access to parts of the record most recently returned by + * XLogReadRecord() or XLogNextRecord(). + */ +#define XLogRecGetTotalLen(decoder) ((decoder)->record->header.xl_tot_len) +#define XLogRecGetPrev(decoder) ((decoder)->record->header.xl_prev) +#define XLogRecGetInfo(decoder) ((decoder)->record->header.xl_info) +#define XLogRecGetRmid(decoder) ((decoder)->record->header.xl_rmid) +#define XLogRecGetXid(decoder) ((decoder)->record->header.xl_xid) +#define XLogRecGetOrigin(decoder) ((decoder)->record->record_origin) +#define XLogRecGetTopXid(decoder) ((decoder)->record->toplevel_xid) +#define XLogRecGetData(decoder) ((decoder)->record->main_data) +#define XLogRecGetDataLen(decoder) ((decoder)->record->main_data_len) +#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->record->max_block_id >= 0) +#define XLogRecMaxBlockId(decoder) ((decoder)->record->max_block_id) +#define XLogRecGetBlock(decoder, i) (&(decoder)->record->blocks[(i)]) +#define XLogRecHasBlockRef(decoder, block_id) \ + (((decoder)->record->max_block_id >= (block_id)) && \ + ((decoder)->record->blocks[block_id].in_use)) +#define XLogRecHasBlockImage(decoder, block_id) \ + ((decoder)->record->blocks[block_id].has_image) +#define XLogRecBlockImageApply(decoder, block_id) \ + ((decoder)->record->blocks[block_id].apply_image) +#define XLogRecHasBlockData(decoder, block_id) \ + ((decoder)->record->blocks[block_id].has_data) + +#ifndef FRONTEND +extern FullTransactionId XLogRecGetFullXid(XLogReaderState *record); +#endif + +extern bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page); +extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len); +extern void XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, + RelFileLocator *rlocator, ForkNumber *forknum, + BlockNumber *blknum); +extern bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, + RelFileLocator *rlocator, ForkNumber *forknum, + BlockNumber *blknum, + Buffer *prefetch_buffer); + +#endif /* XLOGREADER_H */ diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h new file mode 100644 index 0000000..f355e08 --- /dev/null +++ b/src/include/access/xlogrecord.h @@ -0,0 +1,248 @@ +/* + * xlogrecord.h + * + * Definitions for the WAL record format. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlogrecord.h + */ +#ifndef XLOGRECORD_H +#define XLOGRECORD_H + +#include "access/rmgr.h" +#include "access/xlogdefs.h" +#include "port/pg_crc32c.h" +#include "storage/block.h" +#include "storage/relfilelocator.h" + +/* + * The overall layout of an XLOG record is: + * Fixed-size header (XLogRecord struct) + * XLogRecordBlockHeader struct + * XLogRecordBlockHeader struct + * ... + * XLogRecordDataHeader[Short|Long] struct + * block data + * block data + * ... + * main data + * + * There can be zero or more XLogRecordBlockHeaders, and 0 or more bytes of + * rmgr-specific data not associated with a block. XLogRecord structs + * always start on MAXALIGN boundaries in the WAL files, but the rest of + * the fields are not aligned. + * + * The XLogRecordBlockHeader, XLogRecordDataHeaderShort and + * XLogRecordDataHeaderLong structs all begin with a single 'id' byte. It's + * used to distinguish between block references, and the main data structs. + */ +typedef struct XLogRecord +{ + uint32 xl_tot_len; /* total len of entire record */ + TransactionId xl_xid; /* xact id */ + XLogRecPtr xl_prev; /* ptr to previous record in log */ + uint8 xl_info; /* flag bits, see below */ + RmgrId xl_rmid; /* resource manager for this record */ + /* 2 bytes of padding here, initialize to zero */ + pg_crc32c xl_crc; /* CRC for this record */ + + /* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */ + +} XLogRecord; + +#define SizeOfXLogRecord (offsetof(XLogRecord, xl_crc) + sizeof(pg_crc32c)) + +/* + * The high 4 bits in xl_info may be used freely by rmgr. The + * XLR_SPECIAL_REL_UPDATE and XLR_CHECK_CONSISTENCY bits can be passed by + * XLogInsert caller. The rest are set internally by XLogInsert. + */ +#define XLR_INFO_MASK 0x0F +#define XLR_RMGR_INFO_MASK 0xF0 + +/* + * XLogReader needs to allocate all the data of a WAL record in a single + * chunk. This means that a single XLogRecord cannot exceed MaxAllocSize + * in length if we ignore any allocation overhead of the XLogReader. + * + * To accommodate some overhead, this value allows for 4M of allocation + * overhead, that should be plenty enough for what + * DecodeXLogRecordRequiredSpace() expects as extra. + */ +#define XLogRecordMaxSize (1020 * 1024 * 1024) + +/* + * If a WAL record modifies any relation files, in ways not covered by the + * usual block references, this flag is set. This is not used for anything + * by PostgreSQL itself, but it allows external tools that read WAL and keep + * track of modified blocks to recognize such special record types. + */ +#define XLR_SPECIAL_REL_UPDATE 0x01 + +/* + * Enforces consistency checks of replayed WAL at recovery. If enabled, + * each record will log a full-page write for each block modified by the + * record and will reuse it afterwards for consistency checks. The caller + * of XLogInsert can use this value if necessary, but if + * wal_consistency_checking is enabled for a rmgr this is set unconditionally. + */ +#define XLR_CHECK_CONSISTENCY 0x02 + +/* + * Header info for block data appended to an XLOG record. + * + * 'data_length' is the length of the rmgr-specific payload data associated + * with this block. It does not include the possible full page image, nor + * XLogRecordBlockHeader struct itself. + * + * Note that we don't attempt to align the XLogRecordBlockHeader struct! + * So, the struct must be copied to aligned local storage before use. + */ +typedef struct XLogRecordBlockHeader +{ + uint8 id; /* block reference ID */ + uint8 fork_flags; /* fork within the relation, and flags */ + uint16 data_length; /* number of payload bytes (not including page + * image) */ + + /* If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows */ + /* If BKPBLOCK_SAME_REL is not set, a RelFileLocator follows */ + /* BlockNumber follows */ +} XLogRecordBlockHeader; + +#define SizeOfXLogRecordBlockHeader (offsetof(XLogRecordBlockHeader, data_length) + sizeof(uint16)) + +/* + * Additional header information when a full-page image is included + * (i.e. when BKPBLOCK_HAS_IMAGE is set). + * + * The XLOG code is aware that PG data pages usually contain an unused "hole" + * in the middle, which contains only zero bytes. Since we know that the + * "hole" is all zeros, we remove it from the stored data (and it's not counted + * in the XLOG record's CRC, either). Hence, the amount of block data actually + * present is (BLCKSZ - <length of "hole" bytes>). + * + * Additionally, when wal_compression is enabled, we will try to compress full + * page images using one of the supported algorithms, after removing the + * "hole". This can reduce the WAL volume, but at some extra cost of CPU spent + * on the compression during WAL logging. In this case, since the "hole" + * length cannot be calculated by subtracting the number of page image bytes + * from BLCKSZ, basically it needs to be stored as an extra information. + * But when no "hole" exists, we can assume that the "hole" length is zero + * and no such an extra information needs to be stored. Note that + * the original version of page image is stored in WAL instead of the + * compressed one if the number of bytes saved by compression is less than + * the length of extra information. Hence, when a page image is successfully + * compressed, the amount of block data actually present is less than + * BLCKSZ - the length of "hole" bytes - the length of extra information. + */ +typedef struct XLogRecordBlockImageHeader +{ + uint16 length; /* number of page image bytes */ + uint16 hole_offset; /* number of bytes before "hole" */ + uint8 bimg_info; /* flag bits, see below */ + + /* + * If BKPIMAGE_HAS_HOLE and BKPIMAGE_COMPRESSED(), an + * XLogRecordBlockCompressHeader struct follows. + */ +} XLogRecordBlockImageHeader; + +#define SizeOfXLogRecordBlockImageHeader \ + (offsetof(XLogRecordBlockImageHeader, bimg_info) + sizeof(uint8)) + +/* Information stored in bimg_info */ +#define BKPIMAGE_HAS_HOLE 0x01 /* page image has "hole" */ +#define BKPIMAGE_APPLY 0x02 /* page image should be restored + * during replay */ +/* compression methods supported */ +#define BKPIMAGE_COMPRESS_PGLZ 0x04 +#define BKPIMAGE_COMPRESS_LZ4 0x08 +#define BKPIMAGE_COMPRESS_ZSTD 0x10 + +#define BKPIMAGE_COMPRESSED(info) \ + ((info & (BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | \ + BKPIMAGE_COMPRESS_ZSTD)) != 0) + +/* + * Extra header information used when page image has "hole" and + * is compressed. + */ +typedef struct XLogRecordBlockCompressHeader +{ + uint16 hole_length; /* number of bytes in "hole" */ +} XLogRecordBlockCompressHeader; + +#define SizeOfXLogRecordBlockCompressHeader \ + sizeof(XLogRecordBlockCompressHeader) + +/* + * Maximum size of the header for a block reference. This is used to size a + * temporary buffer for constructing the header. + */ +#define MaxSizeOfXLogRecordBlockHeader \ + (SizeOfXLogRecordBlockHeader + \ + SizeOfXLogRecordBlockImageHeader + \ + SizeOfXLogRecordBlockCompressHeader + \ + sizeof(RelFileLocator) + \ + sizeof(BlockNumber)) + +/* + * The fork number fits in the lower 4 bits in the fork_flags field. The upper + * bits are used for flags. + */ +#define BKPBLOCK_FORK_MASK 0x0F +#define BKPBLOCK_FLAG_MASK 0xF0 +#define BKPBLOCK_HAS_IMAGE 0x10 /* block data is an XLogRecordBlockImage */ +#define BKPBLOCK_HAS_DATA 0x20 +#define BKPBLOCK_WILL_INIT 0x40 /* redo will re-init the page */ +#define BKPBLOCK_SAME_REL 0x80 /* RelFileLocator omitted, same as + * previous */ + +/* + * XLogRecordDataHeaderShort/Long are used for the "main data" portion of + * the record. If the length of the data is less than 256 bytes, the short + * form is used, with a single byte to hold the length. Otherwise the long + * form is used. + * + * (These structs are currently not used in the code, they are here just for + * documentation purposes). + */ +typedef struct XLogRecordDataHeaderShort +{ + uint8 id; /* XLR_BLOCK_ID_DATA_SHORT */ + uint8 data_length; /* number of payload bytes */ +} XLogRecordDataHeaderShort; + +#define SizeOfXLogRecordDataHeaderShort (sizeof(uint8) * 2) + +typedef struct XLogRecordDataHeaderLong +{ + uint8 id; /* XLR_BLOCK_ID_DATA_LONG */ + /* followed by uint32 data_length, unaligned */ +} XLogRecordDataHeaderLong; + +#define SizeOfXLogRecordDataHeaderLong (sizeof(uint8) + sizeof(uint32)) + +/* + * Block IDs used to distinguish different kinds of record fragments. Block + * references are numbered from 0 to XLR_MAX_BLOCK_ID. A rmgr is free to use + * any ID number in that range (although you should stick to small numbers, + * because the WAL machinery is optimized for that case). A few ID + * numbers are reserved to denote the "main" data portion of the record, + * as well as replication-supporting transaction metadata. + * + * The maximum is currently set at 32, quite arbitrarily. Most records only + * need a handful of block references, but there are a few exceptions that + * need more. + */ +#define XLR_MAX_BLOCK_ID 32 + +#define XLR_BLOCK_ID_DATA_SHORT 255 +#define XLR_BLOCK_ID_DATA_LONG 254 +#define XLR_BLOCK_ID_ORIGIN 253 +#define XLR_BLOCK_ID_TOPLEVEL_XID 252 + +#endif /* XLOGRECORD_H */ diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h new file mode 100644 index 0000000..47c2935 --- /dev/null +++ b/src/include/access/xlogrecovery.h @@ -0,0 +1,158 @@ +/* + * xlogrecovery.h + * + * Functions for WAL recovery and standby mode + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlogrecovery.h + */ +#ifndef XLOGRECOVERY_H +#define XLOGRECOVERY_H + +#include "access/xlogreader.h" +#include "catalog/pg_control.h" +#include "lib/stringinfo.h" +#include "utils/timestamp.h" + +/* + * Recovery target type. + * Only set during a Point in Time recovery, not when in standby mode. + */ +typedef enum +{ + RECOVERY_TARGET_UNSET, + RECOVERY_TARGET_XID, + RECOVERY_TARGET_TIME, + RECOVERY_TARGET_NAME, + RECOVERY_TARGET_LSN, + RECOVERY_TARGET_IMMEDIATE +} RecoveryTargetType; + +/* + * Recovery target TimeLine goal + */ +typedef enum +{ + RECOVERY_TARGET_TIMELINE_CONTROLFILE, + RECOVERY_TARGET_TIMELINE_LATEST, + RECOVERY_TARGET_TIMELINE_NUMERIC +} RecoveryTargetTimeLineGoal; + +/* Recovery pause states */ +typedef enum RecoveryPauseState +{ + RECOVERY_NOT_PAUSED, /* pause not requested */ + RECOVERY_PAUSE_REQUESTED, /* pause requested, but not yet paused */ + RECOVERY_PAUSED /* recovery is paused */ +} RecoveryPauseState; + +/* User-settable GUC parameters */ +extern PGDLLIMPORT bool recoveryTargetInclusive; +extern PGDLLIMPORT int recoveryTargetAction; +extern PGDLLIMPORT int recovery_min_apply_delay; +extern PGDLLIMPORT char *PrimaryConnInfo; +extern PGDLLIMPORT char *PrimarySlotName; +extern PGDLLIMPORT char *recoveryRestoreCommand; +extern PGDLLIMPORT char *recoveryEndCommand; +extern PGDLLIMPORT char *archiveCleanupCommand; + +/* indirectly set via GUC system */ +extern PGDLLIMPORT TransactionId recoveryTargetXid; +extern PGDLLIMPORT char *recovery_target_time_string; +extern PGDLLIMPORT TimestampTz recoveryTargetTime; +extern PGDLLIMPORT const char *recoveryTargetName; +extern PGDLLIMPORT XLogRecPtr recoveryTargetLSN; +extern PGDLLIMPORT RecoveryTargetType recoveryTarget; +extern PGDLLIMPORT bool wal_receiver_create_temp_slot; +extern PGDLLIMPORT RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal; +extern PGDLLIMPORT TimeLineID recoveryTargetTLIRequested; +extern PGDLLIMPORT TimeLineID recoveryTargetTLI; + +/* Have we already reached a consistent database state? */ +extern PGDLLIMPORT bool reachedConsistency; + +/* Are we currently in standby mode? */ +extern PGDLLIMPORT bool StandbyMode; + +extern Size XLogRecoveryShmemSize(void); +extern void XLogRecoveryShmemInit(void); + +extern void InitWalRecovery(ControlFileData *ControlFile, + bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, + bool *haveTblspcMap_ptr); +extern void PerformWalRecovery(void); + +/* + * FinishWalRecovery() returns this. It contains information about the point + * where recovery ended, and why it ended. + */ +typedef struct +{ + /* + * Information about the last valid or applied record, after which new WAL + * can be appended. 'lastRec' is the position where the last record + * starts, and 'endOfLog' is its end. 'lastPage' is a copy of the last + * partial page that contains endOfLog (or NULL if endOfLog is exactly at + * page boundary). 'lastPageBeginPtr' is the position where the last page + * begins. + * + * endOfLogTLI is the TLI in the filename of the XLOG segment containing + * the last applied record. It could be different from lastRecTLI, if + * there was a timeline switch in that segment, and we were reading the + * old WAL from a segment belonging to a higher timeline. + */ + XLogRecPtr lastRec; /* start of last valid or applied record */ + TimeLineID lastRecTLI; + XLogRecPtr endOfLog; /* end of last valid or applied record */ + TimeLineID endOfLogTLI; + + XLogRecPtr lastPageBeginPtr; /* LSN of page that contains endOfLog */ + char *lastPage; /* copy of the last page, up to endOfLog */ + + /* + * abortedRecPtr is the start pointer of a broken record at end of WAL + * when recovery completes; missingContrecPtr is the location of the first + * contrecord that went missing. See CreateOverwriteContrecordRecord for + * details. + */ + XLogRecPtr abortedRecPtr; + XLogRecPtr missingContrecPtr; + + /* short human-readable string describing why recovery ended */ + char *recoveryStopReason; + + /* + * If standby or recovery signal file was found, these flags are set + * accordingly. + */ + bool standby_signal_file_found; + bool recovery_signal_file_found; +} EndOfWalRecoveryInfo; + +extern EndOfWalRecoveryInfo *FinishWalRecovery(void); +extern void ShutdownWalRecovery(void); +extern void RemovePromoteSignalFiles(void); + +extern bool HotStandbyActive(void); +extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); +extern RecoveryPauseState GetRecoveryPauseState(void); +extern void SetRecoveryPause(bool recoveryPause); +extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); +extern TimestampTz GetLatestXTime(void); +extern TimestampTz GetCurrentChunkReplayStartTime(void); +extern XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI); + +extern bool PromoteIsTriggered(void); +extern bool CheckPromoteSignal(void); +extern void WakeupRecovery(void); + +extern void StartupRequestWalReceiverRestart(void); +extern void XLogRequestWalReceiverReply(void); + +extern void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue); + +extern void xlog_outdesc(StringInfo buf, XLogReaderState *record); + +#endif /* XLOGRECOVERY_H */ diff --git a/src/include/access/xlogstats.h b/src/include/access/xlogstats.h new file mode 100644 index 0000000..89410ce --- /dev/null +++ b/src/include/access/xlogstats.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * xlogstats.h + * Definitions for WAL Statitstics + * + * Copyright (c) 2022-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/access/xlogstats.h + * + *------------------------------------------------------------------------- + */ +#ifndef XLOGSTATS_H +#define XLOGSTATS_H + +#include "access/rmgr.h" +#include "access/xlogreader.h" + +#define MAX_XLINFO_TYPES 16 + +typedef struct XLogRecStats +{ + uint64 count; + uint64 rec_len; + uint64 fpi_len; +} XLogRecStats; + +typedef struct XLogStats +{ + uint64 count; +#ifdef FRONTEND + XLogRecPtr startptr; + XLogRecPtr endptr; +#endif + XLogRecStats rmgr_stats[RM_MAX_ID + 1]; + XLogRecStats record_stats[RM_MAX_ID + 1][MAX_XLINFO_TYPES]; +} XLogStats; + +extern void XLogRecGetLen(XLogReaderState *record, uint32 *rec_len, + uint32 *fpi_len); +extern void XLogRecStoreStats(XLogStats *stats, XLogReaderState *record); + +#endif /* XLOGSTATS_H */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h new file mode 100644 index 0000000..5b77b11 --- /dev/null +++ b/src/include/access/xlogutils.h @@ -0,0 +1,118 @@ +/* + * xlogutils.h + * + * Utilities for replaying WAL records. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlogutils.h + */ +#ifndef XLOG_UTILS_H +#define XLOG_UTILS_H + +#include "access/xlogreader.h" +#include "storage/bufmgr.h" + +/* + * Prior to 8.4, all activity during recovery was carried out by the startup + * process. This local variable continues to be used in many parts of the + * code to indicate actions taken by RecoveryManagers. Other processes that + * potentially perform work during recovery should check RecoveryInProgress(). + * See XLogCtl notes in xlog.c. + */ +extern PGDLLIMPORT bool InRecovery; + +/* + * Like InRecovery, standbyState is only valid in the startup process. + * In all other processes it will have the value STANDBY_DISABLED (so + * InHotStandby will read as false). + * + * In DISABLED state, we're performing crash recovery or hot standby was + * disabled in postgresql.conf. + * + * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but + * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record + * to initialize our primary-transaction tracking system. + * + * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING + * state. The tracked information might still be incomplete, so we can't allow + * connections yet, but redo functions must update the in-memory state when + * appropriate. + * + * In SNAPSHOT_READY mode, we have full knowledge of transactions that are + * (or were) running on the primary at the current WAL location. Snapshots + * can be taken, and read-only queries can be run. + */ +typedef enum +{ + STANDBY_DISABLED, + STANDBY_INITIALIZED, + STANDBY_SNAPSHOT_PENDING, + STANDBY_SNAPSHOT_READY +} HotStandbyState; + +extern PGDLLIMPORT HotStandbyState standbyState; + +#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING) + + +extern bool XLogHaveInvalidPages(void); +extern void XLogCheckInvalidPages(void); + +extern void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum); +extern void XLogDropDatabase(Oid dbid); +extern void XLogTruncateRelation(RelFileLocator rlocator, ForkNumber forkNum, + BlockNumber nblocks); + +/* Result codes for XLogReadBufferForRedo[Extended] */ +typedef enum +{ + BLK_NEEDS_REDO, /* changes from WAL record need to be applied */ + BLK_DONE, /* block is already up-to-date */ + BLK_RESTORED, /* block was restored from a full-page image */ + BLK_NOTFOUND /* block was not found (and hence does not + * need to be replayed) */ +} XLogRedoAction; + +/* Private data of the read_local_xlog_page_no_wait callback. */ +typedef struct ReadLocalXLogPageNoWaitPrivate +{ + bool end_of_wal; /* true, when end of WAL is reached */ +} ReadLocalXLogPageNoWaitPrivate; + +extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, + uint8 block_id, Buffer *buf); +extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); +extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, + uint8 block_id, + ReadBufferMode mode, bool get_cleanup_lock, + Buffer *buf); + +extern Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, + BlockNumber blkno, ReadBufferMode mode, + Buffer recent_buffer); + +extern Relation CreateFakeRelcacheEntry(RelFileLocator rlocator); +extern void FreeFakeRelcacheEntry(Relation fakerel); + +extern int read_local_xlog_page(XLogReaderState *state, + XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *cur_page); +extern int read_local_xlog_page_no_wait(XLogReaderState *state, + XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, + char *cur_page); +extern void wal_segment_open(XLogReaderState *state, + XLogSegNo nextSegNo, + TimeLineID *tli_p); +extern void wal_segment_close(XLogReaderState *state); + +extern void XLogReadDetermineTimeline(XLogReaderState *state, + XLogRecPtr wantPage, + uint32 wantLength, + TimeLineID currTLI); + +extern void WALReadRaiseError(WALReadError *errinfo); + +#endif |