From 46651ce6fe013220ed397add242004d764fc0153 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 14:15:05 +0200 Subject: Adding upstream version 14.5. Signed-off-by: Daniel Baumann --- src/backend/access/Makefile | 14 + src/backend/access/brin/Makefile | 27 + src/backend/access/brin/README | 189 + src/backend/access/brin/brin.c | 1800 ++++ src/backend/access/brin/brin_bloom.c | 809 ++ src/backend/access/brin/brin_inclusion.c | 657 ++ src/backend/access/brin/brin_minmax.c | 317 + src/backend/access/brin/brin_minmax_multi.c | 3163 ++++++ src/backend/access/brin/brin_pageops.c | 920 ++ src/backend/access/brin/brin_revmap.c | 664 ++ src/backend/access/brin/brin_tuple.c | 708 ++ src/backend/access/brin/brin_validate.c | 281 + src/backend/access/brin/brin_xlog.c | 367 + src/backend/access/common/Makefile | 33 + src/backend/access/common/attmap.c | 324 + src/backend/access/common/bufmask.c | 130 + src/backend/access/common/detoast.c | 646 ++ src/backend/access/common/heaptuple.c | 1501 +++ src/backend/access/common/indextuple.c | 589 ++ src/backend/access/common/printsimple.c | 132 + src/backend/access/common/printtup.c | 485 + src/backend/access/common/relation.c | 217 + src/backend/access/common/reloptions.c | 2131 ++++ src/backend/access/common/scankey.c | 117 + src/backend/access/common/session.c | 208 + src/backend/access/common/syncscan.c | 322 + src/backend/access/common/toast_compression.c | 318 + src/backend/access/common/toast_internals.c | 664 ++ src/backend/access/common/tupconvert.c | 293 + src/backend/access/common/tupdesc.c | 912 ++ src/backend/access/gin/Makefile | 32 + src/backend/access/gin/README | 562 + src/backend/access/gin/ginarrayproc.c | 305 + src/backend/access/gin/ginbtree.c | 795 ++ src/backend/access/gin/ginbulk.c | 293 + src/backend/access/gin/gindatapage.c | 1942 ++++ src/backend/access/gin/ginentrypage.c | 772 ++ src/backend/access/gin/ginfast.c | 1068 ++ src/backend/access/gin/ginget.c | 1970 ++++ src/backend/access/gin/gininsert.c | 541 + src/backend/access/gin/ginlogic.c | 246 + src/backend/access/gin/ginpostinglist.c | 434 + src/backend/access/gin/ginscan.c | 468 + src/backend/access/gin/ginutil.c | 707 ++ src/backend/access/gin/ginvacuum.c | 822 ++ src/backend/access/gin/ginvalidate.c | 338 + src/backend/access/gin/ginxlog.c | 813 ++ src/backend/access/gist/Makefile | 28 + src/backend/access/gist/README | 467 + src/backend/access/gist/gist.c | 1713 +++ src/backend/access/gist/gistbuild.c | 1566 +++ src/backend/access/gist/gistbuildbuffers.c | 775 ++ src/backend/access/gist/gistget.c | 803 ++ src/backend/access/gist/gistproc.c | 1777 ++++ src/backend/access/gist/gistscan.c | 358 + src/backend/access/gist/gistsplit.c | 779 ++ src/backend/access/gist/gistutil.c | 1066 ++ src/backend/access/gist/gistvacuum.c | 668 ++ src/backend/access/gist/gistvalidate.c | 355 + src/backend/access/gist/gistxlog.c | 696 ++ src/backend/access/hash/Makefile | 27 + src/backend/access/hash/README | 651 ++ src/backend/access/hash/hash.c | 918 ++ src/backend/access/hash/hash_xlog.c | 1145 ++ src/backend/access/hash/hashfunc.c | 411 + src/backend/access/hash/hashinsert.c | 432 + src/backend/access/hash/hashovfl.c | 1083 ++ src/backend/access/hash/hashpage.c | 1612 +++ src/backend/access/hash/hashsearch.c | 721 ++ src/backend/access/hash/hashsort.c | 152 + src/backend/access/hash/hashutil.c | 622 ++ src/backend/access/hash/hashvalidate.c | 439 + src/backend/access/heap/Makefile | 26 + src/backend/access/heap/README.HOT | 499 + src/backend/access/heap/README.tuplock | 155 + src/backend/access/heap/heapam.c | 9955 ++++++++++++++++++ src/backend/access/heap/heapam_handler.c | 2608 +++++ src/backend/access/heap/heapam_visibility.c | 1794 ++++ src/backend/access/heap/heaptoast.c | 793 ++ src/backend/access/heap/hio.c | 721 ++ src/backend/access/heap/pruneheap.c | 1052 ++ src/backend/access/heap/rewriteheap.c | 1295 +++ src/backend/access/heap/vacuumlazy.c | 4353 ++++++++ src/backend/access/heap/visibilitymap.c | 672 ++ src/backend/access/index/Makefile | 21 + src/backend/access/index/amapi.c | 143 + src/backend/access/index/amvalidate.c | 276 + src/backend/access/index/genam.c | 745 ++ src/backend/access/index/indexam.c | 984 ++ src/backend/access/nbtree/Makefile | 28 + src/backend/access/nbtree/README | 1056 ++ src/backend/access/nbtree/nbtcompare.c | 335 + src/backend/access/nbtree/nbtdedup.c | 1098 ++ src/backend/access/nbtree/nbtinsert.c | 3009 ++++++ src/backend/access/nbtree/nbtpage.c | 3073 ++++++ src/backend/access/nbtree/nbtree.c | 1446 +++ src/backend/access/nbtree/nbtsearch.c | 2501 +++++ src/backend/access/nbtree/nbtsort.c | 2016 ++++ src/backend/access/nbtree/nbtsplitloc.c | 1190 +++ src/backend/access/nbtree/nbtutils.c | 2751 +++++ src/backend/access/nbtree/nbtvalidate.c | 380 + src/backend/access/nbtree/nbtxlog.c | 1126 ++ src/backend/access/rmgrdesc/Makefile | 34 + src/backend/access/rmgrdesc/brindesc.c | 107 + src/backend/access/rmgrdesc/clogdesc.c | 59 + src/backend/access/rmgrdesc/committsdesc.c | 55 + src/backend/access/rmgrdesc/dbasedesc.c | 63 + src/backend/access/rmgrdesc/genericdesc.c | 56 + src/backend/access/rmgrdesc/gindesc.c | 218 + src/backend/access/rmgrdesc/gistdesc.c | 117 + src/backend/access/rmgrdesc/hashdesc.c | 172 + src/backend/access/rmgrdesc/heapdesc.c | 265 + src/backend/access/rmgrdesc/logicalmsgdesc.c | 52 + src/backend/access/rmgrdesc/mxactdesc.c | 105 + src/backend/access/rmgrdesc/nbtdesc.c | 178 + src/backend/access/rmgrdesc/relmapdesc.c | 47 + src/backend/access/rmgrdesc/replorigindesc.c | 62 + src/backend/access/rmgrdesc/seqdesc.c | 46 + src/backend/access/rmgrdesc/smgrdesc.c | 61 + src/backend/access/rmgrdesc/spgdesc.c | 164 + src/backend/access/rmgrdesc/standbydesc.c | 135 + src/backend/access/rmgrdesc/tblspcdesc.c | 56 + src/backend/access/rmgrdesc/xactdesc.c | 438 + src/backend/access/rmgrdesc/xlogdesc.c | 202 + src/backend/access/spgist/Makefile | 28 + src/backend/access/spgist/README | 389 + src/backend/access/spgist/spgdoinsert.c | 2354 +++++ src/backend/access/spgist/spginsert.c | 243 + src/backend/access/spgist/spgkdtreeproc.c | 349 + src/backend/access/spgist/spgproc.c | 88 + src/backend/access/spgist/spgquadtreeproc.c | 471 + src/backend/access/spgist/spgscan.c | 1097 ++ src/backend/access/spgist/spgtextproc.c | 699 ++ src/backend/access/spgist/spgutils.c | 1350 +++ src/backend/access/spgist/spgvacuum.c | 975 ++ src/backend/access/spgist/spgvalidate.c | 392 + src/backend/access/spgist/spgxlog.c | 1013 ++ src/backend/access/table/Makefile | 21 + src/backend/access/table/table.c | 170 + src/backend/access/table/tableam.c | 765 ++ src/backend/access/table/tableamapi.c | 158 + src/backend/access/table/toast_helper.c | 337 + src/backend/access/tablesample/Makefile | 20 + src/backend/access/tablesample/bernoulli.c | 229 + src/backend/access/tablesample/system.c | 257 + src/backend/access/tablesample/tablesample.c | 40 + src/backend/access/transam/Makefile | 40 + src/backend/access/transam/README | 896 ++ src/backend/access/transam/README.parallel | 237 + src/backend/access/transam/clog.c | 1030 ++ src/backend/access/transam/commit_ts.c | 1032 ++ src/backend/access/transam/generic_xlog.c | 544 + src/backend/access/transam/multixact.c | 3427 ++++++ src/backend/access/transam/parallel.c | 1585 +++ src/backend/access/transam/rmgr.c | 38 + src/backend/access/transam/slru.c | 1611 +++ src/backend/access/transam/subtrans.c | 374 + src/backend/access/transam/timeline.c | 600 ++ src/backend/access/transam/transam.c | 430 + src/backend/access/transam/twophase.c | 2527 +++++ src/backend/access/transam/twophase_rmgr.c | 58 + src/backend/access/transam/varsup.c | 637 ++ src/backend/access/transam/xact.c | 6169 +++++++++++ src/backend/access/transam/xlog.c | 13209 ++++++++++++++++++++++++ src/backend/access/transam/xlogarchive.c | 732 ++ src/backend/access/transam/xlogfuncs.c | 830 ++ src/backend/access/transam/xloginsert.c | 1229 +++ src/backend/access/transam/xlogreader.c | 1660 +++ src/backend/access/transam/xlogutils.c | 978 ++ 169 files changed, 150421 insertions(+) create mode 100644 src/backend/access/Makefile create mode 100644 src/backend/access/brin/Makefile create mode 100644 src/backend/access/brin/README create mode 100644 src/backend/access/brin/brin.c create mode 100644 src/backend/access/brin/brin_bloom.c create mode 100644 src/backend/access/brin/brin_inclusion.c create mode 100644 src/backend/access/brin/brin_minmax.c create mode 100644 src/backend/access/brin/brin_minmax_multi.c create mode 100644 src/backend/access/brin/brin_pageops.c create mode 100644 src/backend/access/brin/brin_revmap.c create mode 100644 src/backend/access/brin/brin_tuple.c create mode 100644 src/backend/access/brin/brin_validate.c create mode 100644 src/backend/access/brin/brin_xlog.c create mode 100644 src/backend/access/common/Makefile create mode 100644 src/backend/access/common/attmap.c create mode 100644 src/backend/access/common/bufmask.c create mode 100644 src/backend/access/common/detoast.c create mode 100644 src/backend/access/common/heaptuple.c create mode 100644 src/backend/access/common/indextuple.c create mode 100644 src/backend/access/common/printsimple.c create mode 100644 src/backend/access/common/printtup.c create mode 100644 src/backend/access/common/relation.c create mode 100644 src/backend/access/common/reloptions.c create mode 100644 src/backend/access/common/scankey.c create mode 100644 src/backend/access/common/session.c create mode 100644 src/backend/access/common/syncscan.c create mode 100644 src/backend/access/common/toast_compression.c create mode 100644 src/backend/access/common/toast_internals.c create mode 100644 src/backend/access/common/tupconvert.c create mode 100644 src/backend/access/common/tupdesc.c create mode 100644 src/backend/access/gin/Makefile create mode 100644 src/backend/access/gin/README create mode 100644 src/backend/access/gin/ginarrayproc.c create mode 100644 src/backend/access/gin/ginbtree.c create mode 100644 src/backend/access/gin/ginbulk.c create mode 100644 src/backend/access/gin/gindatapage.c create mode 100644 src/backend/access/gin/ginentrypage.c create mode 100644 src/backend/access/gin/ginfast.c create mode 100644 src/backend/access/gin/ginget.c create mode 100644 src/backend/access/gin/gininsert.c create mode 100644 src/backend/access/gin/ginlogic.c create mode 100644 src/backend/access/gin/ginpostinglist.c create mode 100644 src/backend/access/gin/ginscan.c create mode 100644 src/backend/access/gin/ginutil.c create mode 100644 src/backend/access/gin/ginvacuum.c create mode 100644 src/backend/access/gin/ginvalidate.c create mode 100644 src/backend/access/gin/ginxlog.c create mode 100644 src/backend/access/gist/Makefile create mode 100644 src/backend/access/gist/README create mode 100644 src/backend/access/gist/gist.c create mode 100644 src/backend/access/gist/gistbuild.c create mode 100644 src/backend/access/gist/gistbuildbuffers.c create mode 100644 src/backend/access/gist/gistget.c create mode 100644 src/backend/access/gist/gistproc.c create mode 100644 src/backend/access/gist/gistscan.c create mode 100644 src/backend/access/gist/gistsplit.c create mode 100644 src/backend/access/gist/gistutil.c create mode 100644 src/backend/access/gist/gistvacuum.c create mode 100644 src/backend/access/gist/gistvalidate.c create mode 100644 src/backend/access/gist/gistxlog.c create mode 100644 src/backend/access/hash/Makefile create mode 100644 src/backend/access/hash/README create mode 100644 src/backend/access/hash/hash.c create mode 100644 src/backend/access/hash/hash_xlog.c create mode 100644 src/backend/access/hash/hashfunc.c create mode 100644 src/backend/access/hash/hashinsert.c create mode 100644 src/backend/access/hash/hashovfl.c create mode 100644 src/backend/access/hash/hashpage.c create mode 100644 src/backend/access/hash/hashsearch.c create mode 100644 src/backend/access/hash/hashsort.c create mode 100644 src/backend/access/hash/hashutil.c create mode 100644 src/backend/access/hash/hashvalidate.c create mode 100644 src/backend/access/heap/Makefile create mode 100644 src/backend/access/heap/README.HOT create mode 100644 src/backend/access/heap/README.tuplock create mode 100644 src/backend/access/heap/heapam.c create mode 100644 src/backend/access/heap/heapam_handler.c create mode 100644 src/backend/access/heap/heapam_visibility.c create mode 100644 src/backend/access/heap/heaptoast.c create mode 100644 src/backend/access/heap/hio.c create mode 100644 src/backend/access/heap/pruneheap.c create mode 100644 src/backend/access/heap/rewriteheap.c create mode 100644 src/backend/access/heap/vacuumlazy.c create mode 100644 src/backend/access/heap/visibilitymap.c create mode 100644 src/backend/access/index/Makefile create mode 100644 src/backend/access/index/amapi.c create mode 100644 src/backend/access/index/amvalidate.c create mode 100644 src/backend/access/index/genam.c create mode 100644 src/backend/access/index/indexam.c create mode 100644 src/backend/access/nbtree/Makefile create mode 100644 src/backend/access/nbtree/README create mode 100644 src/backend/access/nbtree/nbtcompare.c create mode 100644 src/backend/access/nbtree/nbtdedup.c create mode 100644 src/backend/access/nbtree/nbtinsert.c create mode 100644 src/backend/access/nbtree/nbtpage.c create mode 100644 src/backend/access/nbtree/nbtree.c create mode 100644 src/backend/access/nbtree/nbtsearch.c create mode 100644 src/backend/access/nbtree/nbtsort.c create mode 100644 src/backend/access/nbtree/nbtsplitloc.c create mode 100644 src/backend/access/nbtree/nbtutils.c create mode 100644 src/backend/access/nbtree/nbtvalidate.c create mode 100644 src/backend/access/nbtree/nbtxlog.c create mode 100644 src/backend/access/rmgrdesc/Makefile create mode 100644 src/backend/access/rmgrdesc/brindesc.c create mode 100644 src/backend/access/rmgrdesc/clogdesc.c create mode 100644 src/backend/access/rmgrdesc/committsdesc.c create mode 100644 src/backend/access/rmgrdesc/dbasedesc.c create mode 100644 src/backend/access/rmgrdesc/genericdesc.c create mode 100644 src/backend/access/rmgrdesc/gindesc.c create mode 100644 src/backend/access/rmgrdesc/gistdesc.c create mode 100644 src/backend/access/rmgrdesc/hashdesc.c create mode 100644 src/backend/access/rmgrdesc/heapdesc.c create mode 100644 src/backend/access/rmgrdesc/logicalmsgdesc.c create mode 100644 src/backend/access/rmgrdesc/mxactdesc.c create mode 100644 src/backend/access/rmgrdesc/nbtdesc.c create mode 100644 src/backend/access/rmgrdesc/relmapdesc.c create mode 100644 src/backend/access/rmgrdesc/replorigindesc.c create mode 100644 src/backend/access/rmgrdesc/seqdesc.c create mode 100644 src/backend/access/rmgrdesc/smgrdesc.c create mode 100644 src/backend/access/rmgrdesc/spgdesc.c create mode 100644 src/backend/access/rmgrdesc/standbydesc.c create mode 100644 src/backend/access/rmgrdesc/tblspcdesc.c create mode 100644 src/backend/access/rmgrdesc/xactdesc.c create mode 100644 src/backend/access/rmgrdesc/xlogdesc.c create mode 100644 src/backend/access/spgist/Makefile create mode 100644 src/backend/access/spgist/README create mode 100644 src/backend/access/spgist/spgdoinsert.c create mode 100644 src/backend/access/spgist/spginsert.c create mode 100644 src/backend/access/spgist/spgkdtreeproc.c create mode 100644 src/backend/access/spgist/spgproc.c create mode 100644 src/backend/access/spgist/spgquadtreeproc.c create mode 100644 src/backend/access/spgist/spgscan.c create mode 100644 src/backend/access/spgist/spgtextproc.c create mode 100644 src/backend/access/spgist/spgutils.c create mode 100644 src/backend/access/spgist/spgvacuum.c create mode 100644 src/backend/access/spgist/spgvalidate.c create mode 100644 src/backend/access/spgist/spgxlog.c create mode 100644 src/backend/access/table/Makefile create mode 100644 src/backend/access/table/table.c create mode 100644 src/backend/access/table/tableam.c create mode 100644 src/backend/access/table/tableamapi.c create mode 100644 src/backend/access/table/toast_helper.c create mode 100644 src/backend/access/tablesample/Makefile create mode 100644 src/backend/access/tablesample/bernoulli.c create mode 100644 src/backend/access/tablesample/system.c create mode 100644 src/backend/access/tablesample/tablesample.c create mode 100644 src/backend/access/transam/Makefile create mode 100644 src/backend/access/transam/README create mode 100644 src/backend/access/transam/README.parallel create mode 100644 src/backend/access/transam/clog.c create mode 100644 src/backend/access/transam/commit_ts.c create mode 100644 src/backend/access/transam/generic_xlog.c create mode 100644 src/backend/access/transam/multixact.c create mode 100644 src/backend/access/transam/parallel.c create mode 100644 src/backend/access/transam/rmgr.c create mode 100644 src/backend/access/transam/slru.c create mode 100644 src/backend/access/transam/subtrans.c create mode 100644 src/backend/access/transam/timeline.c create mode 100644 src/backend/access/transam/transam.c create mode 100644 src/backend/access/transam/twophase.c create mode 100644 src/backend/access/transam/twophase_rmgr.c create mode 100644 src/backend/access/transam/varsup.c create mode 100644 src/backend/access/transam/xact.c create mode 100644 src/backend/access/transam/xlog.c create mode 100644 src/backend/access/transam/xlogarchive.c create mode 100644 src/backend/access/transam/xlogfuncs.c create mode 100644 src/backend/access/transam/xloginsert.c create mode 100644 src/backend/access/transam/xlogreader.c create mode 100644 src/backend/access/transam/xlogutils.c (limited to 'src/backend/access') diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile new file mode 100644 index 0000000..0880e0a --- /dev/null +++ b/src/backend/access/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for the access methods module +# +# src/backend/access/Makefile +# + +subdir = src/backend/access +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ + table tablesample transam + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/brin/Makefile b/src/backend/access/brin/Makefile new file mode 100644 index 0000000..a386cb7 --- /dev/null +++ b/src/backend/access/brin/Makefile @@ -0,0 +1,27 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/brin +# +# IDENTIFICATION +# src/backend/access/brin/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/brin +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + brin.o \ + brin_bloom.o \ + brin_inclusion.o \ + brin_minmax.o \ + brin_minmax_multi.o \ + brin_pageops.o \ + brin_revmap.o \ + brin_tuple.o \ + brin_validate.o \ + brin_xlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/brin/README b/src/backend/access/brin/README new file mode 100644 index 0000000..636d965 --- /dev/null +++ b/src/backend/access/brin/README @@ -0,0 +1,189 @@ +Block Range Indexes (BRIN) +========================== + +BRIN indexes intend to enable very fast scanning of extremely large tables. + +The essential idea of a BRIN index is to keep track of summarizing values in +consecutive groups of heap pages (page ranges); for example, the minimum and +maximum values for datatypes with a btree opclass, or the bounding box for +geometric types. These values can be used to avoid scanning such pages +during a table scan, depending on query quals. + +The cost of this is having to update the stored summary values of each page +range as tuples are inserted into them. + + +Access Method Design +-------------------- + +Since item pointers are not stored inside indexes of this type, it is not +possible to support the amgettuple interface. Instead, we only provide +amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap +comprising all pages in those page ranges that match the query +qualifications. The recheck step in the BitmapHeapScan node prunes tuples +that are not visible according to the query qualifications. + +An operator class must have the following entries: + +- generic support procedures (pg_amproc), identical to all opclasses: + * "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index + creation or scanning + * "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item, + and possibly changes the index tuple so that it includes the heap item + values + * "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query + quals, and returns whether the index tuple values match the query quals. + * "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first + one so that it represents the union of the two. +Procedure numbers up to 10 are reserved for future expansion. + +Additionally, each opclass needs additional support functions: +- Minmax-style operator classes: + * Proc numbers 11-14 are used for the functions implementing inequality + operators for the type, in this order: less than, less or equal, + greater or equal, greater than. + +Opclasses using a different design will require different additional procedure +numbers. + +Operator classes also need to have operator (pg_amop) entries so that the +optimizer can choose the index to execute queries. +- Minmax-style operator classes: + * The same operators as btree (<=, <, =, >=, >) + +Each index tuple stores some NULL bits and some opclass-specified values, which +are stored in a single null bitmask of length twice the number of columns. The +generic NULL bits indicate, for each column: + * bt_hasnulls: Whether there's any NULL value at all in the page range + * bt_allnulls: Whether all values are NULLs in the page range + +The opclass-specified values are: +- Minmax-style operator classes + * minimum value across all tuples in the range + * maximum value across all tuples in the range + +Note that the addValue and Union support procedures must be careful to +datumCopy() the values they want to store in the in-memory BRIN tuple, and +must pfree() the old copies when replacing older ones. Since some values +referenced from the tuple persist and others go away, there is no +well-defined lifetime for a memory context that would make this automatic. + + +The Range Map +------------- + +To find the index tuple for a particular page range, we have an internal +structure we call the range map, or "revmap" for short. This stores one TID +per page range, which is the address of the index tuple summarizing that +range. Since the map entries are fixed size, it is possible to compute the +address of the range map entry for any given heap page by simple arithmetic. + +When a new heap tuple is inserted in a summarized page range, we compare the +existing index tuple with the new heap tuple. If the heap tuple is outside +the summarization data given by the index tuple for any indexed column (or +if the new heap tuple contains null values but the index tuple indicates +there are no nulls), the index is updated with the new values. In many +cases it is possible to update the index tuple in-place, but if the new +index tuple is larger than the old one and there's not enough space in the +page, it is necessary to create a new index tuple with the new values. The +range map can be updated quickly to point to it; the old index tuple is +removed. + +If the range map points to an invalid TID, the corresponding page range is +considered to be not summarized. When tuples are added to unsummarized +pages, nothing needs to happen. + +To scan a table following a BRIN index, we scan the range map sequentially. +This yields index tuples in ascending page range order. Query quals are +matched to each index tuple; if they match, each page within the page range +is returned as part of the output TID bitmap. If there's no match, they are +skipped. Range map entries returning invalid index TIDs, that is +unsummarized page ranges, are also returned in the TID bitmap. + +The revmap is stored in the first few blocks of the index main fork, +immediately following the metapage. Whenever the revmap needs to be +extended by another page, existing tuples in that page are moved to some +other page. + +Heap tuples can be removed from anywhere without restriction. It might be +useful to mark the corresponding index tuple somehow, if the heap tuple is +one of the constraining values of the summary data (i.e. either min or max +in the case of a btree-opclass-bearing datatype), so that in the future we +are aware of the need to re-execute summarization on that range, leading to +a possible tightening of the summary values. + +Summarization +------------- + +At index creation time, the whole table is scanned; for each page range the +summarizing values of each indexed column and nulls bitmap are collected and +stored in the index. The partially-filled page range at the end of the +table is also summarized. + +As new tuples get inserted at the end of the table, they may update the +index tuple that summarizes the partial page range at the end. Eventually +that page range is complete and new tuples belong in a new page range that +hasn't yet been summarized. Those insertions do not create a new index +entry; instead, the page range remains unsummarized until later. + +Whenever VACUUM is run on the table, all unsummarized page ranges are +summarized. This action can also be invoked by the user via +brin_summarize_new_values(). Both these procedures scan all the +unsummarized ranges, and create a summary tuple. Again, this includes the +partially-filled page range at the end of the table. + +Vacuuming +--------- + +Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the +index when heap tuples are removed. It might be that some summary values can +be tightened if heap tuples have been deleted; but this would represent an +optimization opportunity only, not a correctness issue. It's simpler to +represent this as the need to re-run summarization on the affected page range +rather than "subtracting" values from the existing one. This is not +currently implemented. + +Note that if there are no indexes on the table other than the BRIN index, +usage of maintenance_work_mem by vacuum can be decreased significantly, because +no detailed index scan needs to take place (and thus it's not necessary for +vacuum to save TIDs to remove). It's unlikely that BRIN would be the only +indexes in a table, though, because primary keys can be btrees only, and so +we don't implement this optimization. + + +Optimizer +--------- + +The optimizer selects the index based on the operator class' pg_amop +entries for the column. + + +Future improvements +------------------- + +* Different-size page ranges? + In the current design, each "index entry" in a BRIN index covers the same + number of pages. There's no hard reason for this; it might make sense to + allow the index to self-tune so that some index entries cover smaller page + ranges, if this allows the summary values to be more compact. This would incur + larger BRIN overhead for the index itself, but might allow better pruning of + page ranges during scan. In the limit of one index tuple per page, the index + itself would occupy too much space, even though we would be able to skip + reading the most heap pages, because the summary values are tight; in the + opposite limit of a single tuple that summarizes the whole table, we wouldn't + be able to prune anything even though the index is very small. This can + probably be made to work by using the range map as an index in itself. + +* More compact representation for TIDBitmap? + TIDBitmap is the structure used to represent bitmap scans. The + representation of lossy page ranges is not optimal for our purposes, because + it uses a Bitmapset to represent pages in the range; since we're going to return + all pages in a large range, it might be more convenient to allow for a + struct that uses start and end page numbers to represent the range, instead. + +* Better vacuuming? + It might be useful to enable passing more useful info to BRIN indexes during + vacuuming about tuples that are deleted, i.e. do not require the callback to + pass each tuple's TID. For instance we might need a callback that passes a + block number instead of a TID. That would help determine when to re-run + summarization on blocks that have seen lots of tuple deletions. diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c new file mode 100644 index 0000000..21a2384 --- /dev/null +++ b/src/backend/access/brin/brin.c @@ -0,0 +1,1800 @@ +/* + * brin.c + * Implementation of BRIN indexes for Postgres + * + * See src/backend/access/brin/README for details. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin.c + * + * TODO + * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY) + */ +#include "postgres.h" + +#include "access/brin.h" +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_xlog.h" +#include "access/relation.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +/* + * We use a BrinBuildState during initial construction of a BRIN index. + * The running state is kept in a BrinMemTuple. + */ +typedef struct BrinBuildState +{ + Relation bs_irel; + int bs_numtuples; + Buffer bs_currentInsertBuf; + BlockNumber bs_pagesPerRange; + BlockNumber bs_currRangeStart; + BrinRevmap *bs_rmAccess; + BrinDesc *bs_bdesc; + BrinMemTuple *bs_dtuple; +} BrinBuildState; + +/* + * Struct used as "opaque" during index scans + */ +typedef struct BrinOpaque +{ + BlockNumber bo_pagesPerRange; + BrinRevmap *bo_rmAccess; + BrinDesc *bo_bdesc; +} BrinOpaque; + +#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber + +static BrinBuildState *initialize_brin_buildstate(Relation idxRel, + BrinRevmap *revmap, BlockNumber pagesPerRange); +static void terminate_brin_buildstate(BrinBuildState *state); +static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, + bool include_partial, double *numSummarized, double *numExisting); +static void form_and_insert_tuple(BrinBuildState *state); +static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, + BrinTuple *b); +static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy); +static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, + BrinMemTuple *dtup, Datum *values, bool *nulls); +static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys); + +/* + * BRIN handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +brinhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM; + amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = true; + amroutine->amstorage = true; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = brinbuild; + amroutine->ambuildempty = brinbuildempty; + amroutine->aminsert = brininsert; + amroutine->ambulkdelete = brinbulkdelete; + amroutine->amvacuumcleanup = brinvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = brincostestimate; + amroutine->amoptions = brinoptions; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = brinvalidate; + amroutine->amadjustmembers = NULL; + amroutine->ambeginscan = brinbeginscan; + amroutine->amrescan = brinrescan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = bringetbitmap; + amroutine->amendscan = brinendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * A tuple in the heap is being inserted. To keep a brin index up to date, + * we need to obtain the relevant index tuple and compare its stored values + * with those of the new tuple. If the tuple values are not consistent with + * the summary tuple, we need to update the index tuple. + * + * If autosummarization is enabled, check if we need to summarize the previous + * page range. + * + * If the range is not currently summarized (i.e. the revmap returns NULL for + * it), there's nothing to do for this tuple. + */ +bool +brininsert(Relation idxRel, Datum *values, bool *nulls, + ItemPointer heaptid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + BlockNumber pagesPerRange; + BlockNumber origHeapBlk; + BlockNumber heapBlk; + BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; + BrinRevmap *revmap; + Buffer buf = InvalidBuffer; + MemoryContext tupcxt = NULL; + MemoryContext oldcxt = CurrentMemoryContext; + bool autosummarize = BrinGetAutoSummarize(idxRel); + + revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); + + /* + * origHeapBlk is the block number where the insertion occurred. heapBlk + * is the first block in the corresponding page range. + */ + origHeapBlk = ItemPointerGetBlockNumber(heaptid); + heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange; + + for (;;) + { + bool need_insert = false; + OffsetNumber off; + BrinTuple *brtup; + BrinMemTuple *dtup; + + CHECK_FOR_INTERRUPTS(); + + /* + * If auto-summarization is enabled and we just inserted the first + * tuple into the first block of a new non-first page range, request a + * summarization run of the previous range. + */ + if (autosummarize && + heapBlk > 0 && + heapBlk == origHeapBlk && + ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) + { + BlockNumber lastPageRange = heapBlk - 1; + BrinTuple *lastPageTuple; + + lastPageTuple = + brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); + if (!lastPageTuple) + { + bool recorded; + + recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange, + RelationGetRelid(idxRel), + lastPageRange); + if (!recorded) + ereport(LOG, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded", + RelationGetRelationName(idxRel), + lastPageRange))); + } + else + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); + + /* if range is unsummarized, there's nothing to do */ + if (!brtup) + break; + + /* First time through in this statement? */ + if (bdesc == NULL) + { + MemoryContextSwitchTo(indexInfo->ii_Context); + bdesc = brin_build_desc(idxRel); + indexInfo->ii_AmCache = (void *) bdesc; + MemoryContextSwitchTo(oldcxt); + } + /* First time through in this brininsert call? */ + if (tupcxt == NULL) + { + tupcxt = AllocSetContextCreate(CurrentMemoryContext, + "brininsert cxt", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(tupcxt); + } + + dtup = brin_deform_tuple(bdesc, brtup, NULL); + + need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls); + + if (!need_insert) + { + /* + * The tuple is consistent with the new values, so there's nothing + * to do. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + { + Page page = BufferGetPage(buf); + ItemId lp = PageGetItemId(page, off); + Size origsz; + BrinTuple *origtup; + Size newsz; + BrinTuple *newtup; + bool samepage; + + /* + * Make a copy of the old tuple, so that we can compare it after + * re-acquiring the lock. + */ + origsz = ItemIdGetLength(lp); + origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); + + /* + * Before releasing the lock, check if we can attempt a same-page + * update. Another process could insert a tuple concurrently in + * the same page though, so downstream we must be prepared to cope + * if this turns out to not be possible after all. + */ + newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); + samepage = brin_can_do_samepage_update(buf, origsz, newsz); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* + * Try to update the tuple. If this doesn't work for whatever + * reason, we need to restart from the top; the revmap might be + * pointing at a different tuple for this block now, so we need to + * recompute to ensure both our new heap tuple and the other + * inserter's are covered by the combined tuple. It might be that + * we don't need to update at all. + */ + if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, + buf, off, origtup, origsz, newtup, newsz, + samepage)) + { + /* no luck; start over */ + MemoryContextResetAndDeleteChildren(tupcxt); + continue; + } + } + + /* success! */ + break; + } + + brinRevmapTerminate(revmap); + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + MemoryContextSwitchTo(oldcxt); + if (tupcxt != NULL) + MemoryContextDelete(tupcxt); + + return false; +} + +/* + * Initialize state for a BRIN index scan. + * + * We read the metapage here to determine the pages-per-range number that this + * index was built with. Note that since this cannot be changed while we're + * holding lock on index, it's not necessary to recompute it during brinrescan. + */ +IndexScanDesc +brinbeginscan(Relation r, int nkeys, int norderbys) +{ + IndexScanDesc scan; + BrinOpaque *opaque; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque)); + opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange, + scan->xs_snapshot); + opaque->bo_bdesc = brin_build_desc(r); + scan->opaque = opaque; + + return scan; +} + +/* + * Execute the index scan. + * + * This works by reading index TIDs from the revmap, and obtaining the index + * tuples pointed to by them; the summary values in the index tuples are + * compared to the scan keys. We return into the TID bitmap all the pages in + * ranges corresponding to index tuples that match the scan keys. + * + * If a TID from the revmap is read as InvalidTID, we know that range is + * unsummarized. Pages in those ranges need to be returned regardless of scan + * keys. + */ +int64 +bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + Relation idxRel = scan->indexRelation; + Buffer buf = InvalidBuffer; + BrinDesc *bdesc; + Oid heapOid; + Relation heapRel; + BrinOpaque *opaque; + BlockNumber nblocks; + BlockNumber heapBlk; + int totalpages = 0; + FmgrInfo *consistentFn; + MemoryContext oldcxt; + MemoryContext perRangeCxt; + BrinMemTuple *dtup; + BrinTuple *btup = NULL; + Size btupsz = 0; + ScanKey **keys, + **nullkeys; + int *nkeys, + *nnullkeys; + int keyno; + char *ptr; + Size len; + char *tmp PG_USED_FOR_ASSERTS_ONLY; + + opaque = (BrinOpaque *) scan->opaque; + bdesc = opaque->bo_bdesc; + pgstat_count_index_scan(idxRel); + + /* + * We need to know the size of the table so that we know how long to + * iterate on the revmap. + */ + heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); + heapRel = table_open(heapOid, AccessShareLock); + nblocks = RelationGetNumberOfBlocks(heapRel); + table_close(heapRel, AccessShareLock); + + /* + * Make room for the consistent support procedures of indexed columns. We + * don't look them up here; we do that lazily the first time we see a scan + * key reference each of them. We rely on zeroing fn_oid to InvalidOid. + */ + consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); + + /* + * Make room for per-attribute lists of scan keys that we'll pass to the + * consistent support procedure. We don't know which attributes have scan + * keys, so we allocate space for all attributes. That may use more memory + * but it's probably cheaper than determining which attributes are used. + * + * We keep null and regular keys separate, so that we can pass just the + * regular keys to the consistent function easily. + * + * To reduce the allocation overhead, we allocate one big chunk and then + * carve it into smaller arrays ourselves. All the pieces have exactly the + * same lifetime, so that's OK. + * + * XXX The widest index can have 32 attributes, so the amount of wasted + * memory is negligible. We could invent a more compact approach (with + * just space for used attributes) but that would make the matching more + * complex so it's not a good trade-off. + */ + len = + MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */ + MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + + MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) + + MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */ + MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + + MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + ptr = palloc(len); + tmp = ptr; + + keys = (ScanKey **) ptr; + ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); + + nullkeys = (ScanKey **) ptr; + ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); + + nkeys = (int *) ptr; + ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + nnullkeys = (int *) ptr; + ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + for (int i = 0; i < bdesc->bd_tupdesc->natts; i++) + { + keys[i] = (ScanKey *) ptr; + ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); + + nullkeys[i] = (ScanKey *) ptr; + ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); + } + + Assert(tmp + len == ptr); + + /* zero the number of keys */ + memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); + memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); + + /* Preprocess the scan keys - split them into per-attribute arrays. */ + for (keyno = 0; keyno < scan->numberOfKeys; keyno++) + { + ScanKey key = &scan->keyData[keyno]; + AttrNumber keyattno = key->sk_attno; + + /* + * The collation of the scan key must match the collation used in the + * index column (but only if the search is not IS NULL/ IS NOT NULL). + * Otherwise we shouldn't be using this index ... + */ + Assert((key->sk_flags & SK_ISNULL) || + (key->sk_collation == + TupleDescAttr(bdesc->bd_tupdesc, + keyattno - 1)->attcollation)); + + /* + * First time we see this index attribute, so init as needed. + * + * This is a bit of an overkill - we don't know how many scan keys are + * there for this attribute, so we simply allocate the largest number + * possible (as if all keys were for this attribute). This may waste a + * bit of memory, but we only expect small number of scan keys in + * general, so this should be negligible, and repeated repalloc calls + * are not free either. + */ + if (consistentFn[keyattno - 1].fn_oid == InvalidOid) + { + FmgrInfo *tmp; + + /* First time we see this attribute, so no key/null keys. */ + Assert(nkeys[keyattno - 1] == 0); + Assert(nnullkeys[keyattno - 1] == 0); + + tmp = index_getprocinfo(idxRel, keyattno, + BRIN_PROCNUM_CONSISTENT); + fmgr_info_copy(&consistentFn[keyattno - 1], tmp, + CurrentMemoryContext); + } + + /* Add key to the proper per-attribute array. */ + if (key->sk_flags & SK_ISNULL) + { + nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key; + nnullkeys[keyattno - 1]++; + } + else + { + keys[keyattno - 1][nkeys[keyattno - 1]] = key; + nkeys[keyattno - 1]++; + } + } + + /* allocate an initial in-memory tuple, out of the per-range memcxt */ + dtup = brin_new_memtuple(bdesc); + + /* + * Setup and use a per-range memory context, which is reset every time we + * loop below. This avoids having to free the tuples within the loop. + */ + perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, + "bringetbitmap cxt", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(perRangeCxt); + + /* + * Now scan the revmap. We start by querying for heap page 0, + * incrementing by the number of pages per range; this gives us a full + * view of the table. + */ + for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) + { + bool addrange; + bool gottuple = false; + BrinTuple *tup; + OffsetNumber off; + Size size; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextResetAndDeleteChildren(perRangeCxt); + + tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, + &off, &size, BUFFER_LOCK_SHARE, + scan->xs_snapshot); + if (tup) + { + gottuple = true; + btup = brin_copy_tuple(tup, size, btup, &btupsz); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* + * For page ranges with no indexed tuple, we must return the whole + * range; otherwise, compare it to the scan keys. + */ + if (!gottuple) + { + addrange = true; + } + else + { + dtup = brin_deform_tuple(bdesc, btup, dtup); + if (dtup->bt_placeholder) + { + /* + * Placeholder tuples are always returned, regardless of the + * values stored in them. + */ + addrange = true; + } + else + { + int attno; + + /* + * Compare scan keys with summary values stored for the range. + * If scan keys are matched, the page range must be added to + * the bitmap. We initially assume the range needs to be + * added; in particular this serves the case where there are + * no keys. + */ + addrange = true; + for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) + { + BrinValues *bval; + Datum add; + Oid collation; + + /* + * skip attributes without any scan keys (both regular and + * IS [NOT] NULL) + */ + if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0) + continue; + + bval = &dtup->bt_columns[attno - 1]; + + /* + * First check if there are any IS [NOT] NULL scan keys, + * and if we're violating them. In that case we can + * terminate early, without invoking the support function. + * + * As there may be more keys, we can only determine + * mismatch within this loop. + */ + if (bdesc->bd_info[attno - 1]->oi_regular_nulls && + !check_null_keys(bval, nullkeys[attno - 1], + nnullkeys[attno - 1])) + { + /* + * If any of the IS [NOT] NULL keys failed, the page + * range as a whole can't pass. So terminate the loop. + */ + addrange = false; + break; + } + + /* + * So either there are no IS [NOT] NULL keys, or all + * passed. If there are no regular scan keys, we're done - + * the page range matches. If there are regular keys, but + * the page range is marked as 'all nulls' it can't + * possibly pass (we're assuming the operators are + * strict). + */ + + /* No regular scan keys - page range as a whole passes. */ + if (!nkeys[attno - 1]) + continue; + + Assert((nkeys[attno - 1] > 0) && + (nkeys[attno - 1] <= scan->numberOfKeys)); + + /* If it is all nulls, it cannot possibly be consistent. */ + if (bval->bv_allnulls) + { + addrange = false; + break; + } + + /* + * Collation from the first key (has to be the same for + * all keys for the same attribute). + */ + collation = keys[attno - 1][0]->sk_collation; + + /* + * Check whether the scan key is consistent with the page + * range values; if so, have the pages in the range added + * to the output bitmap. + * + * The opclass may or may not support processing of + * multiple scan keys. We can determine that based on the + * number of arguments - functions with extra parameter + * (number of scan keys) do support this, otherwise we + * have to simply pass the scan keys one by one. + */ + if (consistentFn[attno - 1].fn_nargs >= 4) + { + /* Check all keys at once */ + add = FunctionCall4Coll(&consistentFn[attno - 1], + collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(keys[attno - 1]), + Int32GetDatum(nkeys[attno - 1])); + addrange = DatumGetBool(add); + } + else + { + /* + * Check keys one by one + * + * When there are multiple scan keys, failure to meet + * the criteria for a single one of them is enough to + * discard the range as a whole, so break out of the + * loop as soon as a false return value is obtained. + */ + int keyno; + + for (keyno = 0; keyno < nkeys[attno - 1]; keyno++) + { + add = FunctionCall3Coll(&consistentFn[attno - 1], + keys[attno - 1][keyno]->sk_collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(keys[attno - 1][keyno])); + addrange = DatumGetBool(add); + if (!addrange) + break; + } + } + } + } + } + + /* add the pages in the range to the output bitmap, if needed */ + if (addrange) + { + BlockNumber pageno; + + for (pageno = heapBlk; + pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1; + pageno++) + { + MemoryContextSwitchTo(oldcxt); + tbm_add_page(tbm, pageno); + totalpages++; + MemoryContextSwitchTo(perRangeCxt); + } + } + } + + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(perRangeCxt); + + if (buf != InvalidBuffer) + ReleaseBuffer(buf); + + /* + * XXX We have an approximation of the number of *pages* that our scan + * returns, but we don't have a precise idea of the number of heap tuples + * involved. + */ + return totalpages * 10; +} + +/* + * Re-initialize state for a BRIN index scan + */ +void +brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + /* + * Other index AMs preprocess the scan keys at this point, or sometime + * early during the scan; this lets them optimize by removing redundant + * keys, or doing early returns when they are impossible to satisfy; see + * _bt_preprocess_keys for an example. Something like that could be added + * here someday, too. + */ + + if (scankey && scan->numberOfKeys > 0) + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); +} + +/* + * Close down a BRIN index scan + */ +void +brinendscan(IndexScanDesc scan) +{ + BrinOpaque *opaque = (BrinOpaque *) scan->opaque; + + brinRevmapTerminate(opaque->bo_rmAccess); + brin_free_desc(opaque->bo_bdesc); + pfree(opaque); +} + +/* + * Per-heap-tuple callback for table_index_build_scan. + * + * Note we don't worry about the page range at the end of the table here; it is + * present in the build state struct after we're called the last time, but not + * inserted into the index. Caller must ensure to do so, if appropriate. + */ +static void +brinbuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *brstate) +{ + BrinBuildState *state = (BrinBuildState *) brstate; + BlockNumber thisblock; + + thisblock = ItemPointerGetBlockNumber(tid); + + /* + * If we're in a block that belongs to a future range, summarize what + * we've got and start afresh. Note the scan might have skipped many + * pages, if they were devoid of live tuples; make sure to insert index + * tuples for those too. + */ + while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1) + { + + BRIN_elog((DEBUG2, + "brinbuildCallback: completed a range: %u--%u", + state->bs_currRangeStart, + state->bs_currRangeStart + state->bs_pagesPerRange)); + + /* create the index tuple and insert it */ + form_and_insert_tuple(state); + + /* set state to correspond to the next range */ + state->bs_currRangeStart += state->bs_pagesPerRange; + + /* re-initialize state for it */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + } + + /* Accumulate the current tuple into the running state */ + (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple, + values, isnull); +} + +/* + * brinbuild() -- build a new BRIN index. + */ +IndexBuildResult * +brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + double idxtuples; + BrinRevmap *revmap; + BrinBuildState *state; + Buffer meta; + BlockNumber pagesPerRange; + + /* + * We expect to be called exactly once for any index relation. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. + */ + + meta = ReadBuffer(index, P_NEW); + Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); + LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); + + brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(meta); + + if (RelationNeedsWAL(index)) + { + xl_brin_createidx xlrec; + XLogRecPtr recptr; + Page page; + + xlrec.version = BRIN_CURRENT_VERSION; + xlrec.pagesPerRange = BrinGetPagesPerRange(index); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); + XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD); + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); + + page = BufferGetPage(meta); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(meta); + + /* + * Initialize our state, including the deformed tuple state. + */ + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); + state = initialize_brin_buildstate(index, revmap, pagesPerRange); + + /* + * Now scan the relation. No syncscan allowed here because we want the + * heap blocks in physical order. + */ + reltuples = table_index_build_scan(heap, index, indexInfo, false, true, + brinbuildCallback, (void *) state, NULL); + + /* process the final batch */ + form_and_insert_tuple(state); + + /* release resources */ + idxtuples = state->bs_numtuples; + brinRevmapTerminate(state->bs_rmAccess); + terminate_brin_buildstate(state); + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = idxtuples; + + return result; +} + +void +brinbuildempty(Relation index) +{ + Buffer metabuf; + + /* An empty BRIN index has a metapage only. */ + metabuf = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog metabuffer. */ + START_CRIT_SECTION(); + brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(metabuf); + log_newpage_buffer(metabuf, true); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(metabuf); +} + +/* + * brinbulkdelete + * Since there are no per-heap-tuple index tuples in BRIN indexes, + * there's not a lot we can do here. + * + * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap + * tuple is deleted), meaning the need to re-run summarization on the affected + * range. Would need to add an extra flag in brintuples for that. + */ +IndexBulkDeleteResult * +brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + return stats; +} + +/* + * This routine is in charge of "vacuuming" a BRIN index: we just summarize + * ranges that are currently unsummarized. + */ +IndexBulkDeleteResult * +brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation heapRel; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + if (!stats) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats->num_pages = RelationGetNumberOfBlocks(info->index); + /* rest of stats is initialized by zeroing */ + + heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false), + AccessShareLock); + + brin_vacuum_scan(info->index, info->strategy); + + brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false, + &stats->num_index_tuples, &stats->num_index_tuples); + + table_close(heapRel, AccessShareLock); + + return stats; +} + +/* + * reloptions processor for BRIN indexes + */ +bytea * +brinoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}, + {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_BRIN, + sizeof(BrinOptions), + tab, lengthof(tab)); +} + +/* + * SQL-callable function to scan through an index and summarize all ranges + * that are not currently summarized. + */ +Datum +brin_summarize_new_values(PG_FUNCTION_ARGS) +{ + Datum relation = PG_GETARG_DATUM(0); + + return DirectFunctionCall2(brin_summarize_range, + relation, + Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES)); +} + +/* + * SQL-callable function to summarize the indicated page range, if not already + * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all + * unsummarized ranges are summarized. + */ +Datum +brin_summarize_range(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; + Oid heapoid; + Relation indexRel; + Relation heapRel; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + double numSummarized = 0; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("BRIN control functions cannot be executed during recovery."))); + + if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0) + { + char *blk = psprintf(INT64_FORMAT, heapBlk64); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %s", blk))); + } + heapBlk = (BlockNumber) heapBlk64; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indexoid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + */ + heapoid = IndexGetRelation(indexoid, true); + if (OidIsValid(heapoid)) + { + heapRel = table_open(heapoid, ShareUpdateExclusiveLock); + + /* + * Autovacuum calls us. For its benefit, switch to the table owner's + * userid, so that any index functions are run as that user. Also + * lock down security-restricted operations and arrange to make GUC + * variable changes local to this command. This is harmless, albeit + * unnecessary, when called from SQL, because we fail shortly if the + * user does not own the index. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(heapRel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + } + else + { + heapRel = NULL; + /* Set these just to suppress "uninitialized variable" warnings */ + save_userid = InvalidOid; + save_sec_context = -1; + save_nestlevel = -1; + } + + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + /* Must be a BRIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != BRIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a BRIN index", + RelationGetRelationName(indexRel)))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (heapRel != NULL && !pg_class_ownercheck(indexoid, save_userid)) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. Recheck. + */ + if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indexRel)))); + + /* OK, do it */ + brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL); + + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_INT32((int32) numSummarized); +} + +/* + * SQL-callable interface to mark a range as no longer summarized + */ +Datum +brin_desummarize_range(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; + Oid heapoid; + Relation heapRel; + Relation indexRel; + bool done; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("BRIN control functions cannot be executed during recovery."))); + + if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0) + { + char *blk = psprintf(INT64_FORMAT, heapBlk64); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %s", blk))); + } + heapBlk = (BlockNumber) heapBlk64; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indexoid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + * + * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we + * don't switch userid. + */ + heapoid = IndexGetRelation(indexoid, true); + if (OidIsValid(heapoid)) + heapRel = table_open(heapoid, ShareUpdateExclusiveLock); + else + heapRel = NULL; + + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + /* Must be a BRIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != BRIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a BRIN index", + RelationGetRelationName(indexRel)))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (!pg_class_ownercheck(indexoid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. Recheck. + */ + if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indexRel)))); + + /* the revmap does the hard work */ + do + { + done = brinRevmapDesummarizeRange(indexRel, heapBlk); + } + while (!done); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_VOID(); +} + +/* + * Build a BrinDesc used to create or scan a BRIN index + */ +BrinDesc * +brin_build_desc(Relation rel) +{ + BrinOpcInfo **opcinfo; + BrinDesc *bdesc; + TupleDesc tupdesc; + int totalstored = 0; + int keyno; + long totalsize; + MemoryContext cxt; + MemoryContext oldcxt; + + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin desc cxt", + ALLOCSET_SMALL_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + tupdesc = RelationGetDescr(rel); + + /* + * Obtain BrinOpcInfo for each indexed column. While at it, accumulate + * the number of columns stored, since the number is opclass-defined. + */ + opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts); + for (keyno = 0; keyno < tupdesc->natts; keyno++) + { + FmgrInfo *opcInfoFn; + Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno); + + opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO); + + opcinfo[keyno] = (BrinOpcInfo *) + DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid)); + totalstored += opcinfo[keyno]->oi_nstored; + } + + /* Allocate our result struct and fill it in */ + totalsize = offsetof(BrinDesc, bd_info) + + sizeof(BrinOpcInfo *) * tupdesc->natts; + + bdesc = palloc(totalsize); + bdesc->bd_context = cxt; + bdesc->bd_index = rel; + bdesc->bd_tupdesc = tupdesc; + bdesc->bd_disktdesc = NULL; /* generated lazily */ + bdesc->bd_totalstored = totalstored; + + for (keyno = 0; keyno < tupdesc->natts; keyno++) + bdesc->bd_info[keyno] = opcinfo[keyno]; + pfree(opcinfo); + + MemoryContextSwitchTo(oldcxt); + + return bdesc; +} + +void +brin_free_desc(BrinDesc *bdesc) +{ + /* make sure the tupdesc is still valid */ + Assert(bdesc->bd_tupdesc->tdrefcount >= 1); + /* no need for retail pfree */ + MemoryContextDelete(bdesc->bd_context); +} + +/* + * Fetch index's statistical data into *stats + */ +void +brinGetStats(Relation index, BrinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + BrinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + stats->pagesPerRange = metadata->pagesPerRange; + stats->revmapNumPages = metadata->lastRevmapPage - 1; + + UnlockReleaseBuffer(metabuffer); +} + +/* + * Initialize a BrinBuildState appropriate to create tuples on the given index. + */ +static BrinBuildState * +initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, + BlockNumber pagesPerRange) +{ + BrinBuildState *state; + + state = palloc(sizeof(BrinBuildState)); + + state->bs_irel = idxRel; + state->bs_numtuples = 0; + state->bs_currentInsertBuf = InvalidBuffer; + state->bs_pagesPerRange = pagesPerRange; + state->bs_currRangeStart = 0; + state->bs_rmAccess = revmap; + state->bs_bdesc = brin_build_desc(idxRel); + state->bs_dtuple = brin_new_memtuple(state->bs_bdesc); + + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + + return state; +} + +/* + * Release resources associated with a BrinBuildState. + */ +static void +terminate_brin_buildstate(BrinBuildState *state) +{ + /* + * Release the last index buffer used. We might as well ensure that + * whatever free space remains in that page is available in FSM, too. + */ + if (!BufferIsInvalid(state->bs_currentInsertBuf)) + { + Page page; + Size freespace; + BlockNumber blk; + + page = BufferGetPage(state->bs_currentInsertBuf); + freespace = PageGetFreeSpace(page); + blk = BufferGetBlockNumber(state->bs_currentInsertBuf); + ReleaseBuffer(state->bs_currentInsertBuf); + RecordPageWithFreeSpace(state->bs_irel, blk, freespace); + FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1); + } + + brin_free_desc(state->bs_bdesc); + pfree(state->bs_dtuple); + pfree(state); +} + +/* + * On the given BRIN index, summarize the heap page range that corresponds + * to the heap block number given. + * + * This routine can run in parallel with insertions into the heap. To avoid + * missing those values from the summary tuple, we first insert a placeholder + * index tuple into the index, then execute the heap scan; transactions + * concurrent with the scan update the placeholder tuple. After the scan, we + * union the placeholder tuple with the one computed by this routine. The + * update of the index value happens in a loop, so that if somebody updates + * the placeholder tuple after we read it, we detect the case and try again. + * This ensures that the concurrently inserted tuples are not lost. + * + * A further corner case is this routine being asked to summarize the partial + * range at the end of the table. heapNumBlocks is the (possibly outdated) + * table size; if we notice that the requested range lies beyond that size, + * we re-compute the table size after inserting the placeholder tuple, to + * avoid missing pages that were appended recently. + */ +static void +summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, + BlockNumber heapBlk, BlockNumber heapNumBlks) +{ + Buffer phbuf; + BrinTuple *phtup; + Size phsz; + OffsetNumber offset; + BlockNumber scanNumBlks; + + /* + * Insert the placeholder tuple + */ + phbuf = InvalidBuffer; + phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz); + offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, &phbuf, + heapBlk, phtup, phsz); + + /* + * Compute range end. We hold ShareUpdateExclusive lock on table, so it + * cannot shrink concurrently (but it can grow). + */ + Assert(heapBlk % state->bs_pagesPerRange == 0); + if (heapBlk + state->bs_pagesPerRange > heapNumBlks) + { + /* + * If we're asked to scan what we believe to be the final range on the + * table (i.e. a range that might be partial) we need to recompute our + * idea of what the latest page is after inserting the placeholder + * tuple. Anyone that grows the table later will update the + * placeholder tuple, so it doesn't matter that we won't scan these + * pages ourselves. Careful: the table might have been extended + * beyond the current range, so clamp our result. + * + * Fortunately, this should occur infrequently. + */ + scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk, + state->bs_pagesPerRange); + } + else + { + /* Easy case: range is known to be complete */ + scanNumBlks = state->bs_pagesPerRange; + } + + /* + * Execute the partial heap scan covering the heap blocks in the specified + * page range, summarizing the heap tuples in it. This scan stops just + * short of brinbuildCallback creating the new index entry. + * + * Note that it is critical we use the "any visible" mode of + * table_index_build_range_scan here: otherwise, we would miss tuples + * inserted by transactions that are still in progress, among other corner + * cases. + */ + state->bs_currRangeStart = heapBlk; + table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false, + heapBlk, scanNumBlks, + brinbuildCallback, (void *) state, NULL); + + /* + * Now we update the values obtained by the scan with the placeholder + * tuple. We do this in a loop which only terminates if we're able to + * update the placeholder tuple successfully; if we are not, this means + * somebody else modified the placeholder tuple after we read it. + */ + for (;;) + { + BrinTuple *newtup; + Size newsize; + bool didupdate; + bool samepage; + + CHECK_FOR_INTERRUPTS(); + + /* + * Update the summary tuple and try to update. + */ + newtup = brin_form_tuple(state->bs_bdesc, + heapBlk, state->bs_dtuple, &newsize); + samepage = brin_can_do_samepage_update(phbuf, phsz, newsize); + didupdate = + brin_doupdate(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, heapBlk, phbuf, offset, + phtup, phsz, newtup, newsize, samepage); + brin_free_tuple(phtup); + brin_free_tuple(newtup); + + /* If the update succeeded, we're done. */ + if (didupdate) + break; + + /* + * If the update didn't work, it might be because somebody updated the + * placeholder tuple concurrently. Extract the new version, union it + * with the values we have from the scan, and start over. (There are + * other reasons for the update to fail, but it's simple to treat them + * the same.) + */ + phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, + &offset, &phsz, BUFFER_LOCK_SHARE, + NULL); + /* the placeholder tuple must exist */ + if (phtup == NULL) + elog(ERROR, "missing placeholder tuple"); + phtup = brin_copy_tuple(phtup, phsz, NULL, NULL); + LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); + + /* merge it into the tuple from the heap scan */ + union_tuples(state->bs_bdesc, state->bs_dtuple, phtup); + } + + ReleaseBuffer(phbuf); +} + +/* + * Summarize page ranges that are not already summarized. If pageRange is + * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the + * page range containing the given heap page number is scanned. + * If include_partial is true, then the partial range at the end of the table + * is summarized, otherwise not. + * + * For each new index tuple inserted, *numSummarized (if not NULL) is + * incremented; for each existing tuple, *numExisting (if not NULL) is + * incremented. + */ +static void +brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, + bool include_partial, double *numSummarized, double *numExisting) +{ + BrinRevmap *revmap; + BrinBuildState *state = NULL; + IndexInfo *indexInfo = NULL; + BlockNumber heapNumBlocks; + BlockNumber pagesPerRange; + Buffer buf; + BlockNumber startBlk; + + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); + + /* determine range of pages to process */ + heapNumBlocks = RelationGetNumberOfBlocks(heapRel); + if (pageRange == BRIN_ALL_BLOCKRANGES) + startBlk = 0; + else + { + startBlk = (pageRange / pagesPerRange) * pagesPerRange; + heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange); + } + if (startBlk > heapNumBlocks) + { + /* Nothing to do if start point is beyond end of table */ + brinRevmapTerminate(revmap); + return; + } + + /* + * Scan the revmap to find unsummarized items. + */ + buf = InvalidBuffer; + for (; startBlk < heapNumBlocks; startBlk += pagesPerRange) + { + BrinTuple *tup; + OffsetNumber off; + + /* + * Unless requested to summarize even a partial range, go away now if + * we think the next range is partial. Caller would pass true when it + * is typically run once bulk data loading is done + * (brin_summarize_new_values), and false when it is typically the + * result of arbitrarily-scheduled maintenance command (vacuuming). + */ + if (!include_partial && + (startBlk + pagesPerRange > heapNumBlocks)) + break; + + CHECK_FOR_INTERRUPTS(); + + tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL, + BUFFER_LOCK_SHARE, NULL); + if (tup == NULL) + { + /* no revmap entry for this heap range. Summarize it. */ + if (state == NULL) + { + /* first time through */ + Assert(!indexInfo); + state = initialize_brin_buildstate(index, revmap, + pagesPerRange); + indexInfo = BuildIndexInfo(index); + } + summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks); + + /* and re-initialize state for the next range */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + + if (numSummarized) + *numSummarized += 1.0; + } + else + { + if (numExisting) + *numExisting += 1.0; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + } + + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + + /* free resources */ + brinRevmapTerminate(revmap); + if (state) + { + terminate_brin_buildstate(state); + pfree(indexInfo); + } +} + +/* + * Given a deformed tuple in the build state, convert it into the on-disk + * format and insert it into the index, making the revmap point to it. + */ +static void +form_and_insert_tuple(BrinBuildState *state) +{ + BrinTuple *tup; + Size size; + + tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart, + state->bs_dtuple, &size); + brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, + &state->bs_currentInsertBuf, state->bs_currRangeStart, + tup, size); + state->bs_numtuples++; + + pfree(tup); +} + +/* + * Given two deformed tuples, adjust the first one so that it's consistent + * with the summary values in both. + */ +static void +union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) +{ + int keyno; + BrinMemTuple *db; + MemoryContext cxt; + MemoryContext oldcxt; + + /* Use our own memory context to avoid retail pfree */ + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin union", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + db = brin_deform_tuple(bdesc, b, NULL); + MemoryContextSwitchTo(oldcxt); + + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + FmgrInfo *unionFn; + BrinValues *col_a = &a->bt_columns[keyno]; + BrinValues *col_b = &db->bt_columns[keyno]; + BrinOpcInfo *opcinfo = bdesc->bd_info[keyno]; + + if (opcinfo->oi_regular_nulls) + { + /* Adjust "hasnulls". */ + if (!col_a->bv_hasnulls && col_b->bv_hasnulls) + col_a->bv_hasnulls = true; + + /* If there are no values in B, there's nothing left to do. */ + if (col_b->bv_allnulls) + continue; + + /* + * Adjust "allnulls". If A doesn't have values, just copy the + * values from B into A, and we're done. We cannot run the + * operators in this case, because values in A might contain + * garbage. Note we already established that B contains values. + */ + if (col_a->bv_allnulls) + { + int i; + + col_a->bv_allnulls = false; + + for (i = 0; i < opcinfo->oi_nstored; i++) + col_a->bv_values[i] = + datumCopy(col_b->bv_values[i], + opcinfo->oi_typcache[i]->typbyval, + opcinfo->oi_typcache[i]->typlen); + + continue; + } + } + + unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1, + BRIN_PROCNUM_UNION); + FunctionCall3Coll(unionFn, + bdesc->bd_index->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(col_a), + PointerGetDatum(col_b)); + } + + MemoryContextDelete(cxt); +} + +/* + * brin_vacuum_scan + * Do a complete scan of the index during VACUUM. + * + * This routine scans the complete index looking for uncatalogued index pages, + * i.e. those that might have been lost due to a crash after index extension + * and such. + */ +static void +brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) +{ + BlockNumber nblocks; + BlockNumber blkno; + + /* + * Scan the index in physical order, and clean up any possible mess in + * each page. + */ + nblocks = RelationGetNumberOfBlocks(idxrel); + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buf; + + CHECK_FOR_INTERRUPTS(); + + buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno, + RBM_NORMAL, strategy); + + brin_page_cleanup(idxrel, buf); + + ReleaseBuffer(buf); + } + + /* + * Update all upper pages in the index's FSM, as well. This ensures not + * only that we propagate leaf-page FSM updates made by brin_page_cleanup, + * but also that any pre-existing damage or out-of-dateness is repaired. + */ + FreeSpaceMapVacuum(idxrel); +} + +static bool +add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, + Datum *values, bool *nulls) +{ + int keyno; + bool modified = false; + + /* + * Compare the key values of the new tuple to the stored index values; our + * deformed tuple will get updated if the new tuple doesn't fit the + * original range (note this means we can't break out of the loop early). + * Make a note of whether this happens, so that we know to insert the + * modified tuple later. + */ + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + Datum result; + BrinValues *bval; + FmgrInfo *addValue; + + bval = &dtup->bt_columns[keyno]; + + if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno]) + { + /* + * If the new value is null, we record that we saw it if it's the + * first one; otherwise, there's nothing to do. + */ + if (!bval->bv_hasnulls) + { + bval->bv_hasnulls = true; + modified = true; + } + + continue; + } + + addValue = index_getprocinfo(idxRel, keyno + 1, + BRIN_PROCNUM_ADDVALUE); + result = FunctionCall4Coll(addValue, + idxRel->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(bval), + values[keyno], + nulls[keyno]); + /* if that returned true, we need to insert the updated tuple */ + modified |= DatumGetBool(result); + } + + return modified; +} + +static bool +check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys) +{ + int keyno; + + /* + * First check if there are any IS [NOT] NULL scan keys, and if we're + * violating them. + */ + for (keyno = 0; keyno < nnullkeys; keyno++) + { + ScanKey key = nullkeys[keyno]; + + Assert(key->sk_attno == bval->bv_attno); + + /* Handle only IS NULL/IS NOT NULL tests */ + if (!(key->sk_flags & SK_ISNULL)) + continue; + + if (key->sk_flags & SK_SEARCHNULL) + { + /* IS NULL scan key, but range has no NULLs */ + if (!bval->bv_allnulls && !bval->bv_hasnulls) + return false; + } + else if (key->sk_flags & SK_SEARCHNOTNULL) + { + /* + * For IS NOT NULL, we can only skip ranges that are known to have + * only nulls. + */ + if (bval->bv_allnulls) + return false; + } + else + { + /* + * Neither IS NULL nor IS NOT NULL was used; assume all indexable + * operators are strict and thus return false with NULL value in + * the scan key. + */ + return false; + } + } + + return true; +} diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c new file mode 100644 index 0000000..2c8a20a --- /dev/null +++ b/src/backend/access/brin/brin_bloom.c @@ -0,0 +1,809 @@ +/* + * brin_bloom.c + * Implementation of Bloom opclass for BRIN + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * A BRIN opclass summarizing page range into a bloom filter. + * + * Bloom filters allow efficient testing whether a given page range contains + * a particular value. Therefore, if we summarize each page range into a small + * bloom filter, we can easily (and cheaply) test whether it contains values + * we get later. + * + * The index only supports equality operators, similarly to hash indexes. + * Bloom indexes are however much smaller, and support only bitmap scans. + * + * Note: Don't confuse this with bloom indexes, implemented in a contrib + * module. That extension implements an entirely new AM, building a bloom + * filter on multiple columns in a single row. This opclass works with an + * existing AM (BRIN) and builds bloom filter on a column. + * + * + * values vs. hashes + * ----------------- + * + * The original column values are not used directly, but are first hashed + * using the regular type-specific hash function, producing a uint32 hash. + * And this hash value is then added to the summary - i.e. it's hashed + * again and added to the bloom filter. + * + * This allows the code to treat all data types (byval/byref/...) the same + * way, with only minimal space requirements, because we're working with + * hashes and not the original values. Everything is uint32. + * + * Of course, this assumes the built-in hash function is reasonably good, + * without too many collisions etc. But that does seem to be the case, at + * least based on past experience. After all, the same hash functions are + * used for hash indexes, hash partitioning and so on. + * + * + * hashing scheme + * -------------- + * + * Bloom filters require a number of independent hash functions. There are + * different schemes how to construct them - for example we might use + * hash_uint32_extended with random seeds, but that seems fairly expensive. + * We use a scheme requiring only two functions described in this paper: + * + * Less Hashing, Same Performance:Building a Better Bloom Filter + * Adam Kirsch, Michael Mitzenmacher†, Harvard School of Engineering and + * Applied Sciences, Cambridge, Massachusetts [DOI 10.1002/rsa.20208] + * + * The two hash functions h1 and h2 are calculated using hard-coded seeds, + * and then combined using (h1 + i * h2) to generate the hash functions. + * + * + * sizing the bloom filter + * ----------------------- + * + * Size of a bloom filter depends on the number of distinct values we will + * store in it, and the desired false positive rate. The higher the number + * of distinct values and/or the lower the false positive rate, the larger + * the bloom filter. On the other hand, we want to keep the index as small + * as possible - that's one of the basic advantages of BRIN indexes. + * + * Although the number of distinct elements (in a page range) depends on + * the data, we can consider it fixed. This simplifies the trade-off to + * just false positive rate vs. size. + * + * At the page range level, false positive rate is a probability the bloom + * filter matches a random value. For the whole index (with sufficiently + * many page ranges) it represents the fraction of the index ranges (and + * thus fraction of the table to be scanned) matching the random value. + * + * Furthermore, the size of the bloom filter is subject to implementation + * limits - it has to fit onto a single index page (8kB by default). As + * the bitmap is inherently random (when "full" about half the bits is set + * to 1, randomly), compression can't help very much. + * + * To reduce the size of a filter (to fit to a page), we have to either + * accept higher false positive rate (undesirable), or reduce the number + * of distinct items to be stored in the filter. We can't alter the input + * data, of course, but we may make the BRIN page ranges smaller - instead + * of the default 128 pages (1MB) we may build index with 16-page ranges, + * or something like that. This should reduce the number of distinct values + * in the page range, making the filter smaller (with fixed false positive + * rate). Even for random data sets this should help, as the number of rows + * per heap page is limited (to ~290 with very narrow tables, likely ~20 + * in practice). + * + * Of course, good sizing decisions depend on having the necessary data, + * i.e. number of distinct values in a page range (of a given size) and + * table size (to estimate cost change due to change in false positive + * rate due to having larger index vs. scanning larger indexes). We may + * not have that data - for example when building an index on empty table + * it's not really possible. And for some data we only have estimates for + * the whole table and we can only estimate per-range values (ndistinct). + * + * Another challenge is that while the bloom filter is per-column, it's + * the whole index tuple that has to fit into a page. And for multi-column + * indexes that may include pieces we have no control over (not necessarily + * bloom filters, the other columns may use other BRIN opclasses). So it's + * not entirely clear how to distribute the space between those columns. + * + * The current logic, implemented in brin_bloom_get_ndistinct, attempts to + * make some basic sizing decisions, based on the size of BRIN ranges, and + * the maximum number of rows per range. + * + * + * IDENTIFICATION + * src/backend/access/brin/brin_bloom.c + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/brin.h" +#include "access/brin_internal.h" +#include "access/brin_page.h" +#include "access/brin_tuple.h" +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/reloptions.h" +#include "access/stratnum.h" +#include "catalog/pg_type.h" +#include "catalog/pg_amop.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +#include + +#define BloomEqualStrategyNumber 1 + +/* + * Additional SQL level support functions. We only have one, which is + * used to calculate hash of the input value. + * + * Procedure numbers must not use values reserved for BRIN itself; see + * brin_internal.h. + */ +#define BLOOM_MAX_PROCNUMS 1 /* maximum support procs we need */ +#define PROCNUM_HASH 11 /* required */ + +/* + * Subtract this from procnum to obtain index in BloomOpaque arrays + * (Must be equal to minimum of private procnums). + */ +#define PROCNUM_BASE 11 + +/* + * Storage type for BRIN's reloptions. + */ +typedef struct BloomOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + double nDistinctPerRange; /* number of distinct values per range */ + double falsePositiveRate; /* false positive for bloom filter */ +} BloomOptions; + +/* + * The current min value (16) is somewhat arbitrary, but it's based + * on the fact that the filter header is ~20B alone, which is about + * the same as the filter bitmap for 16 distinct items with 1% false + * positive rate. So by allowing lower values we'd not gain much. In + * any case, the min should not be larger than MaxHeapTuplesPerPage + * (~290), which is the theoretical maximum for single-page ranges. + */ +#define BLOOM_MIN_NDISTINCT_PER_RANGE 16 + +/* + * Used to determine number of distinct items, based on the number of rows + * in a page range. The 10% is somewhat similar to what estimate_num_groups + * does, so we use the same factor here. + */ +#define BLOOM_DEFAULT_NDISTINCT_PER_RANGE -0.1 /* 10% of values */ + +/* + * Allowed range and default value for the false positive range. The exact + * values are somewhat arbitrary, but were chosen considering the various + * parameters (size of filter vs. page size, etc.). + * + * The lower the false-positive rate, the more accurate the filter is, but + * it also gets larger - at some point this eliminates the main advantage + * of BRIN indexes, which is the tiny size. At 0.01% the index is about + * 10% of the table (assuming 290 distinct values per 8kB page). + * + * On the other hand, as the false-positive rate increases, larger part of + * the table has to be scanned due to mismatches - at 25% we're probably + * close to sequential scan being cheaper. + */ +#define BLOOM_MIN_FALSE_POSITIVE_RATE 0.0001 /* 0.01% fp rate */ +#define BLOOM_MAX_FALSE_POSITIVE_RATE 0.25 /* 25% fp rate */ +#define BLOOM_DEFAULT_FALSE_POSITIVE_RATE 0.01 /* 1% fp rate */ + +#define BloomGetNDistinctPerRange(opts) \ + ((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \ + (((BloomOptions *) (opts))->nDistinctPerRange) : \ + BLOOM_DEFAULT_NDISTINCT_PER_RANGE) + +#define BloomGetFalsePositiveRate(opts) \ + ((opts) && (((BloomOptions *) (opts))->falsePositiveRate != 0.0) ? \ + (((BloomOptions *) (opts))->falsePositiveRate) : \ + BLOOM_DEFAULT_FALSE_POSITIVE_RATE) + +/* + * And estimate of the largest bloom we can fit onto a page. This is not + * a perfect guarantee, for a couple of reasons. For example, the row may + * be larger because the index has multiple columns. + */ +#define BloomMaxFilterSize \ + MAXALIGN_DOWN(BLCKSZ - \ + (MAXALIGN(SizeOfPageHeaderData + \ + sizeof(ItemIdData)) + \ + MAXALIGN(sizeof(BrinSpecialSpace)) + \ + SizeOfBrinTuple)) + +/* + * Seeds used to calculate two hash functions h1 and h2, which are then used + * to generate k hashes using the (h1 + i * h2) scheme. + */ +#define BLOOM_SEED_1 0x71d924af +#define BLOOM_SEED_2 0xba48b314 + +/* + * Bloom Filter + * + * Represents a bloom filter, built on hashes of the indexed values. That is, + * we compute a uint32 hash of the value, and then store this hash into the + * bloom filter (and compute additional hashes on it). + * + * XXX We could implement "sparse" bloom filters, keeping only the bytes that + * are not entirely 0. But while indexes don't support TOAST, the varlena can + * still be compressed. So this seems unnecessary, because the compression + * should do the same job. + * + * XXX We can also watch the number of bits set in the bloom filter, and then + * stop using it (and not store the bitmap, to save space) when the false + * positive rate gets too high. But even if the false positive rate exceeds the + * desired value, it still can eliminate some page ranges. + */ +typedef struct BloomFilter +{ + /* varlena header (do not touch directly!) */ + int32 vl_len_; + + /* space for various flags (unused for now) */ + uint16 flags; + + /* fields for the HASHED phase */ + uint8 nhashes; /* number of hash functions */ + uint32 nbits; /* number of bits in the bitmap (size) */ + uint32 nbits_set; /* number of bits set to 1 */ + + /* data of the bloom filter */ + char data[FLEXIBLE_ARRAY_MEMBER]; + +} BloomFilter; + + +/* + * bloom_init + * Initialize the Bloom Filter, allocate all the memory. + * + * The filter is initialized with optimal size for ndistinct expected values + * and the requested false positive rate. The filter is stored as varlena. + */ +static BloomFilter * +bloom_init(int ndistinct, double false_positive_rate) +{ + Size len; + BloomFilter *filter; + + int nbits; /* size of filter / number of bits */ + int nbytes; /* size of filter / number of bytes */ + + double k; /* number of hash functions */ + + Assert(ndistinct > 0); + Assert((false_positive_rate >= BLOOM_MIN_FALSE_POSITIVE_RATE) && + (false_positive_rate < BLOOM_MAX_FALSE_POSITIVE_RATE)); + + /* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */ + nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2)); + + /* round m to whole bytes */ + nbytes = ((nbits + 7) / 8); + nbits = nbytes * 8; + + /* + * Reject filters that are obviously too large to store on a page. + * + * Initially the bloom filter is just zeroes and so very compressible, but + * as we add values it gets more and more random, and so less and less + * compressible. So initially everything fits on the page, but we might + * get surprising failures later - we want to prevent that, so we reject + * bloom filter that are obviously too large. + * + * XXX It's not uncommon to oversize the bloom filter a bit, to defend + * against unexpected data anomalies (parts of table with more distinct + * values per range etc.). But we still need to make sure even the + * oversized filter fits on page, if such need arises. + * + * XXX This check is not perfect, because the index may have multiple + * filters that are small individually, but too large when combined. + */ + if (nbytes > BloomMaxFilterSize) + elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes, + BloomMaxFilterSize); + + /* + * round(log(2.0) * m / ndistinct), but assume round() may not be + * available on Windows + */ + k = log(2.0) * nbits / ndistinct; + k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k); + + /* + * We allocate the whole filter. Most of it is going to be 0 bits, so the + * varlena is easy to compress. + */ + len = offsetof(BloomFilter, data) + nbytes; + + filter = (BloomFilter *) palloc0(len); + + filter->flags = 0; + filter->nhashes = (int) k; + filter->nbits = nbits; + + SET_VARSIZE(filter, len); + + return filter; +} + + +/* + * bloom_add_value + * Add value to the bloom filter. + */ +static BloomFilter * +bloom_add_value(BloomFilter *filter, uint32 value, bool *updated) +{ + int i; + uint64 h1, + h2; + + /* compute the hashes, used for the bloom filter */ + h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits; + h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits; + + /* compute the requested number of hashes */ + for (i = 0; i < filter->nhashes; i++) + { + /* h1 + h2 + f(i) */ + uint32 h = (h1 + i * h2) % filter->nbits; + uint32 byte = (h / 8); + uint32 bit = (h % 8); + + /* if the bit is not set, set it and remember we did that */ + if (!(filter->data[byte] & (0x01 << bit))) + { + filter->data[byte] |= (0x01 << bit); + filter->nbits_set++; + if (updated) + *updated = true; + } + } + + return filter; +} + + +/* + * bloom_contains_value + * Check if the bloom filter contains a particular value. + */ +static bool +bloom_contains_value(BloomFilter *filter, uint32 value) +{ + int i; + uint64 h1, + h2; + + /* calculate the two hashes */ + h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits; + h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits; + + /* compute the requested number of hashes */ + for (i = 0; i < filter->nhashes; i++) + { + /* h1 + h2 + f(i) */ + uint32 h = (h1 + i * h2) % filter->nbits; + uint32 byte = (h / 8); + uint32 bit = (h % 8); + + /* if the bit is not set, the value is not there */ + if (!(filter->data[byte] & (0x01 << bit))) + return false; + } + + /* all hashes found in bloom filter */ + return true; +} + +typedef struct BloomOpaque +{ + /* + * XXX At this point we only need a single proc (to compute the hash), but + * let's keep the array just like inclusion and minmax opclasses, for + * consistency. We may need additional procs in the future. + */ + FmgrInfo extra_procinfos[BLOOM_MAX_PROCNUMS]; + bool extra_proc_missing[BLOOM_MAX_PROCNUMS]; +} BloomOpaque; + +static FmgrInfo *bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, + uint16 procnum); + + +Datum +brin_bloom_opcinfo(PG_FUNCTION_ARGS) +{ + BrinOpcInfo *result; + + /* + * opaque->strategy_procinfos is initialized lazily; here it is set to + * all-uninitialized by palloc0 which sets fn_oid to InvalidOid. + * + * bloom indexes only store the filter as a single BYTEA column + */ + + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) + + sizeof(BloomOpaque)); + result->oi_nstored = 1; + result->oi_regular_nulls = true; + result->oi_opaque = (BloomOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(1)); + result->oi_typcache[0] = lookup_type_cache(PG_BRIN_BLOOM_SUMMARYOID, 0); + + PG_RETURN_POINTER(result); +} + +/* + * brin_bloom_get_ndistinct + * Determine the ndistinct value used to size bloom filter. + * + * Adjust the ndistinct value based on the pagesPerRange value. First, + * if it's negative, it's assumed to be relative to maximum number of + * tuples in the range (assuming each page gets MaxHeapTuplesPerPage + * tuples, which is likely a significant over-estimate). We also clamp + * the value, not to over-size the bloom filter unnecessarily. + * + * XXX We can only do this when the pagesPerRange value was supplied. + * If it wasn't, it has to be a read-only access to the index, in which + * case we don't really care. But perhaps we should fall-back to the + * default pagesPerRange value? + * + * XXX We might also fetch info about ndistinct estimate for the column, + * and compute the expected number of distinct values in a range. But + * that may be tricky due to data being sorted in various ways, so it + * seems better to rely on the upper estimate. + * + * XXX We might also calculate a better estimate of rows per BRIN range, + * instead of using MaxHeapTuplesPerPage (which probably produces values + * much higher than reality). + */ +static int +brin_bloom_get_ndistinct(BrinDesc *bdesc, BloomOptions *opts) +{ + double ndistinct; + double maxtuples; + BlockNumber pagesPerRange; + + pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index); + ndistinct = BloomGetNDistinctPerRange(opts); + + Assert(BlockNumberIsValid(pagesPerRange)); + + maxtuples = MaxHeapTuplesPerPage * pagesPerRange; + + /* + * Similarly to n_distinct, negative values are relative - in this case to + * maximum number of tuples in the page range (maxtuples). + */ + if (ndistinct < 0) + ndistinct = (-ndistinct) * maxtuples; + + /* + * Positive values are to be used directly, but we still apply a couple of + * safeties to avoid using unreasonably small bloom filters. + */ + ndistinct = Max(ndistinct, BLOOM_MIN_NDISTINCT_PER_RANGE); + + /* + * And don't use more than the maximum possible number of tuples, in the + * range, which would be entirely wasteful. + */ + ndistinct = Min(ndistinct, maxtuples); + + return (int) ndistinct; +} + +/* + * Examine the given index tuple (which contains partial status of a certain + * page range) by comparing it to the given value that comes from another heap + * tuple. If the new value is outside the bloom filter specified by the + * existing tuple values, update the index tuple and return true. Otherwise, + * return false and do not modify in this case. + */ +Datum +brin_bloom_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS(); + Oid colloid = PG_GET_COLLATION(); + FmgrInfo *hashFn; + uint32 hashValue; + bool updated = false; + AttrNumber attno; + BloomFilter *filter; + + Assert(!isnull); + + attno = column->bv_attno; + + /* + * If this is the first non-null value, we need to initialize the bloom + * filter. Otherwise just extract the existing bloom filter from + * BrinValues. + */ + if (column->bv_allnulls) + { + filter = bloom_init(brin_bloom_get_ndistinct(bdesc, opts), + BloomGetFalsePositiveRate(opts)); + column->bv_values[0] = PointerGetDatum(filter); + column->bv_allnulls = false; + updated = true; + } + else + filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]); + + /* + * Compute the hash of the new value, using the supplied hash function, + * and then add the hash value to the bloom filter. + */ + hashFn = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH); + + hashValue = DatumGetUInt32(FunctionCall1Coll(hashFn, colloid, newval)); + + filter = bloom_add_value(filter, hashValue, &updated); + + column->bv_values[0] = PointerGetDatum(filter); + + PG_RETURN_BOOL(updated); +} + +/* + * Given an index tuple corresponding to a certain page range and a scan key, + * return whether the scan key is consistent with the index tuple's bloom + * filter. Return true if so, false otherwise. + */ +Datum +brin_bloom_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey *keys = (ScanKey *) PG_GETARG_POINTER(2); + int nkeys = PG_GETARG_INT32(3); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Datum value; + Datum matches; + FmgrInfo *finfo; + uint32 hashValue; + BloomFilter *filter; + int keyno; + + filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]); + + Assert(filter); + + matches = true; + + for (keyno = 0; keyno < nkeys; keyno++) + { + ScanKey key = keys[keyno]; + + /* NULL keys are handled and filtered-out in bringetbitmap */ + Assert(!(key->sk_flags & SK_ISNULL)); + + attno = key->sk_attno; + value = key->sk_argument; + + switch (key->sk_strategy) + { + case BloomEqualStrategyNumber: + + /* + * In the equality case (WHERE col = someval), we want to + * return the current page range if the minimum value in the + * range <= scan key, and the maximum value >= scan key. + */ + finfo = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH); + + hashValue = DatumGetUInt32(FunctionCall1Coll(finfo, colloid, value)); + matches &= bloom_contains_value(filter, hashValue); + + break; + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + if (!matches) + break; + } + + PG_RETURN_DATUM(matches); +} + +/* + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + * + * XXX We assume the bloom filters have the same parameters for now. In the + * future we should have 'can union' function, to decide if we can combine + * two particular bloom filters. + */ +Datum +brin_bloom_union(PG_FUNCTION_ARGS) +{ + int i; + int nbytes; + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + BloomFilter *filter_a; + BloomFilter *filter_b; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + filter_a = (BloomFilter *) PG_DETOAST_DATUM(col_a->bv_values[0]); + filter_b = (BloomFilter *) PG_DETOAST_DATUM(col_b->bv_values[0]); + + /* make sure the filters use the same parameters */ + Assert(filter_a && filter_b); + Assert(filter_a->nbits == filter_b->nbits); + Assert(filter_a->nhashes == filter_b->nhashes); + Assert((filter_a->nbits > 0) && (filter_a->nbits % 8 == 0)); + + nbytes = (filter_a->nbits) / 8; + + /* simply OR the bitmaps */ + for (i = 0; i < nbytes; i++) + filter_a->data[i] |= filter_b->data[i]; + + PG_RETURN_VOID(); +} + +/* + * Cache and return inclusion opclass support procedure + * + * Return the procedure corresponding to the given function support number + * or null if it does not exist. + */ +static FmgrInfo * +bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) +{ + BloomOpaque *opaque; + uint16 basenum = procnum - PROCNUM_BASE; + + /* + * We cache these in the opaque struct, to avoid repetitive syscache + * lookups. + */ + opaque = (BloomOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * If we already searched for this proc and didn't find it, don't bother + * searching again. + */ + if (opaque->extra_proc_missing[basenum]) + return NULL; + + if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid) + { + if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno, + procnum))) + { + fmgr_info_copy(&opaque->extra_procinfos[basenum], + index_getprocinfo(bdesc->bd_index, attno, procnum), + bdesc->bd_context); + } + else + { + opaque->extra_proc_missing[basenum] = true; + return NULL; + } + } + + return &opaque->extra_procinfos[basenum]; +} + +Datum +brin_bloom_options(PG_FUNCTION_ARGS) +{ + local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0); + + init_local_reloptions(relopts, sizeof(BloomOptions)); + + add_local_real_reloption(relopts, "n_distinct_per_range", + "number of distinct items expected in a BRIN page range", + BLOOM_DEFAULT_NDISTINCT_PER_RANGE, + -1.0, INT_MAX, offsetof(BloomOptions, nDistinctPerRange)); + + add_local_real_reloption(relopts, "false_positive_rate", + "desired false-positive rate for the bloom filters", + BLOOM_DEFAULT_FALSE_POSITIVE_RATE, + BLOOM_MIN_FALSE_POSITIVE_RATE, + BLOOM_MAX_FALSE_POSITIVE_RATE, + offsetof(BloomOptions, falsePositiveRate)); + + PG_RETURN_VOID(); +} + +/* + * brin_bloom_summary_in + * - input routine for type brin_bloom_summary. + * + * brin_bloom_summary is only used internally to represent summaries + * in BRIN bloom indexes, so it has no operations of its own, and we + * disallow input too. + */ +Datum +brin_bloom_summary_in(PG_FUNCTION_ARGS) +{ + /* + * brin_bloom_summary stores the data in binary form and parsing text + * input is not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + + +/* + * brin_bloom_summary_out + * - output routine for type brin_bloom_summary. + * + * BRIN bloom summaries are serialized into a bytea value, but we want + * to output something nicer humans can understand. + */ +Datum +brin_bloom_summary_out(PG_FUNCTION_ARGS) +{ + BloomFilter *filter; + StringInfoData str; + + /* detoast the data to get value with a full 4B header */ + filter = (BloomFilter *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0)); + + initStringInfo(&str); + appendStringInfoChar(&str, '{'); + + appendStringInfo(&str, "mode: hashed nhashes: %u nbits: %u nbits_set: %u", + filter->nhashes, filter->nbits, filter->nbits_set); + + appendStringInfoChar(&str, '}'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * brin_bloom_summary_recv + * - binary input routine for type brin_bloom_summary. + */ +Datum +brin_bloom_summary_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * brin_bloom_summary_send + * - binary output routine for type brin_bloom_summary. + * + * BRIN bloom summaries are serialized in a bytea value (although the + * type is named differently), so let's just send that. + */ +Datum +brin_bloom_summary_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} diff --git a/src/backend/access/brin/brin_inclusion.c b/src/backend/access/brin/brin_inclusion.c new file mode 100644 index 0000000..0b384c0 --- /dev/null +++ b/src/backend/access/brin/brin_inclusion.c @@ -0,0 +1,657 @@ +/* + * brin_inclusion.c + * Implementation of inclusion opclasses for BRIN + * + * This module provides framework BRIN support functions for the "inclusion" + * operator classes. A few SQL-level support functions are also required for + * each opclass. + * + * The "inclusion" BRIN strategy is useful for types that support R-Tree + * operations. This implementation is a straight mapping of those operations + * to the block-range nature of BRIN, with two exceptions: (a) we explicitly + * support "empty" elements: at least with range types, we need to consider + * emptiness separately from regular R-Tree strategies; and (b) we need to + * consider "unmergeable" elements, that is, a set of elements for whose union + * no representation exists. The only case where that happens as of this + * writing is the INET type, where IPv6 values cannot be merged with IPv4 + * values. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_inclusion.c + */ +#include "postgres.h" + +#include "access/brin_internal.h" +#include "access/brin_tuple.h" +#include "access/genam.h" +#include "access/skey.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + + +/* + * Additional SQL level support functions + * + * Procedure numbers must not use values reserved for BRIN itself; see + * brin_internal.h. + */ +#define INCLUSION_MAX_PROCNUMS 4 /* maximum support procs we need */ +#define PROCNUM_MERGE 11 /* required */ +#define PROCNUM_MERGEABLE 12 /* optional */ +#define PROCNUM_CONTAINS 13 /* optional */ +#define PROCNUM_EMPTY 14 /* optional */ + + +/* + * Subtract this from procnum to obtain index in InclusionOpaque arrays + * (Must be equal to minimum of private procnums). + */ +#define PROCNUM_BASE 11 + +/*- + * The values stored in the bv_values arrays correspond to: + * + * INCLUSION_UNION + * the union of the values in the block range + * INCLUSION_UNMERGEABLE + * whether the values in the block range cannot be merged + * (e.g. an IPv6 address amidst IPv4 addresses) + * INCLUSION_CONTAINS_EMPTY + * whether an empty value is present in any tuple + * in the block range + */ +#define INCLUSION_UNION 0 +#define INCLUSION_UNMERGEABLE 1 +#define INCLUSION_CONTAINS_EMPTY 2 + + +typedef struct InclusionOpaque +{ + FmgrInfo extra_procinfos[INCLUSION_MAX_PROCNUMS]; + bool extra_proc_missing[INCLUSION_MAX_PROCNUMS]; + Oid cached_subtype; + FmgrInfo strategy_procinfos[RTMaxStrategyNumber]; +} InclusionOpaque; + +static FmgrInfo *inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno, + uint16 procnum); +static FmgrInfo *inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, + Oid subtype, uint16 strategynum); + + +/* + * BRIN inclusion OpcInfo function + */ +Datum +brin_inclusion_opcinfo(PG_FUNCTION_ARGS) +{ + Oid typoid = PG_GETARG_OID(0); + BrinOpcInfo *result; + TypeCacheEntry *bool_typcache = lookup_type_cache(BOOLOID, 0); + + /* + * All members of opaque are initialized lazily; both procinfo arrays + * start out as non-initialized by having fn_oid be InvalidOid, and + * "missing" to false, by zeroing here. strategy_procinfos elements can + * be invalidated when cached_subtype changes by zeroing fn_oid. + * extra_procinfo entries are never invalidated, but if a lookup fails + * (which is expected), extra_proc_missing is set to true, indicating not + * to look it up again. + */ + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(3)) + sizeof(InclusionOpaque)); + result->oi_nstored = 3; + result->oi_regular_nulls = true; + result->oi_opaque = (InclusionOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(3)); + + /* the union */ + result->oi_typcache[INCLUSION_UNION] = + lookup_type_cache(typoid, 0); + + /* includes elements that are not mergeable */ + result->oi_typcache[INCLUSION_UNMERGEABLE] = bool_typcache; + + /* includes the empty element */ + result->oi_typcache[INCLUSION_CONTAINS_EMPTY] = bool_typcache; + + PG_RETURN_POINTER(result); +} + +/* + * BRIN inclusion add value function + * + * Examine the given index tuple (which contains partial status of a certain + * page range) by comparing it to the given value that comes from another heap + * tuple. If the new value is outside the union specified by the existing + * tuple values, update the index tuple and return true. Otherwise, return + * false and do not modify in this case. + */ +Datum +brin_inclusion_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_BOOL(3); + Oid colloid = PG_GET_COLLATION(); + FmgrInfo *finfo; + Datum result; + bool new = false; + AttrNumber attno; + Form_pg_attribute attr; + + Assert(!isnull); + + attno = column->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* + * If the recorded value is null, copy the new value (which we know to be + * not null), and we're almost done. + */ + if (column->bv_allnulls) + { + column->bv_values[INCLUSION_UNION] = + datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(false); + column->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(false); + column->bv_allnulls = false; + new = true; + } + + /* + * No need for further processing if the block range is marked as + * containing unmergeable values. + */ + if (DatumGetBool(column->bv_values[INCLUSION_UNMERGEABLE])) + PG_RETURN_BOOL(false); + + /* + * If the opclass supports the concept of empty values, test the passed + * new value for emptiness; if it returns true, we need to set the + * "contains empty" flag in the element (unless already set). + */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_EMPTY); + if (finfo != NULL && DatumGetBool(FunctionCall1Coll(finfo, colloid, newval))) + { + if (!DatumGetBool(column->bv_values[INCLUSION_CONTAINS_EMPTY])) + { + column->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(true); + PG_RETURN_BOOL(true); + } + + PG_RETURN_BOOL(false); + } + + if (new) + PG_RETURN_BOOL(true); + + /* Check if the new value is already contained. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_CONTAINS); + if (finfo != NULL && + DatumGetBool(FunctionCall2Coll(finfo, colloid, + column->bv_values[INCLUSION_UNION], + newval))) + PG_RETURN_BOOL(false); + + /* + * Check if the new value is mergeable to the existing union. If it is + * not, mark the value as containing unmergeable elements and get out. + * + * Note: at this point we could remove the value from the union, since + * it's not going to be used any longer. However, the BRIN framework + * doesn't allow for the value not being present. Improve someday. + */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGEABLE); + if (finfo != NULL && + !DatumGetBool(FunctionCall2Coll(finfo, colloid, + column->bv_values[INCLUSION_UNION], + newval))) + { + column->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true); + PG_RETURN_BOOL(true); + } + + /* Finally, merge the new value to the existing union. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGE); + Assert(finfo != NULL); + result = FunctionCall2Coll(finfo, colloid, + column->bv_values[INCLUSION_UNION], newval); + if (!attr->attbyval && + DatumGetPointer(result) != DatumGetPointer(column->bv_values[INCLUSION_UNION])) + { + pfree(DatumGetPointer(column->bv_values[INCLUSION_UNION])); + + if (result == newval) + result = datumCopy(result, attr->attbyval, attr->attlen); + } + column->bv_values[INCLUSION_UNION] = result; + + PG_RETURN_BOOL(true); +} + +/* + * BRIN inclusion consistent function + * + * We're no longer dealing with NULL keys in the consistent function, that is + * now handled by the AM code. That means we should not get any all-NULL ranges + * either, because those can't be consistent with regular (not [IS] NULL) keys. + * + * All of the strategies are optional. + */ +Datum +brin_inclusion_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey key = (ScanKey) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(), + subtype; + Datum unionval; + AttrNumber attno; + Datum query; + FmgrInfo *finfo; + Datum result; + + /* This opclass uses the old signature with only three arguments. */ + Assert(PG_NARGS() == 3); + + /* Should not be dealing with all-NULL ranges. */ + Assert(!column->bv_allnulls); + + /* It has to be checked, if it contains elements that are not mergeable. */ + if (DatumGetBool(column->bv_values[INCLUSION_UNMERGEABLE])) + PG_RETURN_BOOL(true); + + attno = key->sk_attno; + subtype = key->sk_subtype; + query = key->sk_argument; + unionval = column->bv_values[INCLUSION_UNION]; + switch (key->sk_strategy) + { + /* + * Placement strategies + * + * These are implemented by logically negating the result of the + * converse placement operator; for this to work, the converse + * operator must be part of the opclass. An error will be thrown + * by inclusion_get_strategy_procinfo() if the required strategy + * is not part of the opclass. + * + * These all return false if either argument is empty, so there is + * no need to check for empty elements. + */ + + case RTLeftStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverRightStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverLeftStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTRightStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverRightStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTRightStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTBelowStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverAboveStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverBelowStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTAboveStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverAboveStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTBelowStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTAboveStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverBelowStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + /* + * Overlap and contains strategies + * + * These strategies are simple enough that we can simply call the + * operator and return its result. Empty elements don't change + * the result. + */ + + case RTOverlapStrategyNumber: + case RTContainsStrategyNumber: + case RTContainsElemStrategyNumber: + case RTSubStrategyNumber: + case RTSubEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_DATUM(result); + + /* + * Contained by strategies + * + * We cannot just call the original operator for the contained by + * strategies because some elements can be contained even though + * the union is not; instead we use the overlap operator. + * + * We check for empty elements separately as they are not merged + * to the union but contained by everything. + */ + + case RTContainedByStrategyNumber: + case RTSuperStrategyNumber: + case RTSuperEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverlapStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + /* + * Adjacent strategy + * + * We test for overlap first but to be safe we need to call the + * actual adjacent operator also. + * + * An empty element cannot be adjacent to any other, so there is + * no need to check for it. + */ + + case RTAdjacentStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverlapStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (DatumGetBool(result)) + PG_RETURN_BOOL(true); + + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTAdjacentStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_DATUM(result); + + /* + * Basic comparison strategies + * + * It is straightforward to support the equality strategies with + * the contains operator. Generally, inequality strategies do not + * make much sense for the types which will be used with the + * inclusion BRIN family of opclasses, but it is possible to + * implement them with logical negation of the left-of and + * right-of operators. + * + * NB: These strategies cannot be used with geometric datatypes + * that use comparison of areas! The only exception is the "same" + * strategy. + * + * Empty elements are considered to be less than the others. We + * cannot use the empty support function to check the query is an + * empty element, because the query can be another data type than + * the empty support function argument. So we will return true, + * if there is a possibility that empty elements will change the + * result. + */ + + case RTLessStrategyNumber: + case RTLessEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTRightStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (!DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + case RTSameStrategyNumber: + case RTEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTContainsStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + case RTGreaterEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (!DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + case RTGreaterStrategyNumber: + /* no need to check for empty elements */ + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + PG_RETURN_BOOL(false); + } +} + +/* + * BRIN inclusion union function + * + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + */ +Datum +brin_inclusion_union(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Form_pg_attribute attr; + FmgrInfo *finfo; + Datum result; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + attno = col_a->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* If B includes empty elements, mark A similarly, if needed. */ + if (!DatumGetBool(col_a->bv_values[INCLUSION_CONTAINS_EMPTY]) && + DatumGetBool(col_b->bv_values[INCLUSION_CONTAINS_EMPTY])) + col_a->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(true); + + /* Check if A includes elements that are not mergeable. */ + if (DatumGetBool(col_a->bv_values[INCLUSION_UNMERGEABLE])) + PG_RETURN_VOID(); + + /* If B includes elements that are not mergeable, mark A similarly. */ + if (DatumGetBool(col_b->bv_values[INCLUSION_UNMERGEABLE])) + { + col_a->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true); + PG_RETURN_VOID(); + } + + /* Check if A and B are mergeable; if not, mark A unmergeable. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGEABLE); + if (finfo != NULL && + !DatumGetBool(FunctionCall2Coll(finfo, colloid, + col_a->bv_values[INCLUSION_UNION], + col_b->bv_values[INCLUSION_UNION]))) + { + col_a->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true); + PG_RETURN_VOID(); + } + + /* Finally, merge B to A. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGE); + Assert(finfo != NULL); + result = FunctionCall2Coll(finfo, colloid, + col_a->bv_values[INCLUSION_UNION], + col_b->bv_values[INCLUSION_UNION]); + if (!attr->attbyval && + DatumGetPointer(result) != DatumGetPointer(col_a->bv_values[INCLUSION_UNION])) + { + pfree(DatumGetPointer(col_a->bv_values[INCLUSION_UNION])); + + if (result == col_b->bv_values[INCLUSION_UNION]) + result = datumCopy(result, attr->attbyval, attr->attlen); + } + col_a->bv_values[INCLUSION_UNION] = result; + + PG_RETURN_VOID(); +} + +/* + * Cache and return inclusion opclass support procedure + * + * Return the procedure corresponding to the given function support number + * or null if it is not exists. + */ +static FmgrInfo * +inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) +{ + InclusionOpaque *opaque; + uint16 basenum = procnum - PROCNUM_BASE; + + /* + * We cache these in the opaque struct, to avoid repetitive syscache + * lookups. + */ + opaque = (InclusionOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * If we already searched for this proc and didn't find it, don't bother + * searching again. + */ + if (opaque->extra_proc_missing[basenum]) + return NULL; + + if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid) + { + if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno, + procnum))) + { + fmgr_info_copy(&opaque->extra_procinfos[basenum], + index_getprocinfo(bdesc->bd_index, attno, procnum), + bdesc->bd_context); + } + else + { + opaque->extra_proc_missing[basenum] = true; + return NULL; + } + } + + return &opaque->extra_procinfos[basenum]; +} + +/* + * Cache and return the procedure of the given strategy + * + * Return the procedure corresponding to the given sub-type and strategy + * number. The data type of the index will be used as the left hand side of + * the operator and the given sub-type will be used as the right hand side. + * Throws an error if the pg_amop row does not exist, but that should not + * happen with a properly configured opclass. + * + * It always throws an error when the data type of the opclass is different + * from the data type of the column or the expression. That happens when the + * column data type has implicit cast to the opclass data type. We don't + * bother casting types, because this situation can easily be avoided by + * setting storage data type to that of the opclass. The same problem does not + * apply to the data type of the right hand side, because the type in the + * ScanKey always matches the opclass' one. + * + * Note: this function mirrors minmax_get_strategy_procinfo; if changes are + * made here, see that function too. + */ +static FmgrInfo * +inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype, + uint16 strategynum) +{ + InclusionOpaque *opaque; + + Assert(strategynum >= 1 && + strategynum <= RTMaxStrategyNumber); + + opaque = (InclusionOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * We cache the procedures for the last sub-type in the opaque struct, to + * avoid repetitive syscache lookups. If the sub-type is changed, + * invalidate all the cached entries. + */ + if (opaque->cached_subtype != subtype) + { + uint16 i; + + for (i = 1; i <= RTMaxStrategyNumber; i++) + opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid; + opaque->cached_subtype = subtype; + } + + if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid) + { + Form_pg_attribute attr; + HeapTuple tuple; + Oid opfamily, + oprid; + bool isNull; + + opfamily = bdesc->bd_index->rd_opfamily[attno - 1]; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(attr->atttypid), + ObjectIdGetDatum(subtype), + Int16GetDatum(strategynum)); + + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strategynum, attr->atttypid, subtype, opfamily); + + oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple, + Anum_pg_amop_amopopr, &isNull)); + ReleaseSysCache(tuple); + Assert(!isNull && RegProcedureIsValid(oprid)); + + fmgr_info_cxt(get_opcode(oprid), + &opaque->strategy_procinfos[strategynum - 1], + bdesc->bd_context); + } + + return &opaque->strategy_procinfos[strategynum - 1]; +} diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c new file mode 100644 index 0000000..798f06c --- /dev/null +++ b/src/backend/access/brin/brin_minmax.c @@ -0,0 +1,317 @@ +/* + * brin_minmax.c + * Implementation of Min/Max opclass for BRIN + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_minmax.c + */ +#include "postgres.h" + +#include "access/brin_internal.h" +#include "access/brin_tuple.h" +#include "access/genam.h" +#include "access/stratnum.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +typedef struct MinmaxOpaque +{ + Oid cached_subtype; + FmgrInfo strategy_procinfos[BTMaxStrategyNumber]; +} MinmaxOpaque; + +static FmgrInfo *minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, + Oid subtype, uint16 strategynum); + + +Datum +brin_minmax_opcinfo(PG_FUNCTION_ARGS) +{ + Oid typoid = PG_GETARG_OID(0); + BrinOpcInfo *result; + + /* + * opaque->strategy_procinfos is initialized lazily; here it is set to + * all-uninitialized by palloc0 which sets fn_oid to InvalidOid. + */ + + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) + + sizeof(MinmaxOpaque)); + result->oi_nstored = 2; + result->oi_regular_nulls = true; + result->oi_opaque = (MinmaxOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(2)); + result->oi_typcache[0] = result->oi_typcache[1] = + lookup_type_cache(typoid, 0); + + PG_RETURN_POINTER(result); +} + +/* + * Examine the given index tuple (which contains partial status of a certain + * page range) by comparing it to the given value that comes from another heap + * tuple. If the new value is outside the min/max range specified by the + * existing tuple values, update the index tuple and return true. Otherwise, + * return false and do not modify in this case. + */ +Datum +brin_minmax_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + Oid colloid = PG_GET_COLLATION(); + FmgrInfo *cmpFn; + Datum compar; + bool updated = false; + Form_pg_attribute attr; + AttrNumber attno; + + Assert(!isnull); + + attno = column->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* + * If the recorded value is null, store the new value (which we know to be + * not null) as both minimum and maximum, and we're done. + */ + if (column->bv_allnulls) + { + column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_allnulls = false; + PG_RETURN_BOOL(true); + } + + /* + * Otherwise, need to compare the new value with the existing boundaries + * and update them accordingly. First check if it's less than the + * existing minimum. + */ + cmpFn = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]); + if (DatumGetBool(compar)) + { + if (!attr->attbyval) + pfree(DatumGetPointer(column->bv_values[0])); + column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); + updated = true; + } + + /* + * And now compare it to the existing maximum. + */ + cmpFn = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTGreaterStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]); + if (DatumGetBool(compar)) + { + if (!attr->attbyval) + pfree(DatumGetPointer(column->bv_values[1])); + column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); + updated = true; + } + + PG_RETURN_BOOL(updated); +} + +/* + * Given an index tuple corresponding to a certain page range and a scan key, + * return whether the scan key is consistent with the index tuple's min/max + * values. Return true if so, false otherwise. + * + * We're no longer dealing with NULL keys in the consistent function, that is + * now handled by the AM code. That means we should not get any all-NULL ranges + * either, because those can't be consistent with regular (not [IS] NULL) keys. + */ +Datum +brin_minmax_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey key = (ScanKey) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(), + subtype; + AttrNumber attno; + Datum value; + Datum matches; + FmgrInfo *finfo; + + /* This opclass uses the old signature with only three arguments. */ + Assert(PG_NARGS() == 3); + + /* Should not be dealing with all-NULL ranges. */ + Assert(!column->bv_allnulls); + + attno = key->sk_attno; + subtype = key->sk_subtype; + value = key->sk_argument; + switch (key->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[0], + value); + break; + case BTEqualStrategyNumber: + + /* + * In the equality case (WHERE col = someval), we want to return + * the current page range if the minimum value in the range <= + * scan key, and the maximum value >= scan key. + */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + BTLessEqualStrategyNumber); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[0], + value); + if (!DatumGetBool(matches)) + break; + /* max() >= scankey */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + BTGreaterEqualStrategyNumber); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[1], + value); + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[1], + value); + break; + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + PG_RETURN_DATUM(matches); +} + +/* + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + */ +Datum +brin_minmax_union(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Form_pg_attribute attr; + FmgrInfo *finfo; + bool needsadj; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + attno = col_a->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* Adjust minimum, if B's min is less than A's min */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[0], + col_a->bv_values[0]); + if (needsadj) + { + if (!attr->attbyval) + pfree(DatumGetPointer(col_a->bv_values[0])); + col_a->bv_values[0] = datumCopy(col_b->bv_values[0], + attr->attbyval, attr->attlen); + } + + /* Adjust maximum, if B's max is greater than A's max */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTGreaterStrategyNumber); + needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[1], + col_a->bv_values[1]); + if (needsadj) + { + if (!attr->attbyval) + pfree(DatumGetPointer(col_a->bv_values[1])); + col_a->bv_values[1] = datumCopy(col_b->bv_values[1], + attr->attbyval, attr->attlen); + } + + PG_RETURN_VOID(); +} + +/* + * Cache and return the procedure for the given strategy. + * + * Note: this function mirrors inclusion_get_strategy_procinfo; see notes + * there. If changes are made here, see that function too. + */ +static FmgrInfo * +minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype, + uint16 strategynum) +{ + MinmaxOpaque *opaque; + + Assert(strategynum >= 1 && + strategynum <= BTMaxStrategyNumber); + + opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * We cache the procedures for the previous subtype in the opaque struct, + * to avoid repetitive syscache lookups. If the subtype changed, + * invalidate all the cached entries. + */ + if (opaque->cached_subtype != subtype) + { + uint16 i; + + for (i = 1; i <= BTMaxStrategyNumber; i++) + opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid; + opaque->cached_subtype = subtype; + } + + if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid) + { + Form_pg_attribute attr; + HeapTuple tuple; + Oid opfamily, + oprid; + bool isNull; + + opfamily = bdesc->bd_index->rd_opfamily[attno - 1]; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(attr->atttypid), + ObjectIdGetDatum(subtype), + Int16GetDatum(strategynum)); + + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strategynum, attr->atttypid, subtype, opfamily); + + oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple, + Anum_pg_amop_amopopr, &isNull)); + ReleaseSysCache(tuple); + Assert(!isNull && RegProcedureIsValid(oprid)); + + fmgr_info_cxt(get_opcode(oprid), + &opaque->strategy_procinfos[strategynum - 1], + bdesc->bd_context); + } + + return &opaque->strategy_procinfos[strategynum - 1]; +} diff --git a/src/backend/access/brin/brin_minmax_multi.c b/src/backend/access/brin/brin_minmax_multi.c new file mode 100644 index 0000000..5200916 --- /dev/null +++ b/src/backend/access/brin/brin_minmax_multi.c @@ -0,0 +1,3163 @@ +/* + * brin_minmax_multi.c + * Implementation of Multi Min/Max opclass for BRIN + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * Implements a variant of minmax opclass, where the summary is composed of + * multiple smaller intervals. This allows us to handle outliers, which + * usually make the simple minmax opclass inefficient. + * + * Consider for example page range with simple minmax interval [1000,2000], + * and assume a new row gets inserted into the range with value 1000000. + * Due to that the interval gets [1000,1000000]. I.e. the minmax interval + * got 1000x wider and won't be useful to eliminate scan keys between 2001 + * and 1000000. + * + * With minmax-multi opclass, we may have [1000,2000] interval initially, + * but after adding the new row we start tracking it as two interval: + * + * [1000,2000] and [1000000,1000000] + * + * This allows us to still eliminate the page range when the scan keys hit + * the gap between 2000 and 1000000, making it useful in cases when the + * simple minmax opclass gets inefficient. + * + * The number of intervals tracked per page range is somewhat flexible. + * What is restricted is the number of values per page range, and the limit + * is currently 32 (see values_per_range reloption). Collapsed intervals + * (with equal minimum and maximum value) are stored as a single value, + * while regular intervals require two values. + * + * When the number of values gets too high (by adding new values to the + * summary), we merge some of the intervals to free space for more values. + * This is done in a greedy way - we simply pick the two closest intervals, + * merge them, and repeat this until the number of values to store gets + * sufficiently low (below 50% of maximum values), but that is mostly + * arbitrary threshold and may be changed easily). + * + * To pick the closest intervals we use the "distance" support procedure, + * which measures space between two ranges (i.e. the length of an interval). + * The computed value may be an approximation - in the worst case we will + * merge two ranges that are slightly less optimal at that step, but the + * index should still produce correct results. + * + * The compactions (reducing the number of values) is fairly expensive, as + * it requires calling the distance functions, sorting etc. So when building + * the summary, we use a significantly larger buffer, and only enforce the + * exact limit at the very end. This improves performance, and it also helps + * with building better ranges (due to the greedy approach). + * + * + * IDENTIFICATION + * src/backend/access/brin/brin_minmax_multi.c + */ +#include "postgres.h" + +/* needed for PGSQL_AF_INET */ +#include + +#include "access/genam.h" +#include "access/brin.h" +#include "access/brin_internal.h" +#include "access/brin_tuple.h" +#include "access/reloptions.h" +#include "access/stratnum.h" +#include "access/htup_details.h" +#include "catalog/pg_type.h" +#include "catalog/pg_am.h" +#include "catalog/pg_amop.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/date.h" +#include "utils/datum.h" +#include "utils/float.h" +#include "utils/inet.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/numeric.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/timestamp.h" +#include "utils/uuid.h" + +/* + * Additional SQL level support functions + * + * Procedure numbers must not use values reserved for BRIN itself; see + * brin_internal.h. + */ +#define MINMAX_MAX_PROCNUMS 1 /* maximum support procs we need */ +#define PROCNUM_DISTANCE 11 /* required, distance between values */ + +/* + * Subtract this from procnum to obtain index in MinmaxMultiOpaque arrays + * (Must be equal to minimum of private procnums). + */ +#define PROCNUM_BASE 11 + +/* + * Sizing the insert buffer - we use 10x the number of values specified + * in the reloption, but we cap it to 8192 not to get too large. When + * the buffer gets full, we reduce the number of values by half. + */ +#define MINMAX_BUFFER_FACTOR 10 +#define MINMAX_BUFFER_MIN 256 +#define MINMAX_BUFFER_MAX 8192 +#define MINMAX_BUFFER_LOAD_FACTOR 0.5 + +typedef struct MinmaxMultiOpaque +{ + FmgrInfo extra_procinfos[MINMAX_MAX_PROCNUMS]; + bool extra_proc_missing[MINMAX_MAX_PROCNUMS]; + Oid cached_subtype; + FmgrInfo strategy_procinfos[BTMaxStrategyNumber]; +} MinmaxMultiOpaque; + +/* + * Storage type for BRIN's minmax reloptions + */ +typedef struct MinMaxMultiOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int valuesPerRange; /* number of values per range */ +} MinMaxMultiOptions; + +#define MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE 32 + +#define MinMaxMultiGetValuesPerRange(opts) \ + ((opts) && (((MinMaxMultiOptions *) (opts))->valuesPerRange != 0) ? \ + ((MinMaxMultiOptions *) (opts))->valuesPerRange : \ + MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE) + +#define SAMESIGN(a,b) (((a) < 0) == ((b) < 0)) + +/* + * The summary of minmax-multi indexes has two representations - Ranges for + * convenient processing, and SerializedRanges for storage in bytea value. + * + * The Ranges struct stores the boundary values in a single array, but we + * treat regular and single-point ranges differently to save space. For + * regular ranges (with different boundary values) we have to store both + * values, while for "single-point ranges" we only need to save one value. + * + * The 'values' array stores boundary values for regular ranges first (there + * are 2*nranges values to store), and then the nvalues boundary values for + * single-point ranges. That is, we have (2*nranges + nvalues) boundary + * values in the array. + * + * +---------------------------------+-------------------------------+ + * | ranges (sorted pairs of values) | sorted values (single points) | + * +---------------------------------+-------------------------------+ + * + * This allows us to quickly add new values, and store outliers without + * making the other ranges very wide. + * + * We never store more than maxvalues values (as set by values_per_range + * reloption). If needed we merge some of the ranges. + * + * To minimize palloc overhead, we always allocate the full array with + * space for maxvalues elements. This should be fine as long as the + * maxvalues is reasonably small (64 seems fine), which is the case + * thanks to values_per_range reloption being limited to 256. + */ +typedef struct Ranges +{ + /* Cache information that we need quite often. */ + Oid typid; + Oid colloid; + AttrNumber attno; + FmgrInfo *cmp; + + /* (2*nranges + nvalues) <= maxvalues */ + int nranges; /* number of ranges in the array (stored) */ + int nsorted; /* number of sorted values (ranges + points) */ + int nvalues; /* number of values in the data array (all) */ + int maxvalues; /* maximum number of values (reloption) */ + + /* + * We simply add the values into a large buffer, without any expensive + * steps (sorting, deduplication, ...). The buffer is a multiple of the + * target number of values, so the compaction happens less often, + * amortizing the costs. We keep the actual target and compact to the + * requested number of values at the very end, before serializing to + * on-disk representation. + */ + /* requested number of values */ + int target_maxvalues; + + /* values stored for this range - either raw values, or ranges */ + Datum values[FLEXIBLE_ARRAY_MEMBER]; +} Ranges; + +/* + * On-disk the summary is stored as a bytea value, with a simple header + * with basic metadata, followed by the boundary values. It has a varlena + * header, so can be treated as varlena directly. + * + * See range_serialize/range_deserialize for serialization details. + */ +typedef struct SerializedRanges +{ + /* varlena header (do not touch directly!) */ + int32 vl_len_; + + /* type of values stored in the data array */ + Oid typid; + + /* (2*nranges + nvalues) <= maxvalues */ + int nranges; /* number of ranges in the array (stored) */ + int nvalues; /* number of values in the data array (all) */ + int maxvalues; /* maximum number of values (reloption) */ + + /* contains the actual data */ + char data[FLEXIBLE_ARRAY_MEMBER]; +} SerializedRanges; + +static SerializedRanges *range_serialize(Ranges *range); + +static Ranges *range_deserialize(int maxvalues, SerializedRanges *range); + + +/* + * Used to represent ranges expanded to make merging and combining easier. + * + * Each expanded range is essentially an interval, represented by min/max + * values, along with a flag whether it's a collapsed range (in which case + * the min and max values are equal). We have the flag to handle by-ref + * data types - we can't simply compare the datums, and this saves some + * calls to the type-specific comparator function. + */ +typedef struct ExpandedRange +{ + Datum minval; /* lower boundary */ + Datum maxval; /* upper boundary */ + bool collapsed; /* true if minval==maxval */ +} ExpandedRange; + +/* + * Represents a distance between two ranges (identified by index into + * an array of extended ranges). + */ +typedef struct DistanceValue +{ + int index; + double value; +} DistanceValue; + + +/* Cache for support and strategy procedures. */ + +static FmgrInfo *minmax_multi_get_procinfo(BrinDesc *bdesc, uint16 attno, + uint16 procnum); + +static FmgrInfo *minmax_multi_get_strategy_procinfo(BrinDesc *bdesc, + uint16 attno, Oid subtype, + uint16 strategynum); + +typedef struct compare_context +{ + FmgrInfo *cmpFn; + Oid colloid; +} compare_context; + +static int compare_values(const void *a, const void *b, void *arg); + + +#ifdef USE_ASSERT_CHECKING +/* + * Check that the order of the array values is correct, using the cmp + * function (which should be BTLessStrategyNumber). + */ +static void +AssertArrayOrder(FmgrInfo *cmp, Oid colloid, Datum *values, int nvalues) +{ + int i; + Datum lt; + + for (i = 0; i < (nvalues - 1); i++) + { + lt = FunctionCall2Coll(cmp, colloid, values[i], values[i + 1]); + Assert(DatumGetBool(lt)); + } +} +#endif + +/* + * Comprehensive check of the Ranges structure. + */ +static void +AssertCheckRanges(Ranges *ranges, FmgrInfo *cmpFn, Oid colloid) +{ +#ifdef USE_ASSERT_CHECKING + int i; + + /* some basic sanity checks */ + Assert(ranges->nranges >= 0); + Assert(ranges->nsorted >= 0); + Assert(ranges->nvalues >= ranges->nsorted); + Assert(ranges->maxvalues >= 2 * ranges->nranges + ranges->nvalues); + Assert(ranges->typid != InvalidOid); + + /* + * First the ranges - there are 2*nranges boundary values, and the values + * have to be strictly ordered (equal values would mean the range is + * collapsed, and should be stored as a point). This also guarantees that + * the ranges do not overlap. + */ + AssertArrayOrder(cmpFn, colloid, ranges->values, 2 * ranges->nranges); + + /* then the single-point ranges (with nvalues boundar values ) */ + AssertArrayOrder(cmpFn, colloid, &ranges->values[2 * ranges->nranges], + ranges->nsorted); + + /* + * Check that none of the values are not covered by ranges (both sorted + * and unsorted) + */ + for (i = 0; i < ranges->nvalues; i++) + { + Datum compar; + int start, + end; + Datum minvalue, + maxvalue; + + Datum value = ranges->values[2 * ranges->nranges + i]; + + if (ranges->nranges == 0) + break; + + minvalue = ranges->values[0]; + maxvalue = ranges->values[2 * ranges->nranges - 1]; + + /* + * Is the value smaller than the minval? If yes, we'll recurse to the + * left side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, value, minvalue); + + /* smaller than the smallest value in the first range */ + if (DatumGetBool(compar)) + continue; + + /* + * Is the value greater than the maxval? If yes, we'll recurse to the + * right side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, maxvalue, value); + + /* larger than the largest value in the last range */ + if (DatumGetBool(compar)) + continue; + + start = 0; /* first range */ + end = ranges->nranges - 1; /* last range */ + while (true) + { + int midpoint = (start + end) / 2; + + /* this means we ran out of ranges in the last step */ + if (start > end) + break; + + /* copy the min/max values from the ranges */ + minvalue = ranges->values[2 * midpoint]; + maxvalue = ranges->values[2 * midpoint + 1]; + + /* + * Is the value smaller than the minval? If yes, we'll recurse to + * the left side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, value, minvalue); + + /* smaller than the smallest value in this range */ + if (DatumGetBool(compar)) + { + end = (midpoint - 1); + continue; + } + + /* + * Is the value greater than the minval? If yes, we'll recurse to + * the right side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, maxvalue, value); + + /* larger than the largest value in this range */ + if (DatumGetBool(compar)) + { + start = (midpoint + 1); + continue; + } + + /* hey, we found a matching range */ + Assert(false); + } + } + + /* and values in the unsorted part must not be in sorted part */ + for (i = ranges->nsorted; i < ranges->nvalues; i++) + { + compare_context cxt; + Datum value = ranges->values[2 * ranges->nranges + i]; + + if (ranges->nsorted == 0) + break; + + cxt.colloid = ranges->colloid; + cxt.cmpFn = ranges->cmp; + + Assert(bsearch_arg(&value, &ranges->values[2 * ranges->nranges], + ranges->nsorted, sizeof(Datum), + compare_values, (void *) &cxt) == NULL); + } +#endif +} + +/* + * Check that the expanded ranges (built when reducing the number of ranges + * by combining some of them) are correctly sorted and do not overlap. + */ +static void +AssertCheckExpandedRanges(BrinDesc *bdesc, Oid colloid, AttrNumber attno, + Form_pg_attribute attr, ExpandedRange *ranges, + int nranges) +{ +#ifdef USE_ASSERT_CHECKING + int i; + FmgrInfo *eq; + FmgrInfo *lt; + + eq = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTEqualStrategyNumber); + + lt = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* + * Each range independently should be valid, i.e. that for the boundary + * values (lower <= upper). + */ + for (i = 0; i < nranges; i++) + { + Datum r; + Datum minval = ranges[i].minval; + Datum maxval = ranges[i].maxval; + + if (ranges[i].collapsed) /* collapsed: minval == maxval */ + r = FunctionCall2Coll(eq, colloid, minval, maxval); + else /* non-collapsed: minval < maxval */ + r = FunctionCall2Coll(lt, colloid, minval, maxval); + + Assert(DatumGetBool(r)); + } + + /* + * And the ranges should be ordered and must not overlap, i.e. upper < + * lower for boundaries of consecutive ranges. + */ + for (i = 0; i < nranges - 1; i++) + { + Datum r; + Datum maxval = ranges[i].maxval; + Datum minval = ranges[i + 1].minval; + + r = FunctionCall2Coll(lt, colloid, maxval, minval); + + Assert(DatumGetBool(r)); + } +#endif +} + + +/* + * minmax_multi_init + * Initialize the deserialized range list, allocate all the memory. + * + * This is only in-memory representation of the ranges, so we allocate + * enough space for the maximum number of values (so as not to have to do + * repallocs as the ranges grow). + */ +static Ranges * +minmax_multi_init(int maxvalues) +{ + Size len; + Ranges *ranges; + + Assert(maxvalues > 0); + + len = offsetof(Ranges, values); /* fixed header */ + len += maxvalues * sizeof(Datum); /* Datum values */ + + ranges = (Ranges *) palloc0(len); + + ranges->maxvalues = maxvalues; + + return ranges; +} + + +/* + * range_deduplicate_values + * Deduplicate the part with values in the simple points. + * + * This is meant to be a cheaper way of reducing the size of the ranges. It + * does not touch the ranges, and only sorts the other values - it does not + * call the distance functions, which may be quite expensive, etc. + * + * We do know the values are not duplicate with the ranges, because we check + * that before adding a new value. Same for the sorted part of values. + */ +static void +range_deduplicate_values(Ranges *range) +{ + int i, + n; + int start; + compare_context cxt; + + /* + * If there are no unsorted values, we're done (this probably can't + * happen, as we're adding values to unsorted part). + */ + if (range->nsorted == range->nvalues) + return; + + /* sort the values */ + cxt.colloid = range->colloid; + cxt.cmpFn = range->cmp; + + /* the values start right after the ranges (which are always sorted) */ + start = 2 * range->nranges; + + /* + * XXX This might do a merge sort, to leverage that the first part of the + * array is already sorted. If the sorted part is large, it might be quite + * a bit faster. + */ + qsort_arg(&range->values[start], + range->nvalues, sizeof(Datum), + compare_values, (void *) &cxt); + + n = 1; + for (i = 1; i < range->nvalues; i++) + { + /* same as preceding value, so store it */ + if (compare_values(&range->values[start + i - 1], + &range->values[start + i], + (void *) &cxt) == 0) + continue; + + range->values[start + n] = range->values[start + i]; + + n++; + } + + /* now all the values are sorted */ + range->nvalues = n; + range->nsorted = n; + + AssertCheckRanges(range, range->cmp, range->colloid); +} + + +/* + * range_serialize + * Serialize the in-memory representation into a compact varlena value. + * + * Simply copy the header and then also the individual values, as stored + * in the in-memory value array. + */ +static SerializedRanges * +range_serialize(Ranges *range) +{ + Size len; + int nvalues; + SerializedRanges *serialized; + Oid typid; + int typlen; + bool typbyval; + + int i; + char *ptr; + + /* simple sanity checks */ + Assert(range->nranges >= 0); + Assert(range->nsorted >= 0); + Assert(range->nvalues >= 0); + Assert(range->maxvalues > 0); + Assert(range->target_maxvalues > 0); + + /* at this point the range should be compacted to the target size */ + Assert(2 * range->nranges + range->nvalues <= range->target_maxvalues); + + Assert(range->target_maxvalues <= range->maxvalues); + + /* range boundaries are always sorted */ + Assert(range->nvalues >= range->nsorted); + + /* deduplicate values, if there's unsorted part */ + range_deduplicate_values(range); + + /* see how many Datum values we actually have */ + nvalues = 2 * range->nranges + range->nvalues; + + typid = range->typid; + typbyval = get_typbyval(typid); + typlen = get_typlen(typid); + + /* header is always needed */ + len = offsetof(SerializedRanges, data); + + /* + * The space needed depends on data type - for fixed-length data types + * (by-value and some by-reference) it's pretty simple, just multiply + * (attlen * nvalues) and we're done. For variable-length by-reference + * types we need to actually walk all the values and sum the lengths. + */ + if (typlen == -1) /* varlena */ + { + int i; + + for (i = 0; i < nvalues; i++) + { + len += VARSIZE_ANY(range->values[i]); + } + } + else if (typlen == -2) /* cstring */ + { + int i; + + for (i = 0; i < nvalues; i++) + { + /* don't forget to include the null terminator ;-) */ + len += strlen(DatumGetCString(range->values[i])) + 1; + } + } + else /* fixed-length types (even by-reference) */ + { + Assert(typlen > 0); + len += nvalues * typlen; + } + + /* + * Allocate the serialized object, copy the basic information. The + * serialized object is a varlena, so update the header. + */ + serialized = (SerializedRanges *) palloc0(len); + SET_VARSIZE(serialized, len); + + serialized->typid = typid; + serialized->nranges = range->nranges; + serialized->nvalues = range->nvalues; + serialized->maxvalues = range->target_maxvalues; + + /* + * And now copy also the boundary values (like the length calculation this + * depends on the particular data type). + */ + ptr = serialized->data; /* start of the serialized data */ + + for (i = 0; i < nvalues; i++) + { + if (typbyval) /* simple by-value data types */ + { + Datum tmp; + + /* + * For byval types, we need to copy just the significant bytes - + * we can't use memcpy directly, as that assumes little-endian + * behavior. store_att_byval does almost what we need, but it + * requires a properly aligned buffer - the output buffer does not + * guarantee that. So we simply use a local Datum variable (which + * guarantees proper alignment), and then copy the value from it. + */ + store_att_byval(&tmp, range->values[i], typlen); + + memcpy(ptr, &tmp, typlen); + ptr += typlen; + } + else if (typlen > 0) /* fixed-length by-ref types */ + { + memcpy(ptr, DatumGetPointer(range->values[i]), typlen); + ptr += typlen; + } + else if (typlen == -1) /* varlena */ + { + int tmp = VARSIZE_ANY(DatumGetPointer(range->values[i])); + + memcpy(ptr, DatumGetPointer(range->values[i]), tmp); + ptr += tmp; + } + else if (typlen == -2) /* cstring */ + { + int tmp = strlen(DatumGetCString(range->values[i])) + 1; + + memcpy(ptr, DatumGetCString(range->values[i]), tmp); + ptr += tmp; + } + + /* make sure we haven't overflown the buffer end */ + Assert(ptr <= ((char *) serialized + len)); + } + + /* exact size */ + Assert(ptr == ((char *) serialized + len)); + + return serialized; +} + +/* + * range_deserialize + * Serialize the in-memory representation into a compact varlena value. + * + * Simply copy the header and then also the individual values, as stored + * in the in-memory value array. + */ +static Ranges * +range_deserialize(int maxvalues, SerializedRanges *serialized) +{ + int i, + nvalues; + char *ptr, + *dataptr; + bool typbyval; + int typlen; + Size datalen; + + Ranges *range; + + Assert(serialized->nranges >= 0); + Assert(serialized->nvalues >= 0); + Assert(serialized->maxvalues > 0); + + nvalues = 2 * serialized->nranges + serialized->nvalues; + + Assert(nvalues <= serialized->maxvalues); + Assert(serialized->maxvalues <= maxvalues); + + range = minmax_multi_init(maxvalues); + + /* copy the header info */ + range->nranges = serialized->nranges; + range->nvalues = serialized->nvalues; + range->nsorted = serialized->nvalues; + range->maxvalues = maxvalues; + range->target_maxvalues = serialized->maxvalues; + + range->typid = serialized->typid; + + typbyval = get_typbyval(serialized->typid); + typlen = get_typlen(serialized->typid); + + /* + * And now deconstruct the values into Datum array. We have to copy the + * data because the serialized representation ignores alignment, and we + * don't want to rely on it being kept around anyway. + */ + ptr = serialized->data; + + /* + * We don't want to allocate many pieces, so we just allocate everything + * in one chunk. How much space will we need? + * + * XXX We don't need to copy simple by-value data types. + */ + datalen = 0; + dataptr = NULL; + for (i = 0; (i < nvalues) && (!typbyval); i++) + { + if (typlen > 0) /* fixed-length by-ref types */ + datalen += MAXALIGN(typlen); + else if (typlen == -1) /* varlena */ + { + datalen += MAXALIGN(VARSIZE_ANY(DatumGetPointer(ptr))); + ptr += VARSIZE_ANY(DatumGetPointer(ptr)); + } + else if (typlen == -2) /* cstring */ + { + Size slen = strlen(DatumGetCString(ptr)) + 1; + + datalen += MAXALIGN(slen); + ptr += slen; + } + } + + if (datalen > 0) + dataptr = palloc(datalen); + + /* + * Restore the source pointer (might have been modified when calculating + * the space we need to allocate). + */ + ptr = serialized->data; + + for (i = 0; i < nvalues; i++) + { + if (typbyval) /* simple by-value data types */ + { + Datum v = 0; + + memcpy(&v, ptr, typlen); + + range->values[i] = fetch_att(&v, true, typlen); + ptr += typlen; + } + else if (typlen > 0) /* fixed-length by-ref types */ + { + range->values[i] = PointerGetDatum(dataptr); + + memcpy(dataptr, ptr, typlen); + dataptr += MAXALIGN(typlen); + + ptr += typlen; + } + else if (typlen == -1) /* varlena */ + { + range->values[i] = PointerGetDatum(dataptr); + + memcpy(dataptr, ptr, VARSIZE_ANY(ptr)); + dataptr += MAXALIGN(VARSIZE_ANY(ptr)); + ptr += VARSIZE_ANY(ptr); + } + else if (typlen == -2) /* cstring */ + { + Size slen = strlen(ptr) + 1; + + range->values[i] = PointerGetDatum(dataptr); + + memcpy(dataptr, ptr, slen); + dataptr += MAXALIGN(slen); + ptr += slen; + } + + /* make sure we haven't overflown the buffer end */ + Assert(ptr <= ((char *) serialized + VARSIZE_ANY(serialized))); + } + + /* should have consumed the whole input value exactly */ + Assert(ptr == ((char *) serialized + VARSIZE_ANY(serialized))); + + /* return the deserialized value */ + return range; +} + +/* + * compare_expanded_ranges + * Compare the expanded ranges - first by minimum, then by maximum. + * + * We do guarantee that ranges in a single Ranges object do not overlap, so it + * may seem strange that we don't order just by minimum. But when merging two + * Ranges (which happens in the union function), the ranges may in fact + * overlap. So we do compare both. + */ +static int +compare_expanded_ranges(const void *a, const void *b, void *arg) +{ + ExpandedRange *ra = (ExpandedRange *) a; + ExpandedRange *rb = (ExpandedRange *) b; + Datum r; + + compare_context *cxt = (compare_context *) arg; + + /* first compare minvals */ + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, ra->minval, rb->minval); + + if (DatumGetBool(r)) + return -1; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, rb->minval, ra->minval); + + if (DatumGetBool(r)) + return 1; + + /* then compare maxvals */ + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, ra->maxval, rb->maxval); + + if (DatumGetBool(r)) + return -1; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, rb->maxval, ra->maxval); + + if (DatumGetBool(r)) + return 1; + + return 0; +} + +/* + * compare_values + * Compare the values. + */ +static int +compare_values(const void *a, const void *b, void *arg) +{ + Datum *da = (Datum *) a; + Datum *db = (Datum *) b; + Datum r; + + compare_context *cxt = (compare_context *) arg; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, *da, *db); + + if (DatumGetBool(r)) + return -1; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, *db, *da); + + if (DatumGetBool(r)) + return 1; + + return 0; +} + +/* + * Check if the new value matches one of the existing ranges. + */ +static bool +has_matching_range(BrinDesc *bdesc, Oid colloid, Ranges *ranges, + Datum newval, AttrNumber attno, Oid typid) +{ + Datum compar; + + Datum minvalue = ranges->values[0]; + Datum maxvalue = ranges->values[2 * ranges->nranges - 1]; + + FmgrInfo *cmpLessFn; + FmgrInfo *cmpGreaterFn; + + /* binary search on ranges */ + int start, + end; + + if (ranges->nranges == 0) + return false; + + /* + * Otherwise, need to compare the new value with boundaries of all the + * ranges. First check if it's less than the absolute minimum, which is + * the first value in the array. + */ + cmpLessFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid, + BTLessStrategyNumber); + compar = FunctionCall2Coll(cmpLessFn, colloid, newval, minvalue); + + /* smaller than the smallest value in the range list */ + if (DatumGetBool(compar)) + return false; + + /* + * And now compare it to the existing maximum (last value in the data + * array). But only if we haven't already ruled out a possible match in + * the minvalue check. + */ + cmpGreaterFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid, + BTGreaterStrategyNumber); + compar = FunctionCall2Coll(cmpGreaterFn, colloid, newval, maxvalue); + + if (DatumGetBool(compar)) + return false; + + /* + * So we know it's in the general min/max, the question is whether it + * falls in one of the ranges or gaps. We'll do a binary search on + * individual ranges - for each range we check equality (value falls into + * the range), and then check ranges either above or below the current + * range. + */ + start = 0; /* first range */ + end = (ranges->nranges - 1); /* last range */ + while (true) + { + int midpoint = (start + end) / 2; + + /* this means we ran out of ranges in the last step */ + if (start > end) + return false; + + /* copy the min/max values from the ranges */ + minvalue = ranges->values[2 * midpoint]; + maxvalue = ranges->values[2 * midpoint + 1]; + + /* + * Is the value smaller than the minval? If yes, we'll recurse to the + * left side of range array. + */ + compar = FunctionCall2Coll(cmpLessFn, colloid, newval, minvalue); + + /* smaller than the smallest value in this range */ + if (DatumGetBool(compar)) + { + end = (midpoint - 1); + continue; + } + + /* + * Is the value greater than the minval? If yes, we'll recurse to the + * right side of range array. + */ + compar = FunctionCall2Coll(cmpGreaterFn, colloid, newval, maxvalue); + + /* larger than the largest value in this range */ + if (DatumGetBool(compar)) + { + start = (midpoint + 1); + continue; + } + + /* hey, we found a matching range */ + return true; + } + + return false; +} + + +/* + * range_contains_value + * See if the new value is already contained in the range list. + * + * We first inspect the list of intervals. We use a small trick - we check + * the value against min/max of the whole range (min of the first interval, + * max of the last one) first, and only inspect the individual intervals if + * this passes. + * + * If the value matches none of the intervals, we check the exact values. + * We simply loop through them and invoke equality operator on them. + * + * The last parameter (full) determines whether we need to search all the + * values, including the unsorted part. With full=false, the unsorted part + * is not searched, which may produce false negatives and duplicate values + * (in the unsorted part only), but when we're building the range that's + * fine - we'll deduplicate before serialization, and it can only happen + * if there already are unsorted values (so it was already modified). + * + * Serialized ranges don't have any unsorted values, so this can't cause + * false negatives during querying. + */ +static bool +range_contains_value(BrinDesc *bdesc, Oid colloid, + AttrNumber attno, Form_pg_attribute attr, + Ranges *ranges, Datum newval, bool full) +{ + int i; + FmgrInfo *cmpEqualFn; + Oid typid = attr->atttypid; + + /* + * First inspect the ranges, if there are any. We first check the whole + * range, and only when there's still a chance of getting a match we + * inspect the individual ranges. + */ + if (has_matching_range(bdesc, colloid, ranges, newval, attno, typid)) + return true; + + cmpEqualFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid, + BTEqualStrategyNumber); + + /* + * There is no matching range, so let's inspect the sorted values. + * + * We do a sequential search for small numbers of values, and binary + * search once we have more than 16 values. This threshold is somewhat + * arbitrary, as it depends on how expensive the comparison function is. + * + * XXX If we use the threshold here, maybe we should do the same thing in + * has_matching_range? Or maybe we should do the bin search all the time? + * + * XXX We could use the same optimization as for ranges, to check if the + * value is between min/max, to maybe rule out all sorted values without + * having to inspect all of them. + */ + if (ranges->nsorted >= 16) + { + compare_context cxt; + + cxt.colloid = ranges->colloid; + cxt.cmpFn = ranges->cmp; + + if (bsearch_arg(&newval, &ranges->values[2 * ranges->nranges], + ranges->nsorted, sizeof(Datum), + compare_values, (void *) &cxt) != NULL) + return true; + } + else + { + for (i = 2 * ranges->nranges; i < 2 * ranges->nranges + ranges->nsorted; i++) + { + Datum compar; + + compar = FunctionCall2Coll(cmpEqualFn, colloid, newval, ranges->values[i]); + + /* found an exact match */ + if (DatumGetBool(compar)) + return true; + } + } + + /* If not asked to inspect the unsorted part, we're done. */ + if (!full) + return false; + + /* Inspect the unsorted part. */ + for (i = 2 * ranges->nranges + ranges->nsorted; i < 2 * ranges->nranges + ranges->nvalues; i++) + { + Datum compar; + + compar = FunctionCall2Coll(cmpEqualFn, colloid, newval, ranges->values[i]); + + /* found an exact match */ + if (DatumGetBool(compar)) + return true; + } + + /* the value is not covered by this BRIN tuple */ + return false; +} + +/* + * Expand ranges from Ranges into ExpandedRange array. This expects the + * eranges to be pre-allocated and with the correct size - there needs to be + * (nranges + nvalues) elements. + * + * The order of expanded ranges is arbitrary. We do expand the ranges first, + * and this part is sorted. But then we expand the values, and this part may + * be unsorted. + */ +static void +fill_expanded_ranges(ExpandedRange *eranges, int neranges, Ranges *ranges) +{ + int idx; + int i; + + /* Check that the output array has the right size. */ + Assert(neranges == (ranges->nranges + ranges->nvalues)); + + idx = 0; + for (i = 0; i < ranges->nranges; i++) + { + eranges[idx].minval = ranges->values[2 * i]; + eranges[idx].maxval = ranges->values[2 * i + 1]; + eranges[idx].collapsed = false; + idx++; + + Assert(idx <= neranges); + } + + for (i = 0; i < ranges->nvalues; i++) + { + eranges[idx].minval = ranges->values[2 * ranges->nranges + i]; + eranges[idx].maxval = ranges->values[2 * ranges->nranges + i]; + eranges[idx].collapsed = true; + idx++; + + Assert(idx <= neranges); + } + + /* Did we produce the expected number of elements? */ + Assert(idx == neranges); + + return; +} + +/* + * Sort and deduplicate expanded ranges. + * + * The ranges may be deduplicated - we're simply appending values, without + * checking for duplicates etc. So maybe the deduplication will reduce the + * number of ranges enough, and we won't have to compute the distances etc. + * + * Returns the number of expanded ranges. + */ +static int +sort_expanded_ranges(FmgrInfo *cmp, Oid colloid, + ExpandedRange *eranges, int neranges) +{ + int n; + int i; + compare_context cxt; + + Assert(neranges > 0); + + /* sort the values */ + cxt.colloid = colloid; + cxt.cmpFn = cmp; + + /* + * XXX We do qsort on all the values, but we could also leverage the fact + * that some of the input data is already sorted (all the ranges and maybe + * some of the points) and do merge sort. + */ + qsort_arg(eranges, neranges, sizeof(ExpandedRange), + compare_expanded_ranges, (void *) &cxt); + + /* + * Deduplicate the ranges - simply compare each range to the preceding + * one, and skip the duplicate ones. + */ + n = 1; + for (i = 1; i < neranges; i++) + { + /* if the current range is equal to the preceding one, do nothing */ + if (!compare_expanded_ranges(&eranges[i - 1], &eranges[i], (void *) &cxt)) + continue; + + /* otherwise, copy it to n-th place (if not already there) */ + if (i != n) + memcpy(&eranges[n], &eranges[i], sizeof(ExpandedRange)); + + n++; + } + + Assert((n > 0) && (n <= neranges)); + + return n; +} + +/* + * When combining multiple Range values (in union function), some of the + * ranges may overlap. We simply merge the overlapping ranges to fix that. + * + * XXX This assumes the expanded ranges were previously sorted (by minval + * and then maxval). We leverage this when detecting overlap. + */ +static int +merge_overlapping_ranges(FmgrInfo *cmp, Oid colloid, + ExpandedRange *eranges, int neranges) +{ + int idx; + + /* Merge ranges (idx) and (idx+1) if they overlap. */ + idx = 0; + while (idx < (neranges - 1)) + { + Datum r; + + /* + * comparing [?,maxval] vs. [minval,?] - the ranges overlap if (minval + * < maxval) + */ + r = FunctionCall2Coll(cmp, colloid, + eranges[idx].maxval, + eranges[idx + 1].minval); + + /* + * Nope, maxval < minval, so no overlap. And we know the ranges are + * ordered, so there are no more overlaps, because all the remaining + * ranges have greater or equal minval. + */ + if (DatumGetBool(r)) + { + /* proceed to the next range */ + idx += 1; + continue; + } + + /* + * So ranges 'idx' and 'idx+1' do overlap, but we don't know if + * 'idx+1' is contained in 'idx', or if they overlap only partially. + * So compare the upper bounds and keep the larger one. + */ + r = FunctionCall2Coll(cmp, colloid, + eranges[idx].maxval, + eranges[idx + 1].maxval); + + if (DatumGetBool(r)) + eranges[idx].maxval = eranges[idx + 1].maxval; + + /* + * The range certainly is no longer collapsed (irrespectively of the + * previous state). + */ + eranges[idx].collapsed = false; + + /* + * Now get rid of the (idx+1) range entirely by shifting the remaining + * ranges by 1. There are neranges elements, and we need to move + * elements from (idx+2). That means the number of elements to move is + * [ncranges - (idx+2)]. + */ + memmove(&eranges[idx + 1], &eranges[idx + 2], + (neranges - (idx + 2)) * sizeof(ExpandedRange)); + + /* + * Decrease the number of ranges, and repeat (with the same range, as + * it might overlap with additional ranges thanks to the merge). + */ + neranges--; + } + + return neranges; +} + +/* + * Simple comparator for distance values, comparing the double value. + * This is intentionally sorting the distances in descending order, i.e. + * the longer gaps will be at the front. + */ +static int +compare_distances(const void *a, const void *b) +{ + DistanceValue *da = (DistanceValue *) a; + DistanceValue *db = (DistanceValue *) b; + + if (da->value < db->value) + return 1; + else if (da->value > db->value) + return -1; + + return 0; +} + +/* + * Given an array of expanded ranges, compute size of the gaps between each + * range. For neranges there are (neranges-1) gaps. + * + * We simply call the "distance" function to compute the (max-min) for pairs + * of consecutive ranges. The function may be fairly expensive, so we do that + * just once (and then use it to pick as many ranges to merge as possible). + * + * See reduce_expanded_ranges for details. + */ +static DistanceValue * +build_distances(FmgrInfo *distanceFn, Oid colloid, + ExpandedRange *eranges, int neranges) +{ + int i; + int ndistances; + DistanceValue *distances; + + Assert(neranges >= 2); + + ndistances = (neranges - 1); + distances = (DistanceValue *) palloc0(sizeof(DistanceValue) * ndistances); + + /* + * Walk through the ranges once and compute the distance between the + * ranges so that we can sort them once. + */ + for (i = 0; i < ndistances; i++) + { + Datum a1, + a2, + r; + + a1 = eranges[i].maxval; + a2 = eranges[i + 1].minval; + + /* compute length of the gap (between max/min) */ + r = FunctionCall2Coll(distanceFn, colloid, a1, a2); + + /* remember the index of the gap the distance is for */ + distances[i].index = i; + distances[i].value = DatumGetFloat8(r); + } + + /* + * Sort the distances in descending order, so that the longest gaps are at + * the front. + */ + pg_qsort(distances, ndistances, sizeof(DistanceValue), compare_distances); + + return distances; +} + +/* + * Builds expanded ranges for the existing ranges (and single-point ranges), + * and also the new value (which did not fit into the array). This expanded + * representation makes the processing a bit easier, as it allows handling + * ranges and points the same way. + * + * We sort and deduplicate the expanded ranges - this is necessary, because + * the points may be unsorted. And moreover the two parts (ranges and + * points) are sorted on their own. + */ +static ExpandedRange * +build_expanded_ranges(FmgrInfo *cmp, Oid colloid, Ranges *ranges, + int *nranges) +{ + int neranges; + ExpandedRange *eranges; + + /* both ranges and points are expanded into a separate element */ + neranges = ranges->nranges + ranges->nvalues; + + eranges = (ExpandedRange *) palloc0(neranges * sizeof(ExpandedRange)); + + /* fill the expanded ranges */ + fill_expanded_ranges(eranges, neranges, ranges); + + /* sort and deduplicate the expanded ranges */ + neranges = sort_expanded_ranges(cmp, colloid, eranges, neranges); + + /* remember how many ranges we built */ + *nranges = neranges; + + return eranges; +} + +#ifdef USE_ASSERT_CHECKING +/* + * Counts boundary values needed to store the ranges. Each single-point + * range is stored using a single value, each regular range needs two. + */ +static int +count_values(ExpandedRange *cranges, int ncranges) +{ + int i; + int count; + + count = 0; + for (i = 0; i < ncranges; i++) + { + if (cranges[i].collapsed) + count += 1; + else + count += 2; + } + + return count; +} +#endif + +/* + * reduce_expanded_ranges + * reduce the ranges until the number of values is low enough + * + * Combines ranges until the number of boundary values drops below the + * threshold specified by max_values. This happens by merging enough + * ranges by the distance between them. + * + * Returns the number of result ranges. + * + * We simply use the global min/max and then add boundaries for enough + * largest gaps. Each gap adds 2 values, so we simply use (target/2-1) + * distances. Then we simply sort all the values - each two values are + * a boundary of a range (possibly collapsed). + * + * XXX Some of the ranges may be collapsed (i.e. the min/max values are + * equal), but we ignore that for now. We could repeat the process, + * adding a couple more gaps recursively. + * + * XXX The ranges to merge are selected solely using the distance. But + * that may not be the best strategy, for example when multiple gaps + * are of equal (or very similar) length. + * + * Consider for example points 1, 2, 3, .., 64, which have gaps of the + * same length 1 of course. In that case, we tend to pick the first + * gap of that length, which leads to this: + * + * step 1: [1, 2], 3, 4, 5, .., 64 + * step 2: [1, 3], 4, 5, .., 64 + * step 3: [1, 4], 5, .., 64 + * ... + * + * So in the end we'll have one "large" range and multiple small points. + * That may be fine, but it seems a bit strange and non-optimal. Maybe + * we should consider other things when picking ranges to merge - e.g. + * length of the ranges? Or perhaps randomize the choice of ranges, with + * probability inversely proportional to the distance (the gap lengths + * may be very close, but not exactly the same). + * + * XXX Or maybe we could just handle this by using random value as a + * tie-break, or by adding random noise to the actual distance. + */ +static int +reduce_expanded_ranges(ExpandedRange *eranges, int neranges, + DistanceValue *distances, int max_values, + FmgrInfo *cmp, Oid colloid) +{ + int i; + int nvalues; + Datum *values; + + compare_context cxt; + + /* total number of gaps between ranges */ + int ndistances = (neranges - 1); + + /* number of gaps to keep */ + int keep = (max_values / 2 - 1); + + /* + * Maybe we have a sufficiently low number of ranges already? + * + * XXX This should happen before we actually do the expensive stuff like + * sorting, so maybe this should be just an assert. + */ + if (keep >= ndistances) + return neranges; + + /* sort the values */ + cxt.colloid = colloid; + cxt.cmpFn = cmp; + + /* allocate space for the boundary values */ + nvalues = 0; + values = (Datum *) palloc(sizeof(Datum) * max_values); + + /* add the global min/max values, from the first/last range */ + values[nvalues++] = eranges[0].minval; + values[nvalues++] = eranges[neranges - 1].maxval; + + /* add boundary values for enough gaps */ + for (i = 0; i < keep; i++) + { + /* index of the gap between (index) and (index+1) ranges */ + int index = distances[i].index; + + Assert((index >= 0) && ((index + 1) < neranges)); + + /* add max from the preceding range, minval from the next one */ + values[nvalues++] = eranges[index].maxval; + values[nvalues++] = eranges[index + 1].minval; + + Assert(nvalues <= max_values); + } + + /* We should have an even number of range values. */ + Assert(nvalues % 2 == 0); + + /* + * Sort the values using the comparator function, and form ranges from the + * sorted result. + */ + qsort_arg(values, nvalues, sizeof(Datum), + compare_values, (void *) &cxt); + + /* We have nvalues boundary values, which means nvalues/2 ranges. */ + for (i = 0; i < (nvalues / 2); i++) + { + eranges[i].minval = values[2 * i]; + eranges[i].maxval = values[2 * i + 1]; + + /* if the boundary values are the same, it's a collapsed range */ + eranges[i].collapsed = (compare_values(&values[2 * i], + &values[2 * i + 1], + &cxt) == 0); + } + + return (nvalues / 2); +} + +/* + * Store the boundary values from ExpandedRanges back into 'ranges' (using + * only the minimal number of values needed). + */ +static void +store_expanded_ranges(Ranges *ranges, ExpandedRange *eranges, int neranges) +{ + int i; + int idx = 0; + + /* first copy in the regular ranges */ + ranges->nranges = 0; + for (i = 0; i < neranges; i++) + { + if (!eranges[i].collapsed) + { + ranges->values[idx++] = eranges[i].minval; + ranges->values[idx++] = eranges[i].maxval; + ranges->nranges++; + } + } + + /* now copy in the collapsed ones */ + ranges->nvalues = 0; + for (i = 0; i < neranges; i++) + { + if (eranges[i].collapsed) + { + ranges->values[idx++] = eranges[i].minval; + ranges->nvalues++; + } + } + + /* all the values are sorted */ + ranges->nsorted = ranges->nvalues; + + Assert(count_values(eranges, neranges) == 2 * ranges->nranges + ranges->nvalues); + Assert(2 * ranges->nranges + ranges->nvalues <= ranges->maxvalues); +} + + +/* + * Consider freeing space in the ranges. Checks if there's space for at least + * one new value, and performs compaction if needed. + * + * Returns true if the value was actually modified. + */ +static bool +ensure_free_space_in_buffer(BrinDesc *bdesc, Oid colloid, + AttrNumber attno, Form_pg_attribute attr, + Ranges *range) +{ + MemoryContext ctx; + MemoryContext oldctx; + + FmgrInfo *cmpFn, + *distanceFn; + + /* expanded ranges */ + ExpandedRange *eranges; + int neranges; + DistanceValue *distances; + + /* + * If there is free space in the buffer, we're done without having to + * modify anything. + */ + if (2 * range->nranges + range->nvalues < range->maxvalues) + return false; + + /* we'll certainly need the comparator, so just look it up now */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* deduplicate values, if there's an unsorted part */ + range_deduplicate_values(range); + + /* + * Did we reduce enough free space by just the deduplication? + * + * We don't simply check against range->maxvalues again. The deduplication + * might have freed very little space (e.g. just one value), forcing us to + * do deduplication very often. In that case, it's better to do the + * compaction and reduce more space. + */ + if (2 * range->nranges + range->nvalues <= range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR) + return true; + + /* + * We need to combine some of the existing ranges, to reduce the number of + * values we have to store. + * + * The distanceFn calls (which may internally call e.g. numeric_le) may + * allocate quite a bit of memory, and we must not leak it (we might have + * to do this repeatedly, even for a single BRIN page range). Otherwise + * we'd have problems e.g. when building new indexes. So we use a memory + * context and make sure we free the memory at the end (so if we call the + * distance function many times, it might be an issue, but meh). + */ + ctx = AllocSetContextCreate(CurrentMemoryContext, + "minmax-multi context", + ALLOCSET_DEFAULT_SIZES); + + oldctx = MemoryContextSwitchTo(ctx); + + /* build the expanded ranges */ + eranges = build_expanded_ranges(cmpFn, colloid, range, &neranges); + + /* and we'll also need the 'distance' procedure */ + distanceFn = minmax_multi_get_procinfo(bdesc, attno, PROCNUM_DISTANCE); + + /* build array of gap distances and sort them in ascending order */ + distances = build_distances(distanceFn, colloid, eranges, neranges); + + /* + * Combine ranges until we release at least 50% of the space. This + * threshold is somewhat arbitrary, perhaps needs tuning. We must not use + * too low or high value. + */ + neranges = reduce_expanded_ranges(eranges, neranges, distances, + range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR, + cmpFn, colloid); + + /* Make sure we've sufficiently reduced the number of ranges. */ + Assert(count_values(eranges, neranges) <= range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR); + + /* decompose the expanded ranges into regular ranges and single values */ + store_expanded_ranges(range, eranges, neranges); + + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(ctx); + + /* Did we break the ranges somehow? */ + AssertCheckRanges(range, cmpFn, colloid); + + return true; +} + +/* + * range_add_value + * Add the new value to the minmax-multi range. + */ +static bool +range_add_value(BrinDesc *bdesc, Oid colloid, + AttrNumber attno, Form_pg_attribute attr, + Ranges *ranges, Datum newval) +{ + FmgrInfo *cmpFn; + bool modified = false; + + /* we'll certainly need the comparator, so just look it up now */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* comprehensive checks of the input ranges */ + AssertCheckRanges(ranges, cmpFn, colloid); + + /* + * Make sure there's enough free space in the buffer. We only trigger this + * when the buffer is full, which means it had to be modified as we size + * it to be larger than what is stored on disk. + * + * This needs to happen before we check if the value is contained in the + * range, because the value might be in the unsorted part, and we don't + * check that in range_contains_value. The deduplication would then move + * it to the sorted part, and we'd add the value too, which violates the + * rule that we never have duplicates with the ranges or sorted values. + * + * We might also deduplicate and recheck if the value is contained, but + * that seems like overkill. We'd need to deduplicate anyway, so why not + * do it now. + */ + modified = ensure_free_space_in_buffer(bdesc, colloid, + attno, attr, ranges); + + /* + * Bail out if the value already is covered by the range. + * + * We could also add values until we hit values_per_range, and then do the + * deduplication in a batch, hoping for better efficiency. But that would + * mean we actually modify the range every time, which means having to + * serialize the value, which does palloc, walks the values, copies them, + * etc. Not exactly cheap. + * + * So instead we do the check, which should be fairly cheap - assuming the + * comparator function is not very expensive. + * + * This also implies the values array can't contain duplicate values. + */ + if (range_contains_value(bdesc, colloid, attno, attr, ranges, newval, false)) + return modified; + + /* Make a copy of the value, if needed. */ + newval = datumCopy(newval, attr->attbyval, attr->attlen); + + /* + * If there's space in the values array, copy it in and we're done. + * + * We do want to keep the values sorted (to speed up searches), so we do a + * simple insertion sort. We could do something more elaborate, e.g. by + * sorting the values only now and then, but for small counts (e.g. when + * maxvalues is 64) this should be fine. + */ + ranges->values[2 * ranges->nranges + ranges->nvalues] = newval; + ranges->nvalues++; + + /* If we added the first value, we can consider it as sorted. */ + if (ranges->nvalues == 1) + ranges->nsorted = 1; + + /* + * Check we haven't broken the ordering of boundary values (checks both + * parts, but that doesn't hurt). + */ + AssertCheckRanges(ranges, cmpFn, colloid); + + /* Check the range contains the value we just added. */ + Assert(range_contains_value(bdesc, colloid, attno, attr, ranges, newval, true)); + + /* yep, we've modified the range */ + return true; +} + +/* + * Generate range representation of data collected during "batch mode". + * This is similar to reduce_expanded_ranges, except that we can't assume + * the values are sorted and there may be duplicate values. + */ +static void +compactify_ranges(BrinDesc *bdesc, Ranges *ranges, int max_values) +{ + FmgrInfo *cmpFn, + *distanceFn; + + /* expanded ranges */ + ExpandedRange *eranges; + int neranges; + DistanceValue *distances; + + MemoryContext ctx; + MemoryContext oldctx; + + /* + * Do we need to actually compactify anything? + * + * There are two reasons why compaction may be needed - firstly, there may + * be too many values, or some of the values may be unsorted. + */ + if ((ranges->nranges * 2 + ranges->nvalues <= max_values) && + (ranges->nsorted == ranges->nvalues)) + return; + + /* we'll certainly need the comparator, so just look it up now */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, ranges->attno, ranges->typid, + BTLessStrategyNumber); + + /* and we'll also need the 'distance' procedure */ + distanceFn = minmax_multi_get_procinfo(bdesc, ranges->attno, PROCNUM_DISTANCE); + + /* + * The distanceFn calls (which may internally call e.g. numeric_le) may + * allocate quite a bit of memory, and we must not leak it. Otherwise, + * we'd have problems e.g. when building indexes. So we create a local + * memory context and make sure we free the memory before leaving this + * function (not after every call). + */ + ctx = AllocSetContextCreate(CurrentMemoryContext, + "minmax-multi context", + ALLOCSET_DEFAULT_SIZES); + + oldctx = MemoryContextSwitchTo(ctx); + + /* build the expanded ranges */ + eranges = build_expanded_ranges(cmpFn, ranges->colloid, ranges, &neranges); + + /* build array of gap distances and sort them in ascending order */ + distances = build_distances(distanceFn, ranges->colloid, + eranges, neranges); + + /* + * Combine ranges until we get below max_values. We don't use any scale + * factor, because this is used during serialization, and we don't expect + * more tuples to be inserted anytime soon. + */ + neranges = reduce_expanded_ranges(eranges, neranges, distances, + max_values, cmpFn, ranges->colloid); + + Assert(count_values(eranges, neranges) <= max_values); + + /* transform back into regular ranges and single values */ + store_expanded_ranges(ranges, eranges, neranges); + + /* check all the range invariants */ + AssertCheckRanges(ranges, cmpFn, ranges->colloid); + + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(ctx); +} + +Datum +brin_minmax_multi_opcinfo(PG_FUNCTION_ARGS) +{ + BrinOpcInfo *result; + + /* + * opaque->strategy_procinfos is initialized lazily; here it is set to + * all-uninitialized by palloc0 which sets fn_oid to InvalidOid. + */ + + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) + + sizeof(MinmaxMultiOpaque)); + result->oi_nstored = 1; + result->oi_regular_nulls = true; + result->oi_opaque = (MinmaxMultiOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(1)); + result->oi_typcache[0] = lookup_type_cache(PG_BRIN_MINMAX_MULTI_SUMMARYOID, 0); + + PG_RETURN_POINTER(result); +} + +/* + * Compute the distance between two float4 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_float4(PG_FUNCTION_ARGS) +{ + float a1 = PG_GETARG_FLOAT4(0); + float a2 = PG_GETARG_FLOAT4(1); + + /* if both values are NaN, then we consider them the same */ + if (isnan(a1) && isnan(a2)) + PG_RETURN_FLOAT8(0.0); + + /* if one value is NaN, use infinite distance */ + if (isnan(a1) || isnan(a2)) + PG_RETURN_FLOAT8(get_float8_infinity()); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two float8 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_float8(PG_FUNCTION_ARGS) +{ + double a1 = PG_GETARG_FLOAT8(0); + double a2 = PG_GETARG_FLOAT8(1); + + /* if both values are NaN, then we consider them the same */ + if (isnan(a1) && isnan(a2)) + PG_RETURN_FLOAT8(0.0); + + /* if one value is NaN, use infinite distance */ + if (isnan(a1) || isnan(a2)) + PG_RETURN_FLOAT8(get_float8_infinity()); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8(a2 - a1); +} + +/* + * Compute the distance between two int2 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_int2(PG_FUNCTION_ARGS) +{ + int16 a1 = PG_GETARG_INT16(0); + int16 a2 = PG_GETARG_INT16(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two int4 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_int4(PG_FUNCTION_ARGS) +{ + int32 a1 = PG_GETARG_INT32(0); + int32 a2 = PG_GETARG_INT32(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two int8 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_int8(PG_FUNCTION_ARGS) +{ + int64 a1 = PG_GETARG_INT64(0); + int64 a2 = PG_GETARG_INT64(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two tid values (by mapping them to float8 and + * then subtracting them). + */ +Datum +brin_minmax_multi_distance_tid(PG_FUNCTION_ARGS) +{ + double da1, + da2; + + ItemPointer pa1 = (ItemPointer) PG_GETARG_DATUM(0); + ItemPointer pa2 = (ItemPointer) PG_GETARG_DATUM(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(ItemPointerCompare(pa1, pa2) <= 0); + + /* + * We use the no-check variants here, because user-supplied values may + * have (ip_posid == 0). See ItemPointerCompare. + */ + da1 = ItemPointerGetBlockNumberNoCheck(pa1) * MaxHeapTuplesPerPage + + ItemPointerGetOffsetNumberNoCheck(pa1); + + da2 = ItemPointerGetBlockNumberNoCheck(pa2) * MaxHeapTuplesPerPage + + ItemPointerGetOffsetNumberNoCheck(pa2); + + PG_RETURN_FLOAT8(da2 - da1); +} + +/* + * Compute the distance between two numeric values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_numeric(PG_FUNCTION_ARGS) +{ + Datum d; + Datum a1 = PG_GETARG_DATUM(0); + Datum a2 = PG_GETARG_DATUM(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(DatumGetBool(DirectFunctionCall2(numeric_le, a1, a2))); + + d = DirectFunctionCall2(numeric_sub, a2, a1); /* a2 - a1 */ + + PG_RETURN_FLOAT8(DirectFunctionCall1(numeric_float8, d)); +} + +/* + * Compute the approximate distance between two UUID values. + * + * XXX We do not need a perfectly accurate value, so we approximate the + * deltas (which would have to be 128-bit integers) with a 64-bit float. + * The small inaccuracies do not matter in practice, in the worst case + * we'll decide to merge ranges that are not the closest ones. + */ +Datum +brin_minmax_multi_distance_uuid(PG_FUNCTION_ARGS) +{ + int i; + float8 delta = 0; + + Datum a1 = PG_GETARG_DATUM(0); + Datum a2 = PG_GETARG_DATUM(1); + + pg_uuid_t *u1 = DatumGetUUIDP(a1); + pg_uuid_t *u2 = DatumGetUUIDP(a2); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(DatumGetBool(DirectFunctionCall2(uuid_le, a1, a2))); + + /* compute approximate delta as a double precision value */ + for (i = UUID_LEN - 1; i >= 0; i--) + { + delta += (int) u2->data[i] - (int) u1->data[i]; + delta /= 256; + } + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the approximate distance between two dates. + */ +Datum +brin_minmax_multi_distance_date(PG_FUNCTION_ARGS) +{ + DateADT dateVal1 = PG_GETARG_DATEADT(0); + DateADT dateVal2 = PG_GETARG_DATEADT(1); + + if (DATE_NOT_FINITE(dateVal1) || DATE_NOT_FINITE(dateVal2)) + PG_RETURN_FLOAT8(0); + + PG_RETURN_FLOAT8(dateVal1 - dateVal2); +} + +/* + * Compute the approximate distance between two time (without tz) values. + * + * TimeADT is just an int64, so we simply subtract the values directly. + */ +Datum +brin_minmax_multi_distance_time(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + TimeADT ta = PG_GETARG_TIMEADT(0); + TimeADT tb = PG_GETARG_TIMEADT(1); + + delta = (tb - ta); + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the approximate distance between two timetz values. + * + * Simply subtracts the TimeADT (int64) values embedded in TimeTzADT. + */ +Datum +brin_minmax_multi_distance_timetz(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + TimeTzADT *ta = PG_GETARG_TIMETZADT_P(0); + TimeTzADT *tb = PG_GETARG_TIMETZADT_P(1); + + delta = (tb->time - ta->time) + (tb->zone - ta->zone) * USECS_PER_SEC; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two timestamp values. + */ +Datum +brin_minmax_multi_distance_timestamp(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + Timestamp dt1 = PG_GETARG_TIMESTAMP(0); + Timestamp dt2 = PG_GETARG_TIMESTAMP(1); + + if (TIMESTAMP_NOT_FINITE(dt1) || TIMESTAMP_NOT_FINITE(dt2)) + PG_RETURN_FLOAT8(0); + + delta = dt2 - dt1; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two interval values. + */ +Datum +brin_minmax_multi_distance_interval(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + Interval *ia = PG_GETARG_INTERVAL_P(0); + Interval *ib = PG_GETARG_INTERVAL_P(1); + Interval *result; + + int64 dayfraction; + int64 days; + + result = (Interval *) palloc(sizeof(Interval)); + + result->month = ib->month - ia->month; + /* overflow check copied from int4mi */ + if (!SAMESIGN(ib->month, ia->month) && + !SAMESIGN(result->month, ib->month)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + + result->day = ib->day - ia->day; + if (!SAMESIGN(ib->day, ia->day) && + !SAMESIGN(result->day, ib->day)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + + result->time = ib->time - ia->time; + if (!SAMESIGN(ib->time, ia->time) && + !SAMESIGN(result->time, ib->time)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + + /* + * Delta is (fractional) number of days between the intervals. Assume + * months have 30 days for consistency with interval_cmp_internal. We + * don't need to be exact, in the worst case we'll build a bit less + * efficient ranges. But we should not contradict interval_cmp. + */ + dayfraction = result->time % USECS_PER_DAY; + days = result->time / USECS_PER_DAY; + days += result->month * INT64CONST(30); + days += result->day; + + /* convert to double precision */ + delta = (double) days + dayfraction / (double) USECS_PER_DAY; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two pg_lsn values. + * + * LSN is just an int64 encoding position in the stream, so just subtract + * those int64 values directly. + */ +Datum +brin_minmax_multi_distance_pg_lsn(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + XLogRecPtr lsna = PG_GETARG_LSN(0); + XLogRecPtr lsnb = PG_GETARG_LSN(1); + + delta = (lsnb - lsna); + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two macaddr values. + * + * mac addresses are treated as 6 unsigned chars, so do the same thing we + * already do for UUID values. + */ +Datum +brin_minmax_multi_distance_macaddr(PG_FUNCTION_ARGS) +{ + float8 delta; + + macaddr *a = PG_GETARG_MACADDR_P(0); + macaddr *b = PG_GETARG_MACADDR_P(1); + + delta = ((float8) b->f - (float8) a->f); + delta /= 256; + + delta += ((float8) b->e - (float8) a->e); + delta /= 256; + + delta += ((float8) b->d - (float8) a->d); + delta /= 256; + + delta += ((float8) b->c - (float8) a->c); + delta /= 256; + + delta += ((float8) b->b - (float8) a->b); + delta /= 256; + + delta += ((float8) b->a - (float8) a->a); + delta /= 256; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two macaddr8 values. + * + * macaddr8 addresses are 8 unsigned chars, so do the same thing we + * already do for UUID values. + */ +Datum +brin_minmax_multi_distance_macaddr8(PG_FUNCTION_ARGS) +{ + float8 delta; + + macaddr8 *a = PG_GETARG_MACADDR8_P(0); + macaddr8 *b = PG_GETARG_MACADDR8_P(1); + + delta = ((float8) b->h - (float8) a->h); + delta /= 256; + + delta += ((float8) b->g - (float8) a->g); + delta /= 256; + + delta += ((float8) b->f - (float8) a->f); + delta /= 256; + + delta += ((float8) b->e - (float8) a->e); + delta /= 256; + + delta += ((float8) b->d - (float8) a->d); + delta /= 256; + + delta += ((float8) b->c - (float8) a->c); + delta /= 256; + + delta += ((float8) b->b - (float8) a->b); + delta /= 256; + + delta += ((float8) b->a - (float8) a->a); + delta /= 256; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two inet values. + * + * The distance is defined as the difference between 32-bit/128-bit values, + * depending on the IP version. The distance is computed by subtracting + * the bytes and normalizing it to [0,1] range for each IP family. + * Addresses from different families are considered to be in maximum + * distance, which is 1.0. + * + * XXX Does this need to consider the mask (bits)? For now, it's ignored. + */ +Datum +brin_minmax_multi_distance_inet(PG_FUNCTION_ARGS) +{ + float8 delta; + int i; + int len; + unsigned char *addra, + *addrb; + + inet *ipa = PG_GETARG_INET_PP(0); + inet *ipb = PG_GETARG_INET_PP(1); + + int lena, + lenb; + + /* + * If the addresses are from different families, consider them to be in + * maximal possible distance (which is 1.0). + */ + if (ip_family(ipa) != ip_family(ipb)) + PG_RETURN_FLOAT8(1.0); + + addra = (unsigned char *) palloc(ip_addrsize(ipa)); + memcpy(addra, ip_addr(ipa), ip_addrsize(ipa)); + + addrb = (unsigned char *) palloc(ip_addrsize(ipb)); + memcpy(addrb, ip_addr(ipb), ip_addrsize(ipb)); + + /* + * The length is calculated from the mask length, because we sort the + * addresses by first address in the range, so A.B.C.D/24 < A.B.C.1 (the + * first range starts at A.B.C.0, which is before A.B.C.1). We don't want + * to produce a negative delta in this case, so we just cut the extra + * bytes. + * + * XXX Maybe this should be a bit more careful and cut the bits, not just + * whole bytes. + */ + lena = ip_bits(ipa); + lenb = ip_bits(ipb); + + len = ip_addrsize(ipa); + + /* apply the network mask to both addresses */ + for (i = 0; i < len; i++) + { + unsigned char mask; + int nbits; + + nbits = lena - (i * 8); + if (nbits < 8) + { + mask = (0xFF << (8 - nbits)); + addra[i] = (addra[i] & mask); + } + + nbits = lenb - (i * 8); + if (nbits < 8) + { + mask = (0xFF << (8 - nbits)); + addrb[i] = (addrb[i] & mask); + } + } + + /* Calculate the difference between the addresses. */ + delta = 0; + for (i = len - 1; i >= 0; i--) + { + unsigned char a = addra[i]; + unsigned char b = addrb[i]; + + delta += (float8) b - (float8) a; + delta /= 256; + } + + Assert((delta >= 0) && (delta <= 1)); + + pfree(addra); + pfree(addrb); + + PG_RETURN_FLOAT8(delta); +} + +static void +brin_minmax_multi_serialize(BrinDesc *bdesc, Datum src, Datum *dst) +{ + Ranges *ranges = (Ranges *) DatumGetPointer(src); + SerializedRanges *s; + + /* + * In batch mode, we need to compress the accumulated values to the + * actually requested number of values/ranges. + */ + compactify_ranges(bdesc, ranges, ranges->target_maxvalues); + + /* At this point everything has to be fully sorted. */ + Assert(ranges->nsorted == ranges->nvalues); + + s = range_serialize(ranges); + dst[0] = PointerGetDatum(s); +} + +static int +brin_minmax_multi_get_values(BrinDesc *bdesc, MinMaxMultiOptions *opts) +{ + return MinMaxMultiGetValuesPerRange(opts); +} + +/* + * Examine the given index tuple (which contains the partial status of a + * certain page range) by comparing it to the given value that comes from + * another heap tuple. If the new value is outside the min/max range + * specified by the existing tuple values, update the index tuple and return + * true. Otherwise, return false and do not modify in this case. + */ +Datum +brin_minmax_multi_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + MinMaxMultiOptions *opts = (MinMaxMultiOptions *) PG_GET_OPCLASS_OPTIONS(); + Oid colloid = PG_GET_COLLATION(); + bool modified = false; + Form_pg_attribute attr; + AttrNumber attno; + Ranges *ranges; + SerializedRanges *serialized = NULL; + + Assert(!isnull); + + attno = column->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* use the already deserialized value, if possible */ + ranges = (Ranges *) DatumGetPointer(column->bv_mem_value); + + /* + * If this is the first non-null value, we need to initialize the range + * list. Otherwise, just extract the existing range list from BrinValues. + * + * When starting with an empty range, we assume this is a batch mode and + * we use a larger buffer. The buffer size is derived from the BRIN range + * size, number of rows per page, with some sensible min/max values. A + * small buffer would be bad for performance, but a large buffer might + * require a lot of memory (because of keeping all the values). + */ + if (column->bv_allnulls) + { + MemoryContext oldctx; + + int target_maxvalues; + int maxvalues; + BlockNumber pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index); + + /* what was specified as a reloption? */ + target_maxvalues = brin_minmax_multi_get_values(bdesc, opts); + + /* + * Determine the insert buffer size - we use 10x the target, capped to + * the maximum number of values in the heap range. This is more than + * enough, considering the actual number of rows per page is likely + * much lower, but meh. + */ + maxvalues = Min(target_maxvalues * MINMAX_BUFFER_FACTOR, + MaxHeapTuplesPerPage * pagesPerRange); + + /* but always at least the original value */ + maxvalues = Max(maxvalues, target_maxvalues); + + /* always cap by MIN/MAX */ + maxvalues = Max(maxvalues, MINMAX_BUFFER_MIN); + maxvalues = Min(maxvalues, MINMAX_BUFFER_MAX); + + oldctx = MemoryContextSwitchTo(column->bv_context); + ranges = minmax_multi_init(maxvalues); + ranges->attno = attno; + ranges->colloid = colloid; + ranges->typid = attr->atttypid; + ranges->target_maxvalues = target_maxvalues; + + /* we'll certainly need the comparator, so just look it up now */ + ranges->cmp = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + MemoryContextSwitchTo(oldctx); + + column->bv_allnulls = false; + modified = true; + + column->bv_mem_value = PointerGetDatum(ranges); + column->bv_serialize = brin_minmax_multi_serialize; + } + else if (!ranges) + { + MemoryContext oldctx; + + int maxvalues; + BlockNumber pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index); + + oldctx = MemoryContextSwitchTo(column->bv_context); + + serialized = (SerializedRanges *) PG_DETOAST_DATUM(column->bv_values[0]); + + /* + * Determine the insert buffer size - we use 10x the target, capped to + * the maximum number of values in the heap range. This is more than + * enough, considering the actual number of rows per page is likely + * much lower, but meh. + */ + maxvalues = Min(serialized->maxvalues * MINMAX_BUFFER_FACTOR, + MaxHeapTuplesPerPage * pagesPerRange); + + /* but always at least the original value */ + maxvalues = Max(maxvalues, serialized->maxvalues); + + /* always cap by MIN/MAX */ + maxvalues = Max(maxvalues, MINMAX_BUFFER_MIN); + maxvalues = Min(maxvalues, MINMAX_BUFFER_MAX); + + ranges = range_deserialize(maxvalues, serialized); + + ranges->attno = attno; + ranges->colloid = colloid; + ranges->typid = attr->atttypid; + + /* we'll certainly need the comparator, so just look it up now */ + ranges->cmp = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + column->bv_mem_value = PointerGetDatum(ranges); + column->bv_serialize = brin_minmax_multi_serialize; + + MemoryContextSwitchTo(oldctx); + } + + /* + * Try to add the new value to the range. We need to update the modified + * flag, so that we serialize the updated summary later. + */ + modified |= range_add_value(bdesc, colloid, attno, attr, ranges, newval); + + + PG_RETURN_BOOL(modified); +} + +/* + * Given an index tuple corresponding to a certain page range and a scan key, + * return whether the scan key is consistent with the index tuple's min/max + * values. Return true if so, false otherwise. + */ +Datum +brin_minmax_multi_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey *keys = (ScanKey *) PG_GETARG_POINTER(2); + int nkeys = PG_GETARG_INT32(3); + + Oid colloid = PG_GET_COLLATION(), + subtype; + AttrNumber attno; + Datum value; + FmgrInfo *finfo; + SerializedRanges *serialized; + Ranges *ranges; + int keyno; + int rangeno; + int i; + + attno = column->bv_attno; + + serialized = (SerializedRanges *) PG_DETOAST_DATUM(column->bv_values[0]); + ranges = range_deserialize(serialized->maxvalues, serialized); + + /* inspect the ranges, and for each one evaluate the scan keys */ + for (rangeno = 0; rangeno < ranges->nranges; rangeno++) + { + Datum minval = ranges->values[2 * rangeno]; + Datum maxval = ranges->values[2 * rangeno + 1]; + + /* assume the range is matching, and we'll try to prove otherwise */ + bool matching = true; + + for (keyno = 0; keyno < nkeys; keyno++) + { + Datum matches; + ScanKey key = keys[keyno]; + + /* NULL keys are handled and filtered-out in bringetbitmap */ + Assert(!(key->sk_flags & SK_ISNULL)); + + attno = key->sk_attno; + subtype = key->sk_subtype; + value = key->sk_argument; + switch (key->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + /* first value from the array */ + matches = FunctionCall2Coll(finfo, colloid, minval, value); + break; + + case BTEqualStrategyNumber: + { + Datum compar; + FmgrInfo *cmpFn; + + /* by default this range does not match */ + matches = false; + + /* + * Otherwise, need to compare the new value with + * boundaries of all the ranges. First check if it's + * less than the absolute minimum, which is the first + * value in the array. + */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + BTGreaterStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, minval, value); + + /* smaller than the smallest value in this range */ + if (DatumGetBool(compar)) + break; + + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + BTLessStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, maxval, value); + + /* larger than the largest value in this range */ + if (DatumGetBool(compar)) + break; + + /* + * We haven't managed to eliminate this range, so + * consider it matching. + */ + matches = true; + + break; + } + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + /* last value from the array */ + matches = FunctionCall2Coll(finfo, colloid, maxval, value); + break; + + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + /* the range has to match all the scan keys */ + matching &= DatumGetBool(matches); + + /* once we find a non-matching key, we're done */ + if (!matching) + break; + } + + /* + * have we found a range matching all scan keys? if yes, we're done + */ + if (matching) + PG_RETURN_DATUM(BoolGetDatum(true)); + } + + /* + * And now inspect the values. We don't bother with doing a binary search + * here, because we're dealing with serialized / fully compacted ranges, + * so there should be only very few values. + */ + for (i = 0; i < ranges->nvalues; i++) + { + Datum val = ranges->values[2 * ranges->nranges + i]; + + /* assume the range is matching, and we'll try to prove otherwise */ + bool matching = true; + + for (keyno = 0; keyno < nkeys; keyno++) + { + Datum matches; + ScanKey key = keys[keyno]; + + /* we've already dealt with NULL keys at the beginning */ + if (key->sk_flags & SK_ISNULL) + continue; + + attno = key->sk_attno; + subtype = key->sk_subtype; + value = key->sk_argument; + switch (key->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + case BTEqualStrategyNumber: + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + + finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + matches = FunctionCall2Coll(finfo, colloid, val, value); + break; + + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + /* the range has to match all the scan keys */ + matching &= DatumGetBool(matches); + + /* once we find a non-matching key, we're done */ + if (!matching) + break; + } + + /* have we found a range matching all scan keys? if yes, we're done */ + if (matching) + PG_RETURN_DATUM(BoolGetDatum(true)); + } + + PG_RETURN_DATUM(BoolGetDatum(false)); +} + +/* + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + */ +Datum +brin_minmax_multi_union(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + + Oid colloid = PG_GET_COLLATION(); + SerializedRanges *serialized_a; + SerializedRanges *serialized_b; + Ranges *ranges_a; + Ranges *ranges_b; + AttrNumber attno; + Form_pg_attribute attr; + ExpandedRange *eranges; + int neranges; + FmgrInfo *cmpFn, + *distanceFn; + DistanceValue *distances; + MemoryContext ctx; + MemoryContext oldctx; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + attno = col_a->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + serialized_a = (SerializedRanges *) PG_DETOAST_DATUM(col_a->bv_values[0]); + serialized_b = (SerializedRanges *) PG_DETOAST_DATUM(col_b->bv_values[0]); + + ranges_a = range_deserialize(serialized_a->maxvalues, serialized_a); + ranges_b = range_deserialize(serialized_b->maxvalues, serialized_b); + + /* make sure neither of the ranges is NULL */ + Assert(ranges_a && ranges_b); + + neranges = (ranges_a->nranges + ranges_a->nvalues) + + (ranges_b->nranges + ranges_b->nvalues); + + /* + * The distanceFn calls (which may internally call e.g. numeric_le) may + * allocate quite a bit of memory, and we must not leak it. Otherwise, + * we'd have problems e.g. when building indexes. So we create a local + * memory context and make sure we free the memory before leaving this + * function (not after every call). + */ + ctx = AllocSetContextCreate(CurrentMemoryContext, + "minmax-multi context", + ALLOCSET_DEFAULT_SIZES); + + oldctx = MemoryContextSwitchTo(ctx); + + /* allocate and fill */ + eranges = (ExpandedRange *) palloc0(neranges * sizeof(ExpandedRange)); + + /* fill the expanded ranges with entries for the first range */ + fill_expanded_ranges(eranges, ranges_a->nranges + ranges_a->nvalues, + ranges_a); + + /* and now add combine ranges for the second range */ + fill_expanded_ranges(&eranges[ranges_a->nranges + ranges_a->nvalues], + ranges_b->nranges + ranges_b->nvalues, + ranges_b); + + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* sort the expanded ranges */ + neranges = sort_expanded_ranges(cmpFn, colloid, eranges, neranges); + + /* + * We've loaded two different lists of expanded ranges, so some of them + * may be overlapping. So walk through them and merge them. + */ + neranges = merge_overlapping_ranges(cmpFn, colloid, eranges, neranges); + + /* check that the combine ranges are correct (no overlaps, ordering) */ + AssertCheckExpandedRanges(bdesc, colloid, attno, attr, eranges, neranges); + + /* + * If needed, reduce some of the ranges. + * + * XXX This may be fairly expensive, so maybe we should do it only when + * it's actually needed (when we have too many ranges). + */ + + /* build array of gap distances and sort them in ascending order */ + distanceFn = minmax_multi_get_procinfo(bdesc, attno, PROCNUM_DISTANCE); + distances = build_distances(distanceFn, colloid, eranges, neranges); + + /* + * See how many values would be needed to store the current ranges, and if + * needed combine as many of them to get below the threshold. The + * collapsed ranges will be stored as a single value. + * + * XXX This does not apply the load factor, as we don't expect to add more + * values to the range, so we prefer to keep as many ranges as possible. + * + * XXX Can the maxvalues be different in the two ranges? Perhaps we should + * use maximum of those? + */ + neranges = reduce_expanded_ranges(eranges, neranges, distances, + ranges_a->maxvalues, + cmpFn, colloid); + + /* update the first range summary */ + store_expanded_ranges(ranges_a, eranges, neranges); + + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(ctx); + + /* cleanup and update the serialized value */ + pfree(serialized_a); + col_a->bv_values[0] = PointerGetDatum(range_serialize(ranges_a)); + + PG_RETURN_VOID(); +} + +/* + * Cache and return minmax multi opclass support procedure + * + * Return the procedure corresponding to the given function support number + * or null if it does not exist. + */ +static FmgrInfo * +minmax_multi_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) +{ + MinmaxMultiOpaque *opaque; + uint16 basenum = procnum - PROCNUM_BASE; + + /* + * We cache these in the opaque struct, to avoid repetitive syscache + * lookups. + */ + opaque = (MinmaxMultiOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * If we already searched for this proc and didn't find it, don't bother + * searching again. + */ + if (opaque->extra_proc_missing[basenum]) + return NULL; + + if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid) + { + if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno, + procnum))) + { + fmgr_info_copy(&opaque->extra_procinfos[basenum], + index_getprocinfo(bdesc->bd_index, attno, procnum), + bdesc->bd_context); + } + else + { + opaque->extra_proc_missing[basenum] = true; + return NULL; + } + } + + return &opaque->extra_procinfos[basenum]; +} + +/* + * Cache and return the procedure for the given strategy. + * + * Note: this function mirrors minmax_multi_get_strategy_procinfo; see notes + * there. If changes are made here, see that function too. + */ +static FmgrInfo * +minmax_multi_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype, + uint16 strategynum) +{ + MinmaxMultiOpaque *opaque; + + Assert(strategynum >= 1 && + strategynum <= BTMaxStrategyNumber); + + opaque = (MinmaxMultiOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * We cache the procedures for the previous subtype in the opaque struct, + * to avoid repetitive syscache lookups. If the subtype changed, + * invalidate all the cached entries. + */ + if (opaque->cached_subtype != subtype) + { + uint16 i; + + for (i = 1; i <= BTMaxStrategyNumber; i++) + opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid; + opaque->cached_subtype = subtype; + } + + if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid) + { + Form_pg_attribute attr; + HeapTuple tuple; + Oid opfamily, + oprid; + bool isNull; + + opfamily = bdesc->bd_index->rd_opfamily[attno - 1]; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(attr->atttypid), + ObjectIdGetDatum(subtype), + Int16GetDatum(strategynum)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strategynum, attr->atttypid, subtype, opfamily); + + oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple, + Anum_pg_amop_amopopr, &isNull)); + ReleaseSysCache(tuple); + Assert(!isNull && RegProcedureIsValid(oprid)); + + fmgr_info_cxt(get_opcode(oprid), + &opaque->strategy_procinfos[strategynum - 1], + bdesc->bd_context); + } + + return &opaque->strategy_procinfos[strategynum - 1]; +} + +Datum +brin_minmax_multi_options(PG_FUNCTION_ARGS) +{ + local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0); + + init_local_reloptions(relopts, sizeof(MinMaxMultiOptions)); + + add_local_int_reloption(relopts, "values_per_range", "desc", + MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE, 8, 256, + offsetof(MinMaxMultiOptions, valuesPerRange)); + + PG_RETURN_VOID(); +} + +/* + * brin_minmax_multi_summary_in + * - input routine for type brin_minmax_multi_summary. + * + * brin_minmax_multi_summary is only used internally to represent summaries + * in BRIN minmax-multi indexes, so it has no operations of its own, and we + * disallow input too. + */ +Datum +brin_minmax_multi_summary_in(PG_FUNCTION_ARGS) +{ + /* + * brin_minmax_multi_summary stores the data in binary form and parsing + * text input is not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "brin_minmax_multi_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + + +/* + * brin_minmax_multi_summary_out + * - output routine for type brin_minmax_multi_summary. + * + * BRIN minmax-multi summaries are serialized into a bytea value, but we + * want to output something nicer humans can understand. + */ +Datum +brin_minmax_multi_summary_out(PG_FUNCTION_ARGS) +{ + int i; + int idx; + SerializedRanges *ranges; + Ranges *ranges_deserialized; + StringInfoData str; + bool isvarlena; + Oid outfunc; + FmgrInfo fmgrinfo; + ArrayBuildState *astate_values = NULL; + + initStringInfo(&str); + appendStringInfoChar(&str, '{'); + + /* + * Detoast to get value with full 4B header (can't be stored in a toast + * table, but can use 1B header). + */ + ranges = (SerializedRanges *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0)); + + /* lookup output func for the type */ + getTypeOutputInfo(ranges->typid, &outfunc, &isvarlena); + fmgr_info(outfunc, &fmgrinfo); + + /* deserialize the range info easy-to-process pieces */ + ranges_deserialized = range_deserialize(ranges->maxvalues, ranges); + + appendStringInfo(&str, "nranges: %u nvalues: %u maxvalues: %u", + ranges_deserialized->nranges, + ranges_deserialized->nvalues, + ranges_deserialized->maxvalues); + + /* serialize ranges */ + idx = 0; + for (i = 0; i < ranges_deserialized->nranges; i++) + { + char *a, + *b; + text *c; + StringInfoData str; + + initStringInfo(&str); + + a = OutputFunctionCall(&fmgrinfo, ranges_deserialized->values[idx++]); + b = OutputFunctionCall(&fmgrinfo, ranges_deserialized->values[idx++]); + + appendStringInfo(&str, "%s ... %s", a, b); + + c = cstring_to_text(str.data); + + astate_values = accumArrayResult(astate_values, + PointerGetDatum(c), + false, + TEXTOID, + CurrentMemoryContext); + } + + if (ranges_deserialized->nranges > 0) + { + Oid typoutput; + bool typIsVarlena; + Datum val; + char *extval; + + getTypeOutputInfo(ANYARRAYOID, &typoutput, &typIsVarlena); + + val = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext)); + + extval = OidOutputFunctionCall(typoutput, val); + + appendStringInfo(&str, " ranges: %s", extval); + } + + /* serialize individual values */ + astate_values = NULL; + + for (i = 0; i < ranges_deserialized->nvalues; i++) + { + Datum a; + text *b; + StringInfoData str; + + initStringInfo(&str); + + a = FunctionCall1(&fmgrinfo, ranges_deserialized->values[idx++]); + + appendStringInfoString(&str, DatumGetCString(a)); + + b = cstring_to_text(str.data); + + astate_values = accumArrayResult(astate_values, + PointerGetDatum(b), + false, + TEXTOID, + CurrentMemoryContext); + } + + if (ranges_deserialized->nvalues > 0) + { + Oid typoutput; + bool typIsVarlena; + Datum val; + char *extval; + + getTypeOutputInfo(ANYARRAYOID, &typoutput, &typIsVarlena); + + val = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext)); + + extval = OidOutputFunctionCall(typoutput, val); + + appendStringInfo(&str, " values: %s", extval); + } + + + appendStringInfoChar(&str, '}'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * brin_minmax_multi_summary_recv + * - binary input routine for type brin_minmax_multi_summary. + */ +Datum +brin_minmax_multi_summary_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "brin_minmax_multi_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * brin_minmax_multi_summary_send + * - binary output routine for type brin_minmax_multi_summary. + * + * BRIN minmax-multi summaries are serialized in a bytea value (although + * the type is named differently), so let's just send that. + */ +Datum +brin_minmax_multi_summary_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c new file mode 100644 index 0000000..992b33a --- /dev/null +++ b/src/backend/access/brin/brin_pageops.c @@ -0,0 +1,920 @@ +/* + * brin_pageops.c + * Page-handling routines for BRIN indexes + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_pageops.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_revmap.h" +#include "access/brin_xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/rel.h" + +/* + * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate + * a single item per page, unlike other index AMs. + */ +#define BrinMaxItemSize \ + MAXALIGN_DOWN(BLCKSZ - \ + (MAXALIGN(SizeOfPageHeaderData + \ + sizeof(ItemIdData)) + \ + MAXALIGN(sizeof(BrinSpecialSpace)))) + +static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *extended); +static Size br_page_get_freespace(Page page); +static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer); + + +/* + * Update tuple origtup (size origsz), located in offset oldoff of buffer + * oldbuf, to newtup (size newsz) as summary tuple for the page range starting + * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. + * + * If samepage is true, attempt to put the new tuple in the same page, but if + * there's no room, use some other one. + * + * If the update is successful, return true; the revmap is updated to point to + * the new tuple. If the update is not done for whatever reason, return false. + * Caller may retry the update if this happens. + */ +bool +brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, BlockNumber heapBlk, + Buffer oldbuf, OffsetNumber oldoff, + const BrinTuple *origtup, Size origsz, + const BrinTuple *newtup, Size newsz, + bool samepage) +{ + Page oldpage; + ItemId oldlp; + BrinTuple *oldtup; + Size oldsz; + Buffer newbuf; + BlockNumber newblk = InvalidBlockNumber; + bool extended; + + Assert(newsz == MAXALIGN(newsz)); + + /* If the item is oversized, don't bother. */ + if (newsz > BrinMaxItemSize) + { + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + newsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); + return false; /* keep compiler quiet */ + } + + /* make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + if (!samepage) + { + /* need a page on which to put the item */ + newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); + if (!BufferIsValid(newbuf)) + { + Assert(!extended); + return false; + } + + /* + * Note: it's possible (though unlikely) that the returned newbuf is + * the same as oldbuf, if brin_getinsertbuffer determined that the old + * buffer does in fact have enough space. + */ + if (newbuf == oldbuf) + { + Assert(!extended); + newbuf = InvalidBuffer; + } + else + newblk = BufferGetBlockNumber(newbuf); + } + else + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + newbuf = InvalidBuffer; + extended = false; + } + oldpage = BufferGetPage(oldbuf); + oldlp = PageGetItemId(oldpage, oldoff); + + /* + * Check that the old tuple wasn't updated concurrently: it might have + * moved someplace else entirely, and for that matter the whole page + * might've become a revmap page. Note that in the first two cases + * checked here, the "oldlp" we just calculated is garbage; but + * PageGetItemId() is simple enough that it was safe to do that + * calculation anyway. + */ + if (!BRIN_IS_REGULAR_PAGE(oldpage) || + oldoff > PageGetMaxOffsetNumber(oldpage) || + !ItemIdIsNormal(oldlp)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * If this happens, and the new buffer was obtained by extending the + * relation, then we need to ensure we don't leave it uninitialized or + * forget about it. + */ + if (BufferIsValid(newbuf)) + { + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + return false; + } + + oldsz = ItemIdGetLength(oldlp); + oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); + + /* + * ... or it might have been updated in place to different contents. + */ + if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + if (BufferIsValid(newbuf)) + { + /* As above, initialize and record new page if we got one */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + return false; + } + + /* + * Great, the old tuple is intact. We can proceed with the update. + * + * If there's enough room in the old page for the new tuple, replace it. + * + * Note that there might now be enough space on the page even though the + * caller told us there isn't, if a concurrent update moved another tuple + * elsewhere or replaced a tuple with a smaller one. + */ + if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && + brin_can_do_samepage_update(oldbuf, origsz, newsz)) + { + START_CRIT_SECTION(); + if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz)) + elog(ERROR, "failed to replace BRIN tuple"); + MarkBufferDirty(oldbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_samepage_update xlrec; + XLogRecPtr recptr; + uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; + + xlrec.offnum = oldoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); + + XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(oldpage, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newbuf)) + { + /* As above, initialize and record new page if we got one */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + + return true; + } + else if (newbuf == InvalidBuffer) + { + /* + * Not enough space, but caller said that there was. Tell them to + * start over. + */ + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + return false; + } + else + { + /* + * Not enough free space on the oldpage. Put the new tuple on the new + * page, and update the revmap. + */ + Page newpage = BufferGetPage(newbuf); + Buffer revmapbuf; + ItemPointerData newtid; + OffsetNumber newoff; + Size freespace = 0; + + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + START_CRIT_SECTION(); + + /* + * We need to initialize the page if it's newly obtained. Note we + * will WAL-log the initialization as part of the update, so we don't + * need to do that here. + */ + if (extended) + brin_page_init(newpage, BRIN_PAGETYPE_REGULAR); + + PageIndexTupleDeleteNoCompact(oldpage, oldoff); + newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz, + InvalidOffsetNumber, false, false); + if (newoff == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple to new page"); + MarkBufferDirty(oldbuf); + MarkBufferDirty(newbuf); + + /* needed to update FSM below */ + if (extended) + freespace = br_page_get_freespace(newpage); + + ItemPointerSet(&newtid, newblk, newoff); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_update xlrec; + XLogRecPtr recptr; + uint8 info; + + info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); + + xlrec.insert.offnum = newoff; + xlrec.insert.heapBlk = heapBlk; + xlrec.insert.pagesPerRange = pagesPerRange; + xlrec.oldOffnum = oldoff; + + XLogBeginInsert(); + + /* new page */ + XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); + + XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); + + /* revmap page */ + XLogRegisterBuffer(1, revmapbuf, 0); + + /* old page */ + XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(oldpage, recptr); + PageSetLSN(newpage, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + UnlockReleaseBuffer(newbuf); + + if (extended) + { + RecordPageWithFreeSpace(idxrel, newblk, freespace); + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + + return true; + } +} + +/* + * Return whether brin_doupdate can do a samepage update. + */ +bool +brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) +{ + return + ((newsz <= origsz) || + PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz)); +} + +/* + * Insert an index tuple into the index relation. The revmap is updated to + * mark the range containing the given page as pointing to the inserted entry. + * A WAL record is written. + * + * The buffer, if valid, is first checked for free space to insert the new + * entry; if there isn't enough, a new buffer is obtained and pinned. No + * buffer lock must be held on entry, no buffer lock is held on exit. + * + * Return value is the offset number where the tuple was inserted. + */ +OffsetNumber +brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, + BrinTuple *tup, Size itemsz) +{ + Page page; + BlockNumber blk; + OffsetNumber off; + Size freespace = 0; + Buffer revmapbuf; + ItemPointerData tid; + bool extended; + + Assert(itemsz == MAXALIGN(itemsz)); + + /* If the item is oversized, don't even bother. */ + if (itemsz > BrinMaxItemSize) + { + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); + return InvalidOffsetNumber; /* keep compiler quiet */ + } + + /* Make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + /* + * Acquire lock on buffer supplied by caller, if any. If it doesn't have + * enough space, unpin it to obtain a new one below. + */ + if (BufferIsValid(*buffer)) + { + /* + * It's possible that another backend (or ourselves!) extended the + * revmap over the page we held a pin on, so we cannot assume that + * it's still a regular page. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) + { + UnlockReleaseBuffer(*buffer); + *buffer = InvalidBuffer; + } + } + + /* + * If we still don't have a usable buffer, have brin_getinsertbuffer + * obtain one for us. + */ + if (!BufferIsValid(*buffer)) + { + do + *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); + while (!BufferIsValid(*buffer)); + } + else + extended = false; + + /* Now obtain lock on revmap buffer */ + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + page = BufferGetPage(*buffer); + blk = BufferGetBlockNumber(*buffer); + + /* Execute the actual insertion */ + START_CRIT_SECTION(); + if (extended) + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, + false, false); + if (off == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple to new page"); + MarkBufferDirty(*buffer); + + /* needed to update FSM below */ + if (extended) + freespace = br_page_get_freespace(page); + + ItemPointerSet(&tid, blk, off); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_insert xlrec; + XLogRecPtr recptr; + uint8 info; + + info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); + xlrec.heapBlk = heapBlk; + xlrec.pagesPerRange = pagesPerRange; + xlrec.offnum = off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); + + XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) tup, itemsz); + + XLogRegisterBuffer(1, revmapbuf, 0); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(page, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + /* Tuple is firmly on buffer; we can release our locks */ + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + + BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", + blk, off, heapBlk)); + + if (extended) + { + RecordPageWithFreeSpace(idxrel, blk, freespace); + FreeSpaceMapVacuumRange(idxrel, blk, blk + 1); + } + + return off; +} + +/* + * Initialize a page with the given type. + * + * Caller is responsible for marking it dirty, as appropriate. + */ +void +brin_page_init(Page page, uint16 type) +{ + PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); + + BrinPageType(page) = type; +} + +/* + * Initialize a new BRIN index's metapage. + */ +void +brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) +{ + BrinMetaPageData *metadata; + + brin_page_init(page, BRIN_PAGETYPE_META); + + metadata = (BrinMetaPageData *) PageGetContents(page); + + metadata->brinMagic = BRIN_META_MAGIC; + metadata->brinVersion = version; + metadata->pagesPerRange = pagesPerRange; + + /* + * Note we cheat here a little. 0 is not a valid revmap block number + * (because it's the metapage buffer), but doing this enables the first + * revmap page to be created when the index is. + */ + metadata->lastRevmapPage = 0; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page; +} + +/* + * Initiate page evacuation protocol. + * + * The page must be locked in exclusive mode by the caller. + * + * If the page is not yet initialized or empty, return false without doing + * anything; it can be used for revmap without any further changes. If it + * contains tuples, mark it for evacuation and return true. + */ +bool +brin_start_evacuating_page(Relation idxRel, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + Page page; + + page = BufferGetPage(buf); + + if (PageIsNew(page)) + return false; + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId lp; + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + /* + * Prevent other backends from adding more stuff to this page: + * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page + * can no longer be used to add new tuples. Note that this flag + * is not WAL-logged, except accidentally. + */ + BrinPageFlags(page) |= BRIN_EVACUATE_PAGE; + MarkBufferDirtyHint(buf, true); + + return true; + } + } + return false; +} + +/* + * Move all tuples out of a page. + * + * The caller must hold lock on the page. The lock and pin are released. + */ +void +brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + Page page; + BrinTuple *btup = NULL; + Size btupsz = 0; + + page = BufferGetPage(buf); + + Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE); + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + BrinTuple *tup; + Size sz; + ItemId lp; + + CHECK_FOR_INTERRUPTS(); + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + sz = ItemIdGetLength(lp); + tup = (BrinTuple *) PageGetItem(page, lp); + tup = brin_copy_tuple(tup, sz, btup, &btupsz); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, + buf, off, tup, sz, tup, sz, false)) + off--; /* retry */ + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + /* It's possible that someone extended the revmap over this page */ + if (!BRIN_IS_REGULAR_PAGE(page)) + break; + } + } + + UnlockReleaseBuffer(buf); +} + +/* + * Given a BRIN index page, initialize it if necessary, and record its + * current free space in the FSM. + * + * The main use for this is when, during vacuuming, an uninitialized page is + * found, which could be the result of relation extension followed by a crash + * before the page can be used. + * + * Here, we don't bother to update upper FSM pages, instead expecting that our + * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere + * in this file, it's generally a good idea to propagate additions of free + * space into the upper FSM pages immediately. + */ +void +brin_page_cleanup(Relation idxrel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * If a page was left uninitialized, initialize it now; also record it in + * FSM. + * + * Somebody else might be extending the relation concurrently. To avoid + * re-initializing the page before they can grab the buffer lock, we + * acquire the extension lock momentarily. Since they hold the extension + * lock from before getting the page and after its been initialized, we're + * sure to see their initialization. + */ + if (PageIsNew(page)) + { + LockRelationForExtension(idxrel, ShareLock); + UnlockRelationForExtension(idxrel, ShareLock); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(page)) + { + brin_initialize_empty_new_buffer(idxrel, buf); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + return; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* Nothing to be done for non-regular index pages */ + if (BRIN_IS_META_PAGE(BufferGetPage(buf)) || + BRIN_IS_REVMAP_PAGE(BufferGetPage(buf))) + return; + + /* Measure free space and record it */ + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), + br_page_get_freespace(page)); +} + +/* + * Return a pinned and exclusively locked buffer which can be used to insert an + * index item of size itemsz (caller must ensure not to request sizes + * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in + * an order determined to avoid deadlocks). + * + * If we find that the old page is no longer a regular index page (because + * of a revmap extension), the old buffer is unlocked and we return + * InvalidBuffer. + * + * If there's no existing page with enough free space to accommodate the new + * item, the relation is extended. If this happens, *extended is set to true, + * and it is the caller's responsibility to initialize the page (and WAL-log + * that fact) prior to use. The caller should also update the FSM with the + * page's remaining free space after the insertion. + * + * Note that the caller is not expected to update FSM unless *extended is set + * true. This policy means that we'll update FSM when a page is created, and + * when it's found to have too little space for a desired tuple insertion, + * but not every single time we add a tuple to the page. + * + * Note that in some corner cases it is possible for this routine to extend + * the relation and then not return the new page. It is this routine's + * responsibility to WAL-log the page initialization and to record the page in + * FSM if that happens, since the caller certainly can't do it. + */ +static Buffer +brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *extended) +{ + BlockNumber oldblk; + BlockNumber newblk; + Page page; + Size freespace; + + /* callers must have checked */ + Assert(itemsz <= BrinMaxItemSize); + + if (BufferIsValid(oldbuf)) + oldblk = BufferGetBlockNumber(oldbuf); + else + oldblk = InvalidBlockNumber; + + /* Choose initial target page, re-using existing target if known */ + newblk = RelationGetTargetBlock(irel); + if (newblk == InvalidBlockNumber) + newblk = GetPageWithFreeSpace(irel, itemsz); + + /* + * Loop until we find a page with sufficient free space. By the time we + * return to caller out of this loop, both buffers are valid and locked; + * if we have to restart here, neither page is locked and newblk isn't + * pinned (if it's even valid). + */ + for (;;) + { + Buffer buf; + bool extensionLockHeld = false; + + CHECK_FOR_INTERRUPTS(); + + *extended = false; + + if (newblk == InvalidBlockNumber) + { + /* + * There's not enough free space in any existing index page, + * according to the FSM: extend the relation to obtain a shiny new + * page. + */ + if (!RELATION_IS_LOCAL(irel)) + { + LockRelationForExtension(irel, ExclusiveLock); + extensionLockHeld = true; + } + buf = ReadBuffer(irel, P_NEW); + newblk = BufferGetBlockNumber(buf); + *extended = true; + + BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", + BufferGetBlockNumber(buf))); + } + else if (newblk == oldblk) + { + /* + * There's an odd corner-case here where the FSM is out-of-date, + * and gave us the old page. + */ + buf = oldbuf; + } + else + { + buf = ReadBuffer(irel, newblk); + } + + /* + * We lock the old buffer first, if it's earlier than the new one; but + * then we need to check that it hasn't been turned into a revmap page + * concurrently. If we detect that that happened, give up and tell + * caller to start over. + */ + if (BufferIsValid(oldbuf) && oldblk < newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * It is possible that the new page was obtained from + * extending the relation. In that case, we must be sure to + * record it in the FSM before leaving, because otherwise the + * space would be lost forever. However, we cannot let an + * uninitialized page get in the FSM, so we need to initialize + * it first. + */ + if (*extended) + brin_initialize_empty_new_buffer(irel, buf); + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + + ReleaseBuffer(buf); + + if (*extended) + { + FreeSpaceMapVacuumRange(irel, newblk, newblk + 1); + /* shouldn't matter, but don't confuse caller */ + *extended = false; + } + + return InvalidBuffer; + } + } + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + + page = BufferGetPage(buf); + + /* + * We have a new buffer to insert into. Check that the new page has + * enough free space, and return it if it does; otherwise start over. + * (br_page_get_freespace also checks that the FSM didn't hand us a + * page that has since been repurposed for the revmap.) + */ + freespace = *extended ? + BrinMaxItemSize : br_page_get_freespace(page); + if (freespace >= itemsz) + { + RelationSetTargetBlock(irel, newblk); + + /* + * Lock the old buffer if not locked already. Note that in this + * case we know for sure it's a regular page: it's later than the + * new page we just got, which is not a revmap page, and revmap + * pages are always consecutive. + */ + if (BufferIsValid(oldbuf) && oldblk > newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); + } + + return buf; + } + + /* This page is no good. */ + + /* + * If an entirely new page does not contain enough free space for the + * new item, then surely that item is oversized. Complain loudly; but + * first make sure we initialize the page and record it as free, for + * next time. + */ + if (*extended) + { + brin_initialize_empty_new_buffer(irel, buf); + /* since this should not happen, skip FreeSpaceMapVacuum */ + + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + itemsz, freespace, RelationGetRelationName(irel)))); + return InvalidBuffer; /* keep compiler quiet */ + } + + if (newblk != oldblk) + UnlockReleaseBuffer(buf); + if (BufferIsValid(oldbuf) && oldblk <= newblk) + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * Update the FSM with the new, presumably smaller, freespace value + * for this page, then search for a new target page. + */ + newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); + } +} + +/* + * Initialize a page as an empty regular BRIN page, WAL-log this, and record + * the page in FSM. + * + * There are several corner situations in which we extend the relation to + * obtain a new page and later find that we cannot use it immediately. When + * that happens, we don't want to leave the page go unrecorded in FSM, because + * there is no mechanism to get the space back and the index would bloat. + * Also, because we would not WAL-log the action that would initialize the + * page, the page would go uninitialized in a standby (or after recovery). + * + * While we record the page in FSM here, caller is responsible for doing FSM + * upper-page update if that seems appropriate. + */ +static void +brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) +{ + Page page; + + BRIN_elog((DEBUG2, + "brin_initialize_empty_new_buffer: initializing blank page %u", + BufferGetBlockNumber(buffer))); + + START_CRIT_SECTION(); + page = BufferGetPage(buffer); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + MarkBufferDirty(buffer); + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); + + /* + * We update the FSM for this page, but this is not WAL-logged. This is + * acceptable because VACUUM will scan the index and update the FSM with + * pages whose FSM records were forgotten in a crash. + */ + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer), + br_page_get_freespace(page)); +} + + +/* + * Return the amount of free space on a regular BRIN index page. + * + * If the page is not a regular page, or has been marked with the + * BRIN_EVACUATE_PAGE flag, returns 0. + */ +static Size +br_page_get_freespace(Page page) +{ + if (!BRIN_IS_REGULAR_PAGE(page) || + (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0) + return 0; + else + return PageGetFreeSpace(page); +} diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c new file mode 100644 index 0000000..c574c8a --- /dev/null +++ b/src/backend/access/brin/brin_revmap.c @@ -0,0 +1,664 @@ +/* + * brin_revmap.c + * Range map for BRIN indexes + * + * The range map (revmap) is a translation structure for BRIN indexes: for each + * page range there is one summary tuple, and its location is tracked by the + * revmap. Whenever a new tuple is inserted into a table that violates the + * previously recorded summary values, a new tuple is inserted into the index + * and the revmap is updated to point to it. + * + * The revmap is stored in the first pages of the index, immediately following + * the metapage. When the revmap needs to be expanded, all tuples on the + * regular BRIN page at that block (if any) are moved out of the way. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_revmap.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_revmap.h" +#include "access/brin_tuple.h" +#include "access/brin_xlog.h" +#include "access/rmgr.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/rel.h" + + +/* + * In revmap pages, each item stores an ItemPointerData. These defines let one + * find the logical revmap page number and index number of the revmap item for + * the given heap block number. + */ +#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \ + ((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS) +#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \ + ((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS) + + +struct BrinRevmap +{ + Relation rm_irel; + BlockNumber rm_pagesPerRange; + BlockNumber rm_lastRevmapPage; /* cached from the metapage */ + Buffer rm_metaBuf; + Buffer rm_currBuf; +}; + +/* typedef appears in brin_revmap.h */ + + +static BlockNumber revmap_get_blkno(BrinRevmap *revmap, + BlockNumber heapBlk); +static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk); +static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap, + BlockNumber heapBlk); +static void revmap_physical_extend(BrinRevmap *revmap); + +/* + * Initialize an access object for a range map. This must be freed by + * brinRevmapTerminate when caller is done with it. + */ +BrinRevmap * +brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange, + Snapshot snapshot) +{ + BrinRevmap *revmap; + Buffer meta; + BrinMetaPageData *metadata; + Page page; + + meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO); + LockBuffer(meta, BUFFER_LOCK_SHARE); + page = BufferGetPage(meta); + TestForOldSnapshot(snapshot, idxrel, page); + metadata = (BrinMetaPageData *) PageGetContents(page); + + revmap = palloc(sizeof(BrinRevmap)); + revmap->rm_irel = idxrel; + revmap->rm_pagesPerRange = metadata->pagesPerRange; + revmap->rm_lastRevmapPage = metadata->lastRevmapPage; + revmap->rm_metaBuf = meta; + revmap->rm_currBuf = InvalidBuffer; + + *pagesPerRange = metadata->pagesPerRange; + + LockBuffer(meta, BUFFER_LOCK_UNLOCK); + + return revmap; +} + +/* + * Release resources associated with a revmap access object. + */ +void +brinRevmapTerminate(BrinRevmap *revmap) +{ + ReleaseBuffer(revmap->rm_metaBuf); + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + pfree(revmap); +} + +/* + * Extend the revmap to cover the given heap block number. + */ +void +brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber mapBlk PG_USED_FOR_ASSERTS_ONLY; + + mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk); + + /* Ensure the buffer we got is in the expected range */ + Assert(mapBlk != InvalidBlockNumber && + mapBlk != BRIN_METAPAGE_BLKNO && + mapBlk <= revmap->rm_lastRevmapPage); +} + +/* + * Prepare to insert an entry into the revmap; the revmap buffer in which the + * entry is to reside is locked and returned. Most callers should call + * brinRevmapExtend beforehand, as this routine does not extend the revmap if + * it's not long enough. + * + * The returned buffer is also recorded in the revmap struct; finishing that + * releases the buffer, therefore the caller needn't do it explicitly. + */ +Buffer +brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk) +{ + Buffer rmBuf; + + rmBuf = revmap_get_buffer(revmap, heapBlk); + LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE); + + return rmBuf; +} + +/* + * In the given revmap buffer (locked appropriately by caller), which is used + * in a BRIN index of pagesPerRange pages per range, set the element + * corresponding to heap block number heapBlk to the given TID. + * + * Once the operation is complete, the caller must update the LSN on the + * returned buffer. + * + * This is used both in regular operation and during WAL replay. + */ +void +brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, + BlockNumber heapBlk, ItemPointerData tid) +{ + RevmapContents *contents; + ItemPointerData *iptr; + Page page; + + /* The correct page should already be pinned and locked */ + page = BufferGetPage(buf); + contents = (RevmapContents *) PageGetContents(page); + iptr = (ItemPointerData *) contents->rm_tids; + iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk); + + if (ItemPointerIsValid(&tid)) + ItemPointerSet(iptr, + ItemPointerGetBlockNumber(&tid), + ItemPointerGetOffsetNumber(&tid)); + else + ItemPointerSetInvalid(iptr); +} + +/* + * Fetch the BrinTuple for a given heap block. + * + * The buffer containing the tuple is locked, and returned in *buf. The + * returned tuple points to the shared buffer and must not be freed; if caller + * wants to use it after releasing the buffer lock, it must create its own + * palloc'ed copy. As an optimization, the caller can pass a pinned buffer + * *buf on entry, which will avoid a pin-unpin cycle when the next tuple is on + * the same page as a previous one. + * + * If no tuple is found for the given heap range, returns NULL. In that case, + * *buf might still be updated (and pin must be released by caller), but it's + * not locked. + * + * The output tuple offset within the buffer is returned in *off, and its size + * is returned in *size. + */ +BrinTuple * +brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, + Buffer *buf, OffsetNumber *off, Size *size, int mode, + Snapshot snapshot) +{ + Relation idxRel = revmap->rm_irel; + BlockNumber mapBlk; + RevmapContents *contents; + ItemPointerData *iptr; + BlockNumber blk; + Page page; + ItemId lp; + BrinTuple *tup; + ItemPointerData previptr; + + /* normalize the heap block number to be the first page in the range */ + heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; + + /* + * Compute the revmap page number we need. If Invalid is returned (i.e., + * the revmap page hasn't been created yet), the requested page range is + * not summarized. + */ + mapBlk = revmap_get_blkno(revmap, heapBlk); + if (mapBlk == InvalidBlockNumber) + { + *off = InvalidOffsetNumber; + return NULL; + } + + ItemPointerSetInvalid(&previptr); + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (revmap->rm_currBuf == InvalidBuffer || + BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk) + { + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + + Assert(mapBlk != InvalidBlockNumber); + revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); + } + + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE); + + contents = (RevmapContents *) + PageGetContents(BufferGetPage(revmap->rm_currBuf)); + iptr = contents->rm_tids; + iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); + + if (!ItemPointerIsValid(iptr)) + { + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + /* + * Check the TID we got in a previous iteration, if any, and save the + * current TID we got from the revmap; if we loop, we can sanity-check + * that the next one we get is different. Otherwise we might be stuck + * looping forever if the revmap is somehow badly broken. + */ + if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("corrupted BRIN index: inconsistent range map"))); + previptr = *iptr; + + blk = ItemPointerGetBlockNumber(iptr); + *off = ItemPointerGetOffsetNumber(iptr); + + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); + + /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */ + if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk) + { + if (BufferIsValid(*buf)) + ReleaseBuffer(*buf); + *buf = ReadBuffer(idxRel, blk); + } + LockBuffer(*buf, mode); + page = BufferGetPage(*buf); + TestForOldSnapshot(snapshot, idxRel, page); + + /* If we land on a revmap page, start over */ + if (BRIN_IS_REGULAR_PAGE(page)) + { + /* + * If the offset number is greater than what's in the page, it's + * possible that the range was desummarized concurrently. Just + * return NULL to handle that case. + */ + if (*off > PageGetMaxOffsetNumber(page)) + { + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + lp = PageGetItemId(page, *off); + if (ItemIdIsUsed(lp)) + { + tup = (BrinTuple *) PageGetItem(page, lp); + + if (tup->bt_blkno == heapBlk) + { + if (size) + *size = ItemIdGetLength(lp); + /* found it! */ + return tup; + } + } + } + + /* + * No luck. Assume that the revmap was updated concurrently. + */ + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); + } + /* not reached, but keep compiler quiet */ + return NULL; +} + +/* + * Delete an index tuple, marking a page range as unsummarized. + * + * Index must be locked in ShareUpdateExclusiveLock mode. + * + * Return false if caller should retry. + */ +bool +brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk) +{ + BrinRevmap *revmap; + BlockNumber pagesPerRange; + RevmapContents *contents; + ItemPointerData *iptr; + ItemPointerData invalidIptr; + BlockNumber revmapBlk; + Buffer revmapBuf; + Buffer regBuf; + Page revmapPg; + Page regPg; + OffsetNumber revmapOffset; + OffsetNumber regOffset; + ItemId lp; + + revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL); + + revmapBlk = revmap_get_blkno(revmap, heapBlk); + if (!BlockNumberIsValid(revmapBlk)) + { + /* revmap page doesn't exist: range not summarized, we're done */ + brinRevmapTerminate(revmap); + return true; + } + + /* Lock the revmap page, obtain the index tuple pointer from it */ + revmapBuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + revmapPg = BufferGetPage(revmapBuf); + revmapOffset = HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); + + contents = (RevmapContents *) PageGetContents(revmapPg); + iptr = contents->rm_tids; + iptr += revmapOffset; + + if (!ItemPointerIsValid(iptr)) + { + /* no index tuple: range not summarized, we're done */ + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + return true; + } + + regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr)); + LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE); + regPg = BufferGetPage(regBuf); + + /* + * We're only removing data, not reading it, so there's no need to + * TestForOldSnapshot here. + */ + + /* if this is no longer a regular page, tell caller to start over */ + if (!BRIN_IS_REGULAR_PAGE(regPg)) + { + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + LockBuffer(regBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + return false; + } + + regOffset = ItemPointerGetOffsetNumber(iptr); + if (regOffset > PageGetMaxOffsetNumber(regPg)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("corrupted BRIN index: inconsistent range map"))); + + lp = PageGetItemId(regPg, regOffset); + if (!ItemIdIsUsed(lp)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("corrupted BRIN index: inconsistent range map"))); + + /* + * Placeholder tuples only appear during unfinished summarization, and we + * hold ShareUpdateExclusiveLock, so this function cannot run concurrently + * with that. So any placeholder tuples that exist are leftovers from a + * crashed or aborted summarization; remove them silently. + */ + + START_CRIT_SECTION(); + + ItemPointerSetInvalid(&invalidIptr); + brinSetHeapBlockItemptr(revmapBuf, revmap->rm_pagesPerRange, heapBlk, + invalidIptr); + PageIndexTupleDeleteNoCompact(regPg, regOffset); + /* XXX record free space in FSM? */ + + MarkBufferDirty(regBuf); + MarkBufferDirty(revmapBuf); + + if (RelationNeedsWAL(idxrel)) + { + xl_brin_desummarize xlrec; + XLogRecPtr recptr; + + xlrec.pagesPerRange = revmap->rm_pagesPerRange; + xlrec.heapBlk = heapBlk; + xlrec.regOffset = regOffset; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinDesummarize); + XLogRegisterBuffer(0, revmapBuf, 0); + XLogRegisterBuffer(1, regBuf, REGBUF_STANDARD); + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_DESUMMARIZE); + PageSetLSN(revmapPg, recptr); + PageSetLSN(regPg, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(regBuf); + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + + return true; +} + +/* + * Given a heap block number, find the corresponding physical revmap block + * number and return it. If the revmap page hasn't been allocated yet, return + * InvalidBlockNumber. + */ +static BlockNumber +revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber targetblk; + + /* obtain revmap block number, skip 1 for metapage block */ + targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; + + /* Normal case: the revmap page is already allocated */ + if (targetblk <= revmap->rm_lastRevmapPage) + return targetblk; + + return InvalidBlockNumber; +} + +/* + * Obtain and return a buffer containing the revmap page for the given heap + * page. The revmap must have been previously extended to cover that page. + * The returned buffer is also recorded in the revmap struct; finishing that + * releases the buffer, therefore the caller needn't do it explicitly. + */ +static Buffer +revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber mapBlk; + + /* Translate the heap block number to physical index location. */ + mapBlk = revmap_get_blkno(revmap, heapBlk); + + if (mapBlk == InvalidBlockNumber) + elog(ERROR, "revmap does not cover heap block %u", heapBlk); + + /* Ensure the buffer we got is in the expected range */ + Assert(mapBlk != BRIN_METAPAGE_BLKNO && + mapBlk <= revmap->rm_lastRevmapPage); + + /* + * Obtain the buffer from which we need to read. If we already have the + * correct buffer in our access struct, use that; otherwise, release that, + * (if valid) and read the one we need. + */ + if (revmap->rm_currBuf == InvalidBuffer || + mapBlk != BufferGetBlockNumber(revmap->rm_currBuf)) + { + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + + revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); + } + + return revmap->rm_currBuf; +} + +/* + * Given a heap block number, find the corresponding physical revmap block + * number and return it. If the revmap page hasn't been allocated yet, extend + * the revmap until it is. + */ +static BlockNumber +revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber targetblk; + + /* obtain revmap block number, skip 1 for metapage block */ + targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; + + /* Extend the revmap, if necessary */ + while (targetblk > revmap->rm_lastRevmapPage) + { + CHECK_FOR_INTERRUPTS(); + revmap_physical_extend(revmap); + } + + return targetblk; +} + +/* + * Try to extend the revmap by one page. This might not happen for a number of + * reasons; caller is expected to retry until the expected outcome is obtained. + */ +static void +revmap_physical_extend(BrinRevmap *revmap) +{ + Buffer buf; + Page page; + Page metapage; + BrinMetaPageData *metadata; + BlockNumber mapBlk; + BlockNumber nblocks; + Relation irel = revmap->rm_irel; + bool needLock = !RELATION_IS_LOCAL(irel); + + /* + * Lock the metapage. This locks out concurrent extensions of the revmap, + * but note that we still need to grab the relation extension lock because + * another backend can extend the index with regular BRIN pages. + */ + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE); + metapage = BufferGetPage(revmap->rm_metaBuf); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + /* + * Check that our cached lastRevmapPage value was up-to-date; if it + * wasn't, update the cached copy and have caller start over. + */ + if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage) + { + revmap->rm_lastRevmapPage = metadata->lastRevmapPage; + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + return; + } + mapBlk = metadata->lastRevmapPage + 1; + + nblocks = RelationGetNumberOfBlocks(irel); + if (mapBlk < nblocks) + { + buf = ReadBuffer(irel, mapBlk); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + } + else + { + if (needLock) + LockRelationForExtension(irel, ExclusiveLock); + + buf = ReadBuffer(irel, P_NEW); + if (BufferGetBlockNumber(buf) != mapBlk) + { + /* + * Very rare corner case: somebody extended the relation + * concurrently after we read its length. If this happens, give + * up and have caller start over. We will have to evacuate that + * page from under whoever is using it. + */ + if (needLock) + UnlockRelationForExtension(irel, ExclusiveLock); + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + return; + } + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + if (needLock) + UnlockRelationForExtension(irel, ExclusiveLock); + } + + /* Check that it's a regular block (or an empty page) */ + if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u", + BrinPageType(page), + RelationGetRelationName(irel), + BufferGetBlockNumber(buf)))); + + /* If the page is in use, evacuate it and restart */ + if (brin_start_evacuating_page(irel, buf)) + { + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf); + + /* have caller start over */ + return; + } + + /* + * Ok, we have now locked the metapage and the target block. Re-initialize + * the target block as a revmap page, and update the metapage. + */ + START_CRIT_SECTION(); + + /* the rm_tids array is initialized to all invalid by PageInit */ + brin_page_init(page, BRIN_PAGETYPE_REVMAP); + MarkBufferDirty(buf); + + metadata->lastRevmapPage = mapBlk; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. (We must do this here because pre-v11 versions of PG did not + * set the metapage's pd_lower correctly, so a pg_upgraded index might + * contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapage; + + MarkBufferDirty(revmap->rm_metaBuf); + + if (RelationNeedsWAL(revmap->rm_irel)) + { + xl_brin_revmap_extend xlrec; + XLogRecPtr recptr; + + xlrec.targetBlk = mapBlk; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinRevmapExtend); + XLogRegisterBuffer(0, revmap->rm_metaBuf, REGBUF_STANDARD); + + XLogRegisterBuffer(1, buf, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND); + PageSetLSN(metapage, recptr); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + + UnlockReleaseBuffer(buf); +} diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c new file mode 100644 index 0000000..09e563b --- /dev/null +++ b/src/backend/access/brin/brin_tuple.c @@ -0,0 +1,708 @@ +/* + * brin_tuple.c + * Method implementations for tuples in BRIN indexes. + * + * Intended usage is that code outside this file only deals with + * BrinMemTuples, and convert to and from the on-disk representation through + * functions in this file. + * + * NOTES + * + * A BRIN tuple is similar to a heap tuple, with a few key differences. The + * first interesting difference is that the tuple header is much simpler, only + * containing its total length and a small area for flags. Also, the stored + * data does not match the relation tuple descriptor exactly: for each + * attribute in the descriptor, the index tuple carries an arbitrary number + * of values, depending on the opclass. + * + * Also, for each column of the index relation there are two null bits: one + * (hasnulls) stores whether any tuple within the page range has that column + * set to null; the other one (allnulls) stores whether the column values are + * all null. If allnulls is true, then the tuple data area does not contain + * values for that column at all; whereas it does if the hasnulls is set. + * Note the size of the null bitmask may not be the same as that of the + * datum array. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_tuple.c + */ +#include "postgres.h" + +#include "access/brin_tuple.h" +#include "access/detoast.h" +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/toast_internals.h" +#include "access/tupdesc.h" +#include "access/tupmacs.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +/* + * This enables de-toasting of index entries. Needed until VACUUM is + * smart enough to rebuild indexes from scratch. + */ +#define TOAST_INDEX_HACK + + +static inline void brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls); + + +/* + * Return a tuple descriptor used for on-disk storage of BRIN tuples. + */ +static TupleDesc +brtuple_disk_tupdesc(BrinDesc *brdesc) +{ + /* We cache these in the BrinDesc */ + if (brdesc->bd_disktdesc == NULL) + { + int i; + int j; + AttrNumber attno = 1; + TupleDesc tupdesc; + MemoryContext oldcxt; + + /* make sure it's in the bdesc's context */ + oldcxt = MemoryContextSwitchTo(brdesc->bd_context); + + tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored); + + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++) + TupleDescInitEntry(tupdesc, attno++, NULL, + brdesc->bd_info[i]->oi_typcache[j]->type_id, + -1, 0); + } + + MemoryContextSwitchTo(oldcxt); + + brdesc->bd_disktdesc = tupdesc; + } + + return brdesc->bd_disktdesc; +} + +/* + * Generate a new on-disk tuple to be inserted in a BRIN index. + * + * See brin_form_placeholder_tuple if you touch this. + */ +BrinTuple * +brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, + Size *size) +{ + Datum *values; + bool *nulls; + bool anynulls = false; + BrinTuple *rettuple; + int keyno; + int idxattno; + uint16 phony_infomask = 0; + bits8 *phony_nullbitmap; + Size len, + hoff, + data_len; + int i; + +#ifdef TOAST_INDEX_HACK + Datum *untoasted_values; + int nuntoasted = 0; +#endif + + Assert(brdesc->bd_totalstored > 0); + + values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); + nulls = (bool *) palloc0(sizeof(bool) * brdesc->bd_totalstored); + phony_nullbitmap = (bits8 *) + palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored)); + +#ifdef TOAST_INDEX_HACK + untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); +#endif + + /* + * Set up the values/nulls arrays for heap_fill_tuple + */ + idxattno = 0; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int datumno; + + /* + * "allnulls" is set when there's no nonnull value in any row in the + * column; when this happens, there is no data to store. Thus set the + * nullable bits for all data elements of this column and we're done. + */ + if (tuple->bt_columns[keyno].bv_allnulls) + { + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + nulls[idxattno++] = true; + anynulls = true; + continue; + } + + /* + * The "hasnulls" bit is set when there are some null values in the + * data. We still need to store a real value, but the presence of + * this means we need a null bitmap. + */ + if (tuple->bt_columns[keyno].bv_hasnulls) + anynulls = true; + + /* If needed, serialize the values before forming the on-disk tuple. */ + if (tuple->bt_columns[keyno].bv_serialize) + { + tuple->bt_columns[keyno].bv_serialize(brdesc, + tuple->bt_columns[keyno].bv_mem_value, + tuple->bt_columns[keyno].bv_values); + } + + /* + * Now obtain the values of each stored datum. Note that some values + * might be toasted, and we cannot rely on the original heap values + * sticking around forever, so we must detoast them. Also try to + * compress them. + */ + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + { + Datum value = tuple->bt_columns[keyno].bv_values[datumno]; + +#ifdef TOAST_INDEX_HACK + + /* We must look at the stored type, not at the index descriptor. */ + TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno]; + + /* Do we need to free the value at the end? */ + bool free_value = false; + + /* For non-varlena types we don't need to do anything special */ + if (atttype->typlen != -1) + { + values[idxattno++] = value; + continue; + } + + /* + * Do nothing if value is not of varlena type. We don't need to + * care about NULL values here, thanks to bv_allnulls above. + * + * If value is stored EXTERNAL, must fetch it so we are not + * depending on outside storage. + * + * XXX Is this actually true? Could it be that the summary is NULL + * even for range with non-NULL data? E.g. degenerate bloom filter + * may be thrown away, etc. + */ + if (VARATT_IS_EXTERNAL(DatumGetPointer(value))) + { + value = PointerGetDatum(detoast_external_attr((struct varlena *) + DatumGetPointer(value))); + free_value = true; + } + + /* + * If value is above size target, and is of a compressible + * datatype, try to compress it in-line. + */ + if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) && + VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET && + (atttype->typstorage == TYPSTORAGE_EXTENDED || + atttype->typstorage == TYPSTORAGE_MAIN)) + { + Datum cvalue; + char compression; + Form_pg_attribute att = TupleDescAttr(brdesc->bd_tupdesc, + keyno); + + /* + * If the BRIN summary and indexed attribute use the same data + * type and it has a valid compression method, we can use the + * same compression method. Otherwise we have to use the + * default method. + */ + if (att->atttypid == atttype->type_id) + compression = att->attcompression; + else + compression = InvalidCompressionMethod; + + cvalue = toast_compress_datum(value, compression); + + if (DatumGetPointer(cvalue) != NULL) + { + /* successful compression */ + if (free_value) + pfree(DatumGetPointer(value)); + + value = cvalue; + free_value = true; + } + } + + /* + * If we untoasted / compressed the value, we need to free it + * after forming the index tuple. + */ + if (free_value) + untoasted_values[nuntoasted++] = value; + +#endif + + values[idxattno++] = value; + } + } + + /* Assert we did not overrun temp arrays */ + Assert(idxattno <= brdesc->bd_totalstored); + + /* compute total space needed */ + len = SizeOfBrinTuple; + if (anynulls) + { + /* + * We need a double-length bitmap on an on-disk BRIN index tuple; the + * first half stores the "allnulls" bits, the second stores + * "hasnulls". + */ + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + } + + len = hoff = MAXALIGN(len); + + data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc), + values, nulls); + len += data_len; + + len = MAXALIGN(len); + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + + /* Assert that hoff fits in the space available */ + Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff); + + /* + * The infomask and null bitmap as computed by heap_fill_tuple are useless + * to us. However, that function will not accept a null infomask; and we + * need to pass a valid null bitmap so that it will correctly skip + * outputting null attributes in the data area. + */ + heap_fill_tuple(brtuple_disk_tupdesc(brdesc), + values, + nulls, + (char *) rettuple + hoff, + data_len, + &phony_infomask, + phony_nullbitmap); + + /* done with these */ + pfree(values); + pfree(nulls); + pfree(phony_nullbitmap); + +#ifdef TOAST_INDEX_HACK + for (i = 0; i < nuntoasted; i++) + pfree(DatumGetPointer(untoasted_values[i])); +#endif + + /* + * Now fill in the real null bitmasks. allnulls first. + */ + if (anynulls) + { + bits8 *bitP; + int bitmask; + + rettuple->bt_info |= BRIN_NULLS_MASK; + + /* + * Note that we reverse the sense of null bits in this module: we + * store a 1 for a null attribute rather than a 0. So we must reverse + * the sense of the att_isnull test in brin_deconstruct_tuple as well. + */ + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_allnulls) + continue; + + *bitP |= bitmask; + } + /* hasnulls bits follow */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_hasnulls) + continue; + + *bitP |= bitmask; + } + } + + if (tuple->bt_placeholder) + rettuple->bt_info |= BRIN_PLACEHOLDER_MASK; + + *size = len; + return rettuple; +} + +/* + * Generate a new on-disk tuple with no data values, marked as placeholder. + * + * This is a cut-down version of brin_form_tuple. + */ +BrinTuple * +brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size) +{ + Size len; + Size hoff; + BrinTuple *rettuple; + int keyno; + bits8 *bitP; + int bitmask; + + /* compute total space needed: always add nulls */ + len = SizeOfBrinTuple; + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + len = hoff = MAXALIGN(len); + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK; + + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + /* set allnulls true for all attributes */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + *bitP |= bitmask; + } + /* no need to set hasnulls */ + + *size = len; + return rettuple; +} + +/* + * Free a tuple created by brin_form_tuple + */ +void +brin_free_tuple(BrinTuple *tuple) +{ + pfree(tuple); +} + +/* + * Given a brin tuple of size len, create a copy of it. If 'dest' is not + * NULL, its size is destsz, and can be used as output buffer; if the tuple + * to be copied does not fit, it is enlarged by repalloc, and the size is + * updated to match. This avoids palloc/free cycles when many brin tuples + * are being processed in loops. + */ +BrinTuple * +brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz) +{ + if (!destsz || *destsz == 0) + dest = palloc(len); + else if (len > *destsz) + { + dest = repalloc(dest, len); + *destsz = len; + } + + memcpy(dest, tuple, len); + + return dest; +} + +/* + * Return whether two BrinTuples are bitwise identical. + */ +bool +brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen) +{ + if (alen != blen) + return false; + if (memcmp(a, b, alen) != 0) + return false; + return true; +} + +/* + * Create a new BrinMemTuple from scratch, and initialize it to an empty + * state. + * + * Note: we don't provide any means to free a deformed tuple, so make sure to + * use a temporary memory context. + */ +BrinMemTuple * +brin_new_memtuple(BrinDesc *brdesc) +{ + BrinMemTuple *dtup; + long basesize; + + basesize = MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); + dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored); + + dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored); + dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + + dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext, + "brin dtuple", + ALLOCSET_DEFAULT_SIZES); + + brin_memtuple_initialize(dtup, brdesc); + + return dtup; +} + +/* + * Reset a BrinMemTuple to initial state. We return the same tuple, for + * notational convenience. + */ +BrinMemTuple * +brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc) +{ + int i; + char *currdatum; + + MemoryContextReset(dtuple->bt_context); + + currdatum = (char *) dtuple + + MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + dtuple->bt_columns[i].bv_attno = i + 1; + dtuple->bt_columns[i].bv_allnulls = true; + dtuple->bt_columns[i].bv_hasnulls = false; + dtuple->bt_columns[i].bv_values = (Datum *) currdatum; + + dtuple->bt_columns[i].bv_mem_value = PointerGetDatum(NULL); + dtuple->bt_columns[i].bv_serialize = NULL; + dtuple->bt_columns[i].bv_context = dtuple->bt_context; + + currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored; + } + + return dtuple; +} + +/* + * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of + * brin_form_tuple. + * + * As an optimization, the caller can pass a previously allocated 'dMemtuple'. + * This avoids having to allocate it here, which can be useful when this + * function is called many times in a loop. It is caller's responsibility + * that the given BrinMemTuple matches what we need here. + * + * Note we don't need the "on disk tupdesc" here; we rely on our own routine to + * deconstruct the tuple from the on-disk format. + */ +BrinMemTuple * +brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple) +{ + BrinMemTuple *dtup; + Datum *values; + bool *allnulls; + bool *hasnulls; + char *tp; + bits8 *nullbits; + int keyno; + int valueno; + MemoryContext oldcxt; + + dtup = dMemtuple ? brin_memtuple_initialize(dMemtuple, brdesc) : + brin_new_memtuple(brdesc); + + if (BrinTupleIsPlaceholder(tuple)) + dtup->bt_placeholder = true; + dtup->bt_blkno = tuple->bt_blkno; + + values = dtup->bt_values; + allnulls = dtup->bt_allnulls; + hasnulls = dtup->bt_hasnulls; + + tp = (char *) tuple + BrinTupleDataOffset(tuple); + + if (BrinTupleHasNulls(tuple)) + nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple); + else + nullbits = NULL; + brin_deconstruct_tuple(brdesc, + tp, nullbits, BrinTupleHasNulls(tuple), + values, allnulls, hasnulls); + + /* + * Iterate to assign each of the values to the corresponding item in the + * values array of each column. The copies occur in the tuple's context. + */ + oldcxt = MemoryContextSwitchTo(dtup->bt_context); + for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int i; + + if (allnulls[keyno]) + { + valueno += brdesc->bd_info[keyno]->oi_nstored; + continue; + } + + /* + * We would like to skip datumCopy'ing the values datum in some cases, + * caller permitting ... + */ + for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++) + dtup->bt_columns[keyno].bv_values[i] = + datumCopy(values[valueno++], + brdesc->bd_info[keyno]->oi_typcache[i]->typbyval, + brdesc->bd_info[keyno]->oi_typcache[i]->typlen); + + dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno]; + dtup->bt_columns[keyno].bv_allnulls = false; + + dtup->bt_columns[keyno].bv_mem_value = PointerGetDatum(NULL); + dtup->bt_columns[keyno].bv_serialize = NULL; + dtup->bt_columns[keyno].bv_context = dtup->bt_context; + } + + MemoryContextSwitchTo(oldcxt); + + return dtup; +} + +/* + * brin_deconstruct_tuple + * Guts of attribute extraction from an on-disk BRIN tuple. + * + * Its arguments are: + * brdesc BRIN descriptor for the stored tuple + * tp pointer to the tuple data area + * nullbits pointer to the tuple nulls bitmask + * nulls "has nulls" bit in tuple infomask + * values output values, array of size brdesc->bd_totalstored + * allnulls output "allnulls", size brdesc->bd_tupdesc->natts + * hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts + * + * Output arrays must have been allocated by caller. + */ +static inline void +brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls) +{ + int attnum; + int stored; + TupleDesc diskdsc; + long off; + + /* + * First iterate to natts to obtain both null flags for each attribute. + * Note that we reverse the sense of the att_isnull test, because we store + * 1 for a null value (rather than a 1 for a not null value as is the + * att_isnull convention used elsewhere.) See brin_form_tuple. + */ + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + /* + * the "all nulls" bit means that all values in the page range for + * this column are nulls. Therefore there are no values in the tuple + * data area. + */ + allnulls[attnum] = nulls && !att_isnull(attnum, nullbits); + + /* + * the "has nulls" bit means that some tuples have nulls, but others + * have not-null values. Therefore we know the tuple contains data + * for this column. + * + * The hasnulls bits follow the allnulls bits in the same bitmask. + */ + hasnulls[attnum] = + nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits); + } + + /* + * Iterate to obtain each attribute's stored values. Note that since we + * may reuse attribute entries for more than one column, we cannot cache + * offsets here. + */ + diskdsc = brtuple_disk_tupdesc(brdesc); + stored = 0; + off = 0; + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + int datumno; + + if (allnulls[attnum]) + { + stored += brdesc->bd_info[attnum]->oi_nstored; + continue; + } + + for (datumno = 0; + datumno < brdesc->bd_info[attnum]->oi_nstored; + datumno++) + { + Form_pg_attribute thisatt = TupleDescAttr(diskdsc, stored); + + if (thisatt->attlen == -1) + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + } + + values[stored++] = fetchatt(thisatt, tp + off); + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + } + } +} diff --git a/src/backend/access/brin/brin_validate.c b/src/backend/access/brin/brin_validate.c new file mode 100644 index 0000000..11835d8 --- /dev/null +++ b/src/backend/access/brin/brin_validate.c @@ -0,0 +1,281 @@ +/*------------------------------------------------------------------------- + * + * brin_validate.c + * Opclass validator for BRIN. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_validate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/brin_internal.h" +#include "access/htup_details.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + +/* + * Validator for a BRIN opclass. + * + * Some of the checks done here cover the whole opfamily, and therefore are + * redundant when checking each opclass in a family. But they don't run long + * enough to be much of a problem, so we accept the duplication rather than + * complicate the amvalidate API. + */ +bool +brinvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + uint64 allfuncs = 0; + uint64 allops = 0; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case BRIN_PROCNUM_OPCINFO: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 1, 1, INTERNALOID); + break; + case BRIN_PROCNUM_ADDVALUE: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 4, 4, INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case BRIN_PROCNUM_CONSISTENT: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 3, 4, INTERNALOID, INTERNALOID, + INTERNALOID, INT4OID); + break; + case BRIN_PROCNUM_UNION: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 3, 3, INTERNALOID, INTERNALOID, + INTERNALOID); + break; + case BRIN_PROCNUM_OPTIONS: + ok = check_amoptsproc_signature(procform->amproc); + break; + default: + /* Complain if it's not a valid optional proc number */ + if (procform->amprocnum < BRIN_FIRST_OPTIONAL_PROCNUM || + procform->amprocnum > BRIN_LAST_OPTIONAL_PROCNUM) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "brin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* omit bad proc numbers from allfuncs */ + } + /* Can't check signatures of optional procs, so assume OK */ + ok = true; + break; + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "brin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + + /* Track all valid procedure numbers seen in opfamily */ + allfuncs |= ((uint64) 1) << procform->amprocnum; + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "brin", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + else + { + /* + * The set of operators supplied varies across BRIN opfamilies. + * Our plan is to identify all operator strategy numbers used in + * the opfamily and then complain about datatype combinations that + * are missing any operator(s). However, consider only numbers + * that appear in some non-cross-type case, since cross-type + * operators may have unique strategies. (This is not a great + * heuristic, in particular an erroneous number used in a + * cross-type operator will not get noticed; but the core BRIN + * opfamilies are messy enough to make it necessary.) + */ + if (oprform->amoplefttype == oprform->amoprighttype) + allops |= ((uint64) 1) << oprform->amopstrategy; + } + + /* brin doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "brin", + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all brin strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "brin", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * Some BRIN opfamilies expect cross-type support functions to exist, + * and some don't. We don't know exactly which are which, so if we + * find a cross-type operator for which there are no support functions + * at all, let it pass. (Don't expect that all operators exist for + * such cross-type cases, either.) + */ + if (thisgroup->functionset == 0 && + thisgroup->lefttype != thisgroup->righttype) + continue; + + /* + * Else complain if there seems to be an incomplete set of either + * operators or support functions for this datatype pair. + */ + if (thisgroup->operatorset != allops) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s", + opfamilyname, "brin", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + if (thisgroup->functionset != allfuncs) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing support function(s) for types %s and %s", + opfamilyname, "brin", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + } + + /* Check that the originally-named opclass is complete */ + if (!opclassgroup || opclassgroup->operatorset != allops) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing operator(s)", + opclassname, "brin"))); + result = false; + } + for (i = 1; i <= BRIN_MANDATORY_NPROCS; i++) + { + if (opclassgroup && + (opclassgroup->functionset & (((int64) 1) << i)) != 0) + continue; /* got it */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d", + opclassname, "brin", i))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c new file mode 100644 index 0000000..3519038 --- /dev/null +++ b/src/backend/access/brin/brin_xlog.c @@ -0,0 +1,367 @@ +/* + * brin_xlog.c + * XLog replay routines for BRIN indexes + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_xlog.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_xlog.h" +#include "access/bufmask.h" +#include "access/xlogutils.h" + + +/* + * xlog replay routines + */ +static void +brin_xlog_createidx(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record); + Buffer buf; + Page page; + + /* create the index' metapage */ + buf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(buf)); + page = (Page) BufferGetPage(buf); + brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version); + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); +} + +/* + * Common part of an insert or update. Inserts the new tuple and updates the + * revmap. + */ +static void +brin_xlog_insert_update(XLogReaderState *record, + xl_brin_insert *xlrec) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + BlockNumber regpgno; + Page page; + XLogRedoAction action; + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (XLogRecGetInfo(record) & XLOG_BRIN_INIT_PAGE) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + action = BLK_NEEDS_REDO; + } + else + { + action = XLogReadBufferForRedo(record, 0, &buffer); + } + + /* need this page's blkno to store in revmap */ + regpgno = BufferGetBlockNumber(buffer); + + /* insert the index item into the page */ + if (action == BLK_NEEDS_REDO) + { + OffsetNumber offnum; + BrinTuple *tuple; + Size tuplen; + + tuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); + + Assert(tuple->bt_blkno == xlrec->heapBlk); + + page = (Page) BufferGetPage(buffer); + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "brin_xlog_insert_update: invalid max offset number"); + + offnum = PageAddItem(page, (Item) tuple, tuplen, offnum, true, false); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "brin_xlog_insert_update: failed to add tuple"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* update the revmap */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + ItemPointerData tid; + + ItemPointerSet(&tid, regpgno, xlrec->offnum); + page = (Page) BufferGetPage(buffer); + + brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, + tid); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* XXX no FSM updates here ... */ +} + +/* + * replay a BRIN index insertion + */ +static void +brin_xlog_insert(XLogReaderState *record) +{ + xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record); + + brin_xlog_insert_update(record, xlrec); +} + +/* + * replay a BRIN index update + */ +static void +brin_xlog_update(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record); + Buffer buffer; + XLogRedoAction action; + + /* First remove the old tuple */ + action = XLogReadBufferForRedo(record, 2, &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page; + OffsetNumber offnum; + + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->oldOffnum; + + PageIndexTupleDeleteNoCompact(page, offnum); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + /* Then insert the new tuple and update revmap, like in an insertion. */ + brin_xlog_insert_update(record, &xlrec->insert); + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * Update a tuple on a single page. + */ +static void +brin_xlog_samepage_update(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_samepage_update *xlrec; + Buffer buffer; + XLogRedoAction action; + + xlrec = (xl_brin_samepage_update *) XLogRecGetData(record); + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + Size tuplen; + BrinTuple *brintuple; + Page page; + OffsetNumber offnum; + + brintuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); + + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->offnum; + + if (!PageIndexTupleOverwrite(page, offnum, (Item) brintuple, tuplen)) + elog(PANIC, "brin_xlog_samepage_update: failed to replace tuple"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* XXX no FSM updates here ... */ +} + +/* + * Replay a revmap page extension + */ +static void +brin_xlog_revmap_extend(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_revmap_extend *xlrec; + Buffer metabuf; + Buffer buf; + Page page; + BlockNumber targetBlk; + XLogRedoAction action; + + xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 1, NULL, NULL, &targetBlk); + Assert(xlrec->targetBlk == targetBlk); + + /* Update the metapage */ + action = XLogReadBufferForRedo(record, 0, &metabuf); + if (action == BLK_NEEDS_REDO) + { + Page metapg; + BrinMetaPageData *metadata; + + metapg = BufferGetPage(metabuf); + metadata = (BrinMetaPageData *) PageGetContents(metapg); + + Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1); + metadata->lastRevmapPage = xlrec->targetBlk; + + PageSetLSN(metapg, lsn); + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c + * compresses the page. (We must do this here because pre-v11 + * versions of PG did not set the metapage's pd_lower correctly, so a + * pg_upgraded index might contain the wrong value.) + */ + ((PageHeader) metapg)->pd_lower = + ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapg; + + MarkBufferDirty(metabuf); + } + + /* + * Re-init the target block as a revmap page. There's never a full- page + * image here. + */ + + buf = XLogInitBufferForRedo(record, 1); + page = (Page) BufferGetPage(buf); + brin_page_init(page, BRIN_PAGETYPE_REVMAP); + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + + UnlockReleaseBuffer(buf); + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +static void +brin_xlog_desummarize_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_desummarize *xlrec; + Buffer buffer; + XLogRedoAction action; + + xlrec = (xl_brin_desummarize *) XLogRecGetData(record); + + /* Update the revmap */ + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + ItemPointerData iptr; + + ItemPointerSetInvalid(&iptr); + brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, iptr); + + PageSetLSN(BufferGetPage(buffer), lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* remove the leftover entry from the regular page */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + Page regPg = BufferGetPage(buffer); + + PageIndexTupleDeleteNoCompact(regPg, xlrec->regOffset); + + PageSetLSN(regPg, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +void +brin_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_BRIN_OPMASK) + { + case XLOG_BRIN_CREATE_INDEX: + brin_xlog_createidx(record); + break; + case XLOG_BRIN_INSERT: + brin_xlog_insert(record); + break; + case XLOG_BRIN_UPDATE: + brin_xlog_update(record); + break; + case XLOG_BRIN_SAMEPAGE_UPDATE: + brin_xlog_samepage_update(record); + break; + case XLOG_BRIN_REVMAP_EXTEND: + brin_xlog_revmap_extend(record); + break; + case XLOG_BRIN_DESUMMARIZE: + brin_xlog_desummarize_page(record); + break; + default: + elog(PANIC, "brin_redo: unknown op code %u", info); + } +} + +/* + * Mask a BRIN page before doing consistency checks. + */ +void +brin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + PageHeader pagehdr = (PageHeader) page; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + + /* + * Regular brin pages contain unused space which needs to be masked. + * Similarly for meta pages, but mask it only if pd_lower appears to have + * been set correctly. + */ + if (BRIN_IS_REGULAR_PAGE(page) || + (BRIN_IS_META_PAGE(page) && pagehdr->pd_lower > SizeOfPageHeaderData)) + { + mask_unused_space(page); + } + + /* + * BRIN_EVACUATE_PAGE is not WAL-logged, since it's of no use in recovery. + * Mask it. See brin_start_evacuating_page() for details. + */ + BrinPageFlags(page) &= ~BRIN_EVACUATE_PAGE; +} diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile new file mode 100644 index 0000000..b9aff0c --- /dev/null +++ b/src/backend/access/common/Makefile @@ -0,0 +1,33 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/common +# +# IDENTIFICATION +# src/backend/access/common/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/common +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + attmap.o \ + bufmask.o \ + detoast.o \ + heaptuple.o \ + indextuple.o \ + printsimple.o \ + printtup.o \ + relation.o \ + reloptions.o \ + scankey.o \ + session.o \ + syncscan.o \ + toast_compression.o \ + toast_internals.o \ + tupconvert.o \ + tupdesc.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/common/attmap.c b/src/backend/access/common/attmap.c new file mode 100644 index 0000000..32405f8 --- /dev/null +++ b/src/backend/access/common/attmap.c @@ -0,0 +1,324 @@ +/*------------------------------------------------------------------------- + * + * attmap.c + * Attribute mapping support. + * + * This file provides utility routines to build and manage attribute + * mappings by comparing input and output TupleDescs. Such mappings + * are typically used by DDL operating on inheritance and partition trees + * to do a conversion between rowtypes logically equivalent but with + * columns in a different order, taking into account dropped columns. + * They are also used by the tuple conversion routines in tupconvert.c. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/attmap.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/attmap.h" +#include "access/htup_details.h" +#include "utils/builtins.h" + + +static bool check_attrmap_match(TupleDesc indesc, + TupleDesc outdesc, + AttrMap *attrMap); + +/* + * make_attrmap + * + * Utility routine to allocate an attribute map in the current memory + * context. + */ +AttrMap * +make_attrmap(int maplen) +{ + AttrMap *res; + + res = (AttrMap *) palloc0(sizeof(AttrMap)); + res->maplen = maplen; + res->attnums = (AttrNumber *) palloc0(sizeof(AttrNumber) * maplen); + return res; +} + +/* + * free_attrmap + * + * Utility routine to release an attribute map. + */ +void +free_attrmap(AttrMap *map) +{ + pfree(map->attnums); + pfree(map); +} + +/* + * build_attrmap_by_position + * + * Return a palloc'd bare attribute map for tuple conversion, matching input + * and output columns by position. Dropped columns are ignored in both input + * and output, marked as 0. This is normally a subroutine for + * convert_tuples_by_position in tupconvert.c, but it can be used standalone. + * + * Note: the errdetail messages speak of indesc as the "returned" rowtype, + * outdesc as the "expected" rowtype. This is okay for current uses but + * might need generalization in future. + */ +AttrMap * +build_attrmap_by_position(TupleDesc indesc, + TupleDesc outdesc, + const char *msg) +{ + AttrMap *attrMap; + int nincols; + int noutcols; + int n; + int i; + int j; + bool same; + + /* + * The length is computed as the number of attributes of the expected + * rowtype as it includes dropped attributes in its count. + */ + n = outdesc->natts; + attrMap = make_attrmap(n); + + j = 0; /* j is next physical input attribute */ + nincols = noutcols = 0; /* these count non-dropped attributes */ + same = true; + for (i = 0; i < n; i++) + { + Form_pg_attribute att = TupleDescAttr(outdesc, i); + Oid atttypid; + int32 atttypmod; + + if (att->attisdropped) + continue; /* attrMap->attnums[i] is already 0 */ + noutcols++; + atttypid = att->atttypid; + atttypmod = att->atttypmod; + for (; j < indesc->natts; j++) + { + att = TupleDescAttr(indesc, j); + if (att->attisdropped) + continue; + nincols++; + + /* Found matching column, now check type */ + if (atttypid != att->atttypid || + (atttypmod != att->atttypmod && atttypmod >= 0)) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg_internal("%s", _(msg)), + errdetail("Returned type %s does not match expected type %s in column %d.", + format_type_with_typemod(att->atttypid, + att->atttypmod), + format_type_with_typemod(atttypid, + atttypmod), + noutcols))); + attrMap->attnums[i] = (AttrNumber) (j + 1); + j++; + break; + } + if (attrMap->attnums[i] == 0) + same = false; /* we'll complain below */ + } + + /* Check for unused input columns */ + for (; j < indesc->natts; j++) + { + if (TupleDescAttr(indesc, j)->attisdropped) + continue; + nincols++; + same = false; /* we'll complain below */ + } + + /* Report column count mismatch using the non-dropped-column counts */ + if (!same) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg_internal("%s", _(msg)), + errdetail("Number of returned columns (%d) does not match " + "expected column count (%d).", + nincols, noutcols))); + + /* Check if the map has a one-to-one match */ + if (check_attrmap_match(indesc, outdesc, attrMap)) + { + /* Runtime conversion is not needed */ + free_attrmap(attrMap); + return NULL; + } + + return attrMap; +} + +/* + * build_attrmap_by_name + * + * Return a palloc'd bare attribute map for tuple conversion, matching input + * and output columns by name. (Dropped columns are ignored in both input and + * output.) This is normally a subroutine for convert_tuples_by_name in + * tupconvert.c, but can be used standalone. + */ +AttrMap * +build_attrmap_by_name(TupleDesc indesc, + TupleDesc outdesc) +{ + AttrMap *attrMap; + int outnatts; + int innatts; + int i; + int nextindesc = -1; + + outnatts = outdesc->natts; + innatts = indesc->natts; + + attrMap = make_attrmap(outnatts); + for (i = 0; i < outnatts; i++) + { + Form_pg_attribute outatt = TupleDescAttr(outdesc, i); + char *attname; + Oid atttypid; + int32 atttypmod; + int j; + + if (outatt->attisdropped) + continue; /* attrMap->attnums[i] is already 0 */ + attname = NameStr(outatt->attname); + atttypid = outatt->atttypid; + atttypmod = outatt->atttypmod; + + /* + * Now search for an attribute with the same name in the indesc. It + * seems likely that a partitioned table will have the attributes in + * the same order as the partition, so the search below is optimized + * for that case. It is possible that columns are dropped in one of + * the relations, but not the other, so we use the 'nextindesc' + * counter to track the starting point of the search. If the inner + * loop encounters dropped columns then it will have to skip over + * them, but it should leave 'nextindesc' at the correct position for + * the next outer loop. + */ + for (j = 0; j < innatts; j++) + { + Form_pg_attribute inatt; + + nextindesc++; + if (nextindesc >= innatts) + nextindesc = 0; + + inatt = TupleDescAttr(indesc, nextindesc); + if (inatt->attisdropped) + continue; + if (strcmp(attname, NameStr(inatt->attname)) == 0) + { + /* Found it, check type */ + if (atttypid != inatt->atttypid || atttypmod != inatt->atttypmod) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("could not convert row type"), + errdetail("Attribute \"%s\" of type %s does not match corresponding attribute of type %s.", + attname, + format_type_be(outdesc->tdtypeid), + format_type_be(indesc->tdtypeid)))); + attrMap->attnums[i] = inatt->attnum; + break; + } + } + if (attrMap->attnums[i] == 0) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("could not convert row type"), + errdetail("Attribute \"%s\" of type %s does not exist in type %s.", + attname, + format_type_be(outdesc->tdtypeid), + format_type_be(indesc->tdtypeid)))); + } + return attrMap; +} + +/* + * build_attrmap_by_name_if_req + * + * Returns mapping created by build_attrmap_by_name, or NULL if no + * conversion is required. This is a convenience routine for + * convert_tuples_by_name() in tupconvert.c and other functions, but it + * can be used standalone. + */ +AttrMap * +build_attrmap_by_name_if_req(TupleDesc indesc, + TupleDesc outdesc) +{ + AttrMap *attrMap; + + /* Verify compatibility and prepare attribute-number map */ + attrMap = build_attrmap_by_name(indesc, outdesc); + + /* Check if the map has a one-to-one match */ + if (check_attrmap_match(indesc, outdesc, attrMap)) + { + /* Runtime conversion is not needed */ + free_attrmap(attrMap); + return NULL; + } + + return attrMap; +} + +/* + * check_attrmap_match + * + * Check to see if the map is a one-to-one match, in which case we need + * not to do a tuple conversion, and the attribute map is not necessary. + */ +static bool +check_attrmap_match(TupleDesc indesc, + TupleDesc outdesc, + AttrMap *attrMap) +{ + int i; + + /* no match if attribute numbers are not the same */ + if (indesc->natts != outdesc->natts) + return false; + + for (i = 0; i < attrMap->maplen; i++) + { + Form_pg_attribute inatt = TupleDescAttr(indesc, i); + Form_pg_attribute outatt = TupleDescAttr(outdesc, i); + + /* + * If the input column has a missing attribute, we need a conversion. + */ + if (inatt->atthasmissing) + return false; + + if (attrMap->attnums[i] == (i + 1)) + continue; + + /* + * If it's a dropped column and the corresponding input column is also + * dropped, we don't need a conversion. However, attlen and attalign + * must agree. + */ + if (attrMap->attnums[i] == 0 && + inatt->attisdropped && + inatt->attlen == outatt->attlen && + inatt->attalign == outatt->attalign) + continue; + + return false; + } + + return true; +} diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c new file mode 100644 index 0000000..003a0be --- /dev/null +++ b/src/backend/access/common/bufmask.c @@ -0,0 +1,130 @@ +/*------------------------------------------------------------------------- + * + * bufmask.c + * Routines for buffer masking. Used to mask certain bits + * in a page which can be different when the WAL is generated + * and when the WAL is applied. + * + * Portions Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * Contains common routines required for masking a page. + * + * IDENTIFICATION + * src/backend/access/common/bufmask.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/bufmask.h" + +/* + * mask_page_lsn_and_checksum + * + * In consistency checks, the LSN of the two pages compared will likely be + * different because of concurrent operations when the WAL is generated and + * the state of the page when WAL is applied. Also, mask out checksum as + * masking anything else on page means checksum is not going to match as well. + */ +void +mask_page_lsn_and_checksum(Page page) +{ + PageHeader phdr = (PageHeader) page; + + PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER); + phdr->pd_checksum = MASK_MARKER; +} + +/* + * mask_page_hint_bits + * + * Mask hint bits in PageHeader. We want to ignore differences in hint bits, + * since they can be set without emitting any WAL. + */ +void +mask_page_hint_bits(Page page) +{ + PageHeader phdr = (PageHeader) page; + + /* Ignore prune_xid (it's like a hint-bit) */ + phdr->pd_prune_xid = MASK_MARKER; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */ + PageClearFull(page); + PageClearHasFreeLinePointers(page); + + /* + * During replay, if the page LSN has advanced past our XLOG record's LSN, + * we don't mark the page all-visible. See heap_xlog_visible() for + * details. + */ + PageClearAllVisible(page); +} + +/* + * mask_unused_space + * + * Mask the unused space of a page between pd_lower and pd_upper. + */ +void +mask_unused_space(Page page) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_special; + + /* Sanity check */ + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n", + pd_lower, pd_upper, pd_special); + } + + memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower); +} + +/* + * mask_lp_flags + * + * In some index AMs, line pointer flags can be modified on the primary + * without emitting any WAL record. + */ +void +mask_lp_flags(Page page) +{ + OffsetNumber offnum, + maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemId)) + itemId->lp_flags = LP_UNUSED; + } +} + +/* + * mask_page_content + * + * In some index AMs, the contents of deleted pages need to be almost + * completely ignored. + */ +void +mask_page_content(Page page) +{ + /* Mask Page Content */ + memset(page + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - SizeOfPageHeaderData); + + /* Mask pd_lower and pd_upper */ + memset(&((PageHeader) page)->pd_lower, MASK_MARKER, + sizeof(uint16)); + memset(&((PageHeader) page)->pd_upper, MASK_MARKER, + sizeof(uint16)); +} diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c new file mode 100644 index 0000000..545a6b8 --- /dev/null +++ b/src/backend/access/common/detoast.c @@ -0,0 +1,646 @@ +/*------------------------------------------------------------------------- + * + * detoast.c + * Retrieve compressed or external variable size attributes. + * + * Copyright (c) 2000-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/common/detoast.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/toast_internals.h" +#include "common/int.h" +#include "common/pg_lzcompress.h" +#include "utils/expandeddatum.h" +#include "utils/rel.h" + +static struct varlena *toast_fetch_datum(struct varlena *attr); +static struct varlena *toast_fetch_datum_slice(struct varlena *attr, + int32 sliceoffset, + int32 slicelength); +static struct varlena *toast_decompress_datum(struct varlena *attr); +static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); + +/* ---------- + * detoast_external_attr - + * + * Public entry point to get back a toasted value from + * external source (possibly still in compressed format). + * + * This will return a datum that contains all the data internally, ie, not + * relying on external storage or memory, but it can still be compressed or + * have a short header. Note some callers assume that if the input is an + * EXTERNAL datum, the result will be a pfree'able chunk. + * ---------- + */ +struct varlena * +detoast_external_attr(struct varlena *attr) +{ + struct varlena *result; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* + * This is an external stored plain value + */ + result = toast_fetch_datum(attr); + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + /* + * This is an indirect pointer --- dereference it + */ + struct varatt_indirect redirect; + + VARATT_EXTERNAL_GET_POINTER(redirect, attr); + attr = (struct varlena *) redirect.pointer; + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); + + /* recurse if value is still external in some other way */ + if (VARATT_IS_EXTERNAL(attr)) + return detoast_external_attr(attr); + + /* + * Copy into the caller's memory context, in case caller tries to + * pfree the result. + */ + result = (struct varlena *) palloc(VARSIZE_ANY(attr)); + memcpy(result, attr, VARSIZE_ANY(attr)); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + /* + * This is an expanded-object pointer --- get flat format + */ + ExpandedObjectHeader *eoh; + Size resultsize; + + eoh = DatumGetEOHP(PointerGetDatum(attr)); + resultsize = EOH_get_flat_size(eoh); + result = (struct varlena *) palloc(resultsize); + EOH_flatten_into(eoh, (void *) result, resultsize); + } + else + { + /* + * This is a plain value inside of the main tuple - why am I called? + */ + result = attr; + } + + return result; +} + + +/* ---------- + * detoast_attr - + * + * Public entry point to get back a toasted value from compression + * or external storage. The result is always non-extended varlena form. + * + * Note some callers assume that if the input is an EXTERNAL or COMPRESSED + * datum, the result will be a pfree'able chunk. + * ---------- + */ +struct varlena * +detoast_attr(struct varlena *attr) +{ + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* + * This is an externally stored datum --- fetch it back from there + */ + attr = toast_fetch_datum(attr); + /* If it's compressed, decompress it */ + if (VARATT_IS_COMPRESSED(attr)) + { + struct varlena *tmp = attr; + + attr = toast_decompress_datum(tmp); + pfree(tmp); + } + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + /* + * This is an indirect pointer --- dereference it + */ + struct varatt_indirect redirect; + + VARATT_EXTERNAL_GET_POINTER(redirect, attr); + attr = (struct varlena *) redirect.pointer; + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); + + /* recurse in case value is still extended in some other way */ + attr = detoast_attr(attr); + + /* if it isn't, we'd better copy it */ + if (attr == (struct varlena *) redirect.pointer) + { + struct varlena *result; + + result = (struct varlena *) palloc(VARSIZE_ANY(attr)); + memcpy(result, attr, VARSIZE_ANY(attr)); + attr = result; + } + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + /* + * This is an expanded-object pointer --- get flat format + */ + attr = detoast_external_attr(attr); + /* flatteners are not allowed to produce compressed/short output */ + Assert(!VARATT_IS_EXTENDED(attr)); + } + else if (VARATT_IS_COMPRESSED(attr)) + { + /* + * This is a compressed value inside of the main tuple + */ + attr = toast_decompress_datum(attr); + } + else if (VARATT_IS_SHORT(attr)) + { + /* + * This is a short-header varlena --- convert to 4-byte header format + */ + Size data_size = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT; + Size new_size = data_size + VARHDRSZ; + struct varlena *new_attr; + + new_attr = (struct varlena *) palloc(new_size); + SET_VARSIZE(new_attr, new_size); + memcpy(VARDATA(new_attr), VARDATA_SHORT(attr), data_size); + attr = new_attr; + } + + return attr; +} + + +/* ---------- + * detoast_attr_slice - + * + * Public entry point to get back part of a toasted value + * from compression or external storage. + * + * sliceoffset is where to start (zero or more) + * If slicelength < 0, return everything beyond sliceoffset + * ---------- + */ +struct varlena * +detoast_attr_slice(struct varlena *attr, + int32 sliceoffset, int32 slicelength) +{ + struct varlena *preslice; + struct varlena *result; + char *attrdata; + int32 slicelimit; + int32 attrsize; + + if (sliceoffset < 0) + elog(ERROR, "invalid sliceoffset: %d", sliceoffset); + + /* + * Compute slicelimit = offset + length, or -1 if we must fetch all of the + * value. In case of integer overflow, we must fetch all. + */ + if (slicelength < 0) + slicelimit = -1; + else if (pg_add_s32_overflow(sliceoffset, slicelength, &slicelimit)) + slicelength = slicelimit = -1; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + struct varatt_external toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* fast path for non-compressed external datums */ + if (!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + return toast_fetch_datum_slice(attr, sliceoffset, slicelength); + + /* + * For compressed values, we need to fetch enough slices to decompress + * at least the requested part (when a prefix is requested). + * Otherwise, just fetch all slices. + */ + if (slicelimit >= 0) + { + int32 max_size = VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer); + + /* + * Determine maximum amount of compressed data needed for a prefix + * of a given length (after decompression). + * + * At least for now, if it's LZ4 data, we'll have to fetch the + * whole thing, because there doesn't seem to be an API call to + * determine how much compressed data we need to be sure of being + * able to decompress the required slice. + */ + if (VARATT_EXTERNAL_GET_COMPRESS_METHOD(toast_pointer) == + TOAST_PGLZ_COMPRESSION_ID) + max_size = pglz_maximum_compressed_size(slicelimit, max_size); + + /* + * Fetch enough compressed slices (compressed marker will get set + * automatically). + */ + preslice = toast_fetch_datum_slice(attr, 0, max_size); + } + else + preslice = toast_fetch_datum(attr); + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + struct varatt_indirect redirect; + + VARATT_EXTERNAL_GET_POINTER(redirect, attr); + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(redirect.pointer)); + + return detoast_attr_slice(redirect.pointer, + sliceoffset, slicelength); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + /* pass it off to detoast_external_attr to flatten */ + preslice = detoast_external_attr(attr); + } + else + preslice = attr; + + Assert(!VARATT_IS_EXTERNAL(preslice)); + + if (VARATT_IS_COMPRESSED(preslice)) + { + struct varlena *tmp = preslice; + + /* Decompress enough to encompass the slice and the offset */ + if (slicelimit >= 0) + preslice = toast_decompress_datum_slice(tmp, slicelimit); + else + preslice = toast_decompress_datum(tmp); + + if (tmp != attr) + pfree(tmp); + } + + if (VARATT_IS_SHORT(preslice)) + { + attrdata = VARDATA_SHORT(preslice); + attrsize = VARSIZE_SHORT(preslice) - VARHDRSZ_SHORT; + } + else + { + attrdata = VARDATA(preslice); + attrsize = VARSIZE(preslice) - VARHDRSZ; + } + + /* slicing of datum for compressed cases and plain value */ + + if (sliceoffset >= attrsize) + { + sliceoffset = 0; + slicelength = 0; + } + else if (slicelength < 0 || slicelimit > attrsize) + slicelength = attrsize - sliceoffset; + + result = (struct varlena *) palloc(slicelength + VARHDRSZ); + SET_VARSIZE(result, slicelength + VARHDRSZ); + + memcpy(VARDATA(result), attrdata + sliceoffset, slicelength); + + if (preslice != attr) + pfree(preslice); + + return result; +} + +/* ---------- + * toast_fetch_datum - + * + * Reconstruct an in memory Datum from the chunks saved + * in the toast relation + * ---------- + */ +static struct varlena * +toast_fetch_datum(struct varlena *attr) +{ + Relation toastrel; + struct varlena *result; + struct varatt_external toast_pointer; + int32 attrsize; + + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + elog(ERROR, "toast_fetch_datum shouldn't be called for non-ondisk datums"); + + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + attrsize = VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer); + + result = (struct varlena *) palloc(attrsize + VARHDRSZ); + + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + SET_VARSIZE_COMPRESSED(result, attrsize + VARHDRSZ); + else + SET_VARSIZE(result, attrsize + VARHDRSZ); + + if (attrsize == 0) + return result; /* Probably shouldn't happen, but just in + * case. */ + + /* + * Open the toast relation and its indexes + */ + toastrel = table_open(toast_pointer.va_toastrelid, AccessShareLock); + + /* Fetch all chunks */ + table_relation_fetch_toast_slice(toastrel, toast_pointer.va_valueid, + attrsize, 0, attrsize, result); + + /* Close toast table */ + table_close(toastrel, AccessShareLock); + + return result; +} + +/* ---------- + * toast_fetch_datum_slice - + * + * Reconstruct a segment of a Datum from the chunks saved + * in the toast relation + * + * Note that this function supports non-compressed external datums + * and compressed external datums (in which case the requested slice + * has to be a prefix, i.e. sliceoffset has to be 0). + * ---------- + */ +static struct varlena * +toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, + int32 slicelength) +{ + Relation toastrel; + struct varlena *result; + struct varatt_external toast_pointer; + int32 attrsize; + + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + elog(ERROR, "toast_fetch_datum_slice shouldn't be called for non-ondisk datums"); + + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* + * It's nonsense to fetch slices of a compressed datum unless when it's a + * prefix -- this isn't lo_* we can't return a compressed datum which is + * meaningful to toast later. + */ + Assert(!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) || 0 == sliceoffset); + + attrsize = VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer); + + if (sliceoffset >= attrsize) + { + sliceoffset = 0; + slicelength = 0; + } + + /* + * When fetching a prefix of a compressed external datum, account for the + * space required by va_tcinfo, which is stored at the beginning as an + * int32 value. + */ + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) && slicelength > 0) + slicelength = slicelength + sizeof(int32); + + /* + * Adjust length request if needed. (Note: our sole caller, + * detoast_attr_slice, protects us against sliceoffset + slicelength + * overflowing.) + */ + if (((sliceoffset + slicelength) > attrsize) || slicelength < 0) + slicelength = attrsize - sliceoffset; + + result = (struct varlena *) palloc(slicelength + VARHDRSZ); + + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + SET_VARSIZE_COMPRESSED(result, slicelength + VARHDRSZ); + else + SET_VARSIZE(result, slicelength + VARHDRSZ); + + if (slicelength == 0) + return result; /* Can save a lot of work at this point! */ + + /* Open the toast relation */ + toastrel = table_open(toast_pointer.va_toastrelid, AccessShareLock); + + /* Fetch all chunks */ + table_relation_fetch_toast_slice(toastrel, toast_pointer.va_valueid, + attrsize, sliceoffset, slicelength, + result); + + /* Close toast table */ + table_close(toastrel, AccessShareLock); + + return result; +} + +/* ---------- + * toast_decompress_datum - + * + * Decompress a compressed version of a varlena datum + */ +static struct varlena * +toast_decompress_datum(struct varlena *attr) +{ + ToastCompressionId cmid; + + Assert(VARATT_IS_COMPRESSED(attr)); + + /* + * Fetch the compression method id stored in the compression header and + * decompress the data using the appropriate decompression routine. + */ + cmid = TOAST_COMPRESS_METHOD(attr); + switch (cmid) + { + case TOAST_PGLZ_COMPRESSION_ID: + return pglz_decompress_datum(attr); + case TOAST_LZ4_COMPRESSION_ID: + return lz4_decompress_datum(attr); + default: + elog(ERROR, "invalid compression method id %d", cmid); + return NULL; /* keep compiler quiet */ + } +} + + +/* ---------- + * toast_decompress_datum_slice - + * + * Decompress the front of a compressed version of a varlena datum. + * offset handling happens in detoast_attr_slice. + * Here we just decompress a slice from the front. + */ +static struct varlena * +toast_decompress_datum_slice(struct varlena *attr, int32 slicelength) +{ + ToastCompressionId cmid; + + Assert(VARATT_IS_COMPRESSED(attr)); + + /* + * Some callers may pass a slicelength that's more than the actual + * decompressed size. If so, just decompress normally. This avoids + * possibly allocating a larger-than-necessary result object, and may be + * faster and/or more robust as well. Notably, some versions of liblz4 + * have been seen to give wrong results if passed an output size that is + * more than the data's true decompressed size. + */ + if ((uint32) slicelength >= TOAST_COMPRESS_EXTSIZE(attr)) + return toast_decompress_datum(attr); + + /* + * Fetch the compression method id stored in the compression header and + * decompress the data slice using the appropriate decompression routine. + */ + cmid = TOAST_COMPRESS_METHOD(attr); + switch (cmid) + { + case TOAST_PGLZ_COMPRESSION_ID: + return pglz_decompress_datum_slice(attr, slicelength); + case TOAST_LZ4_COMPRESSION_ID: + return lz4_decompress_datum_slice(attr, slicelength); + default: + elog(ERROR, "invalid compression method id %d", cmid); + return NULL; /* keep compiler quiet */ + } +} + +/* ---------- + * toast_raw_datum_size - + * + * Return the raw (detoasted) size of a varlena datum + * (including the VARHDRSZ header) + * ---------- + */ +Size +toast_raw_datum_size(Datum value) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + Size result; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* va_rawsize is the size of the original datum -- including header */ + struct varatt_external toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + result = toast_pointer.va_rawsize; + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + struct varatt_indirect toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(toast_pointer.pointer)); + + return toast_raw_datum_size(PointerGetDatum(toast_pointer.pointer)); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + result = EOH_get_flat_size(DatumGetEOHP(value)); + } + else if (VARATT_IS_COMPRESSED(attr)) + { + /* here, va_rawsize is just the payload size */ + result = VARDATA_COMPRESSED_GET_EXTSIZE(attr) + VARHDRSZ; + } + else if (VARATT_IS_SHORT(attr)) + { + /* + * we have to normalize the header length to VARHDRSZ or else the + * callers of this function will be confused. + */ + result = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT + VARHDRSZ; + } + else + { + /* plain untoasted datum */ + result = VARSIZE(attr); + } + return result; +} + +/* ---------- + * toast_datum_size + * + * Return the physical storage size (possibly compressed) of a varlena datum + * ---------- + */ +Size +toast_datum_size(Datum value) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + Size result; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* + * Attribute is stored externally - return the extsize whether + * compressed or not. We do not count the size of the toast pointer + * ... should we? + */ + struct varatt_external toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + result = VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer); + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + struct varatt_indirect toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); + + return toast_datum_size(PointerGetDatum(toast_pointer.pointer)); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + result = EOH_get_flat_size(DatumGetEOHP(value)); + } + else if (VARATT_IS_SHORT(attr)) + { + result = VARSIZE_SHORT(attr); + } + else + { + /* + * Attribute is stored inline either compressed or not, just calculate + * the size of the datum in either case. + */ + result = VARSIZE(attr); + } + return result; +} diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c new file mode 100644 index 0000000..0b56b0f --- /dev/null +++ b/src/backend/access/common/heaptuple.c @@ -0,0 +1,1501 @@ +/*------------------------------------------------------------------------- + * + * heaptuple.c + * This file contains heap tuple accessor and mutator routines, as well + * as various tuple utilities. + * + * Some notes about varlenas and this code: + * + * Before Postgres 8.3 varlenas always had a 4-byte length header, and + * therefore always needed 4-byte alignment (at least). This wasted space + * for short varlenas, for example CHAR(1) took 5 bytes and could need up to + * 3 additional padding bytes for alignment. + * + * Now, a short varlena (up to 126 data bytes) is reduced to a 1-byte header + * and we don't align it. To hide this from datatype-specific functions that + * don't want to deal with it, such a datum is considered "toasted" and will + * be expanded back to the normal 4-byte-header format by pg_detoast_datum. + * (In performance-critical code paths we can use pg_detoast_datum_packed + * and the appropriate access macros to avoid that overhead.) Note that this + * conversion is performed directly in heap_form_tuple, without invoking + * heaptoast.c. + * + * This change will break any code that assumes it needn't detoast values + * that have been put into a tuple but never sent to disk. Hopefully there + * are few such places. + * + * Varlenas still have alignment INT (or DOUBLE) in pg_type/pg_attribute, since + * that's the normal requirement for the untoasted format. But we ignore that + * for the 1-byte-header format. This means that the actual start position + * of a varlena datum may vary depending on which format it has. To determine + * what is stored, we have to require that alignment padding bytes be zero. + * (Postgres actually has always zeroed them, but now it's required!) Since + * the first byte of a 1-byte-header varlena can never be zero, we can examine + * the first byte after the previous datum to tell if it's a pad byte or the + * start of a 1-byte-header varlena. + * + * Note that while formerly we could rely on the first varlena column of a + * system catalog to be at the offset suggested by the C struct for the + * catalog, this is now risky: it's only safe if the preceding field is + * word-aligned, so that there will never be any padding. + * + * We don't pack varlenas whose attstorage is PLAIN, since the data type + * isn't expecting to have to detoast values. This is used in particular + * by oidvector and int2vector, which are used in the system catalogs + * and we'd like to still refer to them via C struct offsets. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/heaptuple.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heaptoast.h" +#include "access/sysattr.h" +#include "access/tupdesc_details.h" +#include "executor/tuptable.h" +#include "utils/expandeddatum.h" + + +/* Does att's datatype allow packing into the 1-byte-header varlena format? */ +#define ATT_IS_PACKABLE(att) \ + ((att)->attlen == -1 && (att)->attstorage != TYPSTORAGE_PLAIN) +/* Use this if it's already known varlena */ +#define VARLENA_ATT_IS_PACKABLE(att) \ + ((att)->attstorage != TYPSTORAGE_PLAIN) + + +/* ---------------------------------------------------------------- + * misc support routines + * ---------------------------------------------------------------- + */ + +/* + * Return the missing value of an attribute, or NULL if there isn't one. + */ +Datum +getmissingattr(TupleDesc tupleDesc, + int attnum, bool *isnull) +{ + Form_pg_attribute att; + + Assert(attnum <= tupleDesc->natts); + Assert(attnum > 0); + + att = TupleDescAttr(tupleDesc, attnum - 1); + + if (att->atthasmissing) + { + AttrMissing *attrmiss; + + Assert(tupleDesc->constr); + Assert(tupleDesc->constr->missing); + + attrmiss = tupleDesc->constr->missing + (attnum - 1); + + if (attrmiss->am_present) + { + *isnull = false; + return attrmiss->am_value; + } + } + + *isnull = true; + return PointerGetDatum(NULL); +} + +/* + * heap_compute_data_size + * Determine size of the data area of a tuple to be constructed + */ +Size +heap_compute_data_size(TupleDesc tupleDesc, + Datum *values, + bool *isnull) +{ + Size data_length = 0; + int i; + int numberOfAttributes = tupleDesc->natts; + + for (i = 0; i < numberOfAttributes; i++) + { + Datum val; + Form_pg_attribute atti; + + if (isnull[i]) + continue; + + val = values[i]; + atti = TupleDescAttr(tupleDesc, i); + + if (ATT_IS_PACKABLE(atti) && + VARATT_CAN_MAKE_SHORT(DatumGetPointer(val))) + { + /* + * we're anticipating converting to a short varlena header, so + * adjust length and don't count any alignment + */ + data_length += VARATT_CONVERTED_SHORT_SIZE(DatumGetPointer(val)); + } + else if (atti->attlen == -1 && + VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val))) + { + /* + * we want to flatten the expanded value so that the constructed + * tuple doesn't depend on it + */ + data_length = att_align_nominal(data_length, atti->attalign); + data_length += EOH_get_flat_size(DatumGetEOHP(val)); + } + else + { + data_length = att_align_datum(data_length, atti->attalign, + atti->attlen, val); + data_length = att_addlength_datum(data_length, atti->attlen, + val); + } + } + + return data_length; +} + +/* + * Per-attribute helper for heap_fill_tuple and other routines building tuples. + * + * Fill in either a data value or a bit in the null bitmask + */ +static inline void +fill_val(Form_pg_attribute att, + bits8 **bit, + int *bitmask, + char **dataP, + uint16 *infomask, + Datum datum, + bool isnull) +{ + Size data_length; + char *data = *dataP; + + /* + * If we're building a null bitmap, set the appropriate bit for the + * current column value here. + */ + if (bit != NULL) + { + if (*bitmask != HIGHBIT) + *bitmask <<= 1; + else + { + *bit += 1; + **bit = 0x0; + *bitmask = 1; + } + + if (isnull) + { + *infomask |= HEAP_HASNULL; + return; + } + + **bit |= *bitmask; + } + + /* + * XXX we use the att_align macros on the pointer value itself, not on an + * offset. This is a bit of a hack. + */ + if (att->attbyval) + { + /* pass-by-value */ + data = (char *) att_align_nominal(data, att->attalign); + store_att_byval(data, datum, att->attlen); + data_length = att->attlen; + } + else if (att->attlen == -1) + { + /* varlena */ + Pointer val = DatumGetPointer(datum); + + *infomask |= HEAP_HASVARWIDTH; + if (VARATT_IS_EXTERNAL(val)) + { + if (VARATT_IS_EXTERNAL_EXPANDED(val)) + { + /* + * we want to flatten the expanded value so that the + * constructed tuple doesn't depend on it + */ + ExpandedObjectHeader *eoh = DatumGetEOHP(datum); + + data = (char *) att_align_nominal(data, + att->attalign); + data_length = EOH_get_flat_size(eoh); + EOH_flatten_into(eoh, data, data_length); + } + else + { + *infomask |= HEAP_HASEXTERNAL; + /* no alignment, since it's short by definition */ + data_length = VARSIZE_EXTERNAL(val); + memcpy(data, val, data_length); + } + } + else if (VARATT_IS_SHORT(val)) + { + /* no alignment for short varlenas */ + data_length = VARSIZE_SHORT(val); + memcpy(data, val, data_length); + } + else if (VARLENA_ATT_IS_PACKABLE(att) && + VARATT_CAN_MAKE_SHORT(val)) + { + /* convert to short varlena -- no alignment */ + data_length = VARATT_CONVERTED_SHORT_SIZE(val); + SET_VARSIZE_SHORT(data, data_length); + memcpy(data + 1, VARDATA(val), data_length - 1); + } + else + { + /* full 4-byte header varlena */ + data = (char *) att_align_nominal(data, + att->attalign); + data_length = VARSIZE(val); + memcpy(data, val, data_length); + } + } + else if (att->attlen == -2) + { + /* cstring ... never needs alignment */ + *infomask |= HEAP_HASVARWIDTH; + Assert(att->attalign == TYPALIGN_CHAR); + data_length = strlen(DatumGetCString(datum)) + 1; + memcpy(data, DatumGetPointer(datum), data_length); + } + else + { + /* fixed-length pass-by-reference */ + data = (char *) att_align_nominal(data, att->attalign); + Assert(att->attlen > 0); + data_length = att->attlen; + memcpy(data, DatumGetPointer(datum), data_length); + } + + data += data_length; + *dataP = data; +} + +/* + * heap_fill_tuple + * Load data portion of a tuple from values/isnull arrays + * + * We also fill the null bitmap (if any) and set the infomask bits + * that reflect the tuple's data contents. + * + * NOTE: it is now REQUIRED that the caller have pre-zeroed the data area. + */ +void +heap_fill_tuple(TupleDesc tupleDesc, + Datum *values, bool *isnull, + char *data, Size data_size, + uint16 *infomask, bits8 *bit) +{ + bits8 *bitP; + int bitmask; + int i; + int numberOfAttributes = tupleDesc->natts; + +#ifdef USE_ASSERT_CHECKING + char *start = data; +#endif + + if (bit != NULL) + { + bitP = &bit[-1]; + bitmask = HIGHBIT; + } + else + { + /* just to keep compiler quiet */ + bitP = NULL; + bitmask = 0; + } + + *infomask &= ~(HEAP_HASNULL | HEAP_HASVARWIDTH | HEAP_HASEXTERNAL); + + for (i = 0; i < numberOfAttributes; i++) + { + Form_pg_attribute attr = TupleDescAttr(tupleDesc, i); + + fill_val(attr, + bitP ? &bitP : NULL, + &bitmask, + &data, + infomask, + values ? values[i] : PointerGetDatum(NULL), + isnull ? isnull[i] : true); + } + + Assert((data - start) == data_size); +} + + +/* ---------------------------------------------------------------- + * heap tuple interface + * ---------------------------------------------------------------- + */ + +/* ---------------- + * heap_attisnull - returns true iff tuple attribute is not present + * ---------------- + */ +bool +heap_attisnull(HeapTuple tup, int attnum, TupleDesc tupleDesc) +{ + /* + * We allow a NULL tupledesc for relations not expected to have missing + * values, such as catalog relations and indexes. + */ + Assert(!tupleDesc || attnum <= tupleDesc->natts); + if (attnum > (int) HeapTupleHeaderGetNatts(tup->t_data)) + { + if (tupleDesc && TupleDescAttr(tupleDesc, attnum - 1)->atthasmissing) + return false; + else + return true; + } + + if (attnum > 0) + { + if (HeapTupleNoNulls(tup)) + return false; + return att_isnull(attnum - 1, tup->t_data->t_bits); + } + + switch (attnum) + { + case TableOidAttributeNumber: + case SelfItemPointerAttributeNumber: + case MinTransactionIdAttributeNumber: + case MinCommandIdAttributeNumber: + case MaxTransactionIdAttributeNumber: + case MaxCommandIdAttributeNumber: + /* these are never null */ + break; + + default: + elog(ERROR, "invalid attnum: %d", attnum); + } + + return false; +} + +/* ---------------- + * nocachegetattr + * + * This only gets called from fastgetattr() macro, in cases where + * we can't use a cacheoffset and the value is not null. + * + * This caches attribute offsets in the attribute descriptor. + * + * An alternative way to speed things up would be to cache offsets + * with the tuple, but that seems more difficult unless you take + * the storage hit of actually putting those offsets into the + * tuple you send to disk. Yuck. + * + * This scheme will be slightly slower than that, but should + * perform well for queries which hit large #'s of tuples. After + * you cache the offsets once, examining all the other tuples using + * the same attribute descriptor will go much quicker. -cim 5/4/91 + * + * NOTE: if you need to change this code, see also heap_deform_tuple. + * Also see nocache_index_getattr, which is the same code for index + * tuples. + * ---------------- + */ +Datum +nocachegetattr(HeapTuple tuple, + int attnum, + TupleDesc tupleDesc) +{ + HeapTupleHeader tup = tuple->t_data; + char *tp; /* ptr to data part of tuple */ + bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */ + bool slow = false; /* do we have to walk attrs? */ + int off; /* current offset within data */ + + /* ---------------- + * Three cases: + * + * 1: No nulls and no variable-width attributes. + * 2: Has a null or a var-width AFTER att. + * 3: Has nulls or var-widths BEFORE att. + * ---------------- + */ + + attnum--; + + if (!HeapTupleNoNulls(tuple)) + { + /* + * there's a null somewhere in the tuple + * + * check to see if any preceding bits are null... + */ + int byte = attnum >> 3; + int finalbit = attnum & 0x07; + + /* check for nulls "before" final bit of last byte */ + if ((~bp[byte]) & ((1 << finalbit) - 1)) + slow = true; + else + { + /* check for nulls in any "earlier" bytes */ + int i; + + for (i = 0; i < byte; i++) + { + if (bp[i] != 0xFF) + { + slow = true; + break; + } + } + } + } + + tp = (char *) tup + tup->t_hoff; + + if (!slow) + { + Form_pg_attribute att; + + /* + * If we get here, there are no nulls up to and including the target + * attribute. If we have a cached offset, we can use it. + */ + att = TupleDescAttr(tupleDesc, attnum); + if (att->attcacheoff >= 0) + return fetchatt(att, tp + att->attcacheoff); + + /* + * Otherwise, check for non-fixed-length attrs up to and including + * target. If there aren't any, it's safe to cheaply initialize the + * cached offsets for these attrs. + */ + if (HeapTupleHasVarWidth(tuple)) + { + int j; + + for (j = 0; j <= attnum; j++) + { + if (TupleDescAttr(tupleDesc, j)->attlen <= 0) + { + slow = true; + break; + } + } + } + } + + if (!slow) + { + int natts = tupleDesc->natts; + int j = 1; + + /* + * If we get here, we have a tuple with no nulls or var-widths up to + * and including the target attribute, so we can use the cached offset + * ... only we don't have it yet, or we'd not have got here. Since + * it's cheap to compute offsets for fixed-width columns, we take the + * opportunity to initialize the cached offsets for *all* the leading + * fixed-width columns, in hope of avoiding future visits to this + * routine. + */ + TupleDescAttr(tupleDesc, 0)->attcacheoff = 0; + + /* we might have set some offsets in the slow path previously */ + while (j < natts && TupleDescAttr(tupleDesc, j)->attcacheoff > 0) + j++; + + off = TupleDescAttr(tupleDesc, j - 1)->attcacheoff + + TupleDescAttr(tupleDesc, j - 1)->attlen; + + for (; j < natts; j++) + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, j); + + if (att->attlen <= 0) + break; + + off = att_align_nominal(off, att->attalign); + + att->attcacheoff = off; + + off += att->attlen; + } + + Assert(j > attnum); + + off = TupleDescAttr(tupleDesc, attnum)->attcacheoff; + } + else + { + bool usecache = true; + int i; + + /* + * Now we know that we have to walk the tuple CAREFULLY. But we still + * might be able to cache some offsets for next time. + * + * Note - This loop is a little tricky. For each non-null attribute, + * we have to first account for alignment padding before the attr, + * then advance over the attr based on its length. Nulls have no + * storage and no alignment padding either. We can use/set + * attcacheoff until we reach either a null or a var-width attribute. + */ + off = 0; + for (i = 0;; i++) /* loop exit is at "break" */ + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, i); + + if (HeapTupleHasNulls(tuple) && att_isnull(i, bp)) + { + usecache = false; + continue; /* this cannot be the target att */ + } + + /* If we know the next offset, we can skip the rest */ + if (usecache && att->attcacheoff >= 0) + off = att->attcacheoff; + else if (att->attlen == -1) + { + /* + * We can only cache the offset for a varlena attribute if the + * offset is already suitably aligned, so that there would be + * no pad bytes in any case: then the offset will be valid for + * either an aligned or unaligned value. + */ + if (usecache && + off == att_align_nominal(off, att->attalign)) + att->attcacheoff = off; + else + { + off = att_align_pointer(off, att->attalign, -1, + tp + off); + usecache = false; + } + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, att->attalign); + + if (usecache) + att->attcacheoff = off; + } + + if (i == attnum) + break; + + off = att_addlength_pointer(off, att->attlen, tp + off); + + if (usecache && att->attlen <= 0) + usecache = false; + } + } + + return fetchatt(TupleDescAttr(tupleDesc, attnum), tp + off); +} + +/* ---------------- + * heap_getsysattr + * + * Fetch the value of a system attribute for a tuple. + * + * This is a support routine for the heap_getattr macro. The macro + * has already determined that the attnum refers to a system attribute. + * ---------------- + */ +Datum +heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) +{ + Datum result; + + Assert(tup); + + /* Currently, no sys attribute ever reads as NULL. */ + *isnull = false; + + switch (attnum) + { + case SelfItemPointerAttributeNumber: + /* pass-by-reference datatype */ + result = PointerGetDatum(&(tup->t_self)); + break; + case MinTransactionIdAttributeNumber: + result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmin(tup->t_data)); + break; + case MaxTransactionIdAttributeNumber: + result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data)); + break; + case MinCommandIdAttributeNumber: + case MaxCommandIdAttributeNumber: + + /* + * cmin and cmax are now both aliases for the same field, which + * can in fact also be a combo command id. XXX perhaps we should + * return the "real" cmin or cmax if possible, that is if we are + * inside the originating transaction? + */ + result = CommandIdGetDatum(HeapTupleHeaderGetRawCommandId(tup->t_data)); + break; + case TableOidAttributeNumber: + result = ObjectIdGetDatum(tup->t_tableOid); + break; + default: + elog(ERROR, "invalid attnum: %d", attnum); + result = 0; /* keep compiler quiet */ + break; + } + return result; +} + +/* ---------------- + * heap_copytuple + * + * returns a copy of an entire tuple + * + * The HeapTuple struct, tuple header, and tuple data are all allocated + * as a single palloc() block. + * ---------------- + */ +HeapTuple +heap_copytuple(HeapTuple tuple) +{ + HeapTuple newTuple; + + if (!HeapTupleIsValid(tuple) || tuple->t_data == NULL) + return NULL; + + newTuple = (HeapTuple) palloc(HEAPTUPLESIZE + tuple->t_len); + newTuple->t_len = tuple->t_len; + newTuple->t_self = tuple->t_self; + newTuple->t_tableOid = tuple->t_tableOid; + newTuple->t_data = (HeapTupleHeader) ((char *) newTuple + HEAPTUPLESIZE); + memcpy((char *) newTuple->t_data, (char *) tuple->t_data, tuple->t_len); + return newTuple; +} + +/* ---------------- + * heap_copytuple_with_tuple + * + * copy a tuple into a caller-supplied HeapTuple management struct + * + * Note that after calling this function, the "dest" HeapTuple will not be + * allocated as a single palloc() block (unlike with heap_copytuple()). + * ---------------- + */ +void +heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest) +{ + if (!HeapTupleIsValid(src) || src->t_data == NULL) + { + dest->t_data = NULL; + return; + } + + dest->t_len = src->t_len; + dest->t_self = src->t_self; + dest->t_tableOid = src->t_tableOid; + dest->t_data = (HeapTupleHeader) palloc(src->t_len); + memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len); +} + +/* + * Expand a tuple which has fewer attributes than required. For each attribute + * not present in the sourceTuple, if there is a missing value that will be + * used. Otherwise the attribute will be set to NULL. + * + * The source tuple must have fewer attributes than the required number. + * + * Only one of targetHeapTuple and targetMinimalTuple may be supplied. The + * other argument must be NULL. + */ +static void +expand_tuple(HeapTuple *targetHeapTuple, + MinimalTuple *targetMinimalTuple, + HeapTuple sourceTuple, + TupleDesc tupleDesc) +{ + AttrMissing *attrmiss = NULL; + int attnum; + int firstmissingnum; + bool hasNulls = HeapTupleHasNulls(sourceTuple); + HeapTupleHeader targetTHeader; + HeapTupleHeader sourceTHeader = sourceTuple->t_data; + int sourceNatts = HeapTupleHeaderGetNatts(sourceTHeader); + int natts = tupleDesc->natts; + int sourceNullLen; + int targetNullLen; + Size sourceDataLen = sourceTuple->t_len - sourceTHeader->t_hoff; + Size targetDataLen; + Size len; + int hoff; + bits8 *nullBits = NULL; + int bitMask = 0; + char *targetData; + uint16 *infoMask; + + Assert((targetHeapTuple && !targetMinimalTuple) + || (!targetHeapTuple && targetMinimalTuple)); + + Assert(sourceNatts < natts); + + sourceNullLen = (hasNulls ? BITMAPLEN(sourceNatts) : 0); + + targetDataLen = sourceDataLen; + + if (tupleDesc->constr && + tupleDesc->constr->missing) + { + /* + * If there are missing values we want to put them into the tuple. + * Before that we have to compute the extra length for the values + * array and the variable length data. + */ + attrmiss = tupleDesc->constr->missing; + + /* + * Find the first item in attrmiss for which we don't have a value in + * the source. We can ignore all the missing entries before that. + */ + for (firstmissingnum = sourceNatts; + firstmissingnum < natts; + firstmissingnum++) + { + if (attrmiss[firstmissingnum].am_present) + break; + else + hasNulls = true; + } + + /* + * Now walk the missing attributes. If there is a missing value make + * space for it. Otherwise, it's going to be NULL. + */ + for (attnum = firstmissingnum; + attnum < natts; + attnum++) + { + if (attrmiss[attnum].am_present) + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, attnum); + + targetDataLen = att_align_datum(targetDataLen, + att->attalign, + att->attlen, + attrmiss[attnum].am_value); + + targetDataLen = att_addlength_pointer(targetDataLen, + att->attlen, + attrmiss[attnum].am_value); + } + else + { + /* no missing value, so it must be null */ + hasNulls = true; + } + } + } /* end if have missing values */ + else + { + /* + * If there are no missing values at all then NULLS must be allowed, + * since some of the attributes are known to be absent. + */ + hasNulls = true; + } + + len = 0; + + if (hasNulls) + { + targetNullLen = BITMAPLEN(natts); + len += targetNullLen; + } + else + targetNullLen = 0; + + /* + * Allocate and zero the space needed. Note that the tuple body and + * HeapTupleData management structure are allocated in one chunk. + */ + if (targetHeapTuple) + { + len += offsetof(HeapTupleHeaderData, t_bits); + hoff = len = MAXALIGN(len); /* align user data safely */ + len += targetDataLen; + + *targetHeapTuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len); + (*targetHeapTuple)->t_data + = targetTHeader + = (HeapTupleHeader) ((char *) *targetHeapTuple + HEAPTUPLESIZE); + (*targetHeapTuple)->t_len = len; + (*targetHeapTuple)->t_tableOid = sourceTuple->t_tableOid; + (*targetHeapTuple)->t_self = sourceTuple->t_self; + + targetTHeader->t_infomask = sourceTHeader->t_infomask; + targetTHeader->t_hoff = hoff; + HeapTupleHeaderSetNatts(targetTHeader, natts); + HeapTupleHeaderSetDatumLength(targetTHeader, len); + HeapTupleHeaderSetTypeId(targetTHeader, tupleDesc->tdtypeid); + HeapTupleHeaderSetTypMod(targetTHeader, tupleDesc->tdtypmod); + /* We also make sure that t_ctid is invalid unless explicitly set */ + ItemPointerSetInvalid(&(targetTHeader->t_ctid)); + if (targetNullLen > 0) + nullBits = (bits8 *) ((char *) (*targetHeapTuple)->t_data + + offsetof(HeapTupleHeaderData, t_bits)); + targetData = (char *) (*targetHeapTuple)->t_data + hoff; + infoMask = &(targetTHeader->t_infomask); + } + else + { + len += SizeofMinimalTupleHeader; + hoff = len = MAXALIGN(len); /* align user data safely */ + len += targetDataLen; + + *targetMinimalTuple = (MinimalTuple) palloc0(len); + (*targetMinimalTuple)->t_len = len; + (*targetMinimalTuple)->t_hoff = hoff + MINIMAL_TUPLE_OFFSET; + (*targetMinimalTuple)->t_infomask = sourceTHeader->t_infomask; + /* Same macro works for MinimalTuples */ + HeapTupleHeaderSetNatts(*targetMinimalTuple, natts); + if (targetNullLen > 0) + nullBits = (bits8 *) ((char *) *targetMinimalTuple + + offsetof(MinimalTupleData, t_bits)); + targetData = (char *) *targetMinimalTuple + hoff; + infoMask = &((*targetMinimalTuple)->t_infomask); + } + + if (targetNullLen > 0) + { + if (sourceNullLen > 0) + { + /* if bitmap pre-existed copy in - all is set */ + memcpy(nullBits, + ((char *) sourceTHeader) + + offsetof(HeapTupleHeaderData, t_bits), + sourceNullLen); + nullBits += sourceNullLen - 1; + } + else + { + sourceNullLen = BITMAPLEN(sourceNatts); + /* Set NOT NULL for all existing attributes */ + memset(nullBits, 0xff, sourceNullLen); + + nullBits += sourceNullLen - 1; + + if (sourceNatts & 0x07) + { + /* build the mask (inverted!) */ + bitMask = 0xff << (sourceNatts & 0x07); + /* Voila */ + *nullBits = ~bitMask; + } + } + + bitMask = (1 << ((sourceNatts - 1) & 0x07)); + } /* End if have null bitmap */ + + memcpy(targetData, + ((char *) sourceTuple->t_data) + sourceTHeader->t_hoff, + sourceDataLen); + + targetData += sourceDataLen; + + /* Now fill in the missing values */ + for (attnum = sourceNatts; attnum < natts; attnum++) + { + + Form_pg_attribute attr = TupleDescAttr(tupleDesc, attnum); + + if (attrmiss && attrmiss[attnum].am_present) + { + fill_val(attr, + nullBits ? &nullBits : NULL, + &bitMask, + &targetData, + infoMask, + attrmiss[attnum].am_value, + false); + } + else + { + fill_val(attr, + &nullBits, + &bitMask, + &targetData, + infoMask, + (Datum) 0, + true); + } + } /* end loop over missing attributes */ +} + +/* + * Fill in the missing values for a minimal HeapTuple + */ +MinimalTuple +minimal_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc) +{ + MinimalTuple minimalTuple; + + expand_tuple(NULL, &minimalTuple, sourceTuple, tupleDesc); + return minimalTuple; +} + +/* + * Fill in the missing values for an ordinary HeapTuple + */ +HeapTuple +heap_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc) +{ + HeapTuple heapTuple; + + expand_tuple(&heapTuple, NULL, sourceTuple, tupleDesc); + return heapTuple; +} + +/* ---------------- + * heap_copy_tuple_as_datum + * + * copy a tuple as a composite-type Datum + * ---------------- + */ +Datum +heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc) +{ + HeapTupleHeader td; + + /* + * If the tuple contains any external TOAST pointers, we have to inline + * those fields to meet the conventions for composite-type Datums. + */ + if (HeapTupleHasExternal(tuple)) + return toast_flatten_tuple_to_datum(tuple->t_data, + tuple->t_len, + tupleDesc); + + /* + * Fast path for easy case: just make a palloc'd copy and insert the + * correct composite-Datum header fields (since those may not be set if + * the given tuple came from disk, rather than from heap_form_tuple). + */ + td = (HeapTupleHeader) palloc(tuple->t_len); + memcpy((char *) td, (char *) tuple->t_data, tuple->t_len); + + HeapTupleHeaderSetDatumLength(td, tuple->t_len); + HeapTupleHeaderSetTypeId(td, tupleDesc->tdtypeid); + HeapTupleHeaderSetTypMod(td, tupleDesc->tdtypmod); + + return PointerGetDatum(td); +} + +/* + * heap_form_tuple + * construct a tuple from the given values[] and isnull[] arrays, + * which are of the length indicated by tupleDescriptor->natts + * + * The result is allocated in the current memory context. + */ +HeapTuple +heap_form_tuple(TupleDesc tupleDescriptor, + Datum *values, + bool *isnull) +{ + HeapTuple tuple; /* return tuple */ + HeapTupleHeader td; /* tuple data */ + Size len, + data_len; + int hoff; + bool hasnull = false; + int numberOfAttributes = tupleDescriptor->natts; + int i; + + if (numberOfAttributes > MaxTupleAttributeNumber) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("number of columns (%d) exceeds limit (%d)", + numberOfAttributes, MaxTupleAttributeNumber))); + + /* + * Check for nulls + */ + for (i = 0; i < numberOfAttributes; i++) + { + if (isnull[i]) + { + hasnull = true; + break; + } + } + + /* + * Determine total space needed + */ + len = offsetof(HeapTupleHeaderData, t_bits); + + if (hasnull) + len += BITMAPLEN(numberOfAttributes); + + hoff = len = MAXALIGN(len); /* align user data safely */ + + data_len = heap_compute_data_size(tupleDescriptor, values, isnull); + + len += data_len; + + /* + * Allocate and zero the space needed. Note that the tuple body and + * HeapTupleData management structure are allocated in one chunk. + */ + tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len); + tuple->t_data = td = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + + /* + * And fill in the information. Note we fill the Datum fields even though + * this tuple may never become a Datum. This lets HeapTupleHeaderGetDatum + * identify the tuple type if needed. + */ + tuple->t_len = len; + ItemPointerSetInvalid(&(tuple->t_self)); + tuple->t_tableOid = InvalidOid; + + HeapTupleHeaderSetDatumLength(td, len); + HeapTupleHeaderSetTypeId(td, tupleDescriptor->tdtypeid); + HeapTupleHeaderSetTypMod(td, tupleDescriptor->tdtypmod); + /* We also make sure that t_ctid is invalid unless explicitly set */ + ItemPointerSetInvalid(&(td->t_ctid)); + + HeapTupleHeaderSetNatts(td, numberOfAttributes); + td->t_hoff = hoff; + + heap_fill_tuple(tupleDescriptor, + values, + isnull, + (char *) td + hoff, + data_len, + &td->t_infomask, + (hasnull ? td->t_bits : NULL)); + + return tuple; +} + +/* + * heap_modify_tuple + * form a new tuple from an old tuple and a set of replacement values. + * + * The replValues, replIsnull, and doReplace arrays must be of the length + * indicated by tupleDesc->natts. The new tuple is constructed using the data + * from replValues/replIsnull at columns where doReplace is true, and using + * the data from the old tuple at columns where doReplace is false. + * + * The result is allocated in the current memory context. + */ +HeapTuple +heap_modify_tuple(HeapTuple tuple, + TupleDesc tupleDesc, + Datum *replValues, + bool *replIsnull, + bool *doReplace) +{ + int numberOfAttributes = tupleDesc->natts; + int attoff; + Datum *values; + bool *isnull; + HeapTuple newTuple; + + /* + * allocate and fill values and isnull arrays from either the tuple or the + * repl information, as appropriate. + * + * NOTE: it's debatable whether to use heap_deform_tuple() here or just + * heap_getattr() only the non-replaced columns. The latter could win if + * there are many replaced columns and few non-replaced ones. However, + * heap_deform_tuple costs only O(N) while the heap_getattr way would cost + * O(N^2) if there are many non-replaced columns, so it seems better to + * err on the side of linear cost. + */ + values = (Datum *) palloc(numberOfAttributes * sizeof(Datum)); + isnull = (bool *) palloc(numberOfAttributes * sizeof(bool)); + + heap_deform_tuple(tuple, tupleDesc, values, isnull); + + for (attoff = 0; attoff < numberOfAttributes; attoff++) + { + if (doReplace[attoff]) + { + values[attoff] = replValues[attoff]; + isnull[attoff] = replIsnull[attoff]; + } + } + + /* + * create a new tuple from the values and isnull arrays + */ + newTuple = heap_form_tuple(tupleDesc, values, isnull); + + pfree(values); + pfree(isnull); + + /* + * copy the identification info of the old tuple: t_ctid, t_self + */ + newTuple->t_data->t_ctid = tuple->t_data->t_ctid; + newTuple->t_self = tuple->t_self; + newTuple->t_tableOid = tuple->t_tableOid; + + return newTuple; +} + +/* + * heap_modify_tuple_by_cols + * form a new tuple from an old tuple and a set of replacement values. + * + * This is like heap_modify_tuple, except that instead of specifying which + * column(s) to replace by a boolean map, an array of target column numbers + * is used. This is often more convenient when a fixed number of columns + * are to be replaced. The replCols, replValues, and replIsnull arrays must + * be of length nCols. Target column numbers are indexed from 1. + * + * The result is allocated in the current memory context. + */ +HeapTuple +heap_modify_tuple_by_cols(HeapTuple tuple, + TupleDesc tupleDesc, + int nCols, + int *replCols, + Datum *replValues, + bool *replIsnull) +{ + int numberOfAttributes = tupleDesc->natts; + Datum *values; + bool *isnull; + HeapTuple newTuple; + int i; + + /* + * allocate and fill values and isnull arrays from the tuple, then replace + * selected columns from the input arrays. + */ + values = (Datum *) palloc(numberOfAttributes * sizeof(Datum)); + isnull = (bool *) palloc(numberOfAttributes * sizeof(bool)); + + heap_deform_tuple(tuple, tupleDesc, values, isnull); + + for (i = 0; i < nCols; i++) + { + int attnum = replCols[i]; + + if (attnum <= 0 || attnum > numberOfAttributes) + elog(ERROR, "invalid column number %d", attnum); + values[attnum - 1] = replValues[i]; + isnull[attnum - 1] = replIsnull[i]; + } + + /* + * create a new tuple from the values and isnull arrays + */ + newTuple = heap_form_tuple(tupleDesc, values, isnull); + + pfree(values); + pfree(isnull); + + /* + * copy the identification info of the old tuple: t_ctid, t_self + */ + newTuple->t_data->t_ctid = tuple->t_data->t_ctid; + newTuple->t_self = tuple->t_self; + newTuple->t_tableOid = tuple->t_tableOid; + + return newTuple; +} + +/* + * heap_deform_tuple + * Given a tuple, extract data into values/isnull arrays; this is + * the inverse of heap_form_tuple. + * + * Storage for the values/isnull arrays is provided by the caller; + * it should be sized according to tupleDesc->natts not + * HeapTupleHeaderGetNatts(tuple->t_data). + * + * Note that for pass-by-reference datatypes, the pointer placed + * in the Datum will point into the given tuple. + * + * When all or most of a tuple's fields need to be extracted, + * this routine will be significantly quicker than a loop around + * heap_getattr; the loop will become O(N^2) as soon as any + * noncacheable attribute offsets are involved. + */ +void +heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, + Datum *values, bool *isnull) +{ + HeapTupleHeader tup = tuple->t_data; + bool hasnulls = HeapTupleHasNulls(tuple); + int tdesc_natts = tupleDesc->natts; + int natts; /* number of atts to extract */ + int attnum; + char *tp; /* ptr to tuple data */ + uint32 off; /* offset in tuple data */ + bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */ + bool slow = false; /* can we use/set attcacheoff? */ + + natts = HeapTupleHeaderGetNatts(tup); + + /* + * In inheritance situations, it is possible that the given tuple actually + * has more fields than the caller is expecting. Don't run off the end of + * the caller's arrays. + */ + natts = Min(natts, tdesc_natts); + + tp = (char *) tup + tup->t_hoff; + + off = 0; + + for (attnum = 0; attnum < natts; attnum++) + { + Form_pg_attribute thisatt = TupleDescAttr(tupleDesc, attnum); + + if (hasnulls && att_isnull(attnum, bp)) + { + values[attnum] = (Datum) 0; + isnull[attnum] = true; + slow = true; /* can't use attcacheoff anymore */ + continue; + } + + isnull[attnum] = false; + + if (!slow && thisatt->attcacheoff >= 0) + off = thisatt->attcacheoff; + else if (thisatt->attlen == -1) + { + /* + * We can only cache the offset for a varlena attribute if the + * offset is already suitably aligned, so that there would be no + * pad bytes in any case: then the offset will be valid for either + * an aligned or unaligned value. + */ + if (!slow && + off == att_align_nominal(off, thisatt->attalign)) + thisatt->attcacheoff = off; + else + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + slow = true; + } + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + + if (!slow) + thisatt->attcacheoff = off; + } + + values[attnum] = fetchatt(thisatt, tp + off); + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + + if (thisatt->attlen <= 0) + slow = true; /* can't use attcacheoff anymore */ + } + + /* + * If tuple doesn't have all the atts indicated by tupleDesc, read the + * rest as nulls or missing values as appropriate. + */ + for (; attnum < tdesc_natts; attnum++) + values[attnum] = getmissingattr(tupleDesc, attnum + 1, &isnull[attnum]); +} + +/* + * heap_freetuple + */ +void +heap_freetuple(HeapTuple htup) +{ + pfree(htup); +} + + +/* + * heap_form_minimal_tuple + * construct a MinimalTuple from the given values[] and isnull[] arrays, + * which are of the length indicated by tupleDescriptor->natts + * + * This is exactly like heap_form_tuple() except that the result is a + * "minimal" tuple lacking a HeapTupleData header as well as room for system + * columns. + * + * The result is allocated in the current memory context. + */ +MinimalTuple +heap_form_minimal_tuple(TupleDesc tupleDescriptor, + Datum *values, + bool *isnull) +{ + MinimalTuple tuple; /* return tuple */ + Size len, + data_len; + int hoff; + bool hasnull = false; + int numberOfAttributes = tupleDescriptor->natts; + int i; + + if (numberOfAttributes > MaxTupleAttributeNumber) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("number of columns (%d) exceeds limit (%d)", + numberOfAttributes, MaxTupleAttributeNumber))); + + /* + * Check for nulls + */ + for (i = 0; i < numberOfAttributes; i++) + { + if (isnull[i]) + { + hasnull = true; + break; + } + } + + /* + * Determine total space needed + */ + len = SizeofMinimalTupleHeader; + + if (hasnull) + len += BITMAPLEN(numberOfAttributes); + + hoff = len = MAXALIGN(len); /* align user data safely */ + + data_len = heap_compute_data_size(tupleDescriptor, values, isnull); + + len += data_len; + + /* + * Allocate and zero the space needed. + */ + tuple = (MinimalTuple) palloc0(len); + + /* + * And fill in the information. + */ + tuple->t_len = len; + HeapTupleHeaderSetNatts(tuple, numberOfAttributes); + tuple->t_hoff = hoff + MINIMAL_TUPLE_OFFSET; + + heap_fill_tuple(tupleDescriptor, + values, + isnull, + (char *) tuple + hoff, + data_len, + &tuple->t_infomask, + (hasnull ? tuple->t_bits : NULL)); + + return tuple; +} + +/* + * heap_free_minimal_tuple + */ +void +heap_free_minimal_tuple(MinimalTuple mtup) +{ + pfree(mtup); +} + +/* + * heap_copy_minimal_tuple + * copy a MinimalTuple + * + * The result is allocated in the current memory context. + */ +MinimalTuple +heap_copy_minimal_tuple(MinimalTuple mtup) +{ + MinimalTuple result; + + result = (MinimalTuple) palloc(mtup->t_len); + memcpy(result, mtup, mtup->t_len); + return result; +} + +/* + * heap_tuple_from_minimal_tuple + * create a HeapTuple by copying from a MinimalTuple; + * system columns are filled with zeroes + * + * The result is allocated in the current memory context. + * The HeapTuple struct, tuple header, and tuple data are all allocated + * as a single palloc() block. + */ +HeapTuple +heap_tuple_from_minimal_tuple(MinimalTuple mtup) +{ + HeapTuple result; + uint32 len = mtup->t_len + MINIMAL_TUPLE_OFFSET; + + result = (HeapTuple) palloc(HEAPTUPLESIZE + len); + result->t_len = len; + ItemPointerSetInvalid(&(result->t_self)); + result->t_tableOid = InvalidOid; + result->t_data = (HeapTupleHeader) ((char *) result + HEAPTUPLESIZE); + memcpy((char *) result->t_data + MINIMAL_TUPLE_OFFSET, mtup, mtup->t_len); + memset(result->t_data, 0, offsetof(HeapTupleHeaderData, t_infomask2)); + return result; +} + +/* + * minimal_tuple_from_heap_tuple + * create a MinimalTuple by copying from a HeapTuple + * + * The result is allocated in the current memory context. + */ +MinimalTuple +minimal_tuple_from_heap_tuple(HeapTuple htup) +{ + MinimalTuple result; + uint32 len; + + Assert(htup->t_len > MINIMAL_TUPLE_OFFSET); + len = htup->t_len - MINIMAL_TUPLE_OFFSET; + result = (MinimalTuple) palloc(len); + memcpy(result, (char *) htup->t_data + MINIMAL_TUPLE_OFFSET, len); + result->t_len = len; + return result; +} + +/* + * This mainly exists so JIT can inline the definition, but it's also + * sometimes useful in debugging sessions. + */ +size_t +varsize_any(void *p) +{ + return VARSIZE_ANY(p); +} diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c new file mode 100644 index 0000000..8df882d --- /dev/null +++ b/src/backend/access/common/indextuple.c @@ -0,0 +1,589 @@ +/*------------------------------------------------------------------------- + * + * indextuple.c + * This file contains index tuple accessor and mutator routines, + * as well as various tuple utilities. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/indextuple.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/itup.h" +#include "access/toast_internals.h" + +/* + * This enables de-toasting of index entries. Needed until VACUUM is + * smart enough to rebuild indexes from scratch. + */ +#define TOAST_INDEX_HACK + +/* ---------------------------------------------------------------- + * index_ tuple interface routines + * ---------------------------------------------------------------- + */ + +/* ---------------- + * index_form_tuple + * + * This shouldn't leak any memory; otherwise, callers such as + * tuplesort_putindextuplevalues() will be very unhappy. + * + * This shouldn't perform external table access provided caller + * does not pass values that are stored EXTERNAL. + * ---------------- + */ +IndexTuple +index_form_tuple(TupleDesc tupleDescriptor, + Datum *values, + bool *isnull) +{ + char *tp; /* tuple pointer */ + IndexTuple tuple; /* return tuple */ + Size size, + data_size, + hoff; + int i; + unsigned short infomask = 0; + bool hasnull = false; + uint16 tupmask = 0; + int numberOfAttributes = tupleDescriptor->natts; + +#ifdef TOAST_INDEX_HACK + Datum untoasted_values[INDEX_MAX_KEYS]; + bool untoasted_free[INDEX_MAX_KEYS]; +#endif + + if (numberOfAttributes > INDEX_MAX_KEYS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("number of index columns (%d) exceeds limit (%d)", + numberOfAttributes, INDEX_MAX_KEYS))); + +#ifdef TOAST_INDEX_HACK + for (i = 0; i < numberOfAttributes; i++) + { + Form_pg_attribute att = TupleDescAttr(tupleDescriptor, i); + + untoasted_values[i] = values[i]; + untoasted_free[i] = false; + + /* Do nothing if value is NULL or not of varlena type */ + if (isnull[i] || att->attlen != -1) + continue; + + /* + * If value is stored EXTERNAL, must fetch it so we are not depending + * on outside storage. This should be improved someday. + */ + if (VARATT_IS_EXTERNAL(DatumGetPointer(values[i]))) + { + untoasted_values[i] = + PointerGetDatum(detoast_external_attr((struct varlena *) + DatumGetPointer(values[i]))); + untoasted_free[i] = true; + } + + /* + * If value is above size target, and is of a compressible datatype, + * try to compress it in-line. + */ + if (!VARATT_IS_EXTENDED(DatumGetPointer(untoasted_values[i])) && + VARSIZE(DatumGetPointer(untoasted_values[i])) > TOAST_INDEX_TARGET && + (att->attstorage == TYPSTORAGE_EXTENDED || + att->attstorage == TYPSTORAGE_MAIN)) + { + Datum cvalue; + + cvalue = toast_compress_datum(untoasted_values[i], + att->attcompression); + + if (DatumGetPointer(cvalue) != NULL) + { + /* successful compression */ + if (untoasted_free[i]) + pfree(DatumGetPointer(untoasted_values[i])); + untoasted_values[i] = cvalue; + untoasted_free[i] = true; + } + } + } +#endif + + for (i = 0; i < numberOfAttributes; i++) + { + if (isnull[i]) + { + hasnull = true; + break; + } + } + + if (hasnull) + infomask |= INDEX_NULL_MASK; + + hoff = IndexInfoFindDataOffset(infomask); +#ifdef TOAST_INDEX_HACK + data_size = heap_compute_data_size(tupleDescriptor, + untoasted_values, isnull); +#else + data_size = heap_compute_data_size(tupleDescriptor, + values, isnull); +#endif + size = hoff + data_size; + size = MAXALIGN(size); /* be conservative */ + + tp = (char *) palloc0(size); + tuple = (IndexTuple) tp; + + heap_fill_tuple(tupleDescriptor, +#ifdef TOAST_INDEX_HACK + untoasted_values, +#else + values, +#endif + isnull, + (char *) tp + hoff, + data_size, + &tupmask, + (hasnull ? (bits8 *) tp + sizeof(IndexTupleData) : NULL)); + +#ifdef TOAST_INDEX_HACK + for (i = 0; i < numberOfAttributes; i++) + { + if (untoasted_free[i]) + pfree(DatumGetPointer(untoasted_values[i])); + } +#endif + + /* + * We do this because heap_fill_tuple wants to initialize a "tupmask" + * which is used for HeapTuples, but we want an indextuple infomask. The + * only relevant info is the "has variable attributes" field. We have + * already set the hasnull bit above. + */ + if (tupmask & HEAP_HASVARWIDTH) + infomask |= INDEX_VAR_MASK; + + /* Also assert we got rid of external attributes */ +#ifdef TOAST_INDEX_HACK + Assert((tupmask & HEAP_HASEXTERNAL) == 0); +#endif + + /* + * Here we make sure that the size will fit in the field reserved for it + * in t_info. + */ + if ((size & INDEX_SIZE_MASK) != size) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row requires %zu bytes, maximum size is %zu", + size, (Size) INDEX_SIZE_MASK))); + + infomask |= size; + + /* + * initialize metadata + */ + tuple->t_info = infomask; + return tuple; +} + +/* ---------------- + * nocache_index_getattr + * + * This gets called from index_getattr() macro, and only in cases + * where we can't use cacheoffset and the value is not null. + * + * This caches attribute offsets in the attribute descriptor. + * + * An alternative way to speed things up would be to cache offsets + * with the tuple, but that seems more difficult unless you take + * the storage hit of actually putting those offsets into the + * tuple you send to disk. Yuck. + * + * This scheme will be slightly slower than that, but should + * perform well for queries which hit large #'s of tuples. After + * you cache the offsets once, examining all the other tuples using + * the same attribute descriptor will go much quicker. -cim 5/4/91 + * ---------------- + */ +Datum +nocache_index_getattr(IndexTuple tup, + int attnum, + TupleDesc tupleDesc) +{ + char *tp; /* ptr to data part of tuple */ + bits8 *bp = NULL; /* ptr to null bitmap in tuple */ + bool slow = false; /* do we have to walk attrs? */ + int data_off; /* tuple data offset */ + int off; /* current offset within data */ + + /* ---------------- + * Three cases: + * + * 1: No nulls and no variable-width attributes. + * 2: Has a null or a var-width AFTER att. + * 3: Has nulls or var-widths BEFORE att. + * ---------------- + */ + + data_off = IndexInfoFindDataOffset(tup->t_info); + + attnum--; + + if (IndexTupleHasNulls(tup)) + { + /* + * there's a null somewhere in the tuple + * + * check to see if desired att is null + */ + + /* XXX "knows" t_bits are just after fixed tuple header! */ + bp = (bits8 *) ((char *) tup + sizeof(IndexTupleData)); + + /* + * Now check to see if any preceding bits are null... + */ + { + int byte = attnum >> 3; + int finalbit = attnum & 0x07; + + /* check for nulls "before" final bit of last byte */ + if ((~bp[byte]) & ((1 << finalbit) - 1)) + slow = true; + else + { + /* check for nulls in any "earlier" bytes */ + int i; + + for (i = 0; i < byte; i++) + { + if (bp[i] != 0xFF) + { + slow = true; + break; + } + } + } + } + } + + tp = (char *) tup + data_off; + + if (!slow) + { + Form_pg_attribute att; + + /* + * If we get here, there are no nulls up to and including the target + * attribute. If we have a cached offset, we can use it. + */ + att = TupleDescAttr(tupleDesc, attnum); + if (att->attcacheoff >= 0) + return fetchatt(att, tp + att->attcacheoff); + + /* + * Otherwise, check for non-fixed-length attrs up to and including + * target. If there aren't any, it's safe to cheaply initialize the + * cached offsets for these attrs. + */ + if (IndexTupleHasVarwidths(tup)) + { + int j; + + for (j = 0; j <= attnum; j++) + { + if (TupleDescAttr(tupleDesc, j)->attlen <= 0) + { + slow = true; + break; + } + } + } + } + + if (!slow) + { + int natts = tupleDesc->natts; + int j = 1; + + /* + * If we get here, we have a tuple with no nulls or var-widths up to + * and including the target attribute, so we can use the cached offset + * ... only we don't have it yet, or we'd not have got here. Since + * it's cheap to compute offsets for fixed-width columns, we take the + * opportunity to initialize the cached offsets for *all* the leading + * fixed-width columns, in hope of avoiding future visits to this + * routine. + */ + TupleDescAttr(tupleDesc, 0)->attcacheoff = 0; + + /* we might have set some offsets in the slow path previously */ + while (j < natts && TupleDescAttr(tupleDesc, j)->attcacheoff > 0) + j++; + + off = TupleDescAttr(tupleDesc, j - 1)->attcacheoff + + TupleDescAttr(tupleDesc, j - 1)->attlen; + + for (; j < natts; j++) + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, j); + + if (att->attlen <= 0) + break; + + off = att_align_nominal(off, att->attalign); + + att->attcacheoff = off; + + off += att->attlen; + } + + Assert(j > attnum); + + off = TupleDescAttr(tupleDesc, attnum)->attcacheoff; + } + else + { + bool usecache = true; + int i; + + /* + * Now we know that we have to walk the tuple CAREFULLY. But we still + * might be able to cache some offsets for next time. + * + * Note - This loop is a little tricky. For each non-null attribute, + * we have to first account for alignment padding before the attr, + * then advance over the attr based on its length. Nulls have no + * storage and no alignment padding either. We can use/set + * attcacheoff until we reach either a null or a var-width attribute. + */ + off = 0; + for (i = 0;; i++) /* loop exit is at "break" */ + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, i); + + if (IndexTupleHasNulls(tup) && att_isnull(i, bp)) + { + usecache = false; + continue; /* this cannot be the target att */ + } + + /* If we know the next offset, we can skip the rest */ + if (usecache && att->attcacheoff >= 0) + off = att->attcacheoff; + else if (att->attlen == -1) + { + /* + * We can only cache the offset for a varlena attribute if the + * offset is already suitably aligned, so that there would be + * no pad bytes in any case: then the offset will be valid for + * either an aligned or unaligned value. + */ + if (usecache && + off == att_align_nominal(off, att->attalign)) + att->attcacheoff = off; + else + { + off = att_align_pointer(off, att->attalign, -1, + tp + off); + usecache = false; + } + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, att->attalign); + + if (usecache) + att->attcacheoff = off; + } + + if (i == attnum) + break; + + off = att_addlength_pointer(off, att->attlen, tp + off); + + if (usecache && att->attlen <= 0) + usecache = false; + } + } + + return fetchatt(TupleDescAttr(tupleDesc, attnum), tp + off); +} + +/* + * Convert an index tuple into Datum/isnull arrays. + * + * The caller must allocate sufficient storage for the output arrays. + * (INDEX_MAX_KEYS entries should be enough.) + * + * This is nearly the same as heap_deform_tuple(), but for IndexTuples. + * One difference is that the tuple should never have any missing columns. + */ +void +index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, + Datum *values, bool *isnull) +{ + char *tp; /* ptr to tuple data */ + bits8 *bp; /* ptr to null bitmap in tuple */ + + /* XXX "knows" t_bits are just after fixed tuple header! */ + bp = (bits8 *) ((char *) tup + sizeof(IndexTupleData)); + + tp = (char *) tup + IndexInfoFindDataOffset(tup->t_info); + + index_deform_tuple_internal(tupleDescriptor, values, isnull, + tp, bp, IndexTupleHasNulls(tup)); +} + +/* + * Convert an index tuple into Datum/isnull arrays, + * without assuming any specific layout of the index tuple header. + * + * Caller must supply pointer to data area, pointer to nulls bitmap + * (which can be NULL if !hasnulls), and hasnulls flag. + */ +void +index_deform_tuple_internal(TupleDesc tupleDescriptor, + Datum *values, bool *isnull, + char *tp, bits8 *bp, int hasnulls) +{ + int natts = tupleDescriptor->natts; /* number of atts to extract */ + int attnum; + int off = 0; /* offset in tuple data */ + bool slow = false; /* can we use/set attcacheoff? */ + + /* Assert to protect callers who allocate fixed-size arrays */ + Assert(natts <= INDEX_MAX_KEYS); + + for (attnum = 0; attnum < natts; attnum++) + { + Form_pg_attribute thisatt = TupleDescAttr(tupleDescriptor, attnum); + + if (hasnulls && att_isnull(attnum, bp)) + { + values[attnum] = (Datum) 0; + isnull[attnum] = true; + slow = true; /* can't use attcacheoff anymore */ + continue; + } + + isnull[attnum] = false; + + if (!slow && thisatt->attcacheoff >= 0) + off = thisatt->attcacheoff; + else if (thisatt->attlen == -1) + { + /* + * We can only cache the offset for a varlena attribute if the + * offset is already suitably aligned, so that there would be no + * pad bytes in any case: then the offset will be valid for either + * an aligned or unaligned value. + */ + if (!slow && + off == att_align_nominal(off, thisatt->attalign)) + thisatt->attcacheoff = off; + else + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + slow = true; + } + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + + if (!slow) + thisatt->attcacheoff = off; + } + + values[attnum] = fetchatt(thisatt, tp + off); + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + + if (thisatt->attlen <= 0) + slow = true; /* can't use attcacheoff anymore */ + } +} + +/* + * Create a palloc'd copy of an index tuple. + */ +IndexTuple +CopyIndexTuple(IndexTuple source) +{ + IndexTuple result; + Size size; + + size = IndexTupleSize(source); + result = (IndexTuple) palloc(size); + memcpy(result, source, size); + return result; +} + +/* + * Create a palloc'd copy of an index tuple, leaving only the first + * leavenatts attributes remaining. + * + * Truncation is guaranteed to result in an index tuple that is no + * larger than the original. It is safe to use the IndexTuple with + * the original tuple descriptor, but caller must avoid actually + * accessing truncated attributes from returned tuple! In practice + * this means that index_getattr() must be called with special care, + * and that the truncated tuple should only ever be accessed by code + * under caller's direct control. + * + * It's safe to call this function with a buffer lock held, since it + * never performs external table access. If it ever became possible + * for index tuples to contain EXTERNAL TOAST values, then this would + * have to be revisited. + */ +IndexTuple +index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source, + int leavenatts) +{ + TupleDesc truncdesc; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + IndexTuple truncated; + + Assert(leavenatts <= sourceDescriptor->natts); + + /* Easy case: no truncation actually required */ + if (leavenatts == sourceDescriptor->natts) + return CopyIndexTuple(source); + + /* Create temporary descriptor to scribble on */ + truncdesc = palloc(TupleDescSize(sourceDescriptor)); + TupleDescCopy(truncdesc, sourceDescriptor); + truncdesc->natts = leavenatts; + + /* Deform, form copy of tuple with fewer attributes */ + index_deform_tuple(source, truncdesc, values, isnull); + truncated = index_form_tuple(truncdesc, values, isnull); + truncated->t_tid = source->t_tid; + Assert(IndexTupleSize(truncated) <= IndexTupleSize(source)); + + /* + * Cannot leak memory here, TupleDescCopy() doesn't allocate any inner + * structure, so, plain pfree() should clean all allocated memory + */ + pfree(truncdesc); + + return truncated; +} diff --git a/src/backend/access/common/printsimple.c b/src/backend/access/common/printsimple.c new file mode 100644 index 0000000..93c3c4f --- /dev/null +++ b/src/backend/access/common/printsimple.c @@ -0,0 +1,132 @@ +/*------------------------------------------------------------------------- + * + * printsimple.c + * Routines to print out tuples containing only a limited range of + * builtin types without catalog access. This is intended for + * backends that don't have catalog access because they are not bound + * to a specific database, such as some walsender processes. It + * doesn't handle standalone backends or protocol versions other than + * 3.0, because we don't need such handling for current applications. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/common/printsimple.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/printsimple.h" +#include "catalog/pg_type.h" +#include "libpq/pqformat.h" +#include "utils/builtins.h" + +/* + * At startup time, send a RowDescription message. + */ +void +printsimple_startup(DestReceiver *self, int operation, TupleDesc tupdesc) +{ + StringInfoData buf; + int i; + + pq_beginmessage(&buf, 'T'); /* RowDescription */ + pq_sendint16(&buf, tupdesc->natts); + + for (i = 0; i < tupdesc->natts; ++i) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + + pq_sendstring(&buf, NameStr(attr->attname)); + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + pq_sendint32(&buf, (int) attr->atttypid); + pq_sendint16(&buf, attr->attlen); + pq_sendint32(&buf, attr->atttypmod); + pq_sendint16(&buf, 0); /* format code */ + } + + pq_endmessage(&buf); +} + +/* + * For each tuple, send a DataRow message. + */ +bool +printsimple(TupleTableSlot *slot, DestReceiver *self) +{ + TupleDesc tupdesc = slot->tts_tupleDescriptor; + StringInfoData buf; + int i; + + /* Make sure the tuple is fully deconstructed */ + slot_getallattrs(slot); + + /* Prepare and send message */ + pq_beginmessage(&buf, 'D'); + pq_sendint16(&buf, tupdesc->natts); + + for (i = 0; i < tupdesc->natts; ++i) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + Datum value; + + if (slot->tts_isnull[i]) + { + pq_sendint32(&buf, -1); + continue; + } + + value = slot->tts_values[i]; + + /* + * We can't call the regular type output functions here because we + * might not have catalog access. Instead, we must hard-wire + * knowledge of the required types. + */ + switch (attr->atttypid) + { + case TEXTOID: + { + text *t = DatumGetTextPP(value); + + pq_sendcountedtext(&buf, + VARDATA_ANY(t), + VARSIZE_ANY_EXHDR(t), + false); + } + break; + + case INT4OID: + { + int32 num = DatumGetInt32(value); + char str[12]; /* sign, 10 digits and '\0' */ + int len; + + len = pg_ltoa(num, str); + pq_sendcountedtext(&buf, str, len, false); + } + break; + + case INT8OID: + { + int64 num = DatumGetInt64(value); + char str[MAXINT8LEN + 1]; + int len; + + len = pg_lltoa(num, str); + pq_sendcountedtext(&buf, str, len, false); + } + break; + + default: + elog(ERROR, "unsupported type OID: %u", attr->atttypid); + } + } + + pq_endmessage(&buf); + + return true; +} diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c new file mode 100644 index 0000000..54b539f --- /dev/null +++ b/src/backend/access/common/printtup.c @@ -0,0 +1,485 @@ +/*------------------------------------------------------------------------- + * + * printtup.c + * Routines to print out tuples to the destination (both frontend + * clients and standalone backends are supported here). + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/common/printtup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/printtup.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "tcop/pquery.h" +#include "utils/lsyscache.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" + + +static void printtup_startup(DestReceiver *self, int operation, + TupleDesc typeinfo); +static bool printtup(TupleTableSlot *slot, DestReceiver *self); +static void printtup_shutdown(DestReceiver *self); +static void printtup_destroy(DestReceiver *self); + +/* ---------------------------------------------------------------- + * printtup / debugtup support + * ---------------------------------------------------------------- + */ + +/* ---------------- + * Private state for a printtup destination object + * + * NOTE: finfo is the lookup info for either typoutput or typsend, whichever + * we are using for this column. + * ---------------- + */ +typedef struct +{ /* Per-attribute information */ + Oid typoutput; /* Oid for the type's text output fn */ + Oid typsend; /* Oid for the type's binary output fn */ + bool typisvarlena; /* is it varlena (ie possibly toastable)? */ + int16 format; /* format code for this column */ + FmgrInfo finfo; /* Precomputed call info for output fn */ +} PrinttupAttrInfo; + +typedef struct +{ + DestReceiver pub; /* publicly-known function pointers */ + Portal portal; /* the Portal we are printing from */ + bool sendDescrip; /* send RowDescription at startup? */ + TupleDesc attrinfo; /* The attr info we are set up for */ + int nattrs; + PrinttupAttrInfo *myinfo; /* Cached info about each attr */ + StringInfoData buf; /* output buffer (*not* in tmpcontext) */ + MemoryContext tmpcontext; /* Memory context for per-row workspace */ +} DR_printtup; + +/* ---------------- + * Initialize: create a DestReceiver for printtup + * ---------------- + */ +DestReceiver * +printtup_create_DR(CommandDest dest) +{ + DR_printtup *self = (DR_printtup *) palloc0(sizeof(DR_printtup)); + + self->pub.receiveSlot = printtup; /* might get changed later */ + self->pub.rStartup = printtup_startup; + self->pub.rShutdown = printtup_shutdown; + self->pub.rDestroy = printtup_destroy; + self->pub.mydest = dest; + + /* + * Send T message automatically if DestRemote, but not if + * DestRemoteExecute + */ + self->sendDescrip = (dest == DestRemote); + + self->attrinfo = NULL; + self->nattrs = 0; + self->myinfo = NULL; + self->buf.data = NULL; + self->tmpcontext = NULL; + + return (DestReceiver *) self; +} + +/* + * Set parameters for a DestRemote (or DestRemoteExecute) receiver + */ +void +SetRemoteDestReceiverParams(DestReceiver *self, Portal portal) +{ + DR_printtup *myState = (DR_printtup *) self; + + Assert(myState->pub.mydest == DestRemote || + myState->pub.mydest == DestRemoteExecute); + + myState->portal = portal; +} + +static void +printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + DR_printtup *myState = (DR_printtup *) self; + Portal portal = myState->portal; + + /* + * Create I/O buffer to be used for all messages. This cannot be inside + * tmpcontext, since we want to re-use it across rows. + */ + initStringInfo(&myState->buf); + + /* + * Create a temporary memory context that we can reset once per row to + * recover palloc'd memory. This avoids any problems with leaks inside + * datatype output routines, and should be faster than retail pfree's + * anyway. + */ + myState->tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "printtup", + ALLOCSET_DEFAULT_SIZES); + + /* + * If we are supposed to emit row descriptions, then send the tuple + * descriptor of the tuples. + */ + if (myState->sendDescrip) + SendRowDescriptionMessage(&myState->buf, + typeinfo, + FetchPortalTargetList(portal), + portal->formats); + + /* ---------------- + * We could set up the derived attr info at this time, but we postpone it + * until the first call of printtup, for 2 reasons: + * 1. We don't waste time (compared to the old way) if there are no + * tuples at all to output. + * 2. Checking in printtup allows us to handle the case that the tuples + * change type midway through (although this probably can't happen in + * the current executor). + * ---------------- + */ +} + +/* + * SendRowDescriptionMessage --- send a RowDescription message to the frontend + * + * Notes: the TupleDesc has typically been manufactured by ExecTypeFromTL() + * or some similar function; it does not contain a full set of fields. + * The targetlist will be NIL when executing a utility function that does + * not have a plan. If the targetlist isn't NIL then it is a Query node's + * targetlist; it is up to us to ignore resjunk columns in it. The formats[] + * array pointer might be NULL (if we are doing Describe on a prepared stmt); + * send zeroes for the format codes in that case. + */ +void +SendRowDescriptionMessage(StringInfo buf, TupleDesc typeinfo, + List *targetlist, int16 *formats) +{ + int natts = typeinfo->natts; + int i; + ListCell *tlist_item = list_head(targetlist); + + /* tuple descriptor message type */ + pq_beginmessage_reuse(buf, 'T'); + /* # of attrs in tuples */ + pq_sendint16(buf, natts); + + /* + * Preallocate memory for the entire message to be sent. That allows to + * use the significantly faster inline pqformat.h functions and to avoid + * reallocations. + * + * Have to overestimate the size of the column-names, to account for + * character set overhead. + */ + enlargeStringInfo(buf, (NAMEDATALEN * MAX_CONVERSION_GROWTH /* attname */ + + sizeof(Oid) /* resorigtbl */ + + sizeof(AttrNumber) /* resorigcol */ + + sizeof(Oid) /* atttypid */ + + sizeof(int16) /* attlen */ + + sizeof(int32) /* attypmod */ + + sizeof(int16) /* format */ + ) * natts); + + for (i = 0; i < natts; ++i) + { + Form_pg_attribute att = TupleDescAttr(typeinfo, i); + Oid atttypid = att->atttypid; + int32 atttypmod = att->atttypmod; + Oid resorigtbl; + AttrNumber resorigcol; + int16 format; + + /* + * If column is a domain, send the base type and typmod instead. + * Lookup before sending any ints, for efficiency. + */ + atttypid = getBaseTypeAndTypmod(atttypid, &atttypmod); + + /* Do we have a non-resjunk tlist item? */ + while (tlist_item && + ((TargetEntry *) lfirst(tlist_item))->resjunk) + tlist_item = lnext(targetlist, tlist_item); + if (tlist_item) + { + TargetEntry *tle = (TargetEntry *) lfirst(tlist_item); + + resorigtbl = tle->resorigtbl; + resorigcol = tle->resorigcol; + tlist_item = lnext(targetlist, tlist_item); + } + else + { + /* No info available, so send zeroes */ + resorigtbl = 0; + resorigcol = 0; + } + + if (formats) + format = formats[i]; + else + format = 0; + + pq_writestring(buf, NameStr(att->attname)); + pq_writeint32(buf, resorigtbl); + pq_writeint16(buf, resorigcol); + pq_writeint32(buf, atttypid); + pq_writeint16(buf, att->attlen); + pq_writeint32(buf, atttypmod); + pq_writeint16(buf, format); + } + + pq_endmessage_reuse(buf); +} + +/* + * Get the lookup info that printtup() needs + */ +static void +printtup_prepare_info(DR_printtup *myState, TupleDesc typeinfo, int numAttrs) +{ + int16 *formats = myState->portal->formats; + int i; + + /* get rid of any old data */ + if (myState->myinfo) + pfree(myState->myinfo); + myState->myinfo = NULL; + + myState->attrinfo = typeinfo; + myState->nattrs = numAttrs; + if (numAttrs <= 0) + return; + + myState->myinfo = (PrinttupAttrInfo *) + palloc0(numAttrs * sizeof(PrinttupAttrInfo)); + + for (i = 0; i < numAttrs; i++) + { + PrinttupAttrInfo *thisState = myState->myinfo + i; + int16 format = (formats ? formats[i] : 0); + Form_pg_attribute attr = TupleDescAttr(typeinfo, i); + + thisState->format = format; + if (format == 0) + { + getTypeOutputInfo(attr->atttypid, + &thisState->typoutput, + &thisState->typisvarlena); + fmgr_info(thisState->typoutput, &thisState->finfo); + } + else if (format == 1) + { + getTypeBinaryOutputInfo(attr->atttypid, + &thisState->typsend, + &thisState->typisvarlena); + fmgr_info(thisState->typsend, &thisState->finfo); + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unsupported format code: %d", format))); + } +} + +/* ---------------- + * printtup --- send a tuple to the client + * ---------------- + */ +static bool +printtup(TupleTableSlot *slot, DestReceiver *self) +{ + TupleDesc typeinfo = slot->tts_tupleDescriptor; + DR_printtup *myState = (DR_printtup *) self; + MemoryContext oldcontext; + StringInfo buf = &myState->buf; + int natts = typeinfo->natts; + int i; + + /* Set or update my derived attribute info, if needed */ + if (myState->attrinfo != typeinfo || myState->nattrs != natts) + printtup_prepare_info(myState, typeinfo, natts); + + /* Make sure the tuple is fully deconstructed */ + slot_getallattrs(slot); + + /* Switch into per-row context so we can recover memory below */ + oldcontext = MemoryContextSwitchTo(myState->tmpcontext); + + /* + * Prepare a DataRow message (note buffer is in per-row context) + */ + pq_beginmessage_reuse(buf, 'D'); + + pq_sendint16(buf, natts); + + /* + * send the attributes of this tuple + */ + for (i = 0; i < natts; ++i) + { + PrinttupAttrInfo *thisState = myState->myinfo + i; + Datum attr = slot->tts_values[i]; + + if (slot->tts_isnull[i]) + { + pq_sendint32(buf, -1); + continue; + } + + /* + * Here we catch undefined bytes in datums that are returned to the + * client without hitting disk; see comments at the related check in + * PageAddItem(). This test is most useful for uncompressed, + * non-external datums, but we're quite likely to see such here when + * testing new C functions. + */ + if (thisState->typisvarlena) + VALGRIND_CHECK_MEM_IS_DEFINED(DatumGetPointer(attr), + VARSIZE_ANY(attr)); + + if (thisState->format == 0) + { + /* Text output */ + char *outputstr; + + outputstr = OutputFunctionCall(&thisState->finfo, attr); + pq_sendcountedtext(buf, outputstr, strlen(outputstr), false); + } + else + { + /* Binary output */ + bytea *outputbytes; + + outputbytes = SendFunctionCall(&thisState->finfo, attr); + pq_sendint32(buf, VARSIZE(outputbytes) - VARHDRSZ); + pq_sendbytes(buf, VARDATA(outputbytes), + VARSIZE(outputbytes) - VARHDRSZ); + } + } + + pq_endmessage_reuse(buf); + + /* Return to caller's context, and flush row's temporary memory */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(myState->tmpcontext); + + return true; +} + +/* ---------------- + * printtup_shutdown + * ---------------- + */ +static void +printtup_shutdown(DestReceiver *self) +{ + DR_printtup *myState = (DR_printtup *) self; + + if (myState->myinfo) + pfree(myState->myinfo); + myState->myinfo = NULL; + + myState->attrinfo = NULL; + + if (myState->buf.data) + pfree(myState->buf.data); + myState->buf.data = NULL; + + if (myState->tmpcontext) + MemoryContextDelete(myState->tmpcontext); + myState->tmpcontext = NULL; +} + +/* ---------------- + * printtup_destroy + * ---------------- + */ +static void +printtup_destroy(DestReceiver *self) +{ + pfree(self); +} + +/* ---------------- + * printatt + * ---------------- + */ +static void +printatt(unsigned attributeId, + Form_pg_attribute attributeP, + char *value) +{ + printf("\t%2d: %s%s%s%s\t(typeid = %u, len = %d, typmod = %d, byval = %c)\n", + attributeId, + NameStr(attributeP->attname), + value != NULL ? " = \"" : "", + value != NULL ? value : "", + value != NULL ? "\"" : "", + (unsigned int) (attributeP->atttypid), + attributeP->attlen, + attributeP->atttypmod, + attributeP->attbyval ? 't' : 'f'); +} + +/* ---------------- + * debugStartup - prepare to print tuples for an interactive backend + * ---------------- + */ +void +debugStartup(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + int natts = typeinfo->natts; + int i; + + /* + * show the return type of the tuples + */ + for (i = 0; i < natts; ++i) + printatt((unsigned) i + 1, TupleDescAttr(typeinfo, i), NULL); + printf("\t----\n"); +} + +/* ---------------- + * debugtup - print one tuple for an interactive backend + * ---------------- + */ +bool +debugtup(TupleTableSlot *slot, DestReceiver *self) +{ + TupleDesc typeinfo = slot->tts_tupleDescriptor; + int natts = typeinfo->natts; + int i; + Datum attr; + char *value; + bool isnull; + Oid typoutput; + bool typisvarlena; + + for (i = 0; i < natts; ++i) + { + attr = slot_getattr(slot, i + 1, &isnull); + if (isnull) + continue; + getTypeOutputInfo(TupleDescAttr(typeinfo, i)->atttypid, + &typoutput, &typisvarlena); + + value = OidOutputFunctionCall(typoutput, attr); + + printatt((unsigned) i + 1, TupleDescAttr(typeinfo, i), value); + } + printf("\t----\n"); + + return true; +} diff --git a/src/backend/access/common/relation.c b/src/backend/access/common/relation.c new file mode 100644 index 0000000..632d13c --- /dev/null +++ b/src/backend/access/common/relation.c @@ -0,0 +1,217 @@ +/*------------------------------------------------------------------------- + * + * relation.c + * Generic relation related routines. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/relation.c + * + * NOTES + * This file contains relation_ routines that implement access to relations + * (tables, indexes, etc). Support that's specific to subtypes of relations + * should go into their respective files, not here. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relation.h" +#include "access/xact.h" +#include "catalog/namespace.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "utils/inval.h" +#include "utils/syscache.h" + + +/* ---------------- + * relation_open - open any relation by relation OID + * + * If lockmode is not "NoLock", the specified kind of lock is + * obtained on the relation. (Generally, NoLock should only be + * used if the caller knows it has some appropriate lock on the + * relation already.) + * + * An error is raised if the relation does not exist. + * + * NB: a "relation" is anything with a pg_class entry. The caller is + * expected to check whether the relkind is something it can handle. + * ---------------- + */ +Relation +relation_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES); + + /* Get the lock before trying to open the relcache entry */ + if (lockmode != NoLock) + LockRelationOid(relationId, lockmode); + + /* The relcache does all the real work... */ + r = RelationIdGetRelation(relationId); + + if (!RelationIsValid(r)) + elog(ERROR, "could not open relation with OID %u", relationId); + + /* + * If we didn't get the lock ourselves, assert that caller holds one, + * except in bootstrap mode where no locks are used. + */ + Assert(lockmode != NoLock || + IsBootstrapProcessingMode() || + CheckRelationLockedByMe(r, AccessShareLock, true)); + + /* Make note that we've accessed a temporary relation */ + if (RelationUsesLocalBuffers(r)) + MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPNAMESPACE; + + pgstat_initstats(r); + + return r; +} + +/* ---------------- + * try_relation_open - open any relation by relation OID + * + * Same as relation_open, except return NULL instead of failing + * if the relation does not exist. + * ---------------- + */ +Relation +try_relation_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES); + + /* Get the lock first */ + if (lockmode != NoLock) + LockRelationOid(relationId, lockmode); + + /* + * Now that we have the lock, probe to see if the relation really exists + * or not. + */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId))) + { + /* Release useless lock */ + if (lockmode != NoLock) + UnlockRelationOid(relationId, lockmode); + + return NULL; + } + + /* Should be safe to do a relcache load */ + r = RelationIdGetRelation(relationId); + + if (!RelationIsValid(r)) + elog(ERROR, "could not open relation with OID %u", relationId); + + /* If we didn't get the lock ourselves, assert that caller holds one */ + Assert(lockmode != NoLock || + CheckRelationLockedByMe(r, AccessShareLock, true)); + + /* Make note that we've accessed a temporary relation */ + if (RelationUsesLocalBuffers(r)) + MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPNAMESPACE; + + pgstat_initstats(r); + + return r; +} + +/* ---------------- + * relation_openrv - open any relation specified by a RangeVar + * + * Same as relation_open, but the relation is specified by a RangeVar. + * ---------------- + */ +Relation +relation_openrv(const RangeVar *relation, LOCKMODE lockmode) +{ + Oid relOid; + + /* + * Check for shared-cache-inval messages before trying to open the + * relation. This is needed even if we already hold a lock on the + * relation, because GRANT/REVOKE are executed without taking any lock on + * the target relation, and we want to be sure we see current ACL + * information. We can skip this if asked for NoLock, on the assumption + * that such a call is not the first one in the current command, and so we + * should be reasonably up-to-date already. (XXX this all could stand to + * be redesigned, but for the moment we'll keep doing this like it's been + * done historically.) + */ + if (lockmode != NoLock) + AcceptInvalidationMessages(); + + /* Look up and lock the appropriate relation using namespace search */ + relOid = RangeVarGetRelid(relation, lockmode, false); + + /* Let relation_open do the rest */ + return relation_open(relOid, NoLock); +} + +/* ---------------- + * relation_openrv_extended - open any relation specified by a RangeVar + * + * Same as relation_openrv, but with an additional missing_ok argument + * allowing a NULL return rather than an error if the relation is not + * found. (Note that some other causes, such as permissions problems, + * will still result in an ereport.) + * ---------------- + */ +Relation +relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, + bool missing_ok) +{ + Oid relOid; + + /* + * Check for shared-cache-inval messages before trying to open the + * relation. See comments in relation_openrv(). + */ + if (lockmode != NoLock) + AcceptInvalidationMessages(); + + /* Look up and lock the appropriate relation using namespace search */ + relOid = RangeVarGetRelid(relation, lockmode, missing_ok); + + /* Return NULL on not-found */ + if (!OidIsValid(relOid)) + return NULL; + + /* Let relation_open do the rest */ + return relation_open(relOid, NoLock); +} + +/* ---------------- + * relation_close - close any relation + * + * If lockmode is not "NoLock", we then release the specified lock. + * + * Note that it is often sensible to hold a lock beyond relation_close; + * in that case, the lock is released automatically at xact end. + * ---------------- + */ +void +relation_close(Relation relation, LOCKMODE lockmode) +{ + LockRelId relid = relation->rd_lockInfo.lockRelId; + + Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES); + + /* The relcache does the real work... */ + RelationClose(relation); + + if (lockmode != NoLock) + UnlockRelationId(&relid, lockmode); +} diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c new file mode 100644 index 0000000..b5602f5 --- /dev/null +++ b/src/backend/access/common/reloptions.c @@ -0,0 +1,2131 @@ +/*------------------------------------------------------------------------- + * + * reloptions.c + * Core support for relation options (pg_class.reloptions) + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/reloptions.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/gist_private.h" +#include "access/hash.h" +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/reloptions.h" +#include "access/spgist_private.h" +#include "catalog/pg_type.h" +#include "commands/defrem.h" +#include "commands/tablespace.h" +#include "commands/view.h" +#include "nodes/makefuncs.h" +#include "postmaster/postmaster.h" +#include "utils/array.h" +#include "utils/attoptcache.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * Contents of pg_class.reloptions + * + * To add an option: + * + * (i) decide on a type (integer, real, bool, string), name, default value, + * upper and lower bounds (if applicable); for strings, consider a validation + * routine. + * (ii) add a record below (or use add__reloption). + * (iii) add it to the appropriate options struct (perhaps StdRdOptions) + * (iv) add it to the appropriate handling routine (perhaps + * default_reloptions) + * (v) make sure the lock level is set correctly for that operation + * (vi) don't forget to document the option + * + * The default choice for any new option should be AccessExclusiveLock. + * In some cases the lock level can be reduced from there, but the lock + * level chosen should always conflict with itself to ensure that multiple + * changes aren't lost when we attempt concurrent changes. + * The choice of lock level depends completely upon how that parameter + * is used within the server, not upon how and when you'd like to change it. + * Safety first. Existing choices are documented here, and elsewhere in + * backend code where the parameters are used. + * + * In general, anything that affects the results obtained from a SELECT must be + * protected by AccessExclusiveLock. + * + * Autovacuum related parameters can be set at ShareUpdateExclusiveLock + * since they are only used by the AV procs and don't change anything + * currently executing. + * + * Fillfactor can be set because it applies only to subsequent changes made to + * data blocks, as documented in hio.c + * + * n_distinct options can be set at ShareUpdateExclusiveLock because they + * are only used during ANALYZE, which uses a ShareUpdateExclusiveLock, + * so the ANALYZE will not be affected by in-flight changes. Changing those + * values has no effect until the next ANALYZE, so no need for stronger lock. + * + * Planner-related parameters can be set with ShareUpdateExclusiveLock because + * they only affect planning and not the correctness of the execution. Plans + * cannot be changed in mid-flight, so changes here could not easily result in + * new improved plans in any case. So we allow existing queries to continue + * and existing plans to survive, a small price to pay for allowing better + * plans to be introduced concurrently without interfering with users. + * + * Setting parallel_workers is safe, since it acts the same as + * max_parallel_workers_per_gather which is a USERSET parameter that doesn't + * affect existing plans or queries. + * + * vacuum_truncate can be set at ShareUpdateExclusiveLock because it + * is only used during VACUUM, which uses a ShareUpdateExclusiveLock, + * so the VACUUM will not be affected by in-flight changes. Changing its + * value has no effect until the next VACUUM, so no need for stronger lock. + */ + +static relopt_bool boolRelOpts[] = +{ + { + { + "autosummarize", + "Enables automatic summarization on this BRIN index", + RELOPT_KIND_BRIN, + AccessExclusiveLock + }, + false + }, + { + { + "autovacuum_enabled", + "Enables autovacuum in this relation", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + true + }, + { + { + "user_catalog_table", + "Declare a table as an additional catalog table, e.g. for the purpose of logical replication", + RELOPT_KIND_HEAP, + AccessExclusiveLock + }, + false + }, + { + { + "fastupdate", + "Enables \"fast update\" feature for this GIN index", + RELOPT_KIND_GIN, + AccessExclusiveLock + }, + true + }, + { + { + "security_barrier", + "View acts as a row security barrier", + RELOPT_KIND_VIEW, + AccessExclusiveLock + }, + false + }, + { + { + "vacuum_truncate", + "Enables vacuum to truncate empty pages at the end of this table", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + true + }, + { + { + "deduplicate_items", + "Enables \"deduplicate items\" feature for this btree index", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock /* since it applies only to later + * inserts */ + }, + true + }, + /* list terminator */ + {{NULL}} +}; + +static relopt_int intRelOpts[] = +{ + { + { + "fillfactor", + "Packs table pages only to this percentage", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock /* since it applies only to later + * inserts */ + }, + HEAP_DEFAULT_FILLFACTOR, HEAP_MIN_FILLFACTOR, 100 + }, + { + { + "fillfactor", + "Packs btree index pages only to this percentage", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock /* since it applies only to later + * inserts */ + }, + BTREE_DEFAULT_FILLFACTOR, BTREE_MIN_FILLFACTOR, 100 + }, + { + { + "fillfactor", + "Packs hash index pages only to this percentage", + RELOPT_KIND_HASH, + ShareUpdateExclusiveLock /* since it applies only to later + * inserts */ + }, + HASH_DEFAULT_FILLFACTOR, HASH_MIN_FILLFACTOR, 100 + }, + { + { + "fillfactor", + "Packs gist index pages only to this percentage", + RELOPT_KIND_GIST, + ShareUpdateExclusiveLock /* since it applies only to later + * inserts */ + }, + GIST_DEFAULT_FILLFACTOR, GIST_MIN_FILLFACTOR, 100 + }, + { + { + "fillfactor", + "Packs spgist index pages only to this percentage", + RELOPT_KIND_SPGIST, + ShareUpdateExclusiveLock /* since it applies only to later + * inserts */ + }, + SPGIST_DEFAULT_FILLFACTOR, SPGIST_MIN_FILLFACTOR, 100 + }, + { + { + "autovacuum_vacuum_threshold", + "Minimum number of tuple updates or deletes prior to vacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 0, INT_MAX + }, + { + { + "autovacuum_vacuum_insert_threshold", + "Minimum number of tuple inserts prior to vacuum, or -1 to disable insert vacuums", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -2, -1, INT_MAX + }, + { + { + "autovacuum_analyze_threshold", + "Minimum number of tuple inserts, updates or deletes prior to analyze", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock + }, + -1, 0, INT_MAX + }, + { + { + "autovacuum_vacuum_cost_limit", + "Vacuum cost amount available before napping, for autovacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 1, 10000 + }, + { + { + "autovacuum_freeze_min_age", + "Minimum age at which VACUUM should freeze a table row, for autovacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 0, 1000000000 + }, + { + { + "autovacuum_multixact_freeze_min_age", + "Minimum multixact age at which VACUUM should freeze a row multixact's, for autovacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 0, 1000000000 + }, + { + { + "autovacuum_freeze_max_age", + "Age at which to autovacuum a table to prevent transaction ID wraparound", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 100000, 2000000000 + }, + { + { + "autovacuum_multixact_freeze_max_age", + "Multixact age at which to autovacuum a table to prevent multixact wraparound", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 10000, 2000000000 + }, + { + { + "autovacuum_freeze_table_age", + "Age at which VACUUM should perform a full table sweep to freeze row versions", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, -1, 0, 2000000000 + }, + { + { + "autovacuum_multixact_freeze_table_age", + "Age of multixact at which VACUUM should perform a full table sweep to freeze row versions", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, -1, 0, 2000000000 + }, + { + { + "log_autovacuum_min_duration", + "Sets the minimum execution time above which autovacuum actions will be logged", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, -1, INT_MAX + }, + { + { + "toast_tuple_target", + "Sets the target tuple length at which external columns will be toasted", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock + }, + TOAST_TUPLE_TARGET, 128, TOAST_TUPLE_TARGET_MAIN + }, + { + { + "pages_per_range", + "Number of pages that each page range covers in a BRIN index", + RELOPT_KIND_BRIN, + AccessExclusiveLock + }, 128, 1, 131072 + }, + { + { + "gin_pending_list_limit", + "Maximum size of the pending list for this GIN index, in kilobytes.", + RELOPT_KIND_GIN, + AccessExclusiveLock + }, + -1, 64, MAX_KILOBYTES + }, + { + { + "effective_io_concurrency", + "Number of simultaneous requests that can be handled efficiently by the disk subsystem.", + RELOPT_KIND_TABLESPACE, + ShareUpdateExclusiveLock + }, +#ifdef USE_PREFETCH + -1, 0, MAX_IO_CONCURRENCY +#else + 0, 0, 0 +#endif + }, + { + { + "maintenance_io_concurrency", + "Number of simultaneous requests that can be handled efficiently by the disk subsystem for maintenance work.", + RELOPT_KIND_TABLESPACE, + ShareUpdateExclusiveLock + }, +#ifdef USE_PREFETCH + -1, 0, MAX_IO_CONCURRENCY +#else + 0, 0, 0 +#endif + }, + { + { + "parallel_workers", + "Number of parallel processes that can be used per executor node for this relation.", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock + }, + -1, 0, 1024 + }, + + /* list terminator */ + {{NULL}} +}; + +static relopt_real realRelOpts[] = +{ + { + { + "autovacuum_vacuum_cost_delay", + "Vacuum cost delay in milliseconds, for autovacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 0.0, 100.0 + }, + { + { + "autovacuum_vacuum_scale_factor", + "Number of tuple updates or deletes prior to vacuum as a fraction of reltuples", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 0.0, 100.0 + }, + { + { + "autovacuum_vacuum_insert_scale_factor", + "Number of tuple inserts prior to vacuum as a fraction of reltuples", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + -1, 0.0, 100.0 + }, + { + { + "autovacuum_analyze_scale_factor", + "Number of tuple inserts, updates or deletes prior to analyze as a fraction of reltuples", + RELOPT_KIND_HEAP, + ShareUpdateExclusiveLock + }, + -1, 0.0, 100.0 + }, + { + { + "seq_page_cost", + "Sets the planner's estimate of the cost of a sequentially fetched disk page.", + RELOPT_KIND_TABLESPACE, + ShareUpdateExclusiveLock + }, + -1, 0.0, DBL_MAX + }, + { + { + "random_page_cost", + "Sets the planner's estimate of the cost of a nonsequentially fetched disk page.", + RELOPT_KIND_TABLESPACE, + ShareUpdateExclusiveLock + }, + -1, 0.0, DBL_MAX + }, + { + { + "n_distinct", + "Sets the planner's estimate of the number of distinct values appearing in a column (excluding child relations).", + RELOPT_KIND_ATTRIBUTE, + ShareUpdateExclusiveLock + }, + 0, -1.0, DBL_MAX + }, + { + { + "n_distinct_inherited", + "Sets the planner's estimate of the number of distinct values appearing in a column (including child relations).", + RELOPT_KIND_ATTRIBUTE, + ShareUpdateExclusiveLock + }, + 0, -1.0, DBL_MAX + }, + { + { + "vacuum_cleanup_index_scale_factor", + "Deprecated B-Tree parameter.", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock + }, + -1, 0.0, 1e10 + }, + /* list terminator */ + {{NULL}} +}; + +/* values from StdRdOptIndexCleanup */ +relopt_enum_elt_def StdRdOptIndexCleanupValues[] = +{ + {"auto", STDRD_OPTION_VACUUM_INDEX_CLEANUP_AUTO}, + {"on", STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON}, + {"off", STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF}, + {"true", STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON}, + {"false", STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF}, + {"yes", STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON}, + {"no", STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF}, + {"1", STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON}, + {"0", STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF}, + {(const char *) NULL} /* list terminator */ +}; + +/* values from GistOptBufferingMode */ +relopt_enum_elt_def gistBufferingOptValues[] = +{ + {"auto", GIST_OPTION_BUFFERING_AUTO}, + {"on", GIST_OPTION_BUFFERING_ON}, + {"off", GIST_OPTION_BUFFERING_OFF}, + {(const char *) NULL} /* list terminator */ +}; + +/* values from ViewOptCheckOption */ +relopt_enum_elt_def viewCheckOptValues[] = +{ + /* no value for NOT_SET */ + {"local", VIEW_OPTION_CHECK_OPTION_LOCAL}, + {"cascaded", VIEW_OPTION_CHECK_OPTION_CASCADED}, + {(const char *) NULL} /* list terminator */ +}; + +static relopt_enum enumRelOpts[] = +{ + { + { + "vacuum_index_cleanup", + "Controls index vacuuming and index cleanup", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + StdRdOptIndexCleanupValues, + STDRD_OPTION_VACUUM_INDEX_CLEANUP_AUTO, + gettext_noop("Valid values are \"on\", \"off\", and \"auto\".") + }, + { + { + "buffering", + "Enables buffering build for this GiST index", + RELOPT_KIND_GIST, + AccessExclusiveLock + }, + gistBufferingOptValues, + GIST_OPTION_BUFFERING_AUTO, + gettext_noop("Valid values are \"on\", \"off\", and \"auto\".") + }, + { + { + "check_option", + "View has WITH CHECK OPTION defined (local or cascaded).", + RELOPT_KIND_VIEW, + AccessExclusiveLock + }, + viewCheckOptValues, + VIEW_OPTION_CHECK_OPTION_NOT_SET, + gettext_noop("Valid values are \"local\" and \"cascaded\".") + }, + /* list terminator */ + {{NULL}} +}; + +static relopt_string stringRelOpts[] = +{ + /* list terminator */ + {{NULL}} +}; + +static relopt_gen **relOpts = NULL; +static bits32 last_assigned_kind = RELOPT_KIND_LAST_DEFAULT; + +static int num_custom_options = 0; +static relopt_gen **custom_options = NULL; +static bool need_initialization = true; + +static void initialize_reloptions(void); +static void parse_one_reloption(relopt_value *option, char *text_str, + int text_len, bool validate); + +/* + * Get the length of a string reloption (either default or the user-defined + * value). This is used for allocation purposes when building a set of + * relation options. + */ +#define GET_STRING_RELOPTION_LEN(option) \ + ((option).isset ? strlen((option).values.string_val) : \ + ((relopt_string *) (option).gen)->default_len) + +/* + * initialize_reloptions + * initialization routine, must be called before parsing + * + * Initialize the relOpts array and fill each variable's type and name length. + */ +static void +initialize_reloptions(void) +{ + int i; + int j; + + j = 0; + for (i = 0; boolRelOpts[i].gen.name; i++) + { + Assert(DoLockModesConflict(boolRelOpts[i].gen.lockmode, + boolRelOpts[i].gen.lockmode)); + j++; + } + for (i = 0; intRelOpts[i].gen.name; i++) + { + Assert(DoLockModesConflict(intRelOpts[i].gen.lockmode, + intRelOpts[i].gen.lockmode)); + j++; + } + for (i = 0; realRelOpts[i].gen.name; i++) + { + Assert(DoLockModesConflict(realRelOpts[i].gen.lockmode, + realRelOpts[i].gen.lockmode)); + j++; + } + for (i = 0; enumRelOpts[i].gen.name; i++) + { + Assert(DoLockModesConflict(enumRelOpts[i].gen.lockmode, + enumRelOpts[i].gen.lockmode)); + j++; + } + for (i = 0; stringRelOpts[i].gen.name; i++) + { + Assert(DoLockModesConflict(stringRelOpts[i].gen.lockmode, + stringRelOpts[i].gen.lockmode)); + j++; + } + j += num_custom_options; + + if (relOpts) + pfree(relOpts); + relOpts = MemoryContextAlloc(TopMemoryContext, + (j + 1) * sizeof(relopt_gen *)); + + j = 0; + for (i = 0; boolRelOpts[i].gen.name; i++) + { + relOpts[j] = &boolRelOpts[i].gen; + relOpts[j]->type = RELOPT_TYPE_BOOL; + relOpts[j]->namelen = strlen(relOpts[j]->name); + j++; + } + + for (i = 0; intRelOpts[i].gen.name; i++) + { + relOpts[j] = &intRelOpts[i].gen; + relOpts[j]->type = RELOPT_TYPE_INT; + relOpts[j]->namelen = strlen(relOpts[j]->name); + j++; + } + + for (i = 0; realRelOpts[i].gen.name; i++) + { + relOpts[j] = &realRelOpts[i].gen; + relOpts[j]->type = RELOPT_TYPE_REAL; + relOpts[j]->namelen = strlen(relOpts[j]->name); + j++; + } + + for (i = 0; enumRelOpts[i].gen.name; i++) + { + relOpts[j] = &enumRelOpts[i].gen; + relOpts[j]->type = RELOPT_TYPE_ENUM; + relOpts[j]->namelen = strlen(relOpts[j]->name); + j++; + } + + for (i = 0; stringRelOpts[i].gen.name; i++) + { + relOpts[j] = &stringRelOpts[i].gen; + relOpts[j]->type = RELOPT_TYPE_STRING; + relOpts[j]->namelen = strlen(relOpts[j]->name); + j++; + } + + for (i = 0; i < num_custom_options; i++) + { + relOpts[j] = custom_options[i]; + j++; + } + + /* add a list terminator */ + relOpts[j] = NULL; + + /* flag the work is complete */ + need_initialization = false; +} + +/* + * add_reloption_kind + * Create a new relopt_kind value, to be used in custom reloptions by + * user-defined AMs. + */ +relopt_kind +add_reloption_kind(void) +{ + /* don't hand out the last bit so that the enum's behavior is portable */ + if (last_assigned_kind >= RELOPT_KIND_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("user-defined relation parameter types limit exceeded"))); + last_assigned_kind <<= 1; + return (relopt_kind) last_assigned_kind; +} + +/* + * add_reloption + * Add an already-created custom reloption to the list, and recompute the + * main parser table. + */ +static void +add_reloption(relopt_gen *newoption) +{ + static int max_custom_options = 0; + + if (num_custom_options >= max_custom_options) + { + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(TopMemoryContext); + + if (max_custom_options == 0) + { + max_custom_options = 8; + custom_options = palloc(max_custom_options * sizeof(relopt_gen *)); + } + else + { + max_custom_options *= 2; + custom_options = repalloc(custom_options, + max_custom_options * sizeof(relopt_gen *)); + } + MemoryContextSwitchTo(oldcxt); + } + custom_options[num_custom_options++] = newoption; + + need_initialization = true; +} + +/* + * init_local_reloptions + * Initialize local reloptions that will parsed into bytea structure of + * 'relopt_struct_size'. + */ +void +init_local_reloptions(local_relopts *opts, Size relopt_struct_size) +{ + opts->options = NIL; + opts->validators = NIL; + opts->relopt_struct_size = relopt_struct_size; +} + +/* + * register_reloptions_validator + * Register custom validation callback that will be called at the end of + * build_local_reloptions(). + */ +void +register_reloptions_validator(local_relopts *opts, relopts_validator validator) +{ + opts->validators = lappend(opts->validators, validator); +} + +/* + * add_local_reloption + * Add an already-created custom reloption to the local list. + */ +static void +add_local_reloption(local_relopts *relopts, relopt_gen *newoption, int offset) +{ + local_relopt *opt = palloc(sizeof(*opt)); + + Assert(offset < relopts->relopt_struct_size); + + opt->option = newoption; + opt->offset = offset; + + relopts->options = lappend(relopts->options, opt); +} + +/* + * allocate_reloption + * Allocate a new reloption and initialize the type-agnostic fields + * (for types other than string) + */ +static relopt_gen * +allocate_reloption(bits32 kinds, int type, const char *name, const char *desc, + LOCKMODE lockmode) +{ + MemoryContext oldcxt; + size_t size; + relopt_gen *newoption; + + if (kinds != RELOPT_KIND_LOCAL) + oldcxt = MemoryContextSwitchTo(TopMemoryContext); + else + oldcxt = NULL; + + switch (type) + { + case RELOPT_TYPE_BOOL: + size = sizeof(relopt_bool); + break; + case RELOPT_TYPE_INT: + size = sizeof(relopt_int); + break; + case RELOPT_TYPE_REAL: + size = sizeof(relopt_real); + break; + case RELOPT_TYPE_ENUM: + size = sizeof(relopt_enum); + break; + case RELOPT_TYPE_STRING: + size = sizeof(relopt_string); + break; + default: + elog(ERROR, "unsupported reloption type %d", type); + return NULL; /* keep compiler quiet */ + } + + newoption = palloc(size); + + newoption->name = pstrdup(name); + if (desc) + newoption->desc = pstrdup(desc); + else + newoption->desc = NULL; + newoption->kinds = kinds; + newoption->namelen = strlen(name); + newoption->type = type; + newoption->lockmode = lockmode; + + if (oldcxt != NULL) + MemoryContextSwitchTo(oldcxt); + + return newoption; +} + +/* + * init_bool_reloption + * Allocate and initialize a new boolean reloption + */ +static relopt_bool * +init_bool_reloption(bits32 kinds, const char *name, const char *desc, + bool default_val, LOCKMODE lockmode) +{ + relopt_bool *newoption; + + newoption = (relopt_bool *) allocate_reloption(kinds, RELOPT_TYPE_BOOL, + name, desc, lockmode); + newoption->default_val = default_val; + + return newoption; +} + +/* + * add_bool_reloption + * Add a new boolean reloption + */ +void +add_bool_reloption(bits32 kinds, const char *name, const char *desc, + bool default_val, LOCKMODE lockmode) +{ + relopt_bool *newoption = init_bool_reloption(kinds, name, desc, + default_val, lockmode); + + add_reloption((relopt_gen *) newoption); +} + +/* + * add_local_bool_reloption + * Add a new boolean local reloption + * + * 'offset' is offset of bool-typed field. + */ +void +add_local_bool_reloption(local_relopts *relopts, const char *name, + const char *desc, bool default_val, int offset) +{ + relopt_bool *newoption = init_bool_reloption(RELOPT_KIND_LOCAL, + name, desc, + default_val, 0); + + add_local_reloption(relopts, (relopt_gen *) newoption, offset); +} + + +/* + * init_real_reloption + * Allocate and initialize a new integer reloption + */ +static relopt_int * +init_int_reloption(bits32 kinds, const char *name, const char *desc, + int default_val, int min_val, int max_val, + LOCKMODE lockmode) +{ + relopt_int *newoption; + + newoption = (relopt_int *) allocate_reloption(kinds, RELOPT_TYPE_INT, + name, desc, lockmode); + newoption->default_val = default_val; + newoption->min = min_val; + newoption->max = max_val; + + return newoption; +} + +/* + * add_int_reloption + * Add a new integer reloption + */ +void +add_int_reloption(bits32 kinds, const char *name, const char *desc, int default_val, + int min_val, int max_val, LOCKMODE lockmode) +{ + relopt_int *newoption = init_int_reloption(kinds, name, desc, + default_val, min_val, + max_val, lockmode); + + add_reloption((relopt_gen *) newoption); +} + +/* + * add_local_int_reloption + * Add a new local integer reloption + * + * 'offset' is offset of int-typed field. + */ +void +add_local_int_reloption(local_relopts *relopts, const char *name, + const char *desc, int default_val, int min_val, + int max_val, int offset) +{ + relopt_int *newoption = init_int_reloption(RELOPT_KIND_LOCAL, + name, desc, default_val, + min_val, max_val, 0); + + add_local_reloption(relopts, (relopt_gen *) newoption, offset); +} + +/* + * init_real_reloption + * Allocate and initialize a new real reloption + */ +static relopt_real * +init_real_reloption(bits32 kinds, const char *name, const char *desc, + double default_val, double min_val, double max_val, + LOCKMODE lockmode) +{ + relopt_real *newoption; + + newoption = (relopt_real *) allocate_reloption(kinds, RELOPT_TYPE_REAL, + name, desc, lockmode); + newoption->default_val = default_val; + newoption->min = min_val; + newoption->max = max_val; + + return newoption; +} + +/* + * add_real_reloption + * Add a new float reloption + */ +void +add_real_reloption(bits32 kinds, const char *name, const char *desc, + double default_val, double min_val, double max_val, + LOCKMODE lockmode) +{ + relopt_real *newoption = init_real_reloption(kinds, name, desc, + default_val, min_val, + max_val, lockmode); + + add_reloption((relopt_gen *) newoption); +} + +/* + * add_local_real_reloption + * Add a new local float reloption + * + * 'offset' is offset of double-typed field. + */ +void +add_local_real_reloption(local_relopts *relopts, const char *name, + const char *desc, double default_val, + double min_val, double max_val, int offset) +{ + relopt_real *newoption = init_real_reloption(RELOPT_KIND_LOCAL, + name, desc, + default_val, min_val, + max_val, 0); + + add_local_reloption(relopts, (relopt_gen *) newoption, offset); +} + +/* + * init_enum_reloption + * Allocate and initialize a new enum reloption + */ +static relopt_enum * +init_enum_reloption(bits32 kinds, const char *name, const char *desc, + relopt_enum_elt_def *members, int default_val, + const char *detailmsg, LOCKMODE lockmode) +{ + relopt_enum *newoption; + + newoption = (relopt_enum *) allocate_reloption(kinds, RELOPT_TYPE_ENUM, + name, desc, lockmode); + newoption->members = members; + newoption->default_val = default_val; + newoption->detailmsg = detailmsg; + + return newoption; +} + + +/* + * add_enum_reloption + * Add a new enum reloption + * + * The members array must have a terminating NULL entry. + * + * The detailmsg is shown when unsupported values are passed, and has this + * form: "Valid values are \"foo\", \"bar\", and \"bar\"." + * + * The members array and detailmsg are not copied -- caller must ensure that + * they are valid throughout the life of the process. + */ +void +add_enum_reloption(bits32 kinds, const char *name, const char *desc, + relopt_enum_elt_def *members, int default_val, + const char *detailmsg, LOCKMODE lockmode) +{ + relopt_enum *newoption = init_enum_reloption(kinds, name, desc, + members, default_val, + detailmsg, lockmode); + + add_reloption((relopt_gen *) newoption); +} + +/* + * add_local_enum_reloption + * Add a new local enum reloption + * + * 'offset' is offset of int-typed field. + */ +void +add_local_enum_reloption(local_relopts *relopts, const char *name, + const char *desc, relopt_enum_elt_def *members, + int default_val, const char *detailmsg, int offset) +{ + relopt_enum *newoption = init_enum_reloption(RELOPT_KIND_LOCAL, + name, desc, + members, default_val, + detailmsg, 0); + + add_local_reloption(relopts, (relopt_gen *) newoption, offset); +} + +/* + * init_string_reloption + * Allocate and initialize a new string reloption + */ +static relopt_string * +init_string_reloption(bits32 kinds, const char *name, const char *desc, + const char *default_val, + validate_string_relopt validator, + fill_string_relopt filler, + LOCKMODE lockmode) +{ + relopt_string *newoption; + + /* make sure the validator/default combination is sane */ + if (validator) + (validator) (default_val); + + newoption = (relopt_string *) allocate_reloption(kinds, RELOPT_TYPE_STRING, + name, desc, lockmode); + newoption->validate_cb = validator; + newoption->fill_cb = filler; + if (default_val) + { + if (kinds == RELOPT_KIND_LOCAL) + newoption->default_val = strdup(default_val); + else + newoption->default_val = MemoryContextStrdup(TopMemoryContext, default_val); + newoption->default_len = strlen(default_val); + newoption->default_isnull = false; + } + else + { + newoption->default_val = ""; + newoption->default_len = 0; + newoption->default_isnull = true; + } + + return newoption; +} + +/* + * add_string_reloption + * Add a new string reloption + * + * "validator" is an optional function pointer that can be used to test the + * validity of the values. It must elog(ERROR) when the argument string is + * not acceptable for the variable. Note that the default value must pass + * the validation. + */ +void +add_string_reloption(bits32 kinds, const char *name, const char *desc, + const char *default_val, validate_string_relopt validator, + LOCKMODE lockmode) +{ + relopt_string *newoption = init_string_reloption(kinds, name, desc, + default_val, + validator, NULL, + lockmode); + + add_reloption((relopt_gen *) newoption); +} + +/* + * add_local_string_reloption + * Add a new local string reloption + * + * 'offset' is offset of int-typed field that will store offset of string value + * in the resulting bytea structure. + */ +void +add_local_string_reloption(local_relopts *relopts, const char *name, + const char *desc, const char *default_val, + validate_string_relopt validator, + fill_string_relopt filler, int offset) +{ + relopt_string *newoption = init_string_reloption(RELOPT_KIND_LOCAL, + name, desc, + default_val, + validator, filler, + 0); + + add_local_reloption(relopts, (relopt_gen *) newoption, offset); +} + +/* + * Transform a relation options list (list of DefElem) into the text array + * format that is kept in pg_class.reloptions, including only those options + * that are in the passed namespace. The output values do not include the + * namespace. + * + * This is used for three cases: CREATE TABLE/INDEX, ALTER TABLE SET, and + * ALTER TABLE RESET. In the ALTER cases, oldOptions is the existing + * reloptions value (possibly NULL), and we replace or remove entries + * as needed. + * + * If acceptOidsOff is true, then we allow oids = false, but throw error when + * on. This is solely needed for backwards compatibility. + * + * Note that this is not responsible for determining whether the options + * are valid, but it does check that namespaces for all the options given are + * listed in validnsps. The NULL namespace is always valid and need not be + * explicitly listed. Passing a NULL pointer means that only the NULL + * namespace is valid. + * + * Both oldOptions and the result are text arrays (or NULL for "default"), + * but we declare them as Datums to avoid including array.h in reloptions.h. + */ +Datum +transformRelOptions(Datum oldOptions, List *defList, const char *namspace, + char *validnsps[], bool acceptOidsOff, bool isReset) +{ + Datum result; + ArrayBuildState *astate; + ListCell *cell; + + /* no change if empty list */ + if (defList == NIL) + return oldOptions; + + /* We build new array using accumArrayResult */ + astate = NULL; + + /* Copy any oldOptions that aren't to be replaced */ + if (PointerIsValid(DatumGetPointer(oldOptions))) + { + ArrayType *array = DatumGetArrayTypeP(oldOptions); + Datum *oldoptions; + int noldoptions; + int i; + + deconstruct_array(array, TEXTOID, -1, false, TYPALIGN_INT, + &oldoptions, NULL, &noldoptions); + + for (i = 0; i < noldoptions; i++) + { + char *text_str = VARDATA(oldoptions[i]); + int text_len = VARSIZE(oldoptions[i]) - VARHDRSZ; + + /* Search for a match in defList */ + foreach(cell, defList) + { + DefElem *def = (DefElem *) lfirst(cell); + int kw_len; + + /* ignore if not in the same namespace */ + if (namspace == NULL) + { + if (def->defnamespace != NULL) + continue; + } + else if (def->defnamespace == NULL) + continue; + else if (strcmp(def->defnamespace, namspace) != 0) + continue; + + kw_len = strlen(def->defname); + if (text_len > kw_len && text_str[kw_len] == '=' && + strncmp(text_str, def->defname, kw_len) == 0) + break; + } + if (!cell) + { + /* No match, so keep old option */ + astate = accumArrayResult(astate, oldoptions[i], + false, TEXTOID, + CurrentMemoryContext); + } + } + } + + /* + * If CREATE/SET, add new options to array; if RESET, just check that the + * user didn't say RESET (option=val). (Must do this because the grammar + * doesn't enforce it.) + */ + foreach(cell, defList) + { + DefElem *def = (DefElem *) lfirst(cell); + + if (isReset) + { + if (def->arg != NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("RESET must not include values for parameters"))); + } + else + { + text *t; + const char *value; + Size len; + + /* + * Error out if the namespace is not valid. A NULL namespace is + * always valid. + */ + if (def->defnamespace != NULL) + { + bool valid = false; + int i; + + if (validnsps) + { + for (i = 0; validnsps[i]; i++) + { + if (strcmp(def->defnamespace, validnsps[i]) == 0) + { + valid = true; + break; + } + } + } + + if (!valid) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized parameter namespace \"%s\"", + def->defnamespace))); + } + + /* ignore if not in the same namespace */ + if (namspace == NULL) + { + if (def->defnamespace != NULL) + continue; + } + else if (def->defnamespace == NULL) + continue; + else if (strcmp(def->defnamespace, namspace) != 0) + continue; + + /* + * Flatten the DefElem into a text string like "name=arg". If we + * have just "name", assume "name=true" is meant. Note: the + * namespace is not output. + */ + if (def->arg != NULL) + value = defGetString(def); + else + value = "true"; + + /* + * This is not a great place for this test, but there's no other + * convenient place to filter the option out. As WITH (oids = + * false) will be removed someday, this seems like an acceptable + * amount of ugly. + */ + if (acceptOidsOff && def->defnamespace == NULL && + strcmp(def->defname, "oids") == 0) + { + if (defGetBoolean(def)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tables declared WITH OIDS are not supported"))); + /* skip over option, reloptions machinery doesn't know it */ + continue; + } + + len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value); + /* +1 leaves room for sprintf's trailing null */ + t = (text *) palloc(len + 1); + SET_VARSIZE(t, len); + sprintf(VARDATA(t), "%s=%s", def->defname, value); + + astate = accumArrayResult(astate, PointerGetDatum(t), + false, TEXTOID, + CurrentMemoryContext); + } + } + + if (astate) + result = makeArrayResult(astate, CurrentMemoryContext); + else + result = (Datum) 0; + + return result; +} + + +/* + * Convert the text-array format of reloptions into a List of DefElem. + * This is the inverse of transformRelOptions(). + */ +List * +untransformRelOptions(Datum options) +{ + List *result = NIL; + ArrayType *array; + Datum *optiondatums; + int noptions; + int i; + + /* Nothing to do if no options */ + if (!PointerIsValid(DatumGetPointer(options))) + return result; + + array = DatumGetArrayTypeP(options); + + deconstruct_array(array, TEXTOID, -1, false, TYPALIGN_INT, + &optiondatums, NULL, &noptions); + + for (i = 0; i < noptions; i++) + { + char *s; + char *p; + Node *val = NULL; + + s = TextDatumGetCString(optiondatums[i]); + p = strchr(s, '='); + if (p) + { + *p++ = '\0'; + val = (Node *) makeString(pstrdup(p)); + } + result = lappend(result, makeDefElem(pstrdup(s), val, -1)); + } + + return result; +} + +/* + * Extract and parse reloptions from a pg_class tuple. + * + * This is a low-level routine, expected to be used by relcache code and + * callers that do not have a table's relcache entry (e.g. autovacuum). For + * other uses, consider grabbing the rd_options pointer from the relcache entry + * instead. + * + * tupdesc is pg_class' tuple descriptor. amoptions is a pointer to the index + * AM's options parser function in the case of a tuple corresponding to an + * index, or NULL otherwise. + */ +bytea * +extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, + amoptions_function amoptions) +{ + bytea *options; + bool isnull; + Datum datum; + Form_pg_class classForm; + + datum = fastgetattr(tuple, + Anum_pg_class_reloptions, + tupdesc, + &isnull); + if (isnull) + return NULL; + + classForm = (Form_pg_class) GETSTRUCT(tuple); + + /* Parse into appropriate format; don't error out here */ + switch (classForm->relkind) + { + case RELKIND_RELATION: + case RELKIND_TOASTVALUE: + case RELKIND_MATVIEW: + options = heap_reloptions(classForm->relkind, datum, false); + break; + case RELKIND_PARTITIONED_TABLE: + options = partitioned_table_reloptions(datum, false); + break; + case RELKIND_VIEW: + options = view_reloptions(datum, false); + break; + case RELKIND_INDEX: + case RELKIND_PARTITIONED_INDEX: + options = index_reloptions(amoptions, datum, false); + break; + case RELKIND_FOREIGN_TABLE: + options = NULL; + break; + default: + Assert(false); /* can't get here */ + options = NULL; /* keep compiler quiet */ + break; + } + + return options; +} + +static void +parseRelOptionsInternal(Datum options, bool validate, + relopt_value *reloptions, int numoptions) +{ + ArrayType *array = DatumGetArrayTypeP(options); + Datum *optiondatums; + int noptions; + int i; + + deconstruct_array(array, TEXTOID, -1, false, TYPALIGN_INT, + &optiondatums, NULL, &noptions); + + for (i = 0; i < noptions; i++) + { + char *text_str = VARDATA(optiondatums[i]); + int text_len = VARSIZE(optiondatums[i]) - VARHDRSZ; + int j; + + /* Search for a match in reloptions */ + for (j = 0; j < numoptions; j++) + { + int kw_len = reloptions[j].gen->namelen; + + if (text_len > kw_len && text_str[kw_len] == '=' && + strncmp(text_str, reloptions[j].gen->name, kw_len) == 0) + { + parse_one_reloption(&reloptions[j], text_str, text_len, + validate); + break; + } + } + + if (j >= numoptions && validate) + { + char *s; + char *p; + + s = TextDatumGetCString(optiondatums[i]); + p = strchr(s, '='); + if (p) + *p = '\0'; + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized parameter \"%s\"", s))); + } + } + + /* It's worth avoiding memory leaks in this function */ + pfree(optiondatums); + + if (((void *) array) != DatumGetPointer(options)) + pfree(array); +} + +/* + * Interpret reloptions that are given in text-array format. + * + * options is a reloption text array as constructed by transformRelOptions. + * kind specifies the family of options to be processed. + * + * The return value is a relopt_value * array on which the options actually + * set in the options array are marked with isset=true. The length of this + * array is returned in *numrelopts. Options not set are also present in the + * array; this is so that the caller can easily locate the default values. + * + * If there are no options of the given kind, numrelopts is set to 0 and NULL + * is returned (unless options are illegally supplied despite none being + * defined, in which case an error occurs). + * + * Note: values of type int, bool and real are allocated as part of the + * returned array. Values of type string are allocated separately and must + * be freed by the caller. + */ +static relopt_value * +parseRelOptions(Datum options, bool validate, relopt_kind kind, + int *numrelopts) +{ + relopt_value *reloptions = NULL; + int numoptions = 0; + int i; + int j; + + if (need_initialization) + initialize_reloptions(); + + /* Build a list of expected options, based on kind */ + + for (i = 0; relOpts[i]; i++) + if (relOpts[i]->kinds & kind) + numoptions++; + + if (numoptions > 0) + { + reloptions = palloc(numoptions * sizeof(relopt_value)); + + for (i = 0, j = 0; relOpts[i]; i++) + { + if (relOpts[i]->kinds & kind) + { + reloptions[j].gen = relOpts[i]; + reloptions[j].isset = false; + j++; + } + } + } + + /* Done if no options */ + if (PointerIsValid(DatumGetPointer(options))) + parseRelOptionsInternal(options, validate, reloptions, numoptions); + + *numrelopts = numoptions; + return reloptions; +} + +/* Parse local unregistered options. */ +static relopt_value * +parseLocalRelOptions(local_relopts *relopts, Datum options, bool validate) +{ + int nopts = list_length(relopts->options); + relopt_value *values = palloc(sizeof(*values) * nopts); + ListCell *lc; + int i = 0; + + foreach(lc, relopts->options) + { + local_relopt *opt = lfirst(lc); + + values[i].gen = opt->option; + values[i].isset = false; + + i++; + } + + if (options != (Datum) 0) + parseRelOptionsInternal(options, validate, values, nopts); + + return values; +} + +/* + * Subroutine for parseRelOptions, to parse and validate a single option's + * value + */ +static void +parse_one_reloption(relopt_value *option, char *text_str, int text_len, + bool validate) +{ + char *value; + int value_len; + bool parsed; + bool nofree = false; + + if (option->isset && validate) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("parameter \"%s\" specified more than once", + option->gen->name))); + + value_len = text_len - option->gen->namelen - 1; + value = (char *) palloc(value_len + 1); + memcpy(value, text_str + option->gen->namelen + 1, value_len); + value[value_len] = '\0'; + + switch (option->gen->type) + { + case RELOPT_TYPE_BOOL: + { + parsed = parse_bool(value, &option->values.bool_val); + if (validate && !parsed) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for boolean option \"%s\": %s", + option->gen->name, value))); + } + break; + case RELOPT_TYPE_INT: + { + relopt_int *optint = (relopt_int *) option->gen; + + parsed = parse_int(value, &option->values.int_val, 0, NULL); + if (validate && !parsed) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for integer option \"%s\": %s", + option->gen->name, value))); + if (validate && (option->values.int_val < optint->min || + option->values.int_val > optint->max)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("value %s out of bounds for option \"%s\"", + value, option->gen->name), + errdetail("Valid values are between \"%d\" and \"%d\".", + optint->min, optint->max))); + } + break; + case RELOPT_TYPE_REAL: + { + relopt_real *optreal = (relopt_real *) option->gen; + + parsed = parse_real(value, &option->values.real_val, 0, NULL); + if (validate && !parsed) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for floating point option \"%s\": %s", + option->gen->name, value))); + if (validate && (option->values.real_val < optreal->min || + option->values.real_val > optreal->max)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("value %s out of bounds for option \"%s\"", + value, option->gen->name), + errdetail("Valid values are between \"%f\" and \"%f\".", + optreal->min, optreal->max))); + } + break; + case RELOPT_TYPE_ENUM: + { + relopt_enum *optenum = (relopt_enum *) option->gen; + relopt_enum_elt_def *elt; + + parsed = false; + for (elt = optenum->members; elt->string_val; elt++) + { + if (pg_strcasecmp(value, elt->string_val) == 0) + { + option->values.enum_val = elt->symbol_val; + parsed = true; + break; + } + } + if (validate && !parsed) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for enum option \"%s\": %s", + option->gen->name, value), + optenum->detailmsg ? + errdetail_internal("%s", _(optenum->detailmsg)) : 0)); + + /* + * If value is not among the allowed string values, but we are + * not asked to validate, just use the default numeric value. + */ + if (!parsed) + option->values.enum_val = optenum->default_val; + } + break; + case RELOPT_TYPE_STRING: + { + relopt_string *optstring = (relopt_string *) option->gen; + + option->values.string_val = value; + nofree = true; + if (validate && optstring->validate_cb) + (optstring->validate_cb) (value); + parsed = true; + } + break; + default: + elog(ERROR, "unsupported reloption type %d", option->gen->type); + parsed = true; /* quiet compiler */ + break; + } + + if (parsed) + option->isset = true; + if (!nofree) + pfree(value); +} + +/* + * Given the result from parseRelOptions, allocate a struct that's of the + * specified base size plus any extra space that's needed for string variables. + * + * "base" should be sizeof(struct) of the reloptions struct (StdRdOptions or + * equivalent). + */ +static void * +allocateReloptStruct(Size base, relopt_value *options, int numoptions) +{ + Size size = base; + int i; + + for (i = 0; i < numoptions; i++) + { + relopt_value *optval = &options[i]; + + if (optval->gen->type == RELOPT_TYPE_STRING) + { + relopt_string *optstr = (relopt_string *) optval->gen; + + if (optstr->fill_cb) + { + const char *val = optval->isset ? optval->values.string_val : + optstr->default_isnull ? NULL : optstr->default_val; + + size += optstr->fill_cb(val, NULL); + } + else + size += GET_STRING_RELOPTION_LEN(*optval) + 1; + } + } + + return palloc0(size); +} + +/* + * Given the result of parseRelOptions and a parsing table, fill in the + * struct (previously allocated with allocateReloptStruct) with the parsed + * values. + * + * rdopts is the pointer to the allocated struct to be filled. + * basesize is the sizeof(struct) that was passed to allocateReloptStruct. + * options, of length numoptions, is parseRelOptions' output. + * elems, of length numelems, is the table describing the allowed options. + * When validate is true, it is expected that all options appear in elems. + */ +static void +fillRelOptions(void *rdopts, Size basesize, + relopt_value *options, int numoptions, + bool validate, + const relopt_parse_elt *elems, int numelems) +{ + int i; + int offset = basesize; + + for (i = 0; i < numoptions; i++) + { + int j; + bool found = false; + + for (j = 0; j < numelems; j++) + { + if (strcmp(options[i].gen->name, elems[j].optname) == 0) + { + relopt_string *optstring; + char *itempos = ((char *) rdopts) + elems[j].offset; + char *string_val; + + switch (options[i].gen->type) + { + case RELOPT_TYPE_BOOL: + *(bool *) itempos = options[i].isset ? + options[i].values.bool_val : + ((relopt_bool *) options[i].gen)->default_val; + break; + case RELOPT_TYPE_INT: + *(int *) itempos = options[i].isset ? + options[i].values.int_val : + ((relopt_int *) options[i].gen)->default_val; + break; + case RELOPT_TYPE_REAL: + *(double *) itempos = options[i].isset ? + options[i].values.real_val : + ((relopt_real *) options[i].gen)->default_val; + break; + case RELOPT_TYPE_ENUM: + *(int *) itempos = options[i].isset ? + options[i].values.enum_val : + ((relopt_enum *) options[i].gen)->default_val; + break; + case RELOPT_TYPE_STRING: + optstring = (relopt_string *) options[i].gen; + if (options[i].isset) + string_val = options[i].values.string_val; + else if (!optstring->default_isnull) + string_val = optstring->default_val; + else + string_val = NULL; + + if (optstring->fill_cb) + { + Size size = + optstring->fill_cb(string_val, + (char *) rdopts + offset); + + if (size) + { + *(int *) itempos = offset; + offset += size; + } + else + *(int *) itempos = 0; + } + else if (string_val == NULL) + *(int *) itempos = 0; + else + { + strcpy((char *) rdopts + offset, string_val); + *(int *) itempos = offset; + offset += strlen(string_val) + 1; + } + break; + default: + elog(ERROR, "unsupported reloption type %d", + options[i].gen->type); + break; + } + found = true; + break; + } + } + if (validate && !found) + elog(ERROR, "reloption \"%s\" not found in parse table", + options[i].gen->name); + } + SET_VARSIZE(rdopts, offset); +} + + +/* + * Option parser for anything that uses StdRdOptions. + */ +bytea * +default_reloptions(Datum reloptions, bool validate, relopt_kind kind) +{ + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(StdRdOptions, fillfactor)}, + {"autovacuum_enabled", RELOPT_TYPE_BOOL, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, enabled)}, + {"autovacuum_vacuum_threshold", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, vacuum_threshold)}, + {"autovacuum_vacuum_insert_threshold", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, vacuum_ins_threshold)}, + {"autovacuum_analyze_threshold", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, analyze_threshold)}, + {"autovacuum_vacuum_cost_limit", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, vacuum_cost_limit)}, + {"autovacuum_freeze_min_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, freeze_min_age)}, + {"autovacuum_freeze_max_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, freeze_max_age)}, + {"autovacuum_freeze_table_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, freeze_table_age)}, + {"autovacuum_multixact_freeze_min_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, multixact_freeze_min_age)}, + {"autovacuum_multixact_freeze_max_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, multixact_freeze_max_age)}, + {"autovacuum_multixact_freeze_table_age", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, multixact_freeze_table_age)}, + {"log_autovacuum_min_duration", RELOPT_TYPE_INT, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, log_min_duration)}, + {"toast_tuple_target", RELOPT_TYPE_INT, + offsetof(StdRdOptions, toast_tuple_target)}, + {"autovacuum_vacuum_cost_delay", RELOPT_TYPE_REAL, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, vacuum_cost_delay)}, + {"autovacuum_vacuum_scale_factor", RELOPT_TYPE_REAL, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, vacuum_scale_factor)}, + {"autovacuum_vacuum_insert_scale_factor", RELOPT_TYPE_REAL, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, vacuum_ins_scale_factor)}, + {"autovacuum_analyze_scale_factor", RELOPT_TYPE_REAL, + offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, analyze_scale_factor)}, + {"user_catalog_table", RELOPT_TYPE_BOOL, + offsetof(StdRdOptions, user_catalog_table)}, + {"parallel_workers", RELOPT_TYPE_INT, + offsetof(StdRdOptions, parallel_workers)}, + {"vacuum_index_cleanup", RELOPT_TYPE_ENUM, + offsetof(StdRdOptions, vacuum_index_cleanup)}, + {"vacuum_truncate", RELOPT_TYPE_BOOL, + offsetof(StdRdOptions, vacuum_truncate)} + }; + + return (bytea *) build_reloptions(reloptions, validate, kind, + sizeof(StdRdOptions), + tab, lengthof(tab)); +} + +/* + * build_reloptions + * + * Parses "reloptions" provided by the caller, returning them in a + * structure containing the parsed options. The parsing is done with + * the help of a parsing table describing the allowed options, defined + * by "relopt_elems" of length "num_relopt_elems". + * + * "validate" must be true if reloptions value is freshly built by + * transformRelOptions(), as opposed to being read from the catalog, in which + * case the values contained in it must already be valid. + * + * NULL is returned if the passed-in options did not match any of the options + * in the parsing table, unless validate is true in which case an error would + * be reported. + */ +void * +build_reloptions(Datum reloptions, bool validate, + relopt_kind kind, + Size relopt_struct_size, + const relopt_parse_elt *relopt_elems, + int num_relopt_elems) +{ + int numoptions; + relopt_value *options; + void *rdopts; + + /* parse options specific to given relation option kind */ + options = parseRelOptions(reloptions, validate, kind, &numoptions); + Assert(numoptions <= num_relopt_elems); + + /* if none set, we're done */ + if (numoptions == 0) + { + Assert(options == NULL); + return NULL; + } + + /* allocate and fill the structure */ + rdopts = allocateReloptStruct(relopt_struct_size, options, numoptions); + fillRelOptions(rdopts, relopt_struct_size, options, numoptions, + validate, relopt_elems, num_relopt_elems); + + pfree(options); + + return rdopts; +} + +/* + * Parse local options, allocate a bytea struct that's of the specified + * 'base_size' plus any extra space that's needed for string variables, + * fill its option's fields located at the given offsets and return it. + */ +void * +build_local_reloptions(local_relopts *relopts, Datum options, bool validate) +{ + int noptions = list_length(relopts->options); + relopt_parse_elt *elems = palloc(sizeof(*elems) * noptions); + relopt_value *vals; + void *opts; + int i = 0; + ListCell *lc; + + foreach(lc, relopts->options) + { + local_relopt *opt = lfirst(lc); + + elems[i].optname = opt->option->name; + elems[i].opttype = opt->option->type; + elems[i].offset = opt->offset; + + i++; + } + + vals = parseLocalRelOptions(relopts, options, validate); + opts = allocateReloptStruct(relopts->relopt_struct_size, vals, noptions); + fillRelOptions(opts, relopts->relopt_struct_size, vals, noptions, validate, + elems, noptions); + + foreach(lc, relopts->validators) + ((relopts_validator) lfirst(lc)) (opts, vals, noptions); + + if (elems) + pfree(elems); + + return opts; +} + +/* + * Option parser for partitioned tables + */ +bytea * +partitioned_table_reloptions(Datum reloptions, bool validate) +{ + /* + * There are no options for partitioned tables yet, but this is able to do + * some validation. + */ + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_PARTITIONED, + 0, NULL, 0); +} + +/* + * Option parser for views + */ +bytea * +view_reloptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"security_barrier", RELOPT_TYPE_BOOL, + offsetof(ViewOptions, security_barrier)}, + {"check_option", RELOPT_TYPE_ENUM, + offsetof(ViewOptions, check_option)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_VIEW, + sizeof(ViewOptions), + tab, lengthof(tab)); +} + +/* + * Parse options for heaps, views and toast tables. + */ +bytea * +heap_reloptions(char relkind, Datum reloptions, bool validate) +{ + StdRdOptions *rdopts; + + switch (relkind) + { + case RELKIND_TOASTVALUE: + rdopts = (StdRdOptions *) + default_reloptions(reloptions, validate, RELOPT_KIND_TOAST); + if (rdopts != NULL) + { + /* adjust default-only parameters for TOAST relations */ + rdopts->fillfactor = 100; + rdopts->autovacuum.analyze_threshold = -1; + rdopts->autovacuum.analyze_scale_factor = -1; + } + return (bytea *) rdopts; + case RELKIND_RELATION: + case RELKIND_MATVIEW: + return default_reloptions(reloptions, validate, RELOPT_KIND_HEAP); + default: + /* other relkinds are not supported */ + return NULL; + } +} + + +/* + * Parse options for indexes. + * + * amoptions index AM's option parser function + * reloptions options as text[] datum + * validate error flag + */ +bytea * +index_reloptions(amoptions_function amoptions, Datum reloptions, bool validate) +{ + Assert(amoptions != NULL); + + /* Assume function is strict */ + if (!PointerIsValid(DatumGetPointer(reloptions))) + return NULL; + + return amoptions(reloptions, validate); +} + +/* + * Option parser for attribute reloptions + */ +bytea * +attribute_reloptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"n_distinct", RELOPT_TYPE_REAL, offsetof(AttributeOpts, n_distinct)}, + {"n_distinct_inherited", RELOPT_TYPE_REAL, offsetof(AttributeOpts, n_distinct_inherited)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_ATTRIBUTE, + sizeof(AttributeOpts), + tab, lengthof(tab)); +} + +/* + * Option parser for tablespace reloptions + */ +bytea * +tablespace_reloptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)}, + {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}, + {"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}, + {"maintenance_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, maintenance_io_concurrency)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_TABLESPACE, + sizeof(TableSpaceOpts), + tab, lengthof(tab)); +} + +/* + * Determine the required LOCKMODE from an option list. + * + * Called from AlterTableGetLockLevel(), see that function + * for a longer explanation of how this works. + */ +LOCKMODE +AlterTableGetRelOptionsLockLevel(List *defList) +{ + LOCKMODE lockmode = NoLock; + ListCell *cell; + + if (defList == NIL) + return AccessExclusiveLock; + + if (need_initialization) + initialize_reloptions(); + + foreach(cell, defList) + { + DefElem *def = (DefElem *) lfirst(cell); + int i; + + for (i = 0; relOpts[i]; i++) + { + if (strncmp(relOpts[i]->name, + def->defname, + relOpts[i]->namelen + 1) == 0) + { + if (lockmode < relOpts[i]->lockmode) + lockmode = relOpts[i]->lockmode; + } + } + } + + return lockmode; +} diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c new file mode 100644 index 0000000..bf33c50 --- /dev/null +++ b/src/backend/access/common/scankey.c @@ -0,0 +1,117 @@ +/*------------------------------------------------------------------------- + * + * scankey.c + * scan key support code + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/scankey.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/skey.h" +#include "catalog/pg_collation.h" + + +/* + * ScanKeyEntryInitialize + * Initializes a scan key entry given all the field values. + * The target procedure is specified by OID (but can be invalid + * if SK_SEARCHNULL or SK_SEARCHNOTNULL is set). + * + * Note: CurrentMemoryContext at call should be as long-lived as the ScanKey + * itself, because that's what will be used for any subsidiary info attached + * to the ScanKey's FmgrInfo record. + */ +void +ScanKeyEntryInitialize(ScanKey entry, + int flags, + AttrNumber attributeNumber, + StrategyNumber strategy, + Oid subtype, + Oid collation, + RegProcedure procedure, + Datum argument) +{ + entry->sk_flags = flags; + entry->sk_attno = attributeNumber; + entry->sk_strategy = strategy; + entry->sk_subtype = subtype; + entry->sk_collation = collation; + entry->sk_argument = argument; + if (RegProcedureIsValid(procedure)) + { + fmgr_info(procedure, &entry->sk_func); + } + else + { + Assert(flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); + MemSet(&entry->sk_func, 0, sizeof(entry->sk_func)); + } +} + +/* + * ScanKeyInit + * Shorthand version of ScanKeyEntryInitialize: flags and subtype + * are assumed to be zero (the usual value), and collation is defaulted. + * + * This is the recommended version for hardwired lookups in system catalogs. + * It cannot handle NULL arguments, unary operators, or nondefault operators, + * but we need none of those features for most hardwired lookups. + * + * We set collation to C_COLLATION_OID always. This is the correct value + * for all collation-aware columns in system catalogs, and it will be ignored + * for other column types, so it's not worth trying to be more finicky. + * + * Note: CurrentMemoryContext at call should be as long-lived as the ScanKey + * itself, because that's what will be used for any subsidiary info attached + * to the ScanKey's FmgrInfo record. + */ +void +ScanKeyInit(ScanKey entry, + AttrNumber attributeNumber, + StrategyNumber strategy, + RegProcedure procedure, + Datum argument) +{ + entry->sk_flags = 0; + entry->sk_attno = attributeNumber; + entry->sk_strategy = strategy; + entry->sk_subtype = InvalidOid; + entry->sk_collation = C_COLLATION_OID; + entry->sk_argument = argument; + fmgr_info(procedure, &entry->sk_func); +} + +/* + * ScanKeyEntryInitializeWithInfo + * Initializes a scan key entry using an already-completed FmgrInfo + * function lookup record. + * + * Note: CurrentMemoryContext at call should be as long-lived as the ScanKey + * itself, because that's what will be used for any subsidiary info attached + * to the ScanKey's FmgrInfo record. + */ +void +ScanKeyEntryInitializeWithInfo(ScanKey entry, + int flags, + AttrNumber attributeNumber, + StrategyNumber strategy, + Oid subtype, + Oid collation, + FmgrInfo *finfo, + Datum argument) +{ + entry->sk_flags = flags; + entry->sk_attno = attributeNumber; + entry->sk_strategy = strategy; + entry->sk_subtype = subtype; + entry->sk_collation = collation; + entry->sk_argument = argument; + fmgr_info_copy(&entry->sk_func, finfo, CurrentMemoryContext); +} diff --git a/src/backend/access/common/session.c b/src/backend/access/common/session.c new file mode 100644 index 0000000..61b3206 --- /dev/null +++ b/src/backend/access/common/session.c @@ -0,0 +1,208 @@ +/*------------------------------------------------------------------------- + * + * session.c + * Encapsulation of user session. + * + * This is intended to contain data that needs to be shared between backends + * performing work for a client session. In particular such a session is + * shared between the leader and worker processes for parallel queries. At + * some later point it might also become useful infrastructure for separating + * backends from client connections, e.g. for the purpose of pooling. + * + * Currently this infrastructure is used to share: + * - typemod registry for ephemeral row-types, i.e. BlessTupleDesc etc. + * + * Portions Copyright (c) 2017-2021, PostgreSQL Global Development Group + * + * src/backend/access/common/session.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/session.h" +#include "storage/lwlock.h" +#include "storage/shm_toc.h" +#include "utils/memutils.h" +#include "utils/typcache.h" + +/* Magic number for per-session DSM TOC. */ +#define SESSION_MAGIC 0xabb0fbc9 + +/* + * We want to create a DSA area to store shared state that has the same + * lifetime as a session. So far, it's only used to hold the shared record + * type registry. We don't want it to have to create any DSM segments just + * yet in common cases, so we'll give it enough space to hold a very small + * SharedRecordTypmodRegistry. + */ +#define SESSION_DSA_SIZE 0x30000 + +/* + * Magic numbers for state sharing in the per-session DSM area. + */ +#define SESSION_KEY_DSA UINT64CONST(0xFFFFFFFFFFFF0001) +#define SESSION_KEY_RECORD_TYPMOD_REGISTRY UINT64CONST(0xFFFFFFFFFFFF0002) + +/* This backend's current session. */ +Session *CurrentSession = NULL; + +/* + * Set up CurrentSession to point to an empty Session object. + */ +void +InitializeSession(void) +{ + CurrentSession = MemoryContextAllocZero(TopMemoryContext, sizeof(Session)); +} + +/* + * Initialize the per-session DSM segment if it isn't already initialized, and + * return its handle so that worker processes can attach to it. + * + * Unlike the per-context DSM segment, this segment and its contents are + * reused for future parallel queries. + * + * Return DSM_HANDLE_INVALID if a segment can't be allocated due to lack of + * resources. + */ +dsm_handle +GetSessionDsmHandle(void) +{ + shm_toc_estimator estimator; + shm_toc *toc; + dsm_segment *seg; + size_t typmod_registry_size; + size_t size; + void *dsa_space; + void *typmod_registry_space; + dsa_area *dsa; + MemoryContext old_context; + + /* + * If we have already created a session-scope DSM segment in this backend, + * return its handle. The same segment will be used for the rest of this + * backend's lifetime. + */ + if (CurrentSession->segment != NULL) + return dsm_segment_handle(CurrentSession->segment); + + /* Otherwise, prepare to set one up. */ + old_context = MemoryContextSwitchTo(TopMemoryContext); + shm_toc_initialize_estimator(&estimator); + + /* Estimate space for the per-session DSA area. */ + shm_toc_estimate_keys(&estimator, 1); + shm_toc_estimate_chunk(&estimator, SESSION_DSA_SIZE); + + /* Estimate space for the per-session record typmod registry. */ + typmod_registry_size = SharedRecordTypmodRegistryEstimate(); + shm_toc_estimate_keys(&estimator, 1); + shm_toc_estimate_chunk(&estimator, typmod_registry_size); + + /* Set up segment and TOC. */ + size = shm_toc_estimate(&estimator); + seg = dsm_create(size, DSM_CREATE_NULL_IF_MAXSEGMENTS); + if (seg == NULL) + { + MemoryContextSwitchTo(old_context); + + return DSM_HANDLE_INVALID; + } + toc = shm_toc_create(SESSION_MAGIC, + dsm_segment_address(seg), + size); + + /* Create per-session DSA area. */ + dsa_space = shm_toc_allocate(toc, SESSION_DSA_SIZE); + dsa = dsa_create_in_place(dsa_space, + SESSION_DSA_SIZE, + LWTRANCHE_PER_SESSION_DSA, + seg); + shm_toc_insert(toc, SESSION_KEY_DSA, dsa_space); + + + /* Create session-scoped shared record typmod registry. */ + typmod_registry_space = shm_toc_allocate(toc, typmod_registry_size); + SharedRecordTypmodRegistryInit((SharedRecordTypmodRegistry *) + typmod_registry_space, seg, dsa); + shm_toc_insert(toc, SESSION_KEY_RECORD_TYPMOD_REGISTRY, + typmod_registry_space); + + /* + * If we got this far, we can pin the shared memory so it stays mapped for + * the rest of this backend's life. If we don't make it this far, cleanup + * callbacks for anything we installed above (ie currently + * SharedRecordTypmodRegistry) will run when the DSM segment is detached + * by CurrentResourceOwner so we aren't left with a broken CurrentSession. + */ + dsm_pin_mapping(seg); + dsa_pin_mapping(dsa); + + /* Make segment and area available via CurrentSession. */ + CurrentSession->segment = seg; + CurrentSession->area = dsa; + + MemoryContextSwitchTo(old_context); + + return dsm_segment_handle(seg); +} + +/* + * Attach to a per-session DSM segment provided by a parallel leader. + */ +void +AttachSession(dsm_handle handle) +{ + dsm_segment *seg; + shm_toc *toc; + void *dsa_space; + void *typmod_registry_space; + dsa_area *dsa; + MemoryContext old_context; + + old_context = MemoryContextSwitchTo(TopMemoryContext); + + /* Attach to the DSM segment. */ + seg = dsm_attach(handle); + if (seg == NULL) + elog(ERROR, "could not attach to per-session DSM segment"); + toc = shm_toc_attach(SESSION_MAGIC, dsm_segment_address(seg)); + + /* Attach to the DSA area. */ + dsa_space = shm_toc_lookup(toc, SESSION_KEY_DSA, false); + dsa = dsa_attach_in_place(dsa_space, seg); + + /* Make them available via the current session. */ + CurrentSession->segment = seg; + CurrentSession->area = dsa; + + /* Attach to the shared record typmod registry. */ + typmod_registry_space = + shm_toc_lookup(toc, SESSION_KEY_RECORD_TYPMOD_REGISTRY, false); + SharedRecordTypmodRegistryAttach((SharedRecordTypmodRegistry *) + typmod_registry_space); + + /* Remain attached until end of backend or DetachSession(). */ + dsm_pin_mapping(seg); + dsa_pin_mapping(dsa); + + MemoryContextSwitchTo(old_context); +} + +/* + * Detach from the current session DSM segment. It's not strictly necessary + * to do this explicitly since we'll detach automatically at backend exit, but + * if we ever reuse parallel workers it will become important for workers to + * detach from one session before attaching to another. Note that this runs + * detach hooks. + */ +void +DetachSession(void) +{ + /* Runs detach hooks. */ + dsm_detach(CurrentSession->segment); + CurrentSession->segment = NULL; + dsa_detach(CurrentSession->area); + CurrentSession->area = NULL; +} diff --git a/src/backend/access/common/syncscan.c b/src/backend/access/common/syncscan.c new file mode 100644 index 0000000..b7a28af --- /dev/null +++ b/src/backend/access/common/syncscan.c @@ -0,0 +1,322 @@ +/*------------------------------------------------------------------------- + * + * syncscan.c + * scan synchronization support + * + * When multiple backends run a sequential scan on the same table, we try + * to keep them synchronized to reduce the overall I/O needed. The goal is + * to read each page into shared buffer cache only once, and let all backends + * that take part in the shared scan process the page before it falls out of + * the cache. + * + * Since the "leader" in a pack of backends doing a seqscan will have to wait + * for I/O, while the "followers" don't, there is a strong self-synchronizing + * effect once we can get the backends examining approximately the same part + * of the table at the same time. Hence all that is really needed is to get + * a new backend beginning a seqscan to begin it close to where other backends + * are reading. We can scan the table circularly, from block X up to the + * end and then from block 0 to X-1, to ensure we visit all rows while still + * participating in the common scan. + * + * To accomplish that, we keep track of the scan position of each table, and + * start new scans close to where the previous scan(s) are. We don't try to + * do any extra synchronization to keep the scans together afterwards; some + * scans might progress much more slowly than others, for example if the + * results need to be transferred to the client over a slow network, and we + * don't want such queries to slow down others. + * + * There can realistically only be a few large sequential scans on different + * tables in progress at any time. Therefore we just keep the scan positions + * in a small LRU list which we scan every time we need to look up or update a + * scan position. The whole mechanism is only applied for tables exceeding + * a threshold size (but that is not the concern of this module). + * + * INTERFACE ROUTINES + * ss_get_location - return current scan location of a relation + * ss_report_location - update current scan location + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/common/syncscan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/syncscan.h" +#include "miscadmin.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/rel.h" + + +/* GUC variables */ +#ifdef TRACE_SYNCSCAN +bool trace_syncscan = false; +#endif + + +/* + * Size of the LRU list. + * + * Note: the code assumes that SYNC_SCAN_NELEM > 1. + * + * XXX: What's a good value? It should be large enough to hold the + * maximum number of large tables scanned simultaneously. But a larger value + * means more traversing of the LRU list when starting a new scan. + */ +#define SYNC_SCAN_NELEM 20 + +/* + * Interval between reports of the location of the current scan, in pages. + * + * Note: This should be smaller than the ring size (see buffer/freelist.c) + * we use for bulk reads. Otherwise a scan joining other scans might start + * from a page that's no longer in the buffer cache. This is a bit fuzzy; + * there's no guarantee that the new scan will read the page before it leaves + * the buffer cache anyway, and on the other hand the page is most likely + * still in the OS cache. + */ +#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) + + +/* + * The scan locations structure is essentially a doubly-linked LRU with head + * and tail pointer, but designed to hold a fixed maximum number of elements in + * fixed-size shared memory. + */ +typedef struct ss_scan_location_t +{ + RelFileNode relfilenode; /* identity of a relation */ + BlockNumber location; /* last-reported location in the relation */ +} ss_scan_location_t; + +typedef struct ss_lru_item_t +{ + struct ss_lru_item_t *prev; + struct ss_lru_item_t *next; + ss_scan_location_t location; +} ss_lru_item_t; + +typedef struct ss_scan_locations_t +{ + ss_lru_item_t *head; + ss_lru_item_t *tail; + ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */ +} ss_scan_locations_t; + +#define SizeOfScanLocations(N) \ + (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t)) + +/* Pointer to struct in shared memory */ +static ss_scan_locations_t *scan_locations; + +/* prototypes for internal functions */ +static BlockNumber ss_search(RelFileNode relfilenode, + BlockNumber location, bool set); + + +/* + * SyncScanShmemSize --- report amount of shared memory space needed + */ +Size +SyncScanShmemSize(void) +{ + return SizeOfScanLocations(SYNC_SCAN_NELEM); +} + +/* + * SyncScanShmemInit --- initialize this module's shared memory + */ +void +SyncScanShmemInit(void) +{ + int i; + bool found; + + scan_locations = (ss_scan_locations_t *) + ShmemInitStruct("Sync Scan Locations List", + SizeOfScanLocations(SYNC_SCAN_NELEM), + &found); + + if (!IsUnderPostmaster) + { + /* Initialize shared memory area */ + Assert(!found); + + scan_locations->head = &scan_locations->items[0]; + scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; + + for (i = 0; i < SYNC_SCAN_NELEM; i++) + { + ss_lru_item_t *item = &scan_locations->items[i]; + + /* + * Initialize all slots with invalid values. As scans are started, + * these invalid entries will fall off the LRU list and get + * replaced with real entries. + */ + item->location.relfilenode.spcNode = InvalidOid; + item->location.relfilenode.dbNode = InvalidOid; + item->location.relfilenode.relNode = InvalidOid; + item->location.location = InvalidBlockNumber; + + item->prev = (i > 0) ? + (&scan_locations->items[i - 1]) : NULL; + item->next = (i < SYNC_SCAN_NELEM - 1) ? + (&scan_locations->items[i + 1]) : NULL; + } + } + else + Assert(found); +} + +/* + * ss_search --- search the scan_locations structure for an entry with the + * given relfilenode. + * + * If "set" is true, the location is updated to the given location. If no + * entry for the given relfilenode is found, it will be created at the head + * of the list with the given location, even if "set" is false. + * + * In any case, the location after possible update is returned. + * + * Caller is responsible for having acquired suitable lock on the shared + * data structure. + */ +static BlockNumber +ss_search(RelFileNode relfilenode, BlockNumber location, bool set) +{ + ss_lru_item_t *item; + + item = scan_locations->head; + for (;;) + { + bool match; + + match = RelFileNodeEquals(item->location.relfilenode, relfilenode); + + if (match || item->next == NULL) + { + /* + * If we reached the end of list and no match was found, take over + * the last entry + */ + if (!match) + { + item->location.relfilenode = relfilenode; + item->location.location = location; + } + else if (set) + item->location.location = location; + + /* Move the entry to the front of the LRU list */ + if (item != scan_locations->head) + { + /* unlink */ + if (item == scan_locations->tail) + scan_locations->tail = item->prev; + item->prev->next = item->next; + if (item->next) + item->next->prev = item->prev; + + /* link */ + item->prev = NULL; + item->next = scan_locations->head; + scan_locations->head->prev = item; + scan_locations->head = item; + } + + return item->location.location; + } + + item = item->next; + } + + /* not reached */ +} + +/* + * ss_get_location --- get the optimal starting location for scan + * + * Returns the last-reported location of a sequential scan on the + * relation, or 0 if no valid location is found. + * + * We expect the caller has just done RelationGetNumberOfBlocks(), and + * so that number is passed in rather than computing it again. The result + * is guaranteed less than relnblocks (assuming that's > 0). + */ +BlockNumber +ss_get_location(Relation rel, BlockNumber relnblocks) +{ + BlockNumber startloc; + + LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); + startloc = ss_search(rel->rd_node, 0, false); + LWLockRelease(SyncScanLock); + + /* + * If the location is not a valid block number for this scan, start at 0. + * + * This can happen if for instance a VACUUM truncated the table since the + * location was saved. + */ + if (startloc >= relnblocks) + startloc = 0; + +#ifdef TRACE_SYNCSCAN + if (trace_syncscan) + elog(LOG, + "SYNC_SCAN: start \"%s\" (size %u) at %u", + RelationGetRelationName(rel), relnblocks, startloc); +#endif + + return startloc; +} + +/* + * ss_report_location --- update the current scan location + * + * Writes an entry into the shared Sync Scan state of the form + * (relfilenode, blocknumber), overwriting any existing entry for the + * same relfilenode. + */ +void +ss_report_location(Relation rel, BlockNumber location) +{ +#ifdef TRACE_SYNCSCAN + if (trace_syncscan) + { + if ((location % 1024) == 0) + elog(LOG, + "SYNC_SCAN: scanning \"%s\" at %u", + RelationGetRelationName(rel), location); + } +#endif + + /* + * To reduce lock contention, only report scan progress every N pages. For + * the same reason, don't block if the lock isn't immediately available. + * Missing a few updates isn't critical, it just means that a new scan + * that wants to join the pack will start a little bit behind the head of + * the scan. Hopefully the pages are still in OS cache and the scan + * catches up quickly. + */ + if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) + { + if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) + { + (void) ss_search(rel->rd_node, location, true); + LWLockRelease(SyncScanLock); + } +#ifdef TRACE_SYNCSCAN + else if (trace_syncscan) + elog(LOG, + "SYNC_SCAN: missed update for \"%s\" at %u", + RelationGetRelationName(rel), location); +#endif + } +} diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c new file mode 100644 index 0000000..8456183 --- /dev/null +++ b/src/backend/access/common/toast_compression.c @@ -0,0 +1,318 @@ +/*------------------------------------------------------------------------- + * + * toast_compression.c + * Functions for toast compression. + * + * Copyright (c) 2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/common/toast_compression.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#ifdef USE_LZ4 +#include +#endif + +#include "access/detoast.h" +#include "access/toast_compression.h" +#include "common/pg_lzcompress.h" +#include "fmgr.h" +#include "utils/builtins.h" + +/* GUC */ +int default_toast_compression = TOAST_PGLZ_COMPRESSION; + +#define NO_LZ4_SUPPORT() \ + ereport(ERROR, \ + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ + errmsg("compression method lz4 not supported"), \ + errdetail("This functionality requires the server to be built with lz4 support."), \ + errhint("You need to rebuild PostgreSQL using %s.", "--with-lz4"))) + +/* + * Compress a varlena using PGLZ. + * + * Returns the compressed varlena, or NULL if compression fails. + */ +struct varlena * +pglz_compress_datum(const struct varlena *value) +{ + int32 valsize, + len; + struct varlena *tmp = NULL; + + valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); + + /* + * No point in wasting a palloc cycle if value size is outside the allowed + * range for compression. + */ + if (valsize < PGLZ_strategy_default->min_input_size || + valsize > PGLZ_strategy_default->max_input_size) + return NULL; + + /* + * Figure out the maximum possible size of the pglz output, add the bytes + * that will be needed for varlena overhead, and allocate that amount. + */ + tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize) + + VARHDRSZ_COMPRESSED); + + len = pglz_compress(VARDATA_ANY(value), + valsize, + (char *) tmp + VARHDRSZ_COMPRESSED, + NULL); + if (len < 0) + { + pfree(tmp); + return NULL; + } + + SET_VARSIZE_COMPRESSED(tmp, len + VARHDRSZ_COMPRESSED); + + return tmp; +} + +/* + * Decompress a varlena that was compressed using PGLZ. + */ +struct varlena * +pglz_decompress_datum(const struct varlena *value) +{ + struct varlena *result; + int32 rawsize; + + /* allocate memory for the uncompressed data */ + result = (struct varlena *) palloc(VARDATA_COMPRESSED_GET_EXTSIZE(value) + VARHDRSZ); + + /* decompress the data */ + rawsize = pglz_decompress((char *) value + VARHDRSZ_COMPRESSED, + VARSIZE(value) - VARHDRSZ_COMPRESSED, + VARDATA(result), + VARDATA_COMPRESSED_GET_EXTSIZE(value), true); + if (rawsize < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("compressed pglz data is corrupt"))); + + SET_VARSIZE(result, rawsize + VARHDRSZ); + + return result; +} + +/* + * Decompress part of a varlena that was compressed using PGLZ. + */ +struct varlena * +pglz_decompress_datum_slice(const struct varlena *value, + int32 slicelength) +{ + struct varlena *result; + int32 rawsize; + + /* allocate memory for the uncompressed data */ + result = (struct varlena *) palloc(slicelength + VARHDRSZ); + + /* decompress the data */ + rawsize = pglz_decompress((char *) value + VARHDRSZ_COMPRESSED, + VARSIZE(value) - VARHDRSZ_COMPRESSED, + VARDATA(result), + slicelength, false); + if (rawsize < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("compressed pglz data is corrupt"))); + + SET_VARSIZE(result, rawsize + VARHDRSZ); + + return result; +} + +/* + * Compress a varlena using LZ4. + * + * Returns the compressed varlena, or NULL if compression fails. + */ +struct varlena * +lz4_compress_datum(const struct varlena *value) +{ +#ifndef USE_LZ4 + NO_LZ4_SUPPORT(); + return NULL; /* keep compiler quiet */ +#else + int32 valsize; + int32 len; + int32 max_size; + struct varlena *tmp = NULL; + + valsize = VARSIZE_ANY_EXHDR(value); + + /* + * Figure out the maximum possible size of the LZ4 output, add the bytes + * that will be needed for varlena overhead, and allocate that amount. + */ + max_size = LZ4_compressBound(valsize); + tmp = (struct varlena *) palloc(max_size + VARHDRSZ_COMPRESSED); + + len = LZ4_compress_default(VARDATA_ANY(value), + (char *) tmp + VARHDRSZ_COMPRESSED, + valsize, max_size); + if (len <= 0) + elog(ERROR, "lz4 compression failed"); + + /* data is incompressible so just free the memory and return NULL */ + if (len > valsize) + { + pfree(tmp); + return NULL; + } + + SET_VARSIZE_COMPRESSED(tmp, len + VARHDRSZ_COMPRESSED); + + return tmp; +#endif +} + +/* + * Decompress a varlena that was compressed using LZ4. + */ +struct varlena * +lz4_decompress_datum(const struct varlena *value) +{ +#ifndef USE_LZ4 + NO_LZ4_SUPPORT(); + return NULL; /* keep compiler quiet */ +#else + int32 rawsize; + struct varlena *result; + + /* allocate memory for the uncompressed data */ + result = (struct varlena *) palloc(VARDATA_COMPRESSED_GET_EXTSIZE(value) + VARHDRSZ); + + /* decompress the data */ + rawsize = LZ4_decompress_safe((char *) value + VARHDRSZ_COMPRESSED, + VARDATA(result), + VARSIZE(value) - VARHDRSZ_COMPRESSED, + VARDATA_COMPRESSED_GET_EXTSIZE(value)); + if (rawsize < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("compressed lz4 data is corrupt"))); + + + SET_VARSIZE(result, rawsize + VARHDRSZ); + + return result; +#endif +} + +/* + * Decompress part of a varlena that was compressed using LZ4. + */ +struct varlena * +lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength) +{ +#ifndef USE_LZ4 + NO_LZ4_SUPPORT(); + return NULL; /* keep compiler quiet */ +#else + int32 rawsize; + struct varlena *result; + + /* slice decompression not supported prior to 1.8.3 */ + if (LZ4_versionNumber() < 10803) + return lz4_decompress_datum(value); + + /* allocate memory for the uncompressed data */ + result = (struct varlena *) palloc(slicelength + VARHDRSZ); + + /* decompress the data */ + rawsize = LZ4_decompress_safe_partial((char *) value + VARHDRSZ_COMPRESSED, + VARDATA(result), + VARSIZE(value) - VARHDRSZ_COMPRESSED, + slicelength, + slicelength); + if (rawsize < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("compressed lz4 data is corrupt"))); + + SET_VARSIZE(result, rawsize + VARHDRSZ); + + return result; +#endif +} + +/* + * Extract compression ID from a varlena. + * + * Returns TOAST_INVALID_COMPRESSION_ID if the varlena is not compressed. + */ +ToastCompressionId +toast_get_compression_id(struct varlena *attr) +{ + ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; + + /* + * If it is stored externally then fetch the compression method id from + * the external toast pointer. If compressed inline, fetch it from the + * toast compression header. + */ + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + struct varatt_external toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + cmid = VARATT_EXTERNAL_GET_COMPRESS_METHOD(toast_pointer); + } + else if (VARATT_IS_COMPRESSED(attr)) + cmid = VARDATA_COMPRESSED_GET_COMPRESS_METHOD(attr); + + return cmid; +} + +/* + * CompressionNameToMethod - Get compression method from compression name + * + * Search in the available built-in methods. If the compression not found + * in the built-in methods then return InvalidCompressionMethod. + */ +char +CompressionNameToMethod(const char *compression) +{ + if (strcmp(compression, "pglz") == 0) + return TOAST_PGLZ_COMPRESSION; + else if (strcmp(compression, "lz4") == 0) + { +#ifndef USE_LZ4 + NO_LZ4_SUPPORT(); +#endif + return TOAST_LZ4_COMPRESSION; + } + + return InvalidCompressionMethod; +} + +/* + * GetCompressionMethodName - Get compression method name + */ +const char * +GetCompressionMethodName(char method) +{ + switch (method) + { + case TOAST_PGLZ_COMPRESSION: + return "pglz"; + case TOAST_LZ4_COMPRESSION: + return "lz4"; + default: + elog(ERROR, "invalid compression method %c", method); + return NULL; /* keep compiler quiet */ + } +} diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c new file mode 100644 index 0000000..2d2fd60 --- /dev/null +++ b/src/backend/access/common/toast_internals.c @@ -0,0 +1,664 @@ +/*------------------------------------------------------------------------- + * + * toast_internals.c + * Functions for internal use by the TOAST system. + * + * Copyright (c) 2000-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/common/toast_internals.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/table.h" +#include "access/toast_internals.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "common/pg_lzcompress.h" +#include "miscadmin.h" +#include "utils/fmgroids.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); +static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); + +/* ---------- + * toast_compress_datum - + * + * Create a compressed version of a varlena datum + * + * If we fail (ie, compressed result is actually bigger than original) + * then return NULL. We must not use compressed data if it'd expand + * the tuple! + * + * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without + * copying them. But we can't handle external or compressed datums. + * ---------- + */ +Datum +toast_compress_datum(Datum value, char cmethod) +{ + struct varlena *tmp = NULL; + int32 valsize; + ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; + + Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); + Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); + + valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); + + /* If the compression method is not valid, use the current default */ + if (!CompressionMethodIsValid(cmethod)) + cmethod = default_toast_compression; + + /* + * Call appropriate compression routine for the compression method. + */ + switch (cmethod) + { + case TOAST_PGLZ_COMPRESSION: + tmp = pglz_compress_datum((const struct varlena *) value); + cmid = TOAST_PGLZ_COMPRESSION_ID; + break; + case TOAST_LZ4_COMPRESSION: + tmp = lz4_compress_datum((const struct varlena *) value); + cmid = TOAST_LZ4_COMPRESSION_ID; + break; + default: + elog(ERROR, "invalid compression method %c", cmethod); + } + + if (tmp == NULL) + return PointerGetDatum(NULL); + + /* + * We recheck the actual size even if compression reports success, because + * it might be satisfied with having saved as little as one byte in the + * compressed data --- which could turn into a net loss once you consider + * header and alignment padding. Worst case, the compressed format might + * require three padding bytes (plus header, which is included in + * VARSIZE(tmp)), whereas the uncompressed format would take only one + * header byte and no padding if the value is short enough. So we insist + * on a savings of more than 2 bytes to ensure we have a gain. + */ + if (VARSIZE(tmp) < valsize - 2) + { + /* successful compression */ + Assert(cmid != TOAST_INVALID_COMPRESSION_ID); + TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(tmp, valsize, cmid); + return PointerGetDatum(tmp); + } + else + { + /* incompressible data */ + pfree(tmp); + return PointerGetDatum(NULL); + } +} + +/* ---------- + * toast_save_datum - + * + * Save one single datum into the secondary relation and return + * a Datum reference for it. + * + * rel: the main relation we're working with (not the toast rel!) + * value: datum to be pushed to toast storage + * oldexternal: if not NULL, toast pointer previously representing the datum + * options: options to be passed to heap_insert() for toast rows + * ---------- + */ +Datum +toast_save_datum(Relation rel, Datum value, + struct varlena *oldexternal, int options) +{ + Relation toastrel; + Relation *toastidxs; + HeapTuple toasttup; + TupleDesc toasttupDesc; + Datum t_values[3]; + bool t_isnull[3]; + CommandId mycid = GetCurrentCommandId(true); + struct varlena *result; + struct varatt_external toast_pointer; + union + { + struct varlena hdr; + /* this is to make the union big enough for a chunk: */ + char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; + /* ensure union is aligned well enough: */ + int32 align_it; + } chunk_data; + int32 chunk_size; + int32 chunk_seq = 0; + char *data_p; + int32 data_todo; + Pointer dval = DatumGetPointer(value); + int num_indexes; + int validIndex; + + Assert(!VARATT_IS_EXTERNAL(value)); + + /* + * Open the toast relation and its indexes. We can use the index to check + * uniqueness of the OID we assign to the toasted item, even though it has + * additional columns besides OID. + */ + toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); + toasttupDesc = toastrel->rd_att; + + /* Open all the toast indexes and look for the valid one */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Get the data pointer and length, and compute va_rawsize and va_extinfo. + * + * va_rawsize is the size of the equivalent fully uncompressed datum, so + * we have to adjust for short headers. + * + * va_extinfo stored the actual size of the data payload in the toast + * records and the compression method in first 2 bits if data is + * compressed. + */ + if (VARATT_IS_SHORT(dval)) + { + data_p = VARDATA_SHORT(dval); + data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; + toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ + toast_pointer.va_extinfo = data_todo; + } + else if (VARATT_IS_COMPRESSED(dval)) + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + /* rawsize in a compressed datum is just the size of the payload */ + toast_pointer.va_rawsize = VARDATA_COMPRESSED_GET_EXTSIZE(dval) + VARHDRSZ; + + /* set external size and compression method */ + VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, data_todo, + VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval)); + /* Assert that the numbers look like it's compressed */ + Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); + } + else + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + toast_pointer.va_rawsize = VARSIZE(dval); + toast_pointer.va_extinfo = data_todo; + } + + /* + * Insert the correct table OID into the result TOAST pointer. + * + * Normally this is the actual OID of the target toast table, but during + * table-rewriting operations such as CLUSTER, we have to insert the OID + * of the table's real permanent toast table instead. rd_toastoid is set + * if we have to substitute such an OID. + */ + if (OidIsValid(rel->rd_toastoid)) + toast_pointer.va_toastrelid = rel->rd_toastoid; + else + toast_pointer.va_toastrelid = RelationGetRelid(toastrel); + + /* + * Choose an OID to use as the value ID for this toast value. + * + * Normally we just choose an unused OID within the toast table. But + * during table-rewriting operations where we are preserving an existing + * toast table OID, we want to preserve toast value OIDs too. So, if + * rd_toastoid is set and we had a prior external value from that same + * toast table, re-use its value ID. If we didn't have a prior external + * value (which is a corner case, but possible if the table's attstorage + * options have been changed), we have to pick a value ID that doesn't + * conflict with either new or existing toast value OIDs. + */ + if (!OidIsValid(rel->rd_toastoid)) + { + /* normal case: just choose an unused OID */ + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } + else + { + /* rewrite case: check to see if value was in old toast table */ + toast_pointer.va_valueid = InvalidOid; + if (oldexternal != NULL) + { + struct varatt_external old_toast_pointer; + + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); + if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) + { + /* This value came from the old toast table; reuse its OID */ + toast_pointer.va_valueid = old_toast_pointer.va_valueid; + + /* + * There is a corner case here: the table rewrite might have + * to copy both live and recently-dead versions of a row, and + * those versions could easily reference the same toast value. + * When we copy the second or later version of such a row, + * reusing the OID will mean we select an OID that's already + * in the new toast table. Check for that, and if so, just + * fall through without writing the data again. + * + * While annoying and ugly-looking, this is a good thing + * because it ensures that we wind up with only one copy of + * the toast value when there is only one copy in the old + * toast table. Before we detected this case, we'd have made + * multiple copies, wasting space; and what's worse, the + * copies belonging to already-deleted heap tuples would not + * be reclaimed by VACUUM. + */ + if (toastrel_valueid_exists(toastrel, + toast_pointer.va_valueid)) + { + /* Match, so short-circuit the data storage loop below */ + data_todo = 0; + } + } + } + if (toast_pointer.va_valueid == InvalidOid) + { + /* + * new value; must choose an OID that doesn't conflict in either + * old or new toast table + */ + do + { + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } while (toastid_valueid_exists(rel->rd_toastoid, + toast_pointer.va_valueid)); + } + } + + /* + * Initialize constant parts of the tuple data + */ + t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); + t_values[2] = PointerGetDatum(&chunk_data); + t_isnull[0] = false; + t_isnull[1] = false; + t_isnull[2] = false; + + /* + * Split up the item into chunks + */ + while (data_todo > 0) + { + int i; + + CHECK_FOR_INTERRUPTS(); + + /* + * Calculate the size of this chunk + */ + chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); + + /* + * Build a tuple and store it + */ + t_values[1] = Int32GetDatum(chunk_seq++); + SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); + memcpy(VARDATA(&chunk_data), data_p, chunk_size); + toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); + + heap_insert(toastrel, toasttup, mycid, options, NULL); + + /* + * Create the index entry. We cheat a little here by not using + * FormIndexDatum: this relies on the knowledge that the index columns + * are the same as the initial columns of the table for all the + * indexes. We also cheat by not providing an IndexInfo: this is okay + * for now because btree doesn't need one, but we might have to be + * more honest someday. + * + * Note also that there had better not be any user-created index on + * the TOAST table, since we don't bother to update anything else. + */ + for (i = 0; i < num_indexes; i++) + { + /* Only index relations marked as ready can be updated */ + if (toastidxs[i]->rd_index->indisready) + index_insert(toastidxs[i], t_values, t_isnull, + &(toasttup->t_self), + toastrel, + toastidxs[i]->rd_index->indisunique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + false, NULL); + } + + /* + * Free memory + */ + heap_freetuple(toasttup); + + /* + * Move on to next chunk + */ + data_todo -= chunk_size; + data_p += chunk_size; + } + + /* + * Done - close toast relation and its indexes but keep the lock until + * commit, so as a concurrent reindex done directly on the toast relation + * would be able to wait for this transaction. + */ + toast_close_indexes(toastidxs, num_indexes, NoLock); + table_close(toastrel, NoLock); + + /* + * Create the TOAST pointer value that we'll return + */ + result = (struct varlena *) palloc(TOAST_POINTER_SIZE); + SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); + memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); + + return PointerGetDatum(result); +} + +/* ---------- + * toast_delete_datum - + * + * Delete a single external stored value. + * ---------- + */ +void +toast_delete_datum(Relation rel, Datum value, bool is_speculative) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + struct varatt_external toast_pointer; + Relation toastrel; + Relation *toastidxs; + ScanKeyData toastkey; + SysScanDesc toastscan; + HeapTuple toasttup; + int num_indexes; + int validIndex; + SnapshotData SnapshotToast; + + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + return; + + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* + * Open the toast relation and its indexes + */ + toastrel = table_open(toast_pointer.va_toastrelid, RowExclusiveLock); + + /* Fetch valid relation used for process */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to find chunks with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(toast_pointer.va_valueid)); + + /* + * Find all the chunks. (We don't actually care whether we see them in + * sequence or not, but since we've already locked the index we might as + * well use systable_beginscan_ordered.) + */ + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], + &SnapshotToast, 1, &toastkey); + while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) + { + /* + * Have a chunk, delete it + */ + if (is_speculative) + heap_abort_speculative(toastrel, &toasttup->t_self); + else + simple_heap_delete(toastrel, &toasttup->t_self); + } + + /* + * End scan and close relations but keep the lock until commit, so as a + * concurrent reindex done directly on the toast relation would be able to + * wait for this transaction. + */ + systable_endscan_ordered(toastscan); + toast_close_indexes(toastidxs, num_indexes, NoLock); + table_close(toastrel, NoLock); +} + +/* ---------- + * toastrel_valueid_exists - + * + * Test whether a toast value with the given ID exists in the toast relation. + * For safety, we consider a value to exist if there are either live or dead + * toast rows with that ID; see notes for GetNewOidWithIndex(). + * ---------- + */ +static bool +toastrel_valueid_exists(Relation toastrel, Oid valueid) +{ + bool result = false; + ScanKeyData toastkey; + SysScanDesc toastscan; + int num_indexes; + int validIndex; + Relation *toastidxs; + + /* Fetch a valid index relation */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to find chunks with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(valueid)); + + /* + * Is there any such chunk? + */ + toastscan = systable_beginscan(toastrel, + RelationGetRelid(toastidxs[validIndex]), + true, SnapshotAny, 1, &toastkey); + + if (systable_getnext(toastscan) != NULL) + result = true; + + systable_endscan(toastscan); + + /* Clean up */ + toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); + + return result; +} + +/* ---------- + * toastid_valueid_exists - + * + * As above, but work from toast rel's OID not an open relation + * ---------- + */ +static bool +toastid_valueid_exists(Oid toastrelid, Oid valueid) +{ + bool result; + Relation toastrel; + + toastrel = table_open(toastrelid, AccessShareLock); + + result = toastrel_valueid_exists(toastrel, valueid); + + table_close(toastrel, AccessShareLock); + + return result; +} + +/* ---------- + * toast_get_valid_index + * + * Get OID of valid index associated to given toast relation. A toast + * relation can have only one valid index at the same time. + */ +Oid +toast_get_valid_index(Oid toastoid, LOCKMODE lock) +{ + int num_indexes; + int validIndex; + Oid validIndexOid; + Relation *toastidxs; + Relation toastrel; + + /* Open the toast relation */ + toastrel = table_open(toastoid, lock); + + /* Look for the valid index of the toast relation */ + validIndex = toast_open_indexes(toastrel, + lock, + &toastidxs, + &num_indexes); + validIndexOid = RelationGetRelid(toastidxs[validIndex]); + + /* Close the toast relation and all its indexes */ + toast_close_indexes(toastidxs, num_indexes, NoLock); + table_close(toastrel, NoLock); + + return validIndexOid; +} + +/* ---------- + * toast_open_indexes + * + * Get an array of the indexes associated to the given toast relation + * and return as well the position of the valid index used by the toast + * relation in this array. It is the responsibility of the caller of this + * function to close the indexes as well as free them. + */ +int +toast_open_indexes(Relation toastrel, + LOCKMODE lock, + Relation **toastidxs, + int *num_indexes) +{ + int i = 0; + int res = 0; + bool found = false; + List *indexlist; + ListCell *lc; + + /* Get index list of the toast relation */ + indexlist = RelationGetIndexList(toastrel); + Assert(indexlist != NIL); + + *num_indexes = list_length(indexlist); + + /* Open all the index relations */ + *toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation)); + foreach(lc, indexlist) + (*toastidxs)[i++] = index_open(lfirst_oid(lc), lock); + + /* Fetch the first valid index in list */ + for (i = 0; i < *num_indexes; i++) + { + Relation toastidx = (*toastidxs)[i]; + + if (toastidx->rd_index->indisvalid) + { + res = i; + found = true; + break; + } + } + + /* + * Free index list, not necessary anymore as relations are opened and a + * valid index has been found. + */ + list_free(indexlist); + + /* + * The toast relation should have one valid index, so something is going + * wrong if there is nothing. + */ + if (!found) + elog(ERROR, "no valid index found for toast relation with Oid %u", + RelationGetRelid(toastrel)); + + return res; +} + +/* ---------- + * toast_close_indexes + * + * Close an array of indexes for a toast relation and free it. This should + * be called for a set of indexes opened previously with toast_open_indexes. + */ +void +toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock) +{ + int i; + + /* Close relations and clean up things */ + for (i = 0; i < num_indexes; i++) + index_close(toastidxs[i], lock); + pfree(toastidxs); +} + +/* ---------- + * init_toast_snapshot + * + * Initialize an appropriate TOAST snapshot. We must use an MVCC snapshot + * to initialize the TOAST snapshot; since we don't know which one to use, + * just use the oldest one. This is safe: at worst, we will get a "snapshot + * too old" error that might have been avoided otherwise. + */ +void +init_toast_snapshot(Snapshot toast_snapshot) +{ + Snapshot snapshot = GetOldestSnapshot(); + + /* + * GetOldestSnapshot returns NULL if the session has no active snapshots. + * We can get that if, for example, a procedure fetches a toasted value + * into a local variable, commits, and then tries to detoast the value. + * Such coding is unsafe, because once we commit there is nothing to + * prevent the toast data from being deleted. Detoasting *must* happen in + * the same transaction that originally fetched the toast pointer. Hence, + * rather than trying to band-aid over the problem, throw an error. (This + * is not very much protection, because in many scenarios the procedure + * would have already created a new transaction snapshot, preventing us + * from detecting the problem. But it's better than nothing, and for sure + * we shouldn't expend code on masking the problem more.) + */ + if (snapshot == NULL) + elog(ERROR, "cannot fetch toast data without an active snapshot"); + + InitToastSnapshot(*toast_snapshot, snapshot->lsn, snapshot->whenTaken); +} diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c new file mode 100644 index 0000000..64f5439 --- /dev/null +++ b/src/backend/access/common/tupconvert.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * tupconvert.c + * Tuple conversion support. + * + * These functions provide conversion between rowtypes that are logically + * equivalent but might have columns in a different order or different sets of + * dropped columns. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/tupconvert.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/tupconvert.h" +#include "executor/tuptable.h" + + +/* + * The conversion setup routines have the following common API: + * + * The setup routine checks using attmap.c whether the given source and + * destination tuple descriptors are logically compatible. If not, it throws + * an error. If so, it returns NULL if they are physically compatible (ie, no + * conversion is needed), else a TupleConversionMap that can be used by + * execute_attr_map_tuple or execute_attr_map_slot to perform the conversion. + * + * The TupleConversionMap, if needed, is palloc'd in the caller's memory + * context. Also, the given tuple descriptors are referenced by the map, + * so they must survive as long as the map is needed. + * + * The caller must supply a suitable primary error message to be used if + * a compatibility error is thrown. Recommended coding practice is to use + * gettext_noop() on this string, so that it is translatable but won't + * actually be translated unless the error gets thrown. + * + * + * Implementation notes: + * + * The key component of a TupleConversionMap is an attrMap[] array with + * one entry per output column. This entry contains the 1-based index of + * the corresponding input column, or zero to force a NULL value (for + * a dropped output column). The TupleConversionMap also contains workspace + * arrays. + */ + + +/* + * Set up for tuple conversion, matching input and output columns by + * position. (Dropped columns are ignored in both input and output.) + */ +TupleConversionMap * +convert_tuples_by_position(TupleDesc indesc, + TupleDesc outdesc, + const char *msg) +{ + TupleConversionMap *map; + int n; + AttrMap *attrMap; + + /* Verify compatibility and prepare attribute-number map */ + attrMap = build_attrmap_by_position(indesc, outdesc, msg); + + if (attrMap == NULL) + { + /* runtime conversion is not needed */ + return NULL; + } + + /* Prepare the map structure */ + map = (TupleConversionMap *) palloc(sizeof(TupleConversionMap)); + map->indesc = indesc; + map->outdesc = outdesc; + map->attrMap = attrMap; + /* preallocate workspace for Datum arrays */ + n = outdesc->natts + 1; /* +1 for NULL */ + map->outvalues = (Datum *) palloc(n * sizeof(Datum)); + map->outisnull = (bool *) palloc(n * sizeof(bool)); + n = indesc->natts + 1; /* +1 for NULL */ + map->invalues = (Datum *) palloc(n * sizeof(Datum)); + map->inisnull = (bool *) palloc(n * sizeof(bool)); + map->invalues[0] = (Datum) 0; /* set up the NULL entry */ + map->inisnull[0] = true; + + return map; +} + +/* + * Set up for tuple conversion, matching input and output columns by name. + * (Dropped columns are ignored in both input and output.) This is intended + * for use when the rowtypes are related by inheritance, so we expect an exact + * match of both type and typmod. The error messages will be a bit unhelpful + * unless both rowtypes are named composite types. + */ +TupleConversionMap * +convert_tuples_by_name(TupleDesc indesc, + TupleDesc outdesc) +{ + TupleConversionMap *map; + AttrMap *attrMap; + int n = outdesc->natts; + + /* Verify compatibility and prepare attribute-number map */ + attrMap = build_attrmap_by_name_if_req(indesc, outdesc); + + if (attrMap == NULL) + { + /* runtime conversion is not needed */ + return NULL; + } + + /* Prepare the map structure */ + map = (TupleConversionMap *) palloc(sizeof(TupleConversionMap)); + map->indesc = indesc; + map->outdesc = outdesc; + map->attrMap = attrMap; + /* preallocate workspace for Datum arrays */ + map->outvalues = (Datum *) palloc(n * sizeof(Datum)); + map->outisnull = (bool *) palloc(n * sizeof(bool)); + n = indesc->natts + 1; /* +1 for NULL */ + map->invalues = (Datum *) palloc(n * sizeof(Datum)); + map->inisnull = (bool *) palloc(n * sizeof(bool)); + map->invalues[0] = (Datum) 0; /* set up the NULL entry */ + map->inisnull[0] = true; + + return map; +} + +/* + * Perform conversion of a tuple according to the map. + */ +HeapTuple +execute_attr_map_tuple(HeapTuple tuple, TupleConversionMap *map) +{ + AttrMap *attrMap = map->attrMap; + Datum *invalues = map->invalues; + bool *inisnull = map->inisnull; + Datum *outvalues = map->outvalues; + bool *outisnull = map->outisnull; + int i; + + /* + * Extract all the values of the old tuple, offsetting the arrays so that + * invalues[0] is left NULL and invalues[1] is the first source attribute; + * this exactly matches the numbering convention in attrMap. + */ + heap_deform_tuple(tuple, map->indesc, invalues + 1, inisnull + 1); + + /* + * Transpose into proper fields of the new tuple. + */ + Assert(attrMap->maplen == map->outdesc->natts); + for (i = 0; i < attrMap->maplen; i++) + { + int j = attrMap->attnums[i]; + + outvalues[i] = invalues[j]; + outisnull[i] = inisnull[j]; + } + + /* + * Now form the new tuple. + */ + return heap_form_tuple(map->outdesc, outvalues, outisnull); +} + +/* + * Perform conversion of a tuple slot according to the map. + */ +TupleTableSlot * +execute_attr_map_slot(AttrMap *attrMap, + TupleTableSlot *in_slot, + TupleTableSlot *out_slot) +{ + Datum *invalues; + bool *inisnull; + Datum *outvalues; + bool *outisnull; + int outnatts; + int i; + + /* Sanity checks */ + Assert(in_slot->tts_tupleDescriptor != NULL && + out_slot->tts_tupleDescriptor != NULL); + Assert(in_slot->tts_values != NULL && out_slot->tts_values != NULL); + + outnatts = out_slot->tts_tupleDescriptor->natts; + + /* Extract all the values of the in slot. */ + slot_getallattrs(in_slot); + + /* Before doing the mapping, clear any old contents from the out slot */ + ExecClearTuple(out_slot); + + invalues = in_slot->tts_values; + inisnull = in_slot->tts_isnull; + outvalues = out_slot->tts_values; + outisnull = out_slot->tts_isnull; + + /* Transpose into proper fields of the out slot. */ + for (i = 0; i < outnatts; i++) + { + int j = attrMap->attnums[i] - 1; + + /* attrMap->attnums[i] == 0 means it's a NULL datum. */ + if (j == -1) + { + outvalues[i] = (Datum) 0; + outisnull[i] = true; + } + else + { + outvalues[i] = invalues[j]; + outisnull[i] = inisnull[j]; + } + } + + ExecStoreVirtualTuple(out_slot); + + return out_slot; +} + +/* + * Perform conversion of bitmap of columns according to the map. + * + * The input and output bitmaps are offset by + * FirstLowInvalidHeapAttributeNumber to accommodate system cols, like the + * column-bitmaps in RangeTblEntry. + */ +Bitmapset * +execute_attr_map_cols(AttrMap *attrMap, Bitmapset *in_cols) +{ + Bitmapset *out_cols; + int out_attnum; + + /* fast path for the common trivial case */ + if (in_cols == NULL) + return NULL; + + /* + * For each output column, check which input column it corresponds to. + */ + out_cols = NULL; + + for (out_attnum = FirstLowInvalidHeapAttributeNumber; + out_attnum <= attrMap->maplen; + out_attnum++) + { + int in_attnum; + + if (out_attnum < 0) + { + /* System column. No mapping. */ + in_attnum = out_attnum; + } + else if (out_attnum == 0) + continue; + else + { + /* normal user column */ + in_attnum = attrMap->attnums[out_attnum - 1]; + + if (in_attnum == 0) + continue; + } + + if (bms_is_member(in_attnum - FirstLowInvalidHeapAttributeNumber, in_cols)) + out_cols = bms_add_member(out_cols, out_attnum - FirstLowInvalidHeapAttributeNumber); + } + + return out_cols; +} + +/* + * Free a TupleConversionMap structure. + */ +void +free_conversion_map(TupleConversionMap *map) +{ + /* indesc and outdesc are not ours to free */ + free_attrmap(map->attrMap); + pfree(map->invalues); + pfree(map->inisnull); + pfree(map->outvalues); + pfree(map->outisnull); + pfree(map); +} diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c new file mode 100644 index 0000000..4c63bd4 --- /dev/null +++ b/src/backend/access/common/tupdesc.c @@ -0,0 +1,912 @@ +/*------------------------------------------------------------------------- + * + * tupdesc.c + * POSTGRES tuple descriptor support code + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/common/tupdesc.c + * + * NOTES + * some of the executor utility code such as "ExecTypeFromTL" should be + * moved here. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/toast_compression.h" +#include "access/tupdesc_details.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_type.h" +#include "common/hashfn.h" +#include "miscadmin.h" +#include "parser/parse_type.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/resowner_private.h" +#include "utils/syscache.h" + + +/* + * CreateTemplateTupleDesc + * This function allocates an empty tuple descriptor structure. + * + * Tuple type ID information is initially set for an anonymous record type; + * caller can overwrite this if needed. + */ +TupleDesc +CreateTemplateTupleDesc(int natts) +{ + TupleDesc desc; + + /* + * sanity checks + */ + AssertArg(natts >= 0); + + /* + * Allocate enough memory for the tuple descriptor, including the + * attribute rows. + * + * Note: the attribute array stride is sizeof(FormData_pg_attribute), + * since we declare the array elements as FormData_pg_attribute for + * notational convenience. However, we only guarantee that the first + * ATTRIBUTE_FIXED_PART_SIZE bytes of each entry are valid; most code that + * copies tupdesc entries around copies just that much. In principle that + * could be less due to trailing padding, although with the current + * definition of pg_attribute there probably isn't any padding. + */ + desc = (TupleDesc) palloc(offsetof(struct TupleDescData, attrs) + + natts * sizeof(FormData_pg_attribute)); + + /* + * Initialize other fields of the tupdesc. + */ + desc->natts = natts; + desc->constr = NULL; + desc->tdtypeid = RECORDOID; + desc->tdtypmod = -1; + desc->tdrefcount = -1; /* assume not reference-counted */ + + return desc; +} + +/* + * CreateTupleDesc + * This function allocates a new TupleDesc by copying a given + * Form_pg_attribute array. + * + * Tuple type ID information is initially set for an anonymous record type; + * caller can overwrite this if needed. + */ +TupleDesc +CreateTupleDesc(int natts, Form_pg_attribute *attrs) +{ + TupleDesc desc; + int i; + + desc = CreateTemplateTupleDesc(natts); + + for (i = 0; i < natts; ++i) + memcpy(TupleDescAttr(desc, i), attrs[i], ATTRIBUTE_FIXED_PART_SIZE); + + return desc; +} + +/* + * CreateTupleDescCopy + * This function creates a new TupleDesc by copying from an existing + * TupleDesc. + * + * !!! Constraints and defaults are not copied !!! + */ +TupleDesc +CreateTupleDescCopy(TupleDesc tupdesc) +{ + TupleDesc desc; + int i; + + desc = CreateTemplateTupleDesc(tupdesc->natts); + + /* Flat-copy the attribute array */ + memcpy(TupleDescAttr(desc, 0), + TupleDescAttr(tupdesc, 0), + desc->natts * sizeof(FormData_pg_attribute)); + + /* + * Since we're not copying constraints and defaults, clear fields + * associated with them. + */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + att->attnotnull = false; + att->atthasdef = false; + att->atthasmissing = false; + att->attidentity = '\0'; + att->attgenerated = '\0'; + } + + /* We can copy the tuple type identification, too */ + desc->tdtypeid = tupdesc->tdtypeid; + desc->tdtypmod = tupdesc->tdtypmod; + + return desc; +} + +/* + * CreateTupleDescCopyConstr + * This function creates a new TupleDesc by copying from an existing + * TupleDesc (including its constraints and defaults). + */ +TupleDesc +CreateTupleDescCopyConstr(TupleDesc tupdesc) +{ + TupleDesc desc; + TupleConstr *constr = tupdesc->constr; + int i; + + desc = CreateTemplateTupleDesc(tupdesc->natts); + + /* Flat-copy the attribute array */ + memcpy(TupleDescAttr(desc, 0), + TupleDescAttr(tupdesc, 0), + desc->natts * sizeof(FormData_pg_attribute)); + + /* Copy the TupleConstr data structure, if any */ + if (constr) + { + TupleConstr *cpy = (TupleConstr *) palloc0(sizeof(TupleConstr)); + + cpy->has_not_null = constr->has_not_null; + cpy->has_generated_stored = constr->has_generated_stored; + + if ((cpy->num_defval = constr->num_defval) > 0) + { + cpy->defval = (AttrDefault *) palloc(cpy->num_defval * sizeof(AttrDefault)); + memcpy(cpy->defval, constr->defval, cpy->num_defval * sizeof(AttrDefault)); + for (i = cpy->num_defval - 1; i >= 0; i--) + cpy->defval[i].adbin = pstrdup(constr->defval[i].adbin); + } + + if (constr->missing) + { + cpy->missing = (AttrMissing *) palloc(tupdesc->natts * sizeof(AttrMissing)); + memcpy(cpy->missing, constr->missing, tupdesc->natts * sizeof(AttrMissing)); + for (i = tupdesc->natts - 1; i >= 0; i--) + { + if (constr->missing[i].am_present) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + + cpy->missing[i].am_value = datumCopy(constr->missing[i].am_value, + attr->attbyval, + attr->attlen); + } + } + } + + if ((cpy->num_check = constr->num_check) > 0) + { + cpy->check = (ConstrCheck *) palloc(cpy->num_check * sizeof(ConstrCheck)); + memcpy(cpy->check, constr->check, cpy->num_check * sizeof(ConstrCheck)); + for (i = cpy->num_check - 1; i >= 0; i--) + { + cpy->check[i].ccname = pstrdup(constr->check[i].ccname); + cpy->check[i].ccbin = pstrdup(constr->check[i].ccbin); + cpy->check[i].ccvalid = constr->check[i].ccvalid; + cpy->check[i].ccnoinherit = constr->check[i].ccnoinherit; + } + } + + desc->constr = cpy; + } + + /* We can copy the tuple type identification, too */ + desc->tdtypeid = tupdesc->tdtypeid; + desc->tdtypmod = tupdesc->tdtypmod; + + return desc; +} + +/* + * TupleDescCopy + * Copy a tuple descriptor into caller-supplied memory. + * The memory may be shared memory mapped at any address, and must + * be sufficient to hold TupleDescSize(src) bytes. + * + * !!! Constraints and defaults are not copied !!! + */ +void +TupleDescCopy(TupleDesc dst, TupleDesc src) +{ + int i; + + /* Flat-copy the header and attribute array */ + memcpy(dst, src, TupleDescSize(src)); + + /* + * Since we're not copying constraints and defaults, clear fields + * associated with them. + */ + for (i = 0; i < dst->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(dst, i); + + att->attnotnull = false; + att->atthasdef = false; + att->atthasmissing = false; + att->attidentity = '\0'; + att->attgenerated = '\0'; + } + dst->constr = NULL; + + /* + * Also, assume the destination is not to be ref-counted. (Copying the + * source's refcount would be wrong in any case.) + */ + dst->tdrefcount = -1; +} + +/* + * TupleDescCopyEntry + * This function copies a single attribute structure from one tuple + * descriptor to another. + * + * !!! Constraints and defaults are not copied !!! + */ +void +TupleDescCopyEntry(TupleDesc dst, AttrNumber dstAttno, + TupleDesc src, AttrNumber srcAttno) +{ + Form_pg_attribute dstAtt = TupleDescAttr(dst, dstAttno - 1); + Form_pg_attribute srcAtt = TupleDescAttr(src, srcAttno - 1); + + /* + * sanity checks + */ + AssertArg(PointerIsValid(src)); + AssertArg(PointerIsValid(dst)); + AssertArg(srcAttno >= 1); + AssertArg(srcAttno <= src->natts); + AssertArg(dstAttno >= 1); + AssertArg(dstAttno <= dst->natts); + + memcpy(dstAtt, srcAtt, ATTRIBUTE_FIXED_PART_SIZE); + + /* + * Aside from updating the attno, we'd better reset attcacheoff. + * + * XXX Actually, to be entirely safe we'd need to reset the attcacheoff of + * all following columns in dst as well. Current usage scenarios don't + * require that though, because all following columns will get initialized + * by other uses of this function or TupleDescInitEntry. So we cheat a + * bit to avoid a useless O(N^2) penalty. + */ + dstAtt->attnum = dstAttno; + dstAtt->attcacheoff = -1; + + /* since we're not copying constraints or defaults, clear these */ + dstAtt->attnotnull = false; + dstAtt->atthasdef = false; + dstAtt->atthasmissing = false; + dstAtt->attidentity = '\0'; + dstAtt->attgenerated = '\0'; +} + +/* + * Free a TupleDesc including all substructure + */ +void +FreeTupleDesc(TupleDesc tupdesc) +{ + int i; + + /* + * Possibly this should assert tdrefcount == 0, to disallow explicit + * freeing of un-refcounted tupdescs? + */ + Assert(tupdesc->tdrefcount <= 0); + + if (tupdesc->constr) + { + if (tupdesc->constr->num_defval > 0) + { + AttrDefault *attrdef = tupdesc->constr->defval; + + for (i = tupdesc->constr->num_defval - 1; i >= 0; i--) + pfree(attrdef[i].adbin); + pfree(attrdef); + } + if (tupdesc->constr->missing) + { + AttrMissing *attrmiss = tupdesc->constr->missing; + + for (i = tupdesc->natts - 1; i >= 0; i--) + { + if (attrmiss[i].am_present + && !TupleDescAttr(tupdesc, i)->attbyval) + pfree(DatumGetPointer(attrmiss[i].am_value)); + } + pfree(attrmiss); + } + if (tupdesc->constr->num_check > 0) + { + ConstrCheck *check = tupdesc->constr->check; + + for (i = tupdesc->constr->num_check - 1; i >= 0; i--) + { + pfree(check[i].ccname); + pfree(check[i].ccbin); + } + pfree(check); + } + pfree(tupdesc->constr); + } + + pfree(tupdesc); +} + +/* + * Increment the reference count of a tupdesc, and log the reference in + * CurrentResourceOwner. + * + * Do not apply this to tupdescs that are not being refcounted. (Use the + * macro PinTupleDesc for tupdescs of uncertain status.) + */ +void +IncrTupleDescRefCount(TupleDesc tupdesc) +{ + Assert(tupdesc->tdrefcount >= 0); + + ResourceOwnerEnlargeTupleDescs(CurrentResourceOwner); + tupdesc->tdrefcount++; + ResourceOwnerRememberTupleDesc(CurrentResourceOwner, tupdesc); +} + +/* + * Decrement the reference count of a tupdesc, remove the corresponding + * reference from CurrentResourceOwner, and free the tupdesc if no more + * references remain. + * + * Do not apply this to tupdescs that are not being refcounted. (Use the + * macro ReleaseTupleDesc for tupdescs of uncertain status.) + */ +void +DecrTupleDescRefCount(TupleDesc tupdesc) +{ + Assert(tupdesc->tdrefcount > 0); + + ResourceOwnerForgetTupleDesc(CurrentResourceOwner, tupdesc); + if (--tupdesc->tdrefcount == 0) + FreeTupleDesc(tupdesc); +} + +/* + * Compare two TupleDesc structures for logical equality + * + * Note: we deliberately do not check the attrelid and tdtypmod fields. + * This allows typcache.c to use this routine to see if a cached record type + * matches a requested type, and is harmless for relcache.c's uses. + * We don't compare tdrefcount, either. + */ +bool +equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2) +{ + int i, + n; + + if (tupdesc1->natts != tupdesc2->natts) + return false; + if (tupdesc1->tdtypeid != tupdesc2->tdtypeid) + return false; + + for (i = 0; i < tupdesc1->natts; i++) + { + Form_pg_attribute attr1 = TupleDescAttr(tupdesc1, i); + Form_pg_attribute attr2 = TupleDescAttr(tupdesc2, i); + + /* + * We do not need to check every single field here: we can disregard + * attrelid and attnum (which were used to place the row in the attrs + * array in the first place). It might look like we could dispense + * with checking attlen/attbyval/attalign, since these are derived + * from atttypid; but in the case of dropped columns we must check + * them (since atttypid will be zero for all dropped columns) and in + * general it seems safer to check them always. + * + * attcacheoff must NOT be checked since it's possibly not set in both + * copies. We also intentionally ignore atthasmissing, since that's + * not very relevant in tupdescs, which lack the attmissingval field. + */ + if (strcmp(NameStr(attr1->attname), NameStr(attr2->attname)) != 0) + return false; + if (attr1->atttypid != attr2->atttypid) + return false; + if (attr1->attstattarget != attr2->attstattarget) + return false; + if (attr1->attlen != attr2->attlen) + return false; + if (attr1->attndims != attr2->attndims) + return false; + if (attr1->atttypmod != attr2->atttypmod) + return false; + if (attr1->attbyval != attr2->attbyval) + return false; + if (attr1->attalign != attr2->attalign) + return false; + if (attr1->attstorage != attr2->attstorage) + return false; + if (attr1->attcompression != attr2->attcompression) + return false; + if (attr1->attnotnull != attr2->attnotnull) + return false; + if (attr1->atthasdef != attr2->atthasdef) + return false; + if (attr1->attidentity != attr2->attidentity) + return false; + if (attr1->attgenerated != attr2->attgenerated) + return false; + if (attr1->attisdropped != attr2->attisdropped) + return false; + if (attr1->attislocal != attr2->attislocal) + return false; + if (attr1->attinhcount != attr2->attinhcount) + return false; + if (attr1->attcollation != attr2->attcollation) + return false; + /* variable-length fields are not even present... */ + } + + if (tupdesc1->constr != NULL) + { + TupleConstr *constr1 = tupdesc1->constr; + TupleConstr *constr2 = tupdesc2->constr; + + if (constr2 == NULL) + return false; + if (constr1->has_not_null != constr2->has_not_null) + return false; + if (constr1->has_generated_stored != constr2->has_generated_stored) + return false; + n = constr1->num_defval; + if (n != (int) constr2->num_defval) + return false; + /* We assume here that both AttrDefault arrays are in adnum order */ + for (i = 0; i < n; i++) + { + AttrDefault *defval1 = constr1->defval + i; + AttrDefault *defval2 = constr2->defval + i; + + if (defval1->adnum != defval2->adnum) + return false; + if (strcmp(defval1->adbin, defval2->adbin) != 0) + return false; + } + if (constr1->missing) + { + if (!constr2->missing) + return false; + for (i = 0; i < tupdesc1->natts; i++) + { + AttrMissing *missval1 = constr1->missing + i; + AttrMissing *missval2 = constr2->missing + i; + + if (missval1->am_present != missval2->am_present) + return false; + if (missval1->am_present) + { + Form_pg_attribute missatt1 = TupleDescAttr(tupdesc1, i); + + if (!datumIsEqual(missval1->am_value, missval2->am_value, + missatt1->attbyval, missatt1->attlen)) + return false; + } + } + } + else if (constr2->missing) + return false; + n = constr1->num_check; + if (n != (int) constr2->num_check) + return false; + + /* + * Similarly, we rely here on the ConstrCheck entries being sorted by + * name. If there are duplicate names, the outcome of the comparison + * is uncertain, but that should not happen. + */ + for (i = 0; i < n; i++) + { + ConstrCheck *check1 = constr1->check + i; + ConstrCheck *check2 = constr2->check + i; + + if (!(strcmp(check1->ccname, check2->ccname) == 0 && + strcmp(check1->ccbin, check2->ccbin) == 0 && + check1->ccvalid == check2->ccvalid && + check1->ccnoinherit == check2->ccnoinherit)) + return false; + } + } + else if (tupdesc2->constr != NULL) + return false; + return true; +} + +/* + * hashTupleDesc + * Compute a hash value for a tuple descriptor. + * + * If two tuple descriptors would be considered equal by equalTupleDescs() + * then their hash value will be equal according to this function. + * + * Note that currently contents of constraint are not hashed - it'd be a bit + * painful to do so, and conflicts just due to constraints are unlikely. + */ +uint32 +hashTupleDesc(TupleDesc desc) +{ + uint32 s; + int i; + + s = hash_combine(0, hash_uint32(desc->natts)); + s = hash_combine(s, hash_uint32(desc->tdtypeid)); + for (i = 0; i < desc->natts; ++i) + s = hash_combine(s, hash_uint32(TupleDescAttr(desc, i)->atttypid)); + + return s; +} + +/* + * TupleDescInitEntry + * This function initializes a single attribute structure in + * a previously allocated tuple descriptor. + * + * If attributeName is NULL, the attname field is set to an empty string + * (this is for cases where we don't know or need a name for the field). + * Also, some callers use this function to change the datatype-related fields + * in an existing tupdesc; they pass attributeName = NameStr(att->attname) + * to indicate that the attname field shouldn't be modified. + * + * Note that attcollation is set to the default for the specified datatype. + * If a nondefault collation is needed, insert it afterwards using + * TupleDescInitEntryCollation. + */ +void +TupleDescInitEntry(TupleDesc desc, + AttrNumber attributeNumber, + const char *attributeName, + Oid oidtypeid, + int32 typmod, + int attdim) +{ + HeapTuple tuple; + Form_pg_type typeForm; + Form_pg_attribute att; + + /* + * sanity checks + */ + AssertArg(PointerIsValid(desc)); + AssertArg(attributeNumber >= 1); + AssertArg(attributeNumber <= desc->natts); + + /* + * initialize the attribute fields + */ + att = TupleDescAttr(desc, attributeNumber - 1); + + att->attrelid = 0; /* dummy value */ + + /* + * Note: attributeName can be NULL, because the planner doesn't always + * fill in valid resname values in targetlists, particularly for resjunk + * attributes. Also, do nothing if caller wants to re-use the old attname. + */ + if (attributeName == NULL) + MemSet(NameStr(att->attname), 0, NAMEDATALEN); + else if (attributeName != NameStr(att->attname)) + namestrcpy(&(att->attname), attributeName); + + att->attstattarget = -1; + att->attcacheoff = -1; + att->atttypmod = typmod; + + att->attnum = attributeNumber; + att->attndims = attdim; + + att->attnotnull = false; + att->atthasdef = false; + att->atthasmissing = false; + att->attidentity = '\0'; + att->attgenerated = '\0'; + att->attisdropped = false; + att->attislocal = true; + att->attinhcount = 0; + /* attacl, attoptions and attfdwoptions are not present in tupledescs */ + + tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(oidtypeid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for type %u", oidtypeid); + typeForm = (Form_pg_type) GETSTRUCT(tuple); + + att->atttypid = oidtypeid; + att->attlen = typeForm->typlen; + att->attbyval = typeForm->typbyval; + att->attalign = typeForm->typalign; + att->attstorage = typeForm->typstorage; + att->attcompression = InvalidCompressionMethod; + att->attcollation = typeForm->typcollation; + + ReleaseSysCache(tuple); +} + +/* + * TupleDescInitBuiltinEntry + * Initialize a tuple descriptor without catalog access. Only + * a limited range of builtin types are supported. + */ +void +TupleDescInitBuiltinEntry(TupleDesc desc, + AttrNumber attributeNumber, + const char *attributeName, + Oid oidtypeid, + int32 typmod, + int attdim) +{ + Form_pg_attribute att; + + /* sanity checks */ + AssertArg(PointerIsValid(desc)); + AssertArg(attributeNumber >= 1); + AssertArg(attributeNumber <= desc->natts); + + /* initialize the attribute fields */ + att = TupleDescAttr(desc, attributeNumber - 1); + att->attrelid = 0; /* dummy value */ + + /* unlike TupleDescInitEntry, we require an attribute name */ + Assert(attributeName != NULL); + namestrcpy(&(att->attname), attributeName); + + att->attstattarget = -1; + att->attcacheoff = -1; + att->atttypmod = typmod; + + att->attnum = attributeNumber; + att->attndims = attdim; + + att->attnotnull = false; + att->atthasdef = false; + att->atthasmissing = false; + att->attidentity = '\0'; + att->attgenerated = '\0'; + att->attisdropped = false; + att->attislocal = true; + att->attinhcount = 0; + /* attacl, attoptions and attfdwoptions are not present in tupledescs */ + + att->atttypid = oidtypeid; + + /* + * Our goal here is to support just enough types to let basic builtin + * commands work without catalog access - e.g. so that we can do certain + * things even in processes that are not connected to a database. + */ + switch (oidtypeid) + { + case TEXTOID: + case TEXTARRAYOID: + att->attlen = -1; + att->attbyval = false; + att->attalign = TYPALIGN_INT; + att->attstorage = TYPSTORAGE_EXTENDED; + att->attcompression = InvalidCompressionMethod; + att->attcollation = DEFAULT_COLLATION_OID; + break; + + case BOOLOID: + att->attlen = 1; + att->attbyval = true; + att->attalign = TYPALIGN_CHAR; + att->attstorage = TYPSTORAGE_PLAIN; + att->attcompression = InvalidCompressionMethod; + att->attcollation = InvalidOid; + break; + + case INT4OID: + att->attlen = 4; + att->attbyval = true; + att->attalign = TYPALIGN_INT; + att->attstorage = TYPSTORAGE_PLAIN; + att->attcompression = InvalidCompressionMethod; + att->attcollation = InvalidOid; + break; + + case INT8OID: + att->attlen = 8; + att->attbyval = FLOAT8PASSBYVAL; + att->attalign = TYPALIGN_DOUBLE; + att->attstorage = TYPSTORAGE_PLAIN; + att->attcompression = InvalidCompressionMethod; + att->attcollation = InvalidOid; + break; + + default: + elog(ERROR, "unsupported type %u", oidtypeid); + } +} + +/* + * TupleDescInitEntryCollation + * + * Assign a nondefault collation to a previously initialized tuple descriptor + * entry. + */ +void +TupleDescInitEntryCollation(TupleDesc desc, + AttrNumber attributeNumber, + Oid collationid) +{ + /* + * sanity checks + */ + AssertArg(PointerIsValid(desc)); + AssertArg(attributeNumber >= 1); + AssertArg(attributeNumber <= desc->natts); + + TupleDescAttr(desc, attributeNumber - 1)->attcollation = collationid; +} + + +/* + * BuildDescForRelation + * + * Given a relation schema (list of ColumnDef nodes), build a TupleDesc. + * + * Note: tdtypeid will need to be filled in later on. + */ +TupleDesc +BuildDescForRelation(List *schema) +{ + int natts; + AttrNumber attnum; + ListCell *l; + TupleDesc desc; + bool has_not_null; + char *attname; + Oid atttypid; + int32 atttypmod; + Oid attcollation; + int attdim; + + /* + * allocate a new tuple descriptor + */ + natts = list_length(schema); + desc = CreateTemplateTupleDesc(natts); + has_not_null = false; + + attnum = 0; + + foreach(l, schema) + { + ColumnDef *entry = lfirst(l); + AclResult aclresult; + Form_pg_attribute att; + + /* + * for each entry in the list, get the name and type information from + * the list and have TupleDescInitEntry fill in the attribute + * information we need. + */ + attnum++; + + attname = entry->colname; + typenameTypeIdAndMod(NULL, entry->typeName, &atttypid, &atttypmod); + + aclresult = pg_type_aclcheck(atttypid, GetUserId(), ACL_USAGE); + if (aclresult != ACLCHECK_OK) + aclcheck_error_type(aclresult, atttypid); + + attcollation = GetColumnDefCollation(NULL, entry, atttypid); + attdim = list_length(entry->typeName->arrayBounds); + + if (entry->typeName->setof) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("column \"%s\" cannot be declared SETOF", + attname))); + + TupleDescInitEntry(desc, attnum, attname, + atttypid, atttypmod, attdim); + att = TupleDescAttr(desc, attnum - 1); + + /* Override TupleDescInitEntry's settings as requested */ + TupleDescInitEntryCollation(desc, attnum, attcollation); + if (entry->storage) + att->attstorage = entry->storage; + + /* Fill in additional stuff not handled by TupleDescInitEntry */ + att->attnotnull = entry->is_not_null; + has_not_null |= entry->is_not_null; + att->attislocal = entry->is_local; + att->attinhcount = entry->inhcount; + } + + if (has_not_null) + { + TupleConstr *constr = (TupleConstr *) palloc0(sizeof(TupleConstr)); + + constr->has_not_null = true; + constr->has_generated_stored = false; + constr->defval = NULL; + constr->missing = NULL; + constr->num_defval = 0; + constr->check = NULL; + constr->num_check = 0; + desc->constr = constr; + } + else + { + desc->constr = NULL; + } + + return desc; +} + +/* + * BuildDescFromLists + * + * Build a TupleDesc given lists of column names (as String nodes), + * column type OIDs, typmods, and collation OIDs. + * + * No constraints are generated. + * + * This is essentially a cut-down version of BuildDescForRelation for use + * with functions returning RECORD. + */ +TupleDesc +BuildDescFromLists(List *names, List *types, List *typmods, List *collations) +{ + int natts; + AttrNumber attnum; + ListCell *l1; + ListCell *l2; + ListCell *l3; + ListCell *l4; + TupleDesc desc; + + natts = list_length(names); + Assert(natts == list_length(types)); + Assert(natts == list_length(typmods)); + Assert(natts == list_length(collations)); + + /* + * allocate a new tuple descriptor + */ + desc = CreateTemplateTupleDesc(natts); + + attnum = 0; + forfour(l1, names, l2, types, l3, typmods, l4, collations) + { + char *attname = strVal(lfirst(l1)); + Oid atttypid = lfirst_oid(l2); + int32 atttypmod = lfirst_int(l3); + Oid attcollation = lfirst_oid(l4); + + attnum++; + + TupleDescInitEntry(desc, attnum, attname, atttypid, atttypmod, 0); + TupleDescInitEntryCollation(desc, attnum, attcollation); + } + + return desc; +} diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile new file mode 100644 index 0000000..3fceaee --- /dev/null +++ b/src/backend/access/gin/Makefile @@ -0,0 +1,32 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/gin +# +# IDENTIFICATION +# src/backend/access/gin/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/gin +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + ginarrayproc.o \ + ginbtree.o \ + ginbulk.o \ + gindatapage.o \ + ginentrypage.o \ + ginfast.o \ + ginget.o \ + gininsert.o \ + ginlogic.o \ + ginpostinglist.o \ + ginscan.o \ + ginutil.o \ + ginvacuum.o \ + ginvalidate.o \ + ginxlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README new file mode 100644 index 0000000..41d4e1e --- /dev/null +++ b/src/backend/access/gin/README @@ -0,0 +1,562 @@ +src/backend/access/gin/README + +Gin for PostgreSQL +================== + +Gin was sponsored by jfg://networks (http://www.jfg-networks.com/) + +Gin stands for Generalized Inverted Index and should be considered as a genie, +not a drink. + +Generalized means that the index does not know which operation it accelerates. +It instead works with custom strategies, defined for specific data types (read +"Index Method Strategies" in the PostgreSQL documentation). In that sense, Gin +is similar to GiST and differs from btree indices, which have predefined, +comparison-based operations. + +An inverted index is an index structure storing a set of (key, posting list) +pairs, where 'posting list' is a set of heap rows in which the key occurs. +(A text document would usually contain many keys.) The primary goal of +Gin indices is support for highly scalable, full-text search in PostgreSQL. + +A Gin index consists of a B-tree index constructed over key values, +where each key is an element of some indexed items (element of array, lexeme +for tsvector) and where each tuple in a leaf page contains either a pointer to +a B-tree over item pointers (posting tree), or a simple list of item pointers +(posting list) if the list is small enough. + +Note: There is no delete operation in the key (entry) tree. The reason for +this is that in our experience, the set of distinct words in a large corpus +changes very slowly. This greatly simplifies the code and concurrency +algorithms. + +Core PostgreSQL includes built-in Gin support for one-dimensional arrays +(eg. integer[], text[]). The following operations are available: + + * contains: value_array @> query_array + * overlaps: value_array && query_array + * is contained by: value_array <@ query_array + +Synopsis +-------- + +=# create index txt_idx on aa using gin(a); + +Features +-------- + + * Concurrency + * Write-Ahead Logging (WAL). (Recoverability from crashes.) + * User-defined opclasses. (The scheme is similar to GiST.) + * Optimized index creation (Makes use of maintenance_work_mem to accumulate + postings in memory.) + * Text search support via an opclass + * Soft upper limit on the returned results set using a GUC variable: + gin_fuzzy_search_limit + +Gin Fuzzy Limit +--------------- + +There are often situations when a full-text search returns a very large set of +results. Since reading tuples from the disk and sorting them could take a +lot of time, this is unacceptable for production. (Note that the search +itself is very fast.) + +Such queries usually contain very frequent lexemes, so the results are not +very helpful. To facilitate execution of such queries Gin has a configurable +soft upper limit on the size of the returned set, determined by the +'gin_fuzzy_search_limit' GUC variable. This is set to 0 by default (no +limit). + +If a non-zero search limit is set, then the returned set is a subset of the +whole result set, chosen at random. + +"Soft" means that the actual number of returned results could differ +from the specified limit, depending on the query and the quality of the +system's random number generator. + +From experience, a value of 'gin_fuzzy_search_limit' in the thousands +(eg. 5000-20000) works well. This means that 'gin_fuzzy_search_limit' will +have no effect for queries returning a result set with less tuples than this +number. + +Index structure +--------------- + +The "items" that a GIN index indexes are composite values that contain +zero or more "keys". For example, an item might be an integer array, and +then the keys would be the individual integer values. The index actually +stores and searches for the key values, not the items per se. In the +pg_opclass entry for a GIN opclass, the opcintype is the data type of the +items, and the opckeytype is the data type of the keys. GIN is optimized +for cases where items contain many keys and the same key values appear +in many different items. + +A GIN index contains a metapage, a btree of key entries, and possibly +"posting tree" pages, which hold the overflow when a key entry acquires +too many heap tuple pointers to fit in a btree page. Additionally, if the +fast-update feature is enabled, there can be "list pages" holding "pending" +key entries that haven't yet been merged into the main btree. The list +pages have to be scanned linearly when doing a search, so the pending +entries should be merged into the main btree before there get to be too +many of them. The advantage of the pending list is that bulk insertion of +a few thousand entries can be much faster than retail insertion. (The win +comes mainly from not having to do multiple searches/insertions when the +same key appears in multiple new heap tuples.) + +Key entries are nominally of the same IndexTuple format as used in other +index types, but since a leaf key entry typically refers to multiple heap +tuples, there are significant differences. (See GinFormTuple, which works +by building a "normal" index tuple and then modifying it.) The points to +know are: + +* In a single-column index, a key tuple just contains the key datum, but +in a multi-column index, a key tuple contains the pair (column number, +key datum) where the column number is stored as an int2. This is needed +to support different key data types in different columns. This much of +the tuple is built by index_form_tuple according to the usual rules. +The column number (if present) can never be null, but the key datum can +be, in which case a null bitmap is present as usual. (As usual for index +tuples, the size of the null bitmap is fixed at INDEX_MAX_KEYS.) + +* If the key datum is null (ie, IndexTupleHasNulls() is true), then +just after the nominal index data (ie, at offset IndexInfoFindDataOffset +or IndexInfoFindDataOffset + sizeof(int2)) there is a byte indicating +the "category" of the null entry. These are the possible categories: + 1 = ordinary null key value extracted from an indexable item + 2 = placeholder for zero-key indexable item + 3 = placeholder for null indexable item +Placeholder null entries are inserted into the index because otherwise +there would be no index entry at all for an empty or null indexable item, +which would mean that full index scans couldn't be done and various corner +cases would give wrong answers. The different categories of null entries +are treated as distinct keys by the btree, but heap itempointers for the +same category of null entry are merged into one index entry just as happens +with ordinary key entries. + +* In a key entry at the btree leaf level, at the next SHORTALIGN boundary, +there is a list of item pointers, in compressed format (see Posting List +Compression section), pointing to the heap tuples for which the indexable +items contain this key. This is called the "posting list". + +If the list would be too big for the index tuple to fit on an index page, the +ItemPointers are pushed out to a separate posting page or pages, and none +appear in the key entry itself. The separate pages are called a "posting +tree" (see below); Note that in either case, the ItemPointers associated with +a key can easily be read out in sorted order; this is relied on by the scan +algorithms. + +* The index tuple header fields of a leaf key entry are abused as follows: + +1) Posting list case: + +* ItemPointerGetBlockNumber(&itup->t_tid) contains the offset from index + tuple start to the posting list. + Access macros: GinGetPostingOffset(itup) / GinSetPostingOffset(itup,n) + +* ItemPointerGetOffsetNumber(&itup->t_tid) contains the number of elements + in the posting list (number of heap itempointers). + Access macros: GinGetNPosting(itup) / GinSetNPosting(itup,n) + +* If IndexTupleHasNulls(itup) is true, the null category byte can be + accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c) + +* The posting list can be accessed with GinGetPosting(itup) + +* If GinItupIsCompressed(itup), the posting list is stored in compressed + format. Otherwise it is just an array of ItemPointers. New tuples are always + stored in compressed format, uncompressed items can be present if the + database was migrated from 9.3 or earlier version. + +2) Posting tree case: + +* ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number + of the root of the posting tree. + Access macros: GinGetPostingTree(itup) / GinSetPostingTree(itup, blkno) + +* ItemPointerGetOffsetNumber(&itup->t_tid) contains the magic number + GIN_TREE_POSTING, which distinguishes this from the posting-list case + (it's large enough that that many heap itempointers couldn't possibly + fit on an index page). This value is inserted automatically by the + GinSetPostingTree macro. + +* If IndexTupleHasNulls(itup) is true, the null category byte can be + accessed/set with GinGetNullCategory(itup,gs) / GinSetNullCategory(itup,gs,c) + +* The posting list is not present and must not be accessed. + +Use the macro GinIsPostingTree(itup) to determine which case applies. + +In both cases, itup->t_info & INDEX_SIZE_MASK contains actual total size of +tuple, and the INDEX_VAR_MASK and INDEX_NULL_MASK bits have their normal +meanings as set by index_form_tuple. + +Index tuples in non-leaf levels of the btree contain the optional column +number, key datum, and null category byte as above. They do not contain +a posting list. ItemPointerGetBlockNumber(&itup->t_tid) is the downlink +to the next lower btree level, and ItemPointerGetOffsetNumber(&itup->t_tid) +is InvalidOffsetNumber. Use the access macros GinGetDownlink/GinSetDownlink +to get/set the downlink. + +Index entries that appear in "pending list" pages work a tad differently as +well. The optional column number, key datum, and null category byte are as +for other GIN index entries. However, there is always exactly one heap +itempointer associated with a pending entry, and it is stored in the t_tid +header field just as in non-GIN indexes. There is no posting list. +Furthermore, the code that searches the pending list assumes that all +entries for a given heap tuple appear consecutively in the pending list and +are sorted by the column-number-plus-key-datum. The GIN_LIST_FULLROW page +flag bit tells whether entries for a given heap tuple are spread across +multiple pending-list pages. If GIN_LIST_FULLROW is set, the page contains +all the entries for one or more heap tuples. If GIN_LIST_FULLROW is clear, +the page contains entries for only one heap tuple, *and* they are not all +the entries for that tuple. (Thus, a heap tuple whose entries do not all +fit on one pending-list page must have those pages to itself, even if this +results in wasting much of the space on the preceding page and the last +page for the tuple.) + +GIN packs downlinks and pivot keys into internal page tuples in a different way +than nbtree does. Lehman & Yao defines it as following. + +P_0, K_1, P_1, K_2, P_2, ... , K_n, P_n, K_{n+1} + +There P_i is a downlink and K_i is a key. K_i splits key space between P_{i-1} +and P_i (0 <= i <= n). K_{n+1} is high key. + +In internal page tuple is key and downlink grouped together. nbtree packs +keys and downlinks into tuples as following. + +(K_{n+1}, None), (-Inf, P_0), (K_1, P_1), ... , (K_n, P_n) + +There tuples are shown in parentheses. So, highkey is stored separately. P_i +is grouped with K_i. P_0 is grouped with -Inf key. + +GIN packs keys and downlinks into tuples in a different way. + +(P_0, K_1), (P_1, K_2), ... , (P_n, K_{n+1}) + +P_i is grouped with K_{i+1}. -Inf key is not needed. + +There are couple of additional notes regarding K_{n+1} key. +1) In entry tree rightmost page, a key coupled with P_n doesn't really matter. +Highkey is assumed to be infinity. +2) In posting tree, a key coupled with P_n always doesn't matter. Highkey for +non-rightmost pages is stored separately and accessed via +GinDataPageGetRightBound(). + +Posting tree +------------ + +If a posting list is too large to store in-line in a key entry, a posting tree +is created. A posting tree is a B-tree structure, where the ItemPointer is +used as the key. + +Internal posting tree pages use the standard PageHeader and the same "opaque" +struct as other GIN page, but do not contain regular index tuples. Instead, +the contents of the page is an array of PostingItem structs. Each PostingItem +consists of the block number of the child page, and the right bound of that +child page, as an ItemPointer. The right bound of the page is stored right +after the page header, before the PostingItem array. + +Posting tree leaf pages also use the standard PageHeader and opaque struct, +and the right bound of the page is stored right after the page header, but +the page content comprises of a number of compressed posting lists. The +compressed posting lists are stored one after each other, between page header +and pd_lower. The space between pd_lower and pd_upper is unused, which allows +full-page images of posting tree leaf pages to skip the unused space in middle +(buffer_std = true in XLogRecData). + +The item pointers are stored in a number of independent compressed posting +lists (also called segments), instead of one big one, to make random access +to a given item pointer faster: to find an item in a compressed list, you +have to read the list from the beginning, but when the items are split into +multiple lists, you can first skip over to the list containing the item you're +looking for, and read only that segment. Also, an update only needs to +re-encode the affected segment. + +Posting List Compression +------------------------ + +To fit as many item pointers on a page as possible, posting tree leaf pages +and posting lists stored inline in entry tree leaf tuples use a lightweight +form of compression. We take advantage of the fact that the item pointers +are stored in sorted order. Instead of storing the block and offset number of +each item pointer separately, we store the difference from the previous item. +That in itself doesn't do much, but it allows us to use so-called varbyte +encoding to compress them. + +Varbyte encoding is a method to encode integers, allowing smaller numbers to +take less space at the cost of larger numbers. Each integer is represented by +variable number of bytes. High bit of each byte in varbyte encoding determines +whether the next byte is still part of this number. Therefore, to read a single +varbyte encoded number, you have to read bytes until you find a byte with the +high bit not set. + +When encoding, the block and offset number forming the item pointer are +combined into a single integer. The offset number is stored in the 11 low +bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number +is stored in the higher bits. That requires 43 bits in total, which +conveniently fits in at most 6 bytes. + +A compressed posting list is passed around and stored on disk in a +GinPostingList struct. The first item in the list is stored uncompressed +as a regular ItemPointerData, followed by the length of the list in bytes, +followed by the packed items. + +Concurrency +----------- + +The entry tree and each posting tree are B-trees, with right-links connecting +sibling pages at the same level. This is the same structure that is used in +the regular B-tree indexam (invented by Lehman & Yao), but we don't support +scanning a GIN trees backwards, so we don't need left-links. The entry tree +leaves don't have dedicated high keys, instead greatest leaf tuple serves as +high key. That works because tuples are never deleted from the entry tree. + +The algorithms used to operate entry and posting trees are considered below. + +### Locating the leaf page + +When we search for leaf page in GIN btree to perform a read, we descend from +the root page to the leaf through using downlinks taking pin and shared lock on +one page at once. So, we release pin and shared lock on previous page before +getting them on the next page. + +The picture below shows tree state after finding the leaf page. Lower case +letters depicts tree pages. 'S' depicts shared lock on the page. + + a + / | \ + b c d + / | \ | \ | \ + eS f g h i j k + +### Steping right + +Concurrent page splits move the keyspace to right, so after following a +downlink, the page actually containing the key we're looking for might be +somewhere to the right of the page we landed on. In that case, we follow the +right-links until we find the page we're looking for. + +During stepping right we take pin and shared lock on the right sibling before +releasing them from the current page. This mechanism was designed to protect +from stepping to delete page. We step to the right sibling while hold lock on +the rightlink pointing there. So, it's guaranteed that nobody updates rightlink +concurrently and doesn't delete right sibling accordingly. + +The picture below shows two pages locked at once during stepping right. + + a + / | \ + b c d + / | \ | \ | \ + eS fS g h i j k + +### Insert + +While finding appropriate leaf for insertion we also descend from the root to +leaf, while shared locking one page at once in. But during insertion we don't +release pins from root and internal pages. That could save us some lookups to +the buffers hash table for downlinks insertion assuming parents are not changed +due to concurrent splits. Once we reach leaf we re-lock the page in exclusive +mode. + +The picture below shows leaf page locked in exclusive mode and ready for +insertion. 'P' and 'E' depict pin and exclusive lock correspondingly. + + + aP + / | \ + b cP d + / | \ | \ | \ + e f g hE i j k + + +If insert causes a page split, the parent is locked in exclusive mode before +unlocking the left child. So, insertion algorithm can exclusively lock both +parent and child pages at once starting from child. + +The picture below shows tree state after leaf page split. 'q' is new page +produced by split. Parent 'c' is about to have downlink inserted. + + aP + / | \ + b cE d + / | \ / | \ | \ + e f g hE q i j k + + +### Page deletion + +Vacuum never deletes tuples or pages from the entry tree. It traverses entry +tree leafs in logical order by rightlinks and removes deletable TIDs from +posting lists. Posting trees are processed by links from entry tree leafs. They +are vacuumed in two stages. At first stage, deletable TIDs are removed from +leafs. If first stage detects at least one empty page, then at the second stage +ginScanToDelete() deletes empty pages. + +ginScanToDelete() traverses the whole tree in depth-first manner. It starts +from the super-exclusive lock on the tree root. This lock prevents all the +concurrent insertions into this tree while we're deleting pages. However, +there are still might be some in-progress readers, who traversed root before +we locked it. + +The picture below shows tree state after page deletion algorithm traversed to +leftmost leaf of the tree. + + aE + / | \ + bE c d + / | \ | \ | \ + eE f g h i j k + +Deletion algorithm keeps exclusive locks on left siblings of pages comprising +currently investigated path. Thus, if current page is to be removed, all +required pages to remove both downlink and rightlink are already locked. That +avoids potential right to left page locking order, which could deadlock with +concurrent stepping right. + +A search concurrent to page deletion might already have read a pointer to the +page to be deleted, and might be just about to follow it. A page can be reached +via the right-link of its left sibling, or via its downlink in the parent. + +To prevent a backend from reaching a deleted page via a right-link, stepping +right algorithm doesn't release lock on the current page until lock of the +right page is acquired. + +The downlink is more tricky. A search descending the tree must release the lock +on the parent page before locking the child, or it could deadlock with a +concurrent split of the child page; a page split locks the parent, while already +holding a lock on the child page. So, deleted page cannot be reclaimed +immediately. Instead, we have to wait for every transaction, which might wait +to reference this page, to finish. Corresponding processes must observe that +the page is marked deleted and recover accordingly. + +The picture below shows tree state after page deletion algorithm further +traversed the tree. Currently investigated path is 'a-c-h'. Left siblings 'b' +and 'g' of 'c' and 'h' correspondingly are also exclusively locked. + + aE + / | \ + bE cE d + / | \ | \ | \ + e f gE hE i j k + +The next picture shows tree state after page 'h' was deleted. It's marked with +'deleted' flag and newest xid, which might visit it. Downlink from 'c' to 'h' +is also deleted. + + aE + / | \ + bE cE d + / | \ \ | \ + e f gE hD iE j k + +However, it's still possible that concurrent reader has seen downlink from 'c' +to 'h' before we deleted it. In that case this reader will step right from 'h' +to till find non-deleted page. Xid-marking of page 'h' guarantees that this +page wouldn't be reused till all such readers gone. Next leaf page under +investigation is 'i'. 'g' remains locked as it becomes left sibling of 'i'. + +The next picture shows tree state after 'i' and 'c' was deleted. Internal page +'c' was deleted because it appeared to have no downlinks. The path under +investigation is 'a-d-j'. Pages 'b' and 'g' are locked as self siblings of 'd' +and 'j'. + + aE + / \ + bE cD dE + / | \ | \ + e f gE hD iD jE k + +During the replay of page deletion at standby, the page's left sibling, the +target page, and its parent, are locked in that order. This order guarantees +no deadlock with concurrent reads. + +Predicate Locking +----------------- + +GIN supports predicate locking, for serializable snapshot isolation. +A predicate locks represent that a scan has scanned a range of values. They +are not concerned with physical pages as such, but the logical key values. +A predicate lock on a page covers the key range that would belong on that +page, whether or not there are any matching tuples there currently. In other +words, a predicate lock on an index page covers the "gaps" between the index +tuples. To minimize false positives, predicate locks are acquired at the +finest level possible. + +* Like in the B-tree index, it is enough to lock only leaf pages, because all + insertions happen at the leaf level. + +* In an equality search (i.e. not a partial match search), if a key entry has + a posting tree, we lock the posting tree root page, to represent a lock on + just that key entry. Otherwise, we lock the entry tree page. We also lock + the entry tree page if no match is found, to lock the "gap" where the entry + would've been, had there been one. + +* In a partial match search, we lock all the entry leaf pages that we scan, + in addition to locks on posting tree roots, to represent the "gaps" between + values. + +* In addition to the locks on entry leaf pages and posting tree roots, all + scans grab a lock the metapage. This is to interlock with insertions to + the fast update pending list. An insertion to the pending list can really + belong anywhere in the tree, and the lock on the metapage represents that. + +The interlock for fastupdate pending lists means that with fastupdate=on, +we effectively always grab a full-index lock, so you could get a lot of false +positives. + +Compatibility +------------- + +Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in +uncompressed format because of pg_upgrade from 9.3 or earlier versions. +For compatibility, old uncompressed format is also supported. Following +rules are used to handle it: + +* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list. +This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid). +Use GinItupIsCompressed(itup) to check the flag. + +* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag. + Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to + check and set this flag. + +* All scan operations check format of posting list add use corresponding code +to read its content. + +* When updating an index tuple containing an uncompressed posting list, it +will be replaced with new index tuple containing a compressed list. + +* When updating an uncompressed posting tree leaf page, it's compressed. + +* If vacuum finds some dead TIDs in uncompressed posting lists, they are +converted into compressed posting lists. This assumes that the compressed +posting list fits in the space occupied by the uncompressed list. IOW, we +assume that the compressed version of the page, with the dead items removed, +takes less space than the old uncompressed version. + +Limitations +----------- + + * Gin doesn't use scan->kill_prior_tuple & scan->ignore_killed_tuples + * Gin searches entries only by equality matching, or simple range + matching using the "partial match" feature. + +TODO +---- + +Nearest future: + + * Opclasses for more types (no programming, just many catalog changes) + +Distant future: + + * Replace B-tree of entries to something like GiST + +Authors +------- + +Original work was done by Teodor Sigaev (teodor@sigaev.ru) and Oleg Bartunov +(oleg@sai.msu.su). diff --git a/src/backend/access/gin/ginarrayproc.c b/src/backend/access/gin/ginarrayproc.c new file mode 100644 index 0000000..bf73e32 --- /dev/null +++ b/src/backend/access/gin/ginarrayproc.c @@ -0,0 +1,305 @@ +/*------------------------------------------------------------------------- + * + * ginarrayproc.c + * support functions for GIN's indexing of any array + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginarrayproc.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gin.h" +#include "access/stratnum.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + + +#define GinOverlapStrategy 1 +#define GinContainsStrategy 2 +#define GinContainedStrategy 3 +#define GinEqualStrategy 4 + + +/* + * extractValue support function + */ +Datum +ginarrayextract(PG_FUNCTION_ARGS) +{ + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + int32 *nkeys = (int32 *) PG_GETARG_POINTER(1); + bool **nullFlags = (bool **) PG_GETARG_POINTER(2); + int16 elmlen; + bool elmbyval; + char elmalign; + Datum *elems; + bool *nulls; + int nelems; + + get_typlenbyvalalign(ARR_ELEMTYPE(array), + &elmlen, &elmbyval, &elmalign); + + deconstruct_array(array, + ARR_ELEMTYPE(array), + elmlen, elmbyval, elmalign, + &elems, &nulls, &nelems); + + *nkeys = nelems; + *nullFlags = nulls; + + /* we should not free array, elems[i] points into it */ + PG_RETURN_POINTER(elems); +} + +/* + * Formerly, ginarrayextract had only two arguments. Now it has three, + * but we still need a pg_proc entry with two args to support reloading + * pre-9.1 contrib/intarray opclass declarations. This compatibility + * function should go away eventually. + */ +Datum +ginarrayextract_2args(PG_FUNCTION_ARGS) +{ + if (PG_NARGS() < 3) /* should not happen */ + elog(ERROR, "ginarrayextract requires three arguments"); + return ginarrayextract(fcinfo); +} + +/* + * extractQuery support function + */ +Datum +ginqueryarrayextract(PG_FUNCTION_ARGS) +{ + /* Make copy of array input to ensure it doesn't disappear while in use */ + ArrayType *array = PG_GETARG_ARRAYTYPE_P_COPY(0); + int32 *nkeys = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + + /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */ + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + bool **nullFlags = (bool **) PG_GETARG_POINTER(5); + int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); + int16 elmlen; + bool elmbyval; + char elmalign; + Datum *elems; + bool *nulls; + int nelems; + + get_typlenbyvalalign(ARR_ELEMTYPE(array), + &elmlen, &elmbyval, &elmalign); + + deconstruct_array(array, + ARR_ELEMTYPE(array), + elmlen, elmbyval, elmalign, + &elems, &nulls, &nelems); + + *nkeys = nelems; + *nullFlags = nulls; + + switch (strategy) + { + case GinOverlapStrategy: + *searchMode = GIN_SEARCH_MODE_DEFAULT; + break; + case GinContainsStrategy: + if (nelems > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else /* everything contains the empty set */ + *searchMode = GIN_SEARCH_MODE_ALL; + break; + case GinContainedStrategy: + /* empty set is contained in everything */ + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + case GinEqualStrategy: + if (nelems > 0) + *searchMode = GIN_SEARCH_MODE_DEFAULT; + else + *searchMode = GIN_SEARCH_MODE_INCLUDE_EMPTY; + break; + default: + elog(ERROR, "ginqueryarrayextract: unknown strategy number: %d", + strategy); + } + + /* we should not free array, elems[i] points into it */ + PG_RETURN_POINTER(elems); +} + +/* + * consistent support function + */ +Datum +ginarrayconsistent(PG_FUNCTION_ARGS) +{ + bool *check = (bool *) PG_GETARG_POINTER(0); + StrategyNumber strategy = PG_GETARG_UINT16(1); + + /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */ + int32 nkeys = PG_GETARG_INT32(3); + + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + bool *recheck = (bool *) PG_GETARG_POINTER(5); + + /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(6); */ + bool *nullFlags = (bool *) PG_GETARG_POINTER(7); + bool res; + int32 i; + + switch (strategy) + { + case GinOverlapStrategy: + /* result is not lossy */ + *recheck = false; + /* must have a match for at least one non-null element */ + res = false; + for (i = 0; i < nkeys; i++) + { + if (check[i] && !nullFlags[i]) + { + res = true; + break; + } + } + break; + case GinContainsStrategy: + /* result is not lossy */ + *recheck = false; + /* must have all elements in check[] true, and no nulls */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!check[i] || nullFlags[i]) + { + res = false; + break; + } + } + break; + case GinContainedStrategy: + /* we will need recheck */ + *recheck = true; + /* can't do anything else useful here */ + res = true; + break; + case GinEqualStrategy: + /* we will need recheck */ + *recheck = true; + + /* + * Must have all elements in check[] true; no discrimination + * against nulls here. This is because array_contain_compare and + * array_eq handle nulls differently ... + */ + res = true; + for (i = 0; i < nkeys; i++) + { + if (!check[i]) + { + res = false; + break; + } + } + break; + default: + elog(ERROR, "ginarrayconsistent: unknown strategy number: %d", + strategy); + res = false; + } + + PG_RETURN_BOOL(res); +} + +/* + * triconsistent support function + */ +Datum +ginarraytriconsistent(PG_FUNCTION_ARGS) +{ + GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0); + StrategyNumber strategy = PG_GETARG_UINT16(1); + + /* ArrayType *query = PG_GETARG_ARRAYTYPE_P(2); */ + int32 nkeys = PG_GETARG_INT32(3); + + /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ + /* Datum *queryKeys = (Datum *) PG_GETARG_POINTER(5); */ + bool *nullFlags = (bool *) PG_GETARG_POINTER(6); + GinTernaryValue res; + int32 i; + + switch (strategy) + { + case GinOverlapStrategy: + /* must have a match for at least one non-null element */ + res = GIN_FALSE; + for (i = 0; i < nkeys; i++) + { + if (!nullFlags[i]) + { + if (check[i] == GIN_TRUE) + { + res = GIN_TRUE; + break; + } + else if (check[i] == GIN_MAYBE && res == GIN_FALSE) + { + res = GIN_MAYBE; + } + } + } + break; + case GinContainsStrategy: + /* must have all elements in check[] true, and no nulls */ + res = GIN_TRUE; + for (i = 0; i < nkeys; i++) + { + if (check[i] == GIN_FALSE || nullFlags[i]) + { + res = GIN_FALSE; + break; + } + if (check[i] == GIN_MAYBE) + { + res = GIN_MAYBE; + } + } + break; + case GinContainedStrategy: + /* can't do anything else useful here */ + res = GIN_MAYBE; + break; + case GinEqualStrategy: + + /* + * Must have all elements in check[] true; no discrimination + * against nulls here. This is because array_contain_compare and + * array_eq handle nulls differently ... + */ + res = GIN_MAYBE; + for (i = 0; i < nkeys; i++) + { + if (check[i] == GIN_FALSE) + { + res = GIN_FALSE; + break; + } + } + break; + default: + elog(ERROR, "ginarrayconsistent: unknown strategy number: %d", + strategy); + res = false; + } + + PG_RETURN_GIN_TERNARY_VALUE(res); +} diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c new file mode 100644 index 0000000..482cf10 --- /dev/null +++ b/src/backend/access/gin/ginbtree.c @@ -0,0 +1,795 @@ +/*------------------------------------------------------------------------- + * + * ginbtree.c + * page utilities routines for the postgres inverted index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginbtree.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/predicate.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static void ginFindParents(GinBtree btree, GinBtreeStack *stack); +static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Buffer childbuf, GinStatsData *buildStats); +static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, + bool freestack, GinStatsData *buildStats); + +/* + * Lock buffer by needed method for search. + */ +int +ginTraverseLock(Buffer buffer, bool searchMode) +{ + Page page; + int access = GIN_SHARE; + + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + if (GinPageIsLeaf(page)) + { + if (searchMode == false) + { + /* we should relock our page */ + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_EXCLUSIVE); + + /* But root can become non-leaf during relock */ + if (!GinPageIsLeaf(page)) + { + /* restore old lock type (very rare) */ + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_SHARE); + } + else + access = GIN_EXCLUSIVE; + } + } + + return access; +} + +/* + * Descend the tree to the leaf page that contains or would contain the key + * we're searching for. The key should already be filled in 'btree', in + * tree-type specific manner. If btree->fullScan is true, descends to the + * leftmost leaf page. + * + * If 'searchmode' is false, on return stack->buffer is exclusively locked, + * and the stack represents the full path to the root. Otherwise stack->buffer + * is share-locked, and stack->parent is NULL. + * + * If 'rootConflictCheck' is true, tree root is checked for serialization + * conflict. + */ +GinBtreeStack * +ginFindLeafPage(GinBtree btree, bool searchMode, + bool rootConflictCheck, Snapshot snapshot) +{ + GinBtreeStack *stack; + + stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + stack->blkno = btree->rootBlkno; + stack->buffer = ReadBuffer(btree->index, btree->rootBlkno); + stack->parent = NULL; + stack->predictNumber = 1; + + if (rootConflictCheck) + CheckForSerializableConflictIn(btree->index, NULL, btree->rootBlkno); + + for (;;) + { + Page page; + BlockNumber child; + int access; + + stack->off = InvalidOffsetNumber; + + page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); + + access = ginTraverseLock(stack->buffer, searchMode); + + /* + * If we're going to modify the tree, finish any incomplete splits we + * encounter on the way. + */ + if (!searchMode && GinPageIsIncompleteSplit(page)) + ginFinishSplit(btree, stack, false, NULL); + + /* + * ok, page is correctly locked, we should check to move right .., + * root never has a right link, so small optimization + */ + while (btree->fullScan == false && stack->blkno != btree->rootBlkno && + btree->isMoveRight(btree, page)) + { + BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; + + if (rightlink == InvalidBlockNumber) + /* rightmost page */ + break; + + stack->buffer = ginStepRight(stack->buffer, btree->index, access); + stack->blkno = rightlink; + page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); + + if (!searchMode && GinPageIsIncompleteSplit(page)) + ginFinishSplit(btree, stack, false, NULL); + } + + if (GinPageIsLeaf(page)) /* we found, return locked page */ + return stack; + + /* now we have correct buffer, try to find child */ + child = btree->findChildPage(btree, stack); + + LockBuffer(stack->buffer, GIN_UNLOCK); + Assert(child != InvalidBlockNumber); + Assert(stack->blkno != child); + + if (searchMode) + { + /* in search mode we may forget path to leaf */ + stack->blkno = child; + stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); + } + else + { + GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + + ptr->parent = stack; + stack = ptr; + stack->blkno = child; + stack->buffer = ReadBuffer(btree->index, stack->blkno); + stack->predictNumber = 1; + } + } +} + +/* + * Step right from current page. + * + * The next page is locked first, before releasing the current page. This is + * crucial to protect from concurrent page deletion (see comment in + * ginDeletePage). + */ +Buffer +ginStepRight(Buffer buffer, Relation index, int lockmode) +{ + Buffer nextbuffer; + Page page = BufferGetPage(buffer); + bool isLeaf = GinPageIsLeaf(page); + bool isData = GinPageIsData(page); + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + + nextbuffer = ReadBuffer(index, blkno); + LockBuffer(nextbuffer, lockmode); + UnlockReleaseBuffer(buffer); + + /* Sanity check that the page we stepped to is of similar kind. */ + page = BufferGetPage(nextbuffer); + if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page)) + elog(ERROR, "right sibling of GIN page is of different type"); + + return nextbuffer; +} + +void +freeGinBtreeStack(GinBtreeStack *stack) +{ + while (stack) + { + GinBtreeStack *tmp = stack->parent; + + if (stack->buffer != InvalidBuffer) + ReleaseBuffer(stack->buffer); + + pfree(stack); + stack = tmp; + } +} + +/* + * Try to find parent for current stack position. Returns correct parent and + * child's offset in stack->parent. The root page is never released, to + * prevent conflict with vacuum process. + */ +static void +ginFindParents(GinBtree btree, GinBtreeStack *stack) +{ + Page page; + Buffer buffer; + BlockNumber blkno, + leftmostBlkno; + OffsetNumber offset; + GinBtreeStack *root; + GinBtreeStack *ptr; + + /* + * Unwind the stack all the way up to the root, leaving only the root + * item. + * + * Be careful not to release the pin on the root page! The pin on root + * page is required to lock out concurrent vacuums on the tree. + */ + root = stack->parent; + while (root->parent) + { + ReleaseBuffer(root->buffer); + root = root->parent; + } + + Assert(root->blkno == btree->rootBlkno); + Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno); + root->off = InvalidOffsetNumber; + + blkno = root->blkno; + buffer = root->buffer; + + ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); + + for (;;) + { + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + if (GinPageIsLeaf(page)) + elog(ERROR, "Lost path"); + + if (GinPageIsIncompleteSplit(page)) + { + Assert(blkno != btree->rootBlkno); + ptr->blkno = blkno; + ptr->buffer = buffer; + + /* + * parent may be wrong, but if so, the ginFinishSplit call will + * recurse to call ginFindParents again to fix it. + */ + ptr->parent = root; + ptr->off = InvalidOffsetNumber; + + ginFinishSplit(btree, ptr, false, NULL); + } + + leftmostBlkno = btree->getLeftMostChild(btree, page); + + while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) + { + blkno = GinPageGetOpaque(page)->rightlink; + if (blkno == InvalidBlockNumber) + { + UnlockReleaseBuffer(buffer); + break; + } + buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + + /* finish any incomplete splits, as above */ + if (GinPageIsIncompleteSplit(page)) + { + Assert(blkno != btree->rootBlkno); + ptr->blkno = blkno; + ptr->buffer = buffer; + ptr->parent = root; + ptr->off = InvalidOffsetNumber; + + ginFinishSplit(btree, ptr, false, NULL); + } + } + + if (blkno != InvalidBlockNumber) + { + ptr->blkno = blkno; + ptr->buffer = buffer; + ptr->parent = root; /* it may be wrong, but in next call we will + * correct */ + ptr->off = offset; + stack->parent = ptr; + return; + } + + /* Descend down to next level */ + blkno = leftmostBlkno; + buffer = ReadBuffer(btree->index, blkno); + } +} + +/* + * Insert a new item to a page. + * + * Returns true if the insertion was finished. On false, the page was split and + * the parent needs to be updated. (A root split returns true as it doesn't + * need any further action by the caller to complete.) + * + * When inserting a downlink to an internal page, 'childbuf' contains the + * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared + * atomically with the insert. Also, the existing item at offset stack->off + * in the target page is updated to point to updateblkno. + * + * stack->buffer is locked on entry, and is kept locked. + * Likewise for childbuf, if given. + */ +static bool +ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Buffer childbuf, GinStatsData *buildStats) +{ + Page page = BufferGetPage(stack->buffer); + bool result; + GinPlaceToPageRC rc; + uint16 xlflags = 0; + Page childpage = NULL; + Page newlpage = NULL, + newrpage = NULL; + void *ptp_workspace = NULL; + MemoryContext tmpCxt; + MemoryContext oldCxt; + + /* + * We do all the work of this function and its subfunctions in a temporary + * memory context. This avoids leakages and simplifies APIs, since some + * subfunctions allocate storage that has to survive until we've finished + * the WAL insertion. + */ + tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "ginPlaceToPage temporary context", + ALLOCSET_DEFAULT_SIZES); + oldCxt = MemoryContextSwitchTo(tmpCxt); + + if (GinPageIsData(page)) + xlflags |= GIN_INSERT_ISDATA; + if (GinPageIsLeaf(page)) + { + xlflags |= GIN_INSERT_ISLEAF; + Assert(!BufferIsValid(childbuf)); + Assert(updateblkno == InvalidBlockNumber); + } + else + { + Assert(BufferIsValid(childbuf)); + Assert(updateblkno != InvalidBlockNumber); + childpage = BufferGetPage(childbuf); + } + + /* + * See if the incoming tuple will fit on the page. beginPlaceToPage will + * decide if the page needs to be split, and will compute the split + * contents if so. See comments for beginPlaceToPage and execPlaceToPage + * functions for more details of the API here. + */ + rc = btree->beginPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, + &ptp_workspace, + &newlpage, &newrpage); + + if (rc == GPTP_NO_WORK) + { + /* Nothing to do */ + result = true; + } + else if (rc == GPTP_INSERT) + { + /* It will fit, perform the insertion */ + START_CRIT_SECTION(); + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogBeginInsert(); + XLogRegisterBuffer(0, stack->buffer, REGBUF_STANDARD); + if (BufferIsValid(childbuf)) + XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD); + } + + /* Perform the page update, and register any extra WAL data */ + btree->execPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, ptp_workspace); + + MarkBufferDirty(stack->buffer); + + /* An insert to an internal page finishes the split of the child. */ + if (BufferIsValid(childbuf)) + { + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + MarkBufferDirty(childbuf); + } + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogRecPtr recptr; + ginxlogInsert xlrec; + BlockIdData childblknos[2]; + + xlrec.flags = xlflags; + + XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert)); + + /* + * Log information about child if this was an insertion of a + * downlink. + */ + if (BufferIsValid(childbuf)) + { + BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); + BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); + XLogRegisterData((char *) childblknos, + sizeof(BlockIdData) * 2); + } + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT); + PageSetLSN(page, recptr); + if (BufferIsValid(childbuf)) + PageSetLSN(childpage, recptr); + } + + END_CRIT_SECTION(); + + /* Insertion is complete. */ + result = true; + } + else if (rc == GPTP_SPLIT) + { + /* + * Didn't fit, need to split. The split has been computed in newlpage + * and newrpage, which are pointers to palloc'd pages, not associated + * with buffers. stack->buffer is not touched yet. + */ + Buffer rbuffer; + BlockNumber savedRightLink; + ginxlogSplit data; + Buffer lbuffer = InvalidBuffer; + Page newrootpg = NULL; + + /* Get a new index page to become the right page */ + rbuffer = GinNewBuffer(btree->index); + + /* During index build, count the new page */ + if (buildStats) + { + if (btree->isData) + buildStats->nDataPages++; + else + buildStats->nEntryPages++; + } + + savedRightLink = GinPageGetOpaque(page)->rightlink; + + /* Begin setting up WAL record */ + data.node = btree->index->rd_node; + data.flags = xlflags; + if (BufferIsValid(childbuf)) + { + data.leftChildBlkno = BufferGetBlockNumber(childbuf); + data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; + } + else + data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; + + if (stack->parent == NULL) + { + /* + * splitting the root, so we need to allocate new left page and + * place pointers to left and right page on root page. + */ + lbuffer = GinNewBuffer(btree->index); + + /* During index build, count the new left page */ + if (buildStats) + { + if (btree->isData) + buildStats->nDataPages++; + else + buildStats->nEntryPages++; + } + + data.rrlink = InvalidBlockNumber; + data.flags |= GIN_SPLIT_ROOT; + + GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; + GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); + + /* + * Construct a new root page containing downlinks to the new left + * and right pages. (Do this in a temporary copy rather than + * overwriting the original page directly, since we're not in the + * critical section yet.) + */ + newrootpg = PageGetTempPage(newrpage); + GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); + + btree->fillRoot(btree, newrootpg, + BufferGetBlockNumber(lbuffer), newlpage, + BufferGetBlockNumber(rbuffer), newrpage); + + if (GinPageIsLeaf(BufferGetPage(stack->buffer))) + { + + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(lbuffer)); + + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(rbuffer)); + } + + } + else + { + /* splitting a non-root page */ + data.rrlink = savedRightLink; + + GinPageGetOpaque(newrpage)->rightlink = savedRightLink; + GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; + GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); + + if (GinPageIsLeaf(BufferGetPage(stack->buffer))) + { + + PredicateLockPageSplit(btree->index, + BufferGetBlockNumber(stack->buffer), + BufferGetBlockNumber(rbuffer)); + } + } + + /* + * OK, we have the new contents of the left page in a temporary copy + * now (newlpage), and likewise for the new contents of the + * newly-allocated right block. The original page is still unchanged. + * + * If this is a root split, we also have a temporary page containing + * the new contents of the root. + */ + + START_CRIT_SECTION(); + + MarkBufferDirty(rbuffer); + MarkBufferDirty(stack->buffer); + + /* + * Restore the temporary copies over the real buffers. + */ + if (stack->parent == NULL) + { + /* Splitting the root, three pages to update */ + MarkBufferDirty(lbuffer); + memcpy(page, newrootpg, BLCKSZ); + memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + } + else + { + /* Normal split, only two pages to update */ + memcpy(page, newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + } + + /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ + if (BufferIsValid(childbuf)) + { + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + MarkBufferDirty(childbuf); + } + + /* write WAL record */ + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + /* + * We just take full page images of all the split pages. Splits + * are uncommon enough that it's not worth complicating the code + * to be more efficient. + */ + if (stack->parent == NULL) + { + XLogRegisterBuffer(0, lbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(2, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + else + { + XLogRegisterBuffer(0, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + if (BufferIsValid(childbuf)) + XLogRegisterBuffer(3, childbuf, REGBUF_STANDARD); + + XLogRegisterData((char *) &data, sizeof(ginxlogSplit)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT); + + PageSetLSN(page, recptr); + PageSetLSN(BufferGetPage(rbuffer), recptr); + if (stack->parent == NULL) + PageSetLSN(BufferGetPage(lbuffer), recptr); + if (BufferIsValid(childbuf)) + PageSetLSN(childpage, recptr); + } + END_CRIT_SECTION(); + + /* + * We can release the locks/pins on the new pages now, but keep + * stack->buffer locked. childbuf doesn't get unlocked either. + */ + UnlockReleaseBuffer(rbuffer); + if (stack->parent == NULL) + UnlockReleaseBuffer(lbuffer); + + /* + * If we split the root, we're done. Otherwise the split is not + * complete until the downlink for the new page has been inserted to + * the parent. + */ + result = (stack->parent == NULL); + } + else + { + elog(ERROR, "invalid return code from GIN beginPlaceToPage method: %d", rc); + result = false; /* keep compiler quiet */ + } + + /* Clean up temp context */ + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); + + return result; +} + +/* + * Finish a split by inserting the downlink for the new page to parent. + * + * On entry, stack->buffer is exclusively locked. + * + * If freestack is true, all the buffers are released and unlocked as we + * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept + * locked, and stack is unmodified, except for possibly moving right to find + * the correct parent of page. + */ +static void +ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, + GinStatsData *buildStats) +{ + Page page; + bool done; + bool first = true; + + /* + * freestack == false when we encounter an incompletely split page during + * a scan, while freestack == true is used in the normal scenario that a + * split is finished right after the initial insert. + */ + if (!freestack) + elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"", + stack->blkno, RelationGetRelationName(btree->index)); + + /* this loop crawls up the stack until the insertion is complete */ + do + { + GinBtreeStack *parent = stack->parent; + void *insertdata; + BlockNumber updateblkno; + + /* search parent to lock */ + LockBuffer(parent->buffer, GIN_EXCLUSIVE); + + /* + * If the parent page was incompletely split, finish that split first, + * then continue with the current one. + * + * Note: we have to finish *all* incomplete splits we encounter, even + * if we have to move right. Otherwise we might choose as the target a + * page that has no downlink in the parent, and splitting it further + * would fail. + */ + if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) + ginFinishSplit(btree, parent, false, buildStats); + + /* move right if it's needed */ + page = BufferGetPage(parent->buffer); + while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) + { + if (GinPageRightMost(page)) + { + /* + * rightmost page, but we don't find parent, we should use + * plain search... + */ + LockBuffer(parent->buffer, GIN_UNLOCK); + ginFindParents(btree, stack); + parent = stack->parent; + Assert(parent != NULL); + break; + } + + parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE); + parent->blkno = BufferGetBlockNumber(parent->buffer); + page = BufferGetPage(parent->buffer); + + if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) + ginFinishSplit(btree, parent, false, buildStats); + } + + /* insert the downlink */ + insertdata = btree->prepareDownlink(btree, stack->buffer); + updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink; + done = ginPlaceToPage(btree, parent, + insertdata, updateblkno, + stack->buffer, buildStats); + pfree(insertdata); + + /* + * If the caller requested to free the stack, unlock and release the + * child buffer now. Otherwise keep it pinned and locked, but if we + * have to recurse up the tree, we can unlock the upper pages, only + * keeping the page at the bottom of the stack locked. + */ + if (!first || freestack) + LockBuffer(stack->buffer, GIN_UNLOCK); + if (freestack) + { + ReleaseBuffer(stack->buffer); + pfree(stack); + } + stack = parent; + + first = false; + } while (!done); + + /* unlock the parent */ + LockBuffer(stack->buffer, GIN_UNLOCK); + + if (freestack) + freeGinBtreeStack(stack); +} + +/* + * Insert a value to tree described by stack. + * + * The value to be inserted is given in 'insertdata'. Its format depends + * on whether this is an entry or data tree, ginInsertValue just passes it + * through to the tree-specific callback function. + * + * During an index build, buildStats is non-null and the counters it contains + * are incremented as needed. + * + * NB: the passed-in stack is freed, as though by freeGinBtreeStack. + */ +void +ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata, + GinStatsData *buildStats) +{ + bool done; + + /* If the leaf page was incompletely split, finish the split first */ + if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer))) + ginFinishSplit(btree, stack, false, buildStats); + + done = ginPlaceToPage(btree, stack, + insertdata, InvalidBlockNumber, + InvalidBuffer, buildStats); + if (done) + { + LockBuffer(stack->buffer, GIN_UNLOCK); + freeGinBtreeStack(stack); + } + else + ginFinishSplit(btree, stack, true, buildStats); +} diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c new file mode 100644 index 0000000..4c5067c --- /dev/null +++ b/src/backend/access/gin/ginbulk.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * ginbulk.c + * routines for fast build of inverted index + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginbulk.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/gin_private.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +#define DEF_NENTRY 2048 /* GinEntryAccumulator allocation quantum */ +#define DEF_NPTR 5 /* ItemPointer initial allocation quantum */ + + +/* Combiner function for rbtree.c */ +static void +ginCombineData(RBTNode *existing, const RBTNode *newdata, void *arg) +{ + GinEntryAccumulator *eo = (GinEntryAccumulator *) existing; + const GinEntryAccumulator *en = (const GinEntryAccumulator *) newdata; + BuildAccumulator *accum = (BuildAccumulator *) arg; + + /* + * Note this code assumes that newdata contains only one itempointer. + */ + if (eo->count >= eo->maxcount) + { + if (eo->maxcount > INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("posting list is too long"), + errhint("Reduce maintenance_work_mem."))); + + accum->allocatedMemory -= GetMemoryChunkSpace(eo->list); + eo->maxcount *= 2; + eo->list = (ItemPointerData *) + repalloc_huge(eo->list, sizeof(ItemPointerData) * eo->maxcount); + accum->allocatedMemory += GetMemoryChunkSpace(eo->list); + } + + /* If item pointers are not ordered, they will need to be sorted later */ + if (eo->shouldSort == false) + { + int res; + + res = ginCompareItemPointers(eo->list + eo->count - 1, en->list); + Assert(res != 0); + + if (res > 0) + eo->shouldSort = true; + } + + eo->list[eo->count] = en->list[0]; + eo->count++; +} + +/* Comparator function for rbtree.c */ +static int +cmpEntryAccumulator(const RBTNode *a, const RBTNode *b, void *arg) +{ + const GinEntryAccumulator *ea = (const GinEntryAccumulator *) a; + const GinEntryAccumulator *eb = (const GinEntryAccumulator *) b; + BuildAccumulator *accum = (BuildAccumulator *) arg; + + return ginCompareAttEntries(accum->ginstate, + ea->attnum, ea->key, ea->category, + eb->attnum, eb->key, eb->category); +} + +/* Allocator function for rbtree.c */ +static RBTNode * +ginAllocEntryAccumulator(void *arg) +{ + BuildAccumulator *accum = (BuildAccumulator *) arg; + GinEntryAccumulator *ea; + + /* + * Allocate memory by rather big chunks to decrease overhead. We have no + * need to reclaim RBTNodes individually, so this costs nothing. + */ + if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY) + { + accum->entryallocator = palloc(sizeof(GinEntryAccumulator) * DEF_NENTRY); + accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator); + accum->eas_used = 0; + } + + /* Allocate new RBTNode from current chunk */ + ea = accum->entryallocator + accum->eas_used; + accum->eas_used++; + + return (RBTNode *) ea; +} + +void +ginInitBA(BuildAccumulator *accum) +{ + /* accum->ginstate is intentionally not set here */ + accum->allocatedMemory = 0; + accum->entryallocator = NULL; + accum->eas_used = 0; + accum->tree = rbt_create(sizeof(GinEntryAccumulator), + cmpEntryAccumulator, + ginCombineData, + ginAllocEntryAccumulator, + NULL, /* no freefunc needed */ + (void *) accum); +} + +/* + * This is basically the same as datumCopy(), but extended to count + * palloc'd space in accum->allocatedMemory. + */ +static Datum +getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value) +{ + Form_pg_attribute att; + Datum res; + + att = TupleDescAttr(accum->ginstate->origTupdesc, attnum - 1); + if (att->attbyval) + res = value; + else + { + res = datumCopy(value, false, att->attlen); + accum->allocatedMemory += GetMemoryChunkSpace(DatumGetPointer(res)); + } + return res; +} + +/* + * Find/store one entry from indexed value. + */ +static void +ginInsertBAEntry(BuildAccumulator *accum, + ItemPointer heapptr, OffsetNumber attnum, + Datum key, GinNullCategory category) +{ + GinEntryAccumulator eatmp; + GinEntryAccumulator *ea; + bool isNew; + + /* + * For the moment, fill only the fields of eatmp that will be looked at by + * cmpEntryAccumulator or ginCombineData. + */ + eatmp.attnum = attnum; + eatmp.key = key; + eatmp.category = category; + /* temporarily set up single-entry itempointer list */ + eatmp.list = heapptr; + + ea = (GinEntryAccumulator *) rbt_insert(accum->tree, (RBTNode *) &eatmp, + &isNew); + + if (isNew) + { + /* + * Finish initializing new tree entry, including making permanent + * copies of the datum (if it's not null) and itempointer. + */ + if (category == GIN_CAT_NORM_KEY) + ea->key = getDatumCopy(accum, attnum, key); + ea->maxcount = DEF_NPTR; + ea->count = 1; + ea->shouldSort = false; + ea->list = + (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR); + ea->list[0] = *heapptr; + accum->allocatedMemory += GetMemoryChunkSpace(ea->list); + } + else + { + /* + * ginCombineData did everything needed. + */ + } +} + +/* + * Insert the entries for one heap pointer. + * + * Since the entries are being inserted into a balanced binary tree, you + * might think that the order of insertion wouldn't be critical, but it turns + * out that inserting the entries in sorted order results in a lot of + * rebalancing operations and is slow. To prevent this, we attempt to insert + * the nodes in an order that will produce a nearly-balanced tree if the input + * is in fact sorted. + * + * We do this as follows. First, we imagine that we have an array whose size + * is the smallest power of two greater than or equal to the actual array + * size. Second, we insert the middle entry of our virtual array into the + * tree; then, we insert the middles of each half of our virtual array, then + * middles of quarters, etc. + */ +void +ginInsertBAEntries(BuildAccumulator *accum, + ItemPointer heapptr, OffsetNumber attnum, + Datum *entries, GinNullCategory *categories, + int32 nentries) +{ + uint32 step = nentries; + + if (nentries <= 0) + return; + + Assert(ItemPointerIsValid(heapptr) && attnum >= FirstOffsetNumber); + + /* + * step will contain largest power of 2 and <= nentries + */ + step |= (step >> 1); + step |= (step >> 2); + step |= (step >> 4); + step |= (step >> 8); + step |= (step >> 16); + step >>= 1; + step++; + + while (step > 0) + { + int i; + + for (i = step - 1; i < nentries && i >= 0; i += step << 1 /* *2 */ ) + ginInsertBAEntry(accum, heapptr, attnum, + entries[i], categories[i]); + + step >>= 1; /* /2 */ + } +} + +static int +qsortCompareItemPointers(const void *a, const void *b) +{ + int res = ginCompareItemPointers((ItemPointer) a, (ItemPointer) b); + + /* Assert that there are no equal item pointers being sorted */ + Assert(res != 0); + return res; +} + +/* Prepare to read out the rbtree contents using ginGetBAEntry */ +void +ginBeginBAScan(BuildAccumulator *accum) +{ + rbt_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk); +} + +/* + * Get the next entry in sequence from the BuildAccumulator's rbtree. + * This consists of a single key datum and a list (array) of one or more + * heap TIDs in which that key is found. The list is guaranteed sorted. + */ +ItemPointerData * +ginGetBAEntry(BuildAccumulator *accum, + OffsetNumber *attnum, Datum *key, GinNullCategory *category, + uint32 *n) +{ + GinEntryAccumulator *entry; + ItemPointerData *list; + + entry = (GinEntryAccumulator *) rbt_iterate(&accum->tree_walk); + + if (entry == NULL) + return NULL; /* no more entries */ + + *attnum = entry->attnum; + *key = entry->key; + *category = entry->category; + list = entry->list; + *n = entry->count; + + Assert(list != NULL && entry->count > 0); + + if (entry->shouldSort && entry->count > 1) + qsort(list, entry->count, sizeof(ItemPointerData), + qsortCompareItemPointers); + + return list; +} diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c new file mode 100644 index 0000000..06c0586 --- /dev/null +++ b/src/backend/access/gin/gindatapage.c @@ -0,0 +1,1942 @@ +/*------------------------------------------------------------------------- + * + * gindatapage.c + * routines for handling GIN posting tree pages. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/gindatapage.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "storage/predicate.h" +#include "utils/rel.h" + +/* + * Min, Max and Target size of posting lists stored on leaf pages, in bytes. + * + * The code can deal with any size, but random access is more efficient when + * a number of smaller lists are stored, rather than one big list. If a + * posting list would become larger than Max size as a result of insertions, + * it is split into two. If a posting list would be smaller than minimum + * size, it is merged with the next posting list. + */ +#define GinPostingListSegmentMaxSize 384 +#define GinPostingListSegmentTargetSize 256 +#define GinPostingListSegmentMinSize 128 + +/* + * At least this many items fit in a GinPostingListSegmentMaxSize-bytes + * long segment. This is used when estimating how much space is required + * for N items, at minimum. + */ +#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6) + +/* + * A working struct for manipulating a posting tree leaf page. + */ +typedef struct +{ + dlist_head segments; /* a list of leafSegmentInfos */ + + /* + * The following fields represent how the segments are split across pages, + * if a page split is required. Filled in by leafRepackItems. + */ + dlist_node *lastleft; /* last segment on left page */ + int lsize; /* total size on left page */ + int rsize; /* total size on right page */ + + bool oldformat; /* page is in pre-9.4 format on disk */ + + /* + * If we need WAL data representing the reconstructed leaf page, it's + * stored here by computeLeafRecompressWALData. + */ + char *walinfo; /* buffer start */ + int walinfolen; /* and length */ +} disassembledLeaf; + +typedef struct +{ + dlist_node node; /* linked list pointers */ + + /*------------- + * 'action' indicates the status of this in-memory segment, compared to + * what's on disk. It is one of the GIN_SEGMENT_* action codes: + * + * UNMODIFIED no changes + * DELETE the segment is to be removed. 'seg' and 'items' are + * ignored + * INSERT this is a completely new segment + * REPLACE this replaces an existing segment with new content + * ADDITEMS like REPLACE, but no items have been removed, and we track + * in detail what items have been added to this segment, in + * 'modifieditems' + *------------- + */ + char action; + + ItemPointerData *modifieditems; + uint16 nmodifieditems; + + /* + * The following fields represent the items in this segment. If 'items' is + * not NULL, it contains a palloc'd array of the items in this segment. If + * 'seg' is not NULL, it contains the items in an already-compressed + * format. It can point to an on-disk page (!modified), or a palloc'd + * segment in memory. If both are set, they must represent the same items. + */ + GinPostingList *seg; + ItemPointer items; + int nitems; /* # of items in 'items', if items != NULL */ +} leafSegmentInfo; + +static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); +static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Page *newlpage, Page *newrpage); + +static disassembledLeaf *disassembleLeaf(Page page); +static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); +static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, + int nNewItems); + +static void computeLeafRecompressWALData(disassembledLeaf *leaf); +static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); +static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + Page lpage, Page rpage); + +/* + * Read TIDs from leaf data page to single uncompressed array. The TIDs are + * returned in ascending order. + * + * advancePast is a hint, indicating that the caller is only interested in + * TIDs > advancePast. To return all items, use ItemPointerSetMin. + * + * Note: This function can still return items smaller than advancePast that + * are in the same posting list as the items of interest, so the caller must + * still check all the returned items. But passing it allows this function to + * skip whole posting lists. + */ +ItemPointer +GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast) +{ + ItemPointer result; + + if (GinPageIsCompressed(page)) + { + GinPostingList *seg = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + Pointer endptr = ((Pointer) seg) + len; + GinPostingList *next; + + /* Skip to the segment containing advancePast+1 */ + if (ItemPointerIsValid(&advancePast)) + { + next = GinNextPostingListSegment(seg); + while ((Pointer) next < endptr && + ginCompareItemPointers(&next->first, &advancePast) <= 0) + { + seg = next; + next = GinNextPostingListSegment(seg); + } + len = endptr - (Pointer) seg; + } + + if (len > 0) + result = ginPostingListDecodeAllSegments(seg, len, nitems); + else + { + result = NULL; + *nitems = 0; + } + } + else + { + ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems); + + result = palloc((*nitems) * sizeof(ItemPointerData)); + memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData)); + } + + return result; +} + +/* + * Places all TIDs from leaf data page to bitmap. + */ +int +GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm) +{ + ItemPointer uncompressed; + int nitems; + + if (GinPageIsCompressed(page)) + { + GinPostingList *segment = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + + nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm); + } + else + { + uncompressed = dataLeafPageGetUncompressed(page, &nitems); + + if (nitems > 0) + tbm_add_tuples(tbm, uncompressed, nitems, false); + } + + return nitems; +} + +/* + * Get pointer to the uncompressed array of items on a pre-9.4 format + * uncompressed leaf page. The number of items in the array is returned in + * *nitems. + */ +static ItemPointer +dataLeafPageGetUncompressed(Page page, int *nitems) +{ + ItemPointer items; + + Assert(!GinPageIsCompressed(page)); + + /* + * In the old pre-9.4 page format, the whole page content is used for + * uncompressed items, and the number of items is stored in 'maxoff' + */ + items = (ItemPointer) GinDataPageGetData(page); + *nitems = GinPageGetOpaque(page)->maxoff; + + return items; +} + +/* + * Check if we should follow the right link to find the item we're searching + * for. + * + * Compares inserting item pointer with the right bound of the current page. + */ +static bool +dataIsMoveRight(GinBtree btree, Page page) +{ + ItemPointer iptr = GinDataPageGetRightBound(page); + + if (GinPageRightMost(page)) + return false; + + if (GinPageIsDeleted(page)) + return true; + + return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? true : false; +} + +/* + * Find correct PostingItem in non-leaf page. It is assumed that this is + * the correct page, and the searched value SHOULD be on the page. + */ +static BlockNumber +dataLocateItem(GinBtree btree, GinBtreeStack *stack) +{ + OffsetNumber low, + high, + maxoff; + PostingItem *pitem = NULL; + int result; + Page page = BufferGetPage(stack->buffer); + + Assert(!GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + stack->predictNumber *= GinPageGetOpaque(page)->maxoff; + return btree->getLeftMostChild(btree, page); + } + + low = FirstOffsetNumber; + maxoff = high = GinPageGetOpaque(page)->maxoff; + Assert(high >= low); + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + pitem = GinDataPageGetPostingItem(page, mid); + + if (mid == maxoff) + { + /* + * Right infinity, page already correctly chosen with a help of + * dataIsMoveRight + */ + result = -1; + } + else + { + pitem = GinDataPageGetPostingItem(page, mid); + result = ginCompareItemPointers(&btree->itemptr, &(pitem->key)); + } + + if (result == 0) + { + stack->off = mid; + return PostingItemGetBlockNumber(pitem); + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + Assert(high >= FirstOffsetNumber && high <= maxoff); + + stack->off = high; + pitem = GinDataPageGetPostingItem(page, high); + return PostingItemGetBlockNumber(pitem); +} + +/* + * Find link to blkno on non-leaf page, returns offset of PostingItem + */ +static OffsetNumber +dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) +{ + OffsetNumber i, + maxoff = GinPageGetOpaque(page)->maxoff; + PostingItem *pitem; + + Assert(!GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + + /* if page isn't changed, we return storedOff */ + if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) + { + pitem = GinDataPageGetPostingItem(page, storedOff); + if (PostingItemGetBlockNumber(pitem) == blkno) + return storedOff; + + /* + * we hope, that needed pointer goes to right. It's true if there + * wasn't a deletion + */ + for (i = storedOff + 1; i <= maxoff; i++) + { + pitem = GinDataPageGetPostingItem(page, i); + if (PostingItemGetBlockNumber(pitem) == blkno) + return i; + } + + maxoff = storedOff - 1; + } + + /* last chance */ + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + pitem = GinDataPageGetPostingItem(page, i); + if (PostingItemGetBlockNumber(pitem) == blkno) + return i; + } + + return InvalidOffsetNumber; +} + +/* + * Return blkno of leftmost child + */ +static BlockNumber +dataGetLeftMostPage(GinBtree btree, Page page) +{ + PostingItem *pitem; + + Assert(!GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber); + + pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber); + return PostingItemGetBlockNumber(pitem); +} + +/* + * Add PostingItem to a non-leaf page. + */ +void +GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset) +{ + OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; + char *ptr; + + Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber); + Assert(!GinPageIsLeaf(page)); + + if (offset == InvalidOffsetNumber) + { + ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1); + } + else + { + ptr = (char *) GinDataPageGetPostingItem(page, offset); + if (offset != maxoff + 1) + memmove(ptr + sizeof(PostingItem), + ptr, + (maxoff - offset + 1) * sizeof(PostingItem)); + } + memcpy(ptr, data, sizeof(PostingItem)); + + maxoff++; + GinPageGetOpaque(page)->maxoff = maxoff; + + /* + * Also set pd_lower to the end of the posting items, to follow the + * "standard" page layout, so that we can squeeze out the unused space + * from full-page images. + */ + GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); +} + +/* + * Delete posting item from non-leaf page + */ +void +GinPageDeletePostingItem(Page page, OffsetNumber offset) +{ + OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; + + Assert(!GinPageIsLeaf(page)); + Assert(offset >= FirstOffsetNumber && offset <= maxoff); + + if (offset != maxoff) + memmove(GinDataPageGetPostingItem(page, offset), + GinDataPageGetPostingItem(page, offset + 1), + sizeof(PostingItem) * (maxoff - offset)); + + maxoff--; + GinPageGetOpaque(page)->maxoff = maxoff; + + GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem)); +} + +/* + * Prepare to insert data on a leaf data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + */ +static GinPlaceToPageRC +dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + GinBtreeDataLeafInsertData *items = insertdata; + ItemPointer newItems = &items->items[items->curitem]; + int maxitems = items->nitem - items->curitem; + Page page = BufferGetPage(buf); + int i; + ItemPointerData rbound; + ItemPointerData lbound; + bool needsplit; + bool append; + int segsize; + Size freespace; + disassembledLeaf *leaf; + leafSegmentInfo *lastleftinfo; + ItemPointerData maxOldItem; + ItemPointerData remaining; + + rbound = *GinDataPageGetRightBound(page); + + /* + * Count how many of the new items belong to this page. + */ + if (!GinPageRightMost(page)) + { + for (i = 0; i < maxitems; i++) + { + if (ginCompareItemPointers(&newItems[i], &rbound) > 0) + { + /* + * This needs to go to some other location in the tree. (The + * caller should've chosen the insert location so that at + * least the first item goes here.) + */ + Assert(i > 0); + break; + } + } + maxitems = i; + } + + /* Disassemble the data on the page */ + leaf = disassembleLeaf(page); + + /* + * Are we appending to the end of the page? IOW, are all the new items + * larger than any of the existing items. + */ + if (!dlist_is_empty(&leaf->segments)) + { + lastleftinfo = dlist_container(leafSegmentInfo, node, + dlist_tail_node(&leaf->segments)); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1]; + if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0) + append = true; + else + append = false; + } + else + { + ItemPointerSetMin(&maxOldItem); + append = true; + } + + /* + * If we're appending to the end of the page, we will append as many items + * as we can fit (after splitting), and stop when the pages becomes full. + * Otherwise we have to limit the number of new items to insert, because + * once we start packing we can't just stop when we run out of space, + * because we must make sure that all the old items still fit. + */ + if (GinPageIsCompressed(page)) + freespace = GinDataLeafPageGetFreeSpace(page); + else + freespace = 0; + if (append) + { + /* + * Even when appending, trying to append more items than will fit is + * not completely free, because we will merge the new items and old + * items into an array below. In the best case, every new item fits in + * a single byte, and we can use all the free space on the old page as + * well as the new page. For simplicity, ignore segment overhead etc. + */ + maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize); + } + else + { + /* + * Calculate a conservative estimate of how many new items we can fit + * on the two pages after splitting. + * + * We can use any remaining free space on the old page to store full + * segments, as well as the new page. Each full-sized segment can hold + * at least MinTuplesPerSegment items + */ + int nnewsegments; + + nnewsegments = freespace / GinPostingListSegmentMaxSize; + nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize; + maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); + } + + /* Add the new items to the segment list */ + if (!addItemsToLeaf(leaf, newItems, maxitems)) + { + /* all items were duplicates, we have nothing to do */ + items->curitem += maxitems; + + return GPTP_NO_WORK; + } + + /* + * Pack the items back to compressed segments, ready for writing to disk. + */ + needsplit = leafRepackItems(leaf, &remaining); + + /* + * Did all the new items fit? + * + * If we're appending, it's OK if they didn't. But as a sanity check, + * verify that all the old items fit. + */ + if (ItemPointerIsValid(&remaining)) + { + if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0) + elog(ERROR, "could not split GIN page; all old items didn't fit"); + + /* Count how many of the new items did fit. */ + for (i = 0; i < maxitems; i++) + { + if (ginCompareItemPointers(&newItems[i], &remaining) >= 0) + break; + } + if (i == 0) + elog(ERROR, "could not split GIN page; no new items fit"); + maxitems = i; + } + + if (!needsplit) + { + /* + * Great, all the items fit on a single page. If needed, prepare data + * for a WAL record describing the changes we'll make. + */ + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + computeLeafRecompressWALData(leaf); + + /* + * We're ready to enter the critical section, but + * dataExecPlaceToPageLeaf will need access to the "leaf" data. + */ + *ptp_workspace = leaf; + + if (append) + elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + } + else + { + /* + * Have to split. + * + * leafRepackItems already divided the segments between the left and + * the right page. It filled the left page as full as possible, and + * put the rest to the right page. When building a new index, that's + * good, because the table is scanned from beginning to end and there + * won't be any more insertions to the left page during the build. + * This packs the index as tight as possible. But otherwise, split + * 50/50, by moving segments from the left page to the right page + * until they're balanced. + * + * As a further heuristic, when appending items to the end of the + * page, try to make the left page 75% full, on the assumption that + * subsequent insertions will probably also go to the end. This packs + * the index somewhat tighter when appending to a table, which is very + * common. + */ + if (!btree->isBuild) + { + while (dlist_has_prev(&leaf->segments, leaf->lastleft)) + { + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + + /* ignore deleted segments */ + if (lastleftinfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(lastleftinfo->seg); + + /* + * Note that we check that the right page doesn't become + * more full than the left page even when appending. It's + * possible that we added enough items to make both pages + * more than 75% full. + */ + if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0) + break; + if (append) + { + if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4) + break; + } + + leaf->lsize -= segsize; + leaf->rsize += segsize; + } + leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft); + } + } + Assert(leaf->lsize <= GinDataPageMaxDataSize); + Assert(leaf->rsize <= GinDataPageMaxDataSize); + + /* + * Fetch the max item in the left page's last segment; it becomes the + * right bound of the page. + */ + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; + + /* + * Now allocate a couple of temporary page images, and fill them. + */ + *newlpage = palloc(BLCKSZ); + *newrpage = palloc(BLCKSZ); + + dataPlaceToPageLeafSplit(leaf, lbound, rbound, + *newlpage, *newrpage); + + Assert(GinPageRightMost(page) || + ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), + GinDataPageGetRightBound(*newrpage)) < 0); + + if (append) + elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); + } + + items->curitem += maxitems; + + return needsplit ? GPTP_SPLIT : GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, void *ptp_workspace) +{ + disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace; + + /* Apply changes to page */ + dataPlaceToPageLeafRecompress(buf, leaf); + + /* If needed, register WAL data built by computeLeafRecompressWALData */ + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); + } +} + +/* + * Vacuum a posting tree leaf page. + */ +void +ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) +{ + Page page = BufferGetPage(buffer); + disassembledLeaf *leaf; + bool removedsomething = false; + dlist_iter iter; + + leaf = disassembleLeaf(page); + + /* Vacuum each segment. */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + int oldsegsize; + ItemPointer cleaned; + int ncleaned; + + if (!seginfo->items) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (seginfo->seg) + oldsegsize = SizeOfGinPostingList(seginfo->seg); + else + oldsegsize = GinDataPageMaxDataSize; + + cleaned = ginVacuumItemPointers(gvs, + seginfo->items, + seginfo->nitems, + &ncleaned); + pfree(seginfo->items); + seginfo->items = NULL; + seginfo->nitems = 0; + if (cleaned) + { + if (ncleaned > 0) + { + int npacked; + + seginfo->seg = ginCompressPostingList(cleaned, + ncleaned, + oldsegsize, + &npacked); + /* Removing an item never increases the size of the segment */ + if (npacked != ncleaned) + elog(ERROR, "could not fit vacuumed posting list"); + seginfo->action = GIN_SEGMENT_REPLACE; + } + else + { + seginfo->seg = NULL; + seginfo->items = NULL; + seginfo->action = GIN_SEGMENT_DELETE; + } + seginfo->nitems = ncleaned; + + removedsomething = true; + } + } + + /* + * If we removed any items, reconstruct the page from the pieces. + * + * We don't try to re-encode the segments here, even though some of them + * might be really small now that we've removed some items from them. It + * seems like a waste of effort, as there isn't really any benefit from + * larger segments per se; larger segments only help to pack more items in + * the same space. We might as well delay doing that until the next + * insertion, which will need to re-encode at least part of the page + * anyway. + * + * Also note if the page was in uncompressed, pre-9.4 format before, it is + * now represented as one huge segment that contains all the items. It + * might make sense to split that, to speed up random access, but we don't + * bother. You'll have to REINDEX anyway if you want the full gain of the + * new tighter index format. + */ + if (removedsomething) + { + bool modified; + + /* + * Make sure we have a palloc'd copy of all segments, after the first + * segment that is modified. (dataPlaceToPageLeafRecompress requires + * this). + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + modified = true; + if (modified && seginfo->action != GIN_SEGMENT_DELETE) + { + int segsize = SizeOfGinPostingList(seginfo->seg); + GinPostingList *tmp = (GinPostingList *) palloc(segsize); + + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + + if (RelationNeedsWAL(indexrel)) + computeLeafRecompressWALData(leaf); + + /* Apply changes to page */ + START_CRIT_SECTION(); + + dataPlaceToPageLeafRecompress(buffer, leaf); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(indexrel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } +} + +/* + * Construct a ginxlogRecompressDataLeaf record representing the changes + * in *leaf. (Because this requires a palloc, we have to do it before + * we enter the critical section that actually updates the page.) + */ +static void +computeLeafRecompressWALData(disassembledLeaf *leaf) +{ + int nmodified = 0; + char *walbufbegin; + char *walbufend; + dlist_iter iter; + int segno; + ginxlogRecompressDataLeaf *recompress_xlog; + + /* Count the modified segments */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + nmodified++; + } + + walbufbegin = + palloc(sizeof(ginxlogRecompressDataLeaf) + + BLCKSZ + /* max size needed to hold the segment data */ + nmodified * 2 /* (segno + action) per action */ + ); + walbufend = walbufbegin; + + recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; + walbufend += sizeof(ginxlogRecompressDataLeaf); + + recompress_xlog->nactions = nmodified; + + segno = 0; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + int segsize = 0; + int datalen; + uint8 action = seginfo->action; + + if (action == GIN_SEGMENT_UNMODIFIED) + { + segno++; + continue; + } + + if (action != GIN_SEGMENT_DELETE) + segsize = SizeOfGinPostingList(seginfo->seg); + + /* + * If storing the uncompressed list of added item pointers would take + * more space than storing the compressed segment as is, do that + * instead. + */ + if (action == GIN_SEGMENT_ADDITEMS && + seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize) + { + action = GIN_SEGMENT_REPLACE; + } + + *((uint8 *) (walbufend++)) = segno; + *(walbufend++) = action; + + switch (action) + { + case GIN_SEGMENT_DELETE: + datalen = 0; + break; + + case GIN_SEGMENT_ADDITEMS: + datalen = seginfo->nmodifieditems * sizeof(ItemPointerData); + memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16)); + memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen); + datalen += sizeof(uint16); + break; + + case GIN_SEGMENT_INSERT: + case GIN_SEGMENT_REPLACE: + datalen = SHORTALIGN(segsize); + memcpy(walbufend, seginfo->seg, segsize); + break; + + default: + elog(ERROR, "unexpected GIN leaf action %d", action); + } + walbufend += datalen; + + if (action != GIN_SEGMENT_INSERT) + segno++; + } + + /* Pass back the constructed info via *leaf */ + leaf->walinfo = walbufbegin; + leaf->walinfolen = walbufend - walbufbegin; +} + +/* + * Assemble a disassembled posting tree leaf page back to a buffer. + * + * This just updates the target buffer; WAL stuff is caller's responsibility. + * + * NOTE: The segment pointers must not point directly to the same buffer, + * except for segments that have not been modified and whose preceding + * segments have not been modified either. + */ +static void +dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) +{ + Page page = BufferGetPage(buf); + char *ptr; + int newsize; + bool modified = false; + dlist_iter iter; + int segsize; + + /* + * If the page was in pre-9.4 format before, convert the header, and force + * all segments to be copied to the page whether they were modified or + * not. + */ + if (!GinPageIsCompressed(page)) + { + Assert(leaf->oldformat); + GinPageSetCompressed(page); + GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + modified = true; + } + + ptr = (char *) GinDataLeafPageGetPostingList(page); + newsize = 0; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + modified = true; + + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + + if (modified) + memcpy(ptr, seginfo->seg, segsize); + + ptr += segsize; + newsize += segsize; + } + } + + Assert(newsize <= GinDataPageMaxDataSize); + GinDataPageSetDataSize(page, newsize); +} + +/* + * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf + * segments to two pages instead of one. + * + * This is different from the non-split cases in that this does not modify + * the original page directly, but writes to temporary in-memory copies of + * the new left and right pages. + */ +static void +dataPlaceToPageLeafSplit(disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + Page lpage, Page rpage) +{ + char *ptr; + int segsize; + int lsize; + int rsize; + dlist_node *node; + dlist_node *firstright; + leafSegmentInfo *seginfo; + + /* Initialize temporary pages to hold the new left and right pages */ + GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + + /* + * Copy the segments that go to the left page. + * + * XXX: We should skip copying the unmodified part of the left page, like + * we do when recompressing. + */ + lsize = 0; + ptr = (char *) GinDataLeafPageGetPostingList(lpage); + firstright = dlist_next_node(&leaf->segments, leaf->lastleft); + for (node = dlist_head_node(&leaf->segments); + node != firstright; + node = dlist_next_node(&leaf->segments, node)) + { + seginfo = dlist_container(leafSegmentInfo, node, node); + + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + lsize += segsize; + } + } + Assert(lsize == leaf->lsize); + GinDataPageSetDataSize(lpage, lsize); + *GinDataPageGetRightBound(lpage) = lbound; + + /* Copy the segments that go to the right page */ + ptr = (char *) GinDataLeafPageGetPostingList(rpage); + rsize = 0; + for (node = firstright; + ; + node = dlist_next_node(&leaf->segments, node)) + { + seginfo = dlist_container(leafSegmentInfo, node, node); + + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + rsize += segsize; + } + + if (!dlist_has_next(&leaf->segments, node)) + break; + } + Assert(rsize == leaf->rsize); + GinDataPageSetDataSize(rpage, rsize); + *GinDataPageGetRightBound(rpage) = rbound; +} + +/* + * Prepare to insert data on an internal data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + */ +static GinPlaceToPageRC +dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + + /* If it doesn't fit, deal with split case */ + if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) + { + dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, + newlpage, newrpage); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace) +{ + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + PostingItem *pitem; + + /* Update existing downlink to point to next page (on internal page) */ + pitem = GinDataPageGetPostingItem(page, off); + PostingItemSetBlockNumber(pitem, updateblkno); + + /* Add new item */ + pitem = (PostingItem *) insertdata; + GinDataPageAddPostingItem(page, pitem, off); + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertDataInternal data; + + data.offset = off; + data.newitem = *pitem; + + XLogRegisterBufData(0, (char *) &data, + sizeof(ginxlogInsertDataInternal)); + } +} + +/* + * Prepare to insert data on a posting-tree data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. + */ +static GinPlaceToPageRC +dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + + Assert(GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace, + newlpage, newrpage); + else + return dataBeginPlaceToPageInternal(btree, buf, stack, + insertdata, updateblkno, + ptp_workspace, + newlpage, newrpage); +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. + */ +static void +dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace) +{ + Page page = BufferGetPage(buf); + + if (GinPageIsLeaf(page)) + dataExecPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace); + else + dataExecPlaceToPageInternal(btree, buf, stack, insertdata, + updateblkno, ptp_workspace); +} + +/* + * Split internal page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. + */ +static void +dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + Page *newlpage, Page *newrpage) +{ + Page oldpage = BufferGetPage(origbuf); + OffsetNumber off = stack->off; + int nitems = GinPageGetOpaque(oldpage)->maxoff; + int nleftitems; + int nrightitems; + Size pageSize = PageGetPageSize(oldpage); + ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage); + ItemPointer bound; + Page lpage; + Page rpage; + OffsetNumber separator; + PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; + + lpage = PageGetTempPage(oldpage); + rpage = PageGetTempPage(oldpage); + GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); + GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); + + /* + * First construct a new list of PostingItems, which includes all the old + * items, and the new item. + */ + memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber), + (off - 1) * sizeof(PostingItem)); + + allitems[off - 1] = *((PostingItem *) insertdata); + memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off), + (nitems - (off - 1)) * sizeof(PostingItem)); + nitems++; + + /* Update existing downlink to point to next page */ + PostingItemSetBlockNumber(&allitems[off], updateblkno); + + /* + * When creating a new index, fit as many tuples as possible on the left + * page, on the assumption that the table is scanned from beginning to + * end. This packs the index as tight as possible. + */ + if (btree->isBuild && GinPageRightMost(oldpage)) + separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem); + else + separator = nitems / 2; + nleftitems = separator; + nrightitems = nitems - separator; + + memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber), + allitems, + nleftitems * sizeof(PostingItem)); + GinPageGetOpaque(lpage)->maxoff = nleftitems; + memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber), + &allitems[separator], + nrightitems * sizeof(PostingItem)); + GinPageGetOpaque(rpage)->maxoff = nrightitems; + + /* + * Also set pd_lower for both pages, like GinDataPageAddPostingItem does. + */ + GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem)); + GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem)); + + /* set up right bound for left page */ + bound = GinDataPageGetRightBound(lpage); + *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key; + + /* set up right bound for right page */ + *GinDataPageGetRightBound(rpage) = oldbound; + + /* return temp pages to caller */ + *newlpage = lpage; + *newrpage = rpage; +} + +/* + * Construct insertion payload for inserting the downlink for given buffer. + */ +static void * +dataPrepareDownlink(GinBtree btree, Buffer lbuf) +{ + PostingItem *pitem = palloc(sizeof(PostingItem)); + Page lpage = BufferGetPage(lbuf); + + PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf)); + pitem->key = *GinDataPageGetRightBound(lpage); + + return pitem; +} + +/* + * Fills new root by right bound values from child. + * Also called from ginxlog, should not use btree + */ +void +ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage) +{ + PostingItem li, + ri; + + li.key = *GinDataPageGetRightBound(lpage); + PostingItemSetBlockNumber(&li, lblkno); + GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber); + + ri.key = *GinDataPageGetRightBound(rpage); + PostingItemSetBlockNumber(&ri, rblkno); + GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber); +} + + +/*** Functions to work with disassembled leaf pages ***/ + +/* + * Disassemble page into a disassembledLeaf struct. + */ +static disassembledLeaf * +disassembleLeaf(Page page) +{ + disassembledLeaf *leaf; + GinPostingList *seg; + Pointer segbegin; + Pointer segend; + + leaf = palloc0(sizeof(disassembledLeaf)); + dlist_init(&leaf->segments); + + if (GinPageIsCompressed(page)) + { + /* + * Create a leafSegmentInfo entry for each segment. + */ + seg = GinDataLeafPageGetPostingList(page); + segbegin = (Pointer) seg; + segend = segbegin + GinDataLeafPageGetPostingListSize(page); + while ((Pointer) seg < segend) + { + leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->action = GIN_SEGMENT_UNMODIFIED; + seginfo->seg = seg; + seginfo->items = NULL; + seginfo->nitems = 0; + dlist_push_tail(&leaf->segments, &seginfo->node); + + seg = GinNextPostingListSegment(seg); + } + leaf->oldformat = false; + } + else + { + /* + * A pre-9.4 format uncompressed page is represented by a single + * segment, with an array of items. The corner case is uncompressed + * page containing no items, which is represented as no segments. + */ + ItemPointer uncompressed; + int nuncompressed; + leafSegmentInfo *seginfo; + + uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed); + + if (nuncompressed > 0) + { + seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->action = GIN_SEGMENT_REPLACE; + seginfo->seg = NULL; + seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData)); + memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData)); + seginfo->nitems = nuncompressed; + + dlist_push_tail(&leaf->segments, &seginfo->node); + } + + leaf->oldformat = true; + } + + return leaf; +} + +/* + * Distribute newItems to the segments. + * + * Any segments that acquire new items are decoded, and the new items are + * merged with the old items. + * + * Returns true if any new items were added. False means they were all + * duplicates of existing items on the page. + */ +static bool +addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) +{ + dlist_iter iter; + ItemPointer nextnew = newItems; + int newleft = nNewItems; + bool modified = false; + leafSegmentInfo *newseg; + + /* + * If the page is completely empty, just construct one new segment to hold + * all the new items. + */ + if (dlist_is_empty(&leaf->segments)) + { + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = newItems; + newseg->nitems = nNewItems; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); + return true; + } + + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur); + int nthis; + ItemPointer tmpitems; + int ntmpitems; + + /* + * How many of the new items fall into this segment? + */ + if (!dlist_has_next(&leaf->segments, iter.cur)) + nthis = newleft; + else + { + leafSegmentInfo *next; + ItemPointerData next_first; + + next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, + dlist_next_node(&leaf->segments, iter.cur)); + if (next->items) + next_first = next->items[0]; + else + { + Assert(next->seg != NULL); + next_first = next->seg->first; + } + + nthis = 0; + while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0) + nthis++; + } + if (nthis == 0) + continue; + + /* Merge the new items with the existing items. */ + if (!cur->items) + cur->items = ginPostingListDecode(cur->seg, &cur->nitems); + + /* + * Fast path for the important special case that we're appending to + * the end of the page: don't let the last segment on the page grow + * larger than the target, create a new segment before that happens. + */ + if (!dlist_has_next(&leaf->segments, iter.cur) && + ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 && + cur->seg != NULL && + SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize) + { + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = nextnew; + newseg->nitems = nthis; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); + modified = true; + break; + } + + tmpitems = ginMergeItemPointers(cur->items, cur->nitems, + nextnew, nthis, + &ntmpitems); + if (ntmpitems != cur->nitems) + { + /* + * If there are no duplicates, track the added items so that we + * can emit a compact ADDITEMS WAL record later on. (it doesn't + * seem worth re-checking which items were duplicates, if there + * were any) + */ + if (ntmpitems == nthis + cur->nitems && + cur->action == GIN_SEGMENT_UNMODIFIED) + { + cur->action = GIN_SEGMENT_ADDITEMS; + cur->modifieditems = nextnew; + cur->nmodifieditems = nthis; + } + else + cur->action = GIN_SEGMENT_REPLACE; + + cur->items = tmpitems; + cur->nitems = ntmpitems; + cur->seg = NULL; + modified = true; + } + + nextnew += nthis; + newleft -= nthis; + if (newleft == 0) + break; + } + + return modified; +} + +/* + * Recompresses all segments that have been modified. + * + * If not all the items fit on two pages (ie. after split), we store as + * many items as fit, and set *remaining to the first item that didn't fit. + * If all items fit, *remaining is set to invalid. + * + * Returns true if the page has to be split. + */ +static bool +leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) +{ + int pgused = 0; + bool needsplit = false; + dlist_iter iter; + int segsize; + leafSegmentInfo *nextseg; + int npacked; + bool modified; + dlist_node *cur_node; + dlist_node *next_node; + + ItemPointerSetInvalid(remaining); + + /* + * cannot use dlist_foreach_modify here because we insert adjacent items + * while iterating. + */ + for (cur_node = dlist_head_node(&leaf->segments); + cur_node != NULL; + cur_node = next_node) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + cur_node); + + if (dlist_has_next(&leaf->segments, cur_node)) + next_node = dlist_next_node(&leaf->segments, cur_node); + else + next_node = NULL; + + /* Compress the posting list, if necessary */ + if (seginfo->action != GIN_SEGMENT_DELETE) + { + if (seginfo->seg == NULL) + { + if (seginfo->nitems > GinPostingListSegmentMaxSize) + npacked = 0; /* no chance that it would fit. */ + else + { + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentMaxSize, + &npacked); + } + if (npacked != seginfo->nitems) + { + /* + * Too large. Compress again to the target size, and + * create a new segment to represent the remaining items. + * The new segment is inserted after this one, so it will + * be processed in the next iteration of this loop. + */ + if (seginfo->seg) + pfree(seginfo->seg); + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentTargetSize, + &npacked); + if (seginfo->action != GIN_SEGMENT_INSERT) + seginfo->action = GIN_SEGMENT_REPLACE; + + nextseg = palloc(sizeof(leafSegmentInfo)); + nextseg->action = GIN_SEGMENT_INSERT; + nextseg->seg = NULL; + nextseg->items = &seginfo->items[npacked]; + nextseg->nitems = seginfo->nitems - npacked; + next_node = &nextseg->node; + dlist_insert_after(cur_node, next_node); + } + } + + /* + * If the segment is very small, merge it with the next segment. + */ + if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node) + { + int nmerged; + + nextseg = dlist_container(leafSegmentInfo, node, next_node); + + if (seginfo->items == NULL) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (nextseg->items == NULL) + nextseg->items = ginPostingListDecode(nextseg->seg, + &nextseg->nitems); + nextseg->items = + ginMergeItemPointers(seginfo->items, seginfo->nitems, + nextseg->items, nextseg->nitems, + &nmerged); + Assert(nmerged == seginfo->nitems + nextseg->nitems); + nextseg->nitems = nmerged; + nextseg->seg = NULL; + + nextseg->action = GIN_SEGMENT_REPLACE; + nextseg->modifieditems = NULL; + nextseg->nmodifieditems = 0; + + if (seginfo->action == GIN_SEGMENT_INSERT) + { + dlist_delete(cur_node); + continue; + } + else + { + seginfo->action = GIN_SEGMENT_DELETE; + seginfo->seg = NULL; + } + } + + seginfo->items = NULL; + seginfo->nitems = 0; + } + + if (seginfo->action == GIN_SEGMENT_DELETE) + continue; + + /* + * OK, we now have a compressed version of this segment ready for + * copying to the page. Did we exceed the size that fits on one page? + */ + segsize = SizeOfGinPostingList(seginfo->seg); + if (pgused + segsize > GinDataPageMaxDataSize) + { + if (!needsplit) + { + /* switch to right page */ + Assert(pgused > 0); + leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node); + needsplit = true; + leaf->lsize = pgused; + pgused = 0; + } + else + { + /* + * Filled both pages. The last segment we constructed did not + * fit. + */ + *remaining = seginfo->seg->first; + + /* + * remove all segments that did not fit from the list. + */ + while (dlist_has_next(&leaf->segments, cur_node)) + dlist_delete(dlist_next_node(&leaf->segments, cur_node)); + dlist_delete(cur_node); + break; + } + } + + pgused += segsize; + } + + if (!needsplit) + { + leaf->lsize = pgused; + leaf->rsize = 0; + } + else + leaf->rsize = pgused; + + Assert(leaf->lsize <= GinDataPageMaxDataSize); + Assert(leaf->rsize <= GinDataPageMaxDataSize); + + /* + * Make a palloc'd copy of every segment after the first modified one, + * because as we start copying items to the original page, we might + * overwrite an existing segment. + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED) + { + modified = true; + } + else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED) + { + GinPostingList *tmp; + + segsize = SizeOfGinPostingList(seginfo->seg); + tmp = palloc(segsize); + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + + return needsplit; +} + + +/*** Functions that are exported to the rest of the GIN code ***/ + +/* + * Creates new posting tree containing the given TIDs. Returns the page + * number of the root of the new posting tree. + * + * items[] must be in sorted order with no duplicates. + */ +BlockNumber +createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, + GinStatsData *buildStats, Buffer entrybuffer) +{ + BlockNumber blkno; + Buffer buffer; + Page tmppage; + Page page; + Pointer ptr; + int nrootitems; + int rootsize; + bool is_build = (buildStats != NULL); + + /* Construct the new root page in memory first. */ + tmppage = (Page) palloc(BLCKSZ); + GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber; + + /* + * Write as many of the items to the root page as fit. In segments of max + * GinPostingListSegmentMaxSize bytes each. + */ + nrootitems = 0; + rootsize = 0; + ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage); + while (nrootitems < nitems) + { + GinPostingList *segment; + int npacked; + int segsize; + + segment = ginCompressPostingList(&items[nrootitems], + nitems - nrootitems, + GinPostingListSegmentMaxSize, + &npacked); + segsize = SizeOfGinPostingList(segment); + if (rootsize + segsize > GinDataPageMaxDataSize) + break; + + memcpy(ptr, segment, segsize); + ptr += segsize; + rootsize += segsize; + nrootitems += npacked; + pfree(segment); + } + GinDataPageSetDataSize(tmppage, rootsize); + + /* + * All set. Get a new physical page, and copy the in-memory page to it. + */ + buffer = GinNewBuffer(index); + page = BufferGetPage(buffer); + blkno = BufferGetBlockNumber(buffer); + + /* + * Copy any predicate locks from the entry tree leaf (containing posting + * list) to the posting tree. + */ + PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno); + + START_CRIT_SECTION(); + + PageRestoreTempPage(tmppage, page); + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index) && !is_build) + { + XLogRecPtr recptr; + ginxlogCreatePostingTree data; + + data.size = rootsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree)); + + XLogRegisterData((char *) GinDataLeafPageGetPostingList(page), + rootsize); + XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* During index build, count the newly-added data page */ + if (buildStats) + buildStats->nDataPages++; + + elog(DEBUG2, "created GIN posting tree with %d items", nrootitems); + + /* + * Add any remaining TIDs to the newly-created posting tree. + */ + if (nitems > nrootitems) + { + ginInsertItemPointers(index, blkno, + items + nrootitems, + nitems - nrootitems, + buildStats); + } + + return blkno; +} + +static void +ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) +{ + memset(btree, 0, sizeof(GinBtreeData)); + + btree->index = index; + btree->rootBlkno = rootBlkno; + + btree->findChildPage = dataLocateItem; + btree->getLeftMostChild = dataGetLeftMostPage; + btree->isMoveRight = dataIsMoveRight; + btree->findItem = NULL; + btree->findChildPtr = dataFindChildPtr; + btree->beginPlaceToPage = dataBeginPlaceToPage; + btree->execPlaceToPage = dataExecPlaceToPage; + btree->fillRoot = ginDataFillRoot; + btree->prepareDownlink = dataPrepareDownlink; + + btree->isData = true; + btree->fullScan = false; + btree->isBuild = false; +} + +/* + * Inserts array of item pointers, may execute several tree scan (very rare) + */ +void +ginInsertItemPointers(Relation index, BlockNumber rootBlkno, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats) +{ + GinBtreeData btree; + GinBtreeDataLeafInsertData insertdata; + GinBtreeStack *stack; + + ginPrepareDataScan(&btree, index, rootBlkno); + btree.isBuild = (buildStats != NULL); + insertdata.items = items; + insertdata.nitem = nitem; + insertdata.curitem = 0; + + while (insertdata.curitem < insertdata.nitem) + { + /* search for the leaf page where the first item should go to */ + btree.itemptr = insertdata.items[insertdata.curitem]; + stack = ginFindLeafPage(&btree, false, true, NULL); + + ginInsertValue(&btree, stack, &insertdata, buildStats); + } +} + +/* + * Starts a new scan on a posting tree. + */ +GinBtreeStack * +ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, + Snapshot snapshot) +{ + GinBtreeStack *stack; + + ginPrepareDataScan(btree, index, rootBlkno); + + btree->fullScan = true; + + stack = ginFindLeafPage(btree, true, false, snapshot); + + return stack; +} diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c new file mode 100644 index 0000000..29c36bc --- /dev/null +++ b/src/backend/access/gin/ginentrypage.c @@ -0,0 +1,772 @@ +/*------------------------------------------------------------------------- + * + * ginentrypage.c + * routines for handling GIN entry tree pages. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginentrypage.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "utils/rel.h" + +static void entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage); + +/* + * Form a tuple for entry tree. + * + * If the tuple would be too big to be stored, function throws a suitable + * error if errorTooBig is true, or returns NULL if errorTooBig is false. + * + * See src/backend/access/gin/README for a description of the index tuple + * format that is being built here. We build on the assumption that we + * are making a leaf-level key entry containing a posting list of nipd items. + * If the caller is actually trying to make a posting-tree entry, non-leaf + * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite + * the t_tid fields as necessary. In any case, 'data' can be NULL to skip + * filling in the posting list; the caller is responsible for filling it + * afterwards if data = NULL and nipd > 0. + */ +IndexTuple +GinFormTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + Pointer data, Size dataSize, int nipd, + bool errorTooBig) +{ + Datum datums[2]; + bool isnull[2]; + IndexTuple itup; + uint32 newsize; + + /* Build the basic tuple: optional column number, plus key datum */ + if (ginstate->oneCol) + { + datums[0] = key; + isnull[0] = (category != GIN_CAT_NORM_KEY); + } + else + { + datums[0] = UInt16GetDatum(attnum); + isnull[0] = false; + datums[1] = key; + isnull[1] = (category != GIN_CAT_NORM_KEY); + } + + itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); + + /* + * Determine and store offset to the posting list, making sure there is + * room for the category byte if needed. + * + * Note: because index_form_tuple MAXALIGNs the tuple size, there may well + * be some wasted pad space. Is it worth recomputing the data length to + * prevent that? That would also allow us to Assert that the real data + * doesn't overlap the GinNullCategory byte, which this code currently + * takes on faith. + */ + newsize = IndexTupleSize(itup); + + if (IndexTupleHasNulls(itup)) + { + uint32 minsize; + + Assert(category != GIN_CAT_NORM_KEY); + minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory); + newsize = Max(newsize, minsize); + } + + newsize = SHORTALIGN(newsize); + + GinSetPostingOffset(itup, newsize); + GinSetNPosting(itup, nipd); + + /* + * Add space needed for posting list, if any. Then check that the tuple + * won't be too big to store. + */ + newsize += dataSize; + + newsize = MAXALIGN(newsize); + + if (newsize > GinMaxItemSize) + { + if (errorTooBig) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + (Size) newsize, (Size) GinMaxItemSize, + RelationGetRelationName(ginstate->index)))); + pfree(itup); + return NULL; + } + + /* + * Resize tuple if needed + */ + if (newsize != IndexTupleSize(itup)) + { + itup = repalloc(itup, newsize); + + /* + * PostgreSQL 9.3 and earlier did not clear this new space, so we + * might find uninitialized padding when reading tuples from disk. + */ + memset((char *) itup + IndexTupleSize(itup), + 0, newsize - IndexTupleSize(itup)); + /* set new size in tuple header */ + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + } + + /* + * Copy in the posting list, if provided + */ + if (data) + { + char *ptr = GinGetPosting(itup); + + memcpy(ptr, data, dataSize); + } + + /* + * Insert category byte, if needed + */ + if (category != GIN_CAT_NORM_KEY) + { + Assert(IndexTupleHasNulls(itup)); + GinSetNullCategory(itup, ginstate, category); + } + return itup; +} + +/* + * Read item pointers from leaf entry tuple. + * + * Returns a palloc'd array of ItemPointers. The number of items is returned + * in *nitems. + */ +ItemPointer +ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, + int *nitems) +{ + Pointer ptr = GinGetPosting(itup); + int nipd = GinGetNPosting(itup); + ItemPointer ipd; + int ndecoded; + + if (GinItupIsCompressed(itup)) + { + if (nipd > 0) + { + ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + if (nipd != ndecoded) + elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", + nipd, ndecoded); + } + else + { + ipd = palloc(0); + } + } + else + { + ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); + } + *nitems = nipd; + return ipd; +} + +/* + * Form a non-leaf entry tuple by copying the key data from the given tuple, + * which can be either a leaf or non-leaf entry tuple. + * + * Any posting list in the source tuple is not copied. The specified child + * block number is inserted into t_tid. + */ +static IndexTuple +GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk) +{ + IndexTuple nitup; + + if (GinPageIsLeaf(page) && !GinIsPostingTree(itup)) + { + /* Tuple contains a posting list, just copy stuff before that */ + uint32 origsize = GinGetPostingOffset(itup); + + origsize = MAXALIGN(origsize); + nitup = (IndexTuple) palloc(origsize); + memcpy(nitup, itup, origsize); + /* ... be sure to fix the size header field ... */ + nitup->t_info &= ~INDEX_SIZE_MASK; + nitup->t_info |= origsize; + } + else + { + /* Copy the tuple as-is */ + nitup = (IndexTuple) palloc(IndexTupleSize(itup)); + memcpy(nitup, itup, IndexTupleSize(itup)); + } + + /* Now insert the correct downlink */ + GinSetDownlink(nitup, childblk); + + return nitup; +} + +/* + * Entry tree is a "static", ie tuple never deletes from it, + * so we don't use right bound, we use rightmost key instead. + */ +static IndexTuple +getRightMostTuple(Page page) +{ + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + + return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff)); +} + +static bool +entryIsMoveRight(GinBtree btree, Page page) +{ + IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; + + if (GinPageRightMost(page)) + return false; + + itup = getRightMostTuple(page); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + + if (ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, btree->entryKey, btree->entryCategory, + attnum, key, category) > 0) + return true; + + return false; +} + +/* + * Find correct tuple in non-leaf page. It supposed that + * page correctly chosen and searching value SHOULD be on page + */ +static BlockNumber +entryLocateEntry(GinBtree btree, GinBtreeStack *stack) +{ + OffsetNumber low, + high, + maxoff; + IndexTuple itup = NULL; + int result; + Page page = BufferGetPage(stack->buffer); + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + stack->predictNumber *= PageGetMaxOffsetNumber(page); + return btree->getLeftMostChild(btree, page); + } + + low = FirstOffsetNumber; + maxoff = high = PageGetMaxOffsetNumber(page); + Assert(high >= low); + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + if (mid == maxoff && GinPageRightMost(page)) + { + /* Right infinity */ + result = -1; + } + else + { + OffsetNumber attnum; + Datum key; + GinNullCategory category; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); + } + + if (result == 0) + { + stack->off = mid; + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + Assert(high >= FirstOffsetNumber && high <= maxoff); + + stack->off = high; + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high)); + Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO); + return GinGetDownlink(itup); +} + +/* + * Searches correct position for value on leaf page. + * Page should be correctly chosen. + * Returns true if value found on page. + */ +static bool +entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack) +{ + Page page = BufferGetPage(stack->buffer); + OffsetNumber low, + high; + + Assert(GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + if (btree->fullScan) + { + stack->off = FirstOffsetNumber; + return true; + } + + low = FirstOffsetNumber; + high = PageGetMaxOffsetNumber(page); + + if (high < low) + { + stack->off = FirstOffsetNumber; + return false; + } + + high++; + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + IndexTuple itup; + OffsetNumber attnum; + Datum key; + GinNullCategory category; + int result; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid)); + attnum = gintuple_get_attrnum(btree->ginstate, itup); + key = gintuple_get_key(btree->ginstate, itup, &category); + result = ginCompareAttEntries(btree->ginstate, + btree->entryAttnum, + btree->entryKey, + btree->entryCategory, + attnum, key, category); + if (result == 0) + { + stack->off = mid; + return true; + } + else if (result > 0) + low = mid + 1; + else + high = mid; + } + + stack->off = high; + return false; +} + +static OffsetNumber +entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) +{ + OffsetNumber i, + maxoff = PageGetMaxOffsetNumber(page); + IndexTuple itup; + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + + /* if page isn't changed, we returns storedOff */ + if (storedOff >= FirstOffsetNumber && storedOff <= maxoff) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff)); + if (GinGetDownlink(itup) == blkno) + return storedOff; + + /* + * we hope, that needed pointer goes to right. It's true if there + * wasn't a deletion + */ + for (i = storedOff + 1; i <= maxoff; i++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + if (GinGetDownlink(itup) == blkno) + return i; + } + maxoff = storedOff - 1; + } + + /* last chance */ + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + if (GinGetDownlink(itup) == blkno) + return i; + } + + return InvalidOffsetNumber; +} + +static BlockNumber +entryGetLeftMostPage(GinBtree btree, Page page) +{ + IndexTuple itup; + + Assert(!GinPageIsLeaf(page)); + Assert(!GinPageIsData(page)); + Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); + return GinGetDownlink(itup); +} + +static bool +entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off, + GinBtreeEntryInsertData *insertData) +{ + Size releasedsz = 0; + Size addedsz; + Page page = BufferGetPage(buf); + + Assert(insertData->entry); + Assert(!GinPageIsData(page)); + + if (insertData->isDelete) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + } + + addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData); + + if (PageGetFreeSpace(page) + releasedsz >= addedsz) + return true; + + return false; +} + +/* + * Delete tuple on leaf page if tuples existed and we + * should update it, update old child blkno to new right page + * if child split occurred + */ +static void +entryPreparePage(GinBtree btree, Page page, OffsetNumber off, + GinBtreeEntryInsertData *insertData, BlockNumber updateblkno) +{ + Assert(insertData->entry); + Assert(!GinPageIsData(page)); + + if (insertData->isDelete) + { + Assert(GinPageIsLeaf(page)); + PageIndexTupleDelete(page, off); + } + + if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + GinSetDownlink(itup, updateblkno); + } +} + +/* + * Prepare to insert data on an entry page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + */ +static GinPlaceToPageRC +entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + OffsetNumber off = stack->off; + + /* If it doesn't fit, deal with split case */ + if (!entryIsEnoughSpace(btree, buf, off, insertData)) + { + entrySplitPage(btree, buf, stack, insertData, updateblkno, + newlpage, newrpage); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void *ptp_workspace) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + OffsetNumber placed; + + entryPreparePage(btree, page, off, insertData, updateblkno); + + placed = PageAddItem(page, + (Item) insertData->entry, + IndexTupleSize(insertData->entry), + off, false, false); + if (placed != off) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(btree->index)); + + if (RelationNeedsWAL(btree->index) && !btree->isBuild) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertEntry data; + + data.isDelete = insertData->isDelete; + data.offset = off; + + XLogRegisterBufData(0, (char *) &data, + offsetof(ginxlogInsertEntry, tuple)); + XLogRegisterBufData(0, (char *) insertData->entry, + IndexTupleSize(insertData->entry)); + } +} + +/* + * Split entry page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. + */ +static void +entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage) +{ + OffsetNumber off = stack->off; + OffsetNumber i, + maxoff, + separator = InvalidOffsetNumber; + Size totalsize = 0; + Size lsize = 0, + size; + char *ptr; + IndexTuple itup; + Page page; + Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Size pageSize = PageGetPageSize(lpage); + PGAlignedBlock tupstore[2]; /* could need 2 pages' worth of tuples */ + + entryPreparePage(btree, lpage, off, insertData, updateblkno); + + /* + * First, append all the existing tuples and the new tuple we're inserting + * one after another in a temporary workspace. + */ + maxoff = PageGetMaxOffsetNumber(lpage); + ptr = tupstore[0].data; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + if (i == off) + { + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i)); + size = MAXALIGN(IndexTupleSize(itup)); + memcpy(ptr, itup, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + if (off == maxoff + 1) + { + size = MAXALIGN(IndexTupleSize(insertData->entry)); + memcpy(ptr, insertData->entry, size); + ptr += size; + totalsize += size + sizeof(ItemIdData); + } + + /* + * Initialize the left and right pages, and copy all the tuples back to + * them. + */ + GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); + GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize); + + ptr = tupstore[0].data; + maxoff++; + lsize = 0; + + page = lpage; + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + itup = (IndexTuple) ptr; + + /* + * Decide where to split. We try to equalize the pages' total data + * size, not number of tuples. + */ + if (lsize > totalsize / 2) + { + if (separator == InvalidOffsetNumber) + separator = i - 1; + page = rpage; + } + else + { + lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + } + + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(btree->index)); + ptr += MAXALIGN(IndexTupleSize(itup)); + } + + /* return temp pages to caller */ + *newlpage = lpage; + *newrpage = rpage; +} + +/* + * Construct insertion payload for inserting the downlink for given buffer. + */ +static void * +entryPrepareDownlink(GinBtree btree, Buffer lbuf) +{ + GinBtreeEntryInsertData *insertData; + Page lpage = BufferGetPage(lbuf); + BlockNumber lblkno = BufferGetBlockNumber(lbuf); + IndexTuple itup; + + itup = getRightMostTuple(lpage); + + insertData = palloc(sizeof(GinBtreeEntryInsertData)); + insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno); + insertData->isDelete = false; + + return insertData; +} + +/* + * Fills new root by rightest values from child. + * Also called from ginxlog, should not use btree + */ +void +ginEntryFillRoot(GinBtree btree, Page root, + BlockNumber lblkno, Page lpage, + BlockNumber rblkno, Page rpage) +{ + IndexTuple itup; + + itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index root page"); + pfree(itup); + + itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno); + if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index root page"); + pfree(itup); +} + +/* + * Set up GinBtree for entry page access + * + * Note: during WAL recovery, there may be no valid data in ginstate + * other than a faked-up Relation pointer; the key datum is bogus too. + */ +void +ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, + Datum key, GinNullCategory category, + GinState *ginstate) +{ + memset(btree, 0, sizeof(GinBtreeData)); + + btree->index = ginstate->index; + btree->rootBlkno = GIN_ROOT_BLKNO; + btree->ginstate = ginstate; + + btree->findChildPage = entryLocateEntry; + btree->getLeftMostChild = entryGetLeftMostPage; + btree->isMoveRight = entryIsMoveRight; + btree->findItem = entryLocateLeafEntry; + btree->findChildPtr = entryFindChildPtr; + btree->beginPlaceToPage = entryBeginPlaceToPage; + btree->execPlaceToPage = entryExecPlaceToPage; + btree->fillRoot = ginEntryFillRoot; + btree->prepareDownlink = entryPrepareDownlink; + + btree->isData = false; + btree->fullScan = false; + btree->isBuild = false; + + btree->entryAttnum = attnum; + btree->entryKey = key; + btree->entryCategory = category; +} diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c new file mode 100644 index 0000000..e0d9940 --- /dev/null +++ b/src/backend/access/gin/ginfast.c @@ -0,0 +1,1068 @@ +/*------------------------------------------------------------------------- + * + * ginfast.c + * Fast insert routines for the Postgres inverted index access method. + * Pending entries are stored in linear list of pages. Later on + * (typically during VACUUM), ginInsertCleanup() will be invoked to + * transfer pending entries into the regular index structure. This + * wins because bulk insertion is much more efficient than retail. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginfast.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/pg_am.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "postmaster/autovacuum.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* GUC parameter */ +int gin_pending_list_limit = 0; + +#define GIN_PAGE_FREESIZE \ + ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) + +typedef struct KeyArray +{ + Datum *keys; /* expansible array */ + GinNullCategory *categories; /* another expansible array */ + int32 nvalues; /* current number of valid entries */ + int32 maxvalues; /* allocated size of arrays */ +} KeyArray; + + +/* + * Build a pending-list page from the given array of tuples, and write it out. + * + * Returns amount of free space left on the page. + */ +static int32 +writeListPage(Relation index, Buffer buffer, + IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) +{ + Page page = BufferGetPage(buffer); + int32 i, + freesize, + size = 0; + OffsetNumber l, + off; + PGAlignedBlock workspace; + char *ptr; + + START_CRIT_SECTION(); + + GinInitBuffer(buffer, GIN_LIST); + + off = FirstOffsetNumber; + ptr = workspace.data; + + for (i = 0; i < ntuples; i++) + { + int this_size = IndexTupleSize(tuples[i]); + + memcpy(ptr, tuples[i], this_size); + ptr += this_size; + size += this_size; + + l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(index)); + + off++; + } + + Assert(size <= BLCKSZ); /* else we overran workspace */ + + GinPageGetOpaque(page)->rightlink = rightlink; + + /* + * tail page may contain only whole row(s) or final part of row placed on + * previous pages (a "row" here meaning all the index tuples generated for + * one heap tuple) + */ + if (rightlink == InvalidBlockNumber) + { + GinPageSetFullRow(page); + GinPageGetOpaque(page)->maxoff = 1; + } + else + { + GinPageGetOpaque(page)->maxoff = 0; + } + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + ginxlogInsertListPage data; + XLogRecPtr recptr; + + data.rightlink = rightlink; + data.ntuples = ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage)); + + XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); + XLogRegisterBufData(0, workspace.data, size); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE); + PageSetLSN(page, recptr); + } + + /* get free space before releasing buffer */ + freesize = PageGetExactFreeSpace(page); + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + return freesize; +} + +static void +makeSublist(Relation index, IndexTuple *tuples, int32 ntuples, + GinMetaPageData *res) +{ + Buffer curBuffer = InvalidBuffer; + Buffer prevBuffer = InvalidBuffer; + int i, + size = 0, + tupsize; + int startTuple = 0; + + Assert(ntuples > 0); + + /* + * Split tuples into pages + */ + for (i = 0; i < ntuples; i++) + { + if (curBuffer == InvalidBuffer) + { + curBuffer = GinNewBuffer(index); + + if (prevBuffer != InvalidBuffer) + { + res->nPendingPages++; + writeListPage(index, prevBuffer, + tuples + startTuple, + i - startTuple, + BufferGetBlockNumber(curBuffer)); + } + else + { + res->head = BufferGetBlockNumber(curBuffer); + } + + prevBuffer = curBuffer; + startTuple = i; + size = 0; + } + + tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData); + + if (size + tupsize > GinListPageSize) + { + /* won't fit, force a new page and reprocess */ + i--; + curBuffer = InvalidBuffer; + } + else + { + size += tupsize; + } + } + + /* + * Write last page + */ + res->tail = BufferGetBlockNumber(curBuffer); + res->tailFreeSize = writeListPage(index, curBuffer, + tuples + startTuple, + ntuples - startTuple, + InvalidBlockNumber); + res->nPendingPages++; + /* that was only one heap tuple */ + res->nPendingHeapTuples = 1; +} + +/* + * Write the index tuples contained in *collector into the index's + * pending list. + * + * Function guarantees that all these tuples will be inserted consecutively, + * preserving order + */ +void +ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) +{ + Relation index = ginstate->index; + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata = NULL; + Buffer buffer = InvalidBuffer; + Page page = NULL; + ginxlogUpdateMeta data; + bool separateList = false; + bool needCleanup = false; + int cleanupSize; + bool needWal; + + if (collector->ntuples == 0) + return; + + needWal = RelationNeedsWAL(index); + + data.node = index->rd_node; + data.ntuples = 0; + data.newRightlink = data.prevTail = InvalidBlockNumber; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + /* + * An insertion to the pending list could logically belong anywhere in the + * tree, so it conflicts with all serializable scans. All scans acquire a + * predicate lock on the metabuffer to represent that. + */ + CheckForSerializableConflictIn(index, NULL, GIN_METAPAGE_BLKNO); + + if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) + { + /* + * Total size is greater than one page => make sublist + */ + separateList = true; + } + else + { + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if (metadata->head == InvalidBlockNumber || + collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) + { + /* + * Pending list is empty or total size is greater than freespace + * on tail page => make sublist + * + * We unlock metabuffer to keep high concurrency + */ + separateList = true; + LockBuffer(metabuffer, GIN_UNLOCK); + } + } + + if (separateList) + { + /* + * We should make sublist separately and append it to the tail + */ + GinMetaPageData sublist; + + memset(&sublist, 0, sizeof(GinMetaPageData)); + makeSublist(index, collector->tuples, collector->ntuples, &sublist); + + if (needWal) + XLogBeginInsert(); + + /* + * metapage was unlocked, see above + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if (metadata->head == InvalidBlockNumber) + { + /* + * Main list is empty, so just insert sublist as main list + */ + START_CRIT_SECTION(); + + metadata->head = sublist.head; + metadata->tail = sublist.tail; + metadata->tailFreeSize = sublist.tailFreeSize; + + metadata->nPendingPages = sublist.nPendingPages; + metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; + } + else + { + /* + * Merge lists + */ + data.prevTail = metadata->tail; + data.newRightlink = sublist.head; + + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + + Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); + + START_CRIT_SECTION(); + + GinPageGetOpaque(page)->rightlink = sublist.head; + + MarkBufferDirty(buffer); + + metadata->tail = sublist.tail; + metadata->tailFreeSize = sublist.tailFreeSize; + + metadata->nPendingPages += sublist.nPendingPages; + metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; + + if (needWal) + XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); + } + } + else + { + /* + * Insert into tail page. Metapage is already locked + */ + OffsetNumber l, + off; + int i, + tupsize; + char *ptr; + char *collectordata; + + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + + off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + collectordata = ptr = (char *) palloc(collector->sumsize); + + data.ntuples = collector->ntuples; + + if (needWal) + XLogBeginInsert(); + + START_CRIT_SECTION(); + + /* + * Increase counter of heap tuples + */ + Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); + GinPageGetOpaque(page)->maxoff++; + metadata->nPendingHeapTuples++; + + for (i = 0; i < collector->ntuples; i++) + { + tupsize = IndexTupleSize(collector->tuples[i]); + l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(index)); + + memcpy(ptr, collector->tuples[i], tupsize); + ptr += tupsize; + + off++; + } + + Assert((ptr - collectordata) <= collector->sumsize); + if (needWal) + { + XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); + XLogRegisterBufData(1, collectordata, collector->sumsize); + } + + metadata->tailFreeSize = PageGetExactFreeSpace(page); + + MarkBufferDirty(buffer); + } + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. (We must do this here because pre-v11 versions of PG did not + * set the metapage's pd_lower correctly, so a pg_upgraded index might + * contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; + + /* + * Write metabuffer, make xlog entry + */ + MarkBufferDirty(metabuffer); + + if (needWal) + { + XLogRecPtr recptr; + + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); + + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); + XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); + PageSetLSN(metapage, recptr); + + if (buffer != InvalidBuffer) + { + PageSetLSN(page, recptr); + } + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); + + /* + * Force pending list cleanup when it becomes too long. And, + * ginInsertCleanup could take significant amount of time, so we prefer to + * call it when it can do all the work in a single collection cycle. In + * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it + * while pending list is still small enough to fit into + * gin_pending_list_limit. + * + * ginInsertCleanup() should not be called inside our CRIT_SECTION. + */ + cleanupSize = GinGetPendingListCleanupSize(index); + if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L) + needCleanup = true; + + UnlockReleaseBuffer(metabuffer); + + END_CRIT_SECTION(); + + /* + * Since it could contend with concurrent cleanup process we cleanup + * pending list not forcibly. + */ + if (needCleanup) + ginInsertCleanup(ginstate, false, true, false, NULL); +} + +/* + * Create temporary index tuples for a single indexable item (one index column + * for the heap tuple specified by ht_ctid), and append them to the array + * in *collector. They will subsequently be written out using + * ginHeapTupleFastInsert. Note that to guarantee consistent state, all + * temp tuples for a given heap tuple must be written in one call to + * ginHeapTupleFastInsert. + */ +void +ginHeapTupleFastCollect(GinState *ginstate, + GinTupleCollector *collector, + OffsetNumber attnum, Datum value, bool isNull, + ItemPointer ht_ctid) +{ + Datum *entries; + GinNullCategory *categories; + int32 i, + nentries; + + /* + * Extract the key values that need to be inserted in the index + */ + entries = ginExtractEntries(ginstate, attnum, value, isNull, + &nentries, &categories); + + /* + * Protect against integer overflow in allocation calculations + */ + if (nentries < 0 || + collector->ntuples + nentries > MaxAllocSize / sizeof(IndexTuple)) + elog(ERROR, "too many entries for GIN index"); + + /* + * Allocate/reallocate memory for storing collected tuples + */ + if (collector->tuples == NULL) + { + /* + * Determine the number of elements to allocate in the tuples array + * initially. Make it a power of 2 to avoid wasting memory when + * resizing (since palloc likes powers of 2). + */ + collector->lentuples = pg_nextpower2_32(Max(16, nentries)); + collector->tuples = (IndexTuple *) palloc(sizeof(IndexTuple) * collector->lentuples); + } + else if (collector->lentuples < collector->ntuples + nentries) + { + /* + * Advance lentuples to the next suitable power of 2. This won't + * overflow, though we could get to a value that exceeds + * MaxAllocSize/sizeof(IndexTuple), causing an error in repalloc. + */ + collector->lentuples = pg_nextpower2_32(collector->ntuples + nentries); + collector->tuples = (IndexTuple *) repalloc(collector->tuples, + sizeof(IndexTuple) * collector->lentuples); + } + + /* + * Build an index tuple for each key value, and add to array. In pending + * tuples we just stick the heap TID into t_tid. + */ + for (i = 0; i < nentries; i++) + { + IndexTuple itup; + + itup = GinFormTuple(ginstate, attnum, entries[i], categories[i], + NULL, 0, 0, true); + itup->t_tid = *ht_ctid; + collector->tuples[collector->ntuples++] = itup; + collector->sumsize += IndexTupleSize(itup); + } +} + +/* + * Deletes pending list pages up to (not including) newHead page. + * If newHead == InvalidBlockNumber then function drops the whole list. + * + * metapage is pinned and exclusive-locked throughout this function. + */ +static void +shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, + bool fill_fsm, IndexBulkDeleteResult *stats) +{ + Page metapage; + GinMetaPageData *metadata; + BlockNumber blknoToDelete; + + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + blknoToDelete = metadata->head; + + do + { + Page page; + int i; + int64 nDeletedHeapTuples = 0; + ginxlogDeleteListPages data; + Buffer buffers[GIN_NDELETE_AT_ONCE]; + BlockNumber freespace[GIN_NDELETE_AT_ONCE]; + + data.ndeleted = 0; + while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) + { + freespace[data.ndeleted] = blknoToDelete; + buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); + LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); + page = BufferGetPage(buffers[data.ndeleted]); + + data.ndeleted++; + + Assert(!GinPageIsDeleted(page)); + + nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; + blknoToDelete = GinPageGetOpaque(page)->rightlink; + } + + if (stats) + stats->pages_deleted += data.ndeleted; + + /* + * This operation touches an unusually large number of pages, so + * prepare the XLogInsert machinery for that before entering the + * critical section. + */ + if (RelationNeedsWAL(index)) + XLogEnsureRecordSpace(data.ndeleted, 0); + + START_CRIT_SECTION(); + + metadata->head = blknoToDelete; + + Assert(metadata->nPendingPages >= data.ndeleted); + metadata->nPendingPages -= data.ndeleted; + Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); + metadata->nPendingHeapTuples -= nDeletedHeapTuples; + + if (blknoToDelete == InvalidBlockNumber) + { + metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; + metadata->nPendingPages = 0; + metadata->nPendingHeapTuples = 0; + } + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c + * compresses the page. (We must do this here because pre-v11 + * versions of PG did not set the metapage's pd_lower correctly, so a + * pg_upgraded index might contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; + + MarkBufferDirty(metabuffer); + + for (i = 0; i < data.ndeleted; i++) + { + page = BufferGetPage(buffers[i]); + GinPageGetOpaque(page)->flags = GIN_DELETED; + MarkBufferDirty(buffers[i]); + } + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuffer, + REGBUF_WILL_INIT | REGBUF_STANDARD); + for (i = 0; i < data.ndeleted; i++) + XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); + + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); + + XLogRegisterData((char *) &data, + sizeof(ginxlogDeleteListPages)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); + PageSetLSN(metapage, recptr); + + for (i = 0; i < data.ndeleted; i++) + { + page = BufferGetPage(buffers[i]); + PageSetLSN(page, recptr); + } + } + + for (i = 0; i < data.ndeleted; i++) + UnlockReleaseBuffer(buffers[i]); + + END_CRIT_SECTION(); + + for (i = 0; fill_fsm && i < data.ndeleted; i++) + RecordFreeIndexPage(index, freespace[i]); + + } while (blknoToDelete != newHead); +} + +/* Initialize empty KeyArray */ +static void +initKeyArray(KeyArray *keys, int32 maxvalues) +{ + keys->keys = (Datum *) palloc(sizeof(Datum) * maxvalues); + keys->categories = (GinNullCategory *) + palloc(sizeof(GinNullCategory) * maxvalues); + keys->nvalues = 0; + keys->maxvalues = maxvalues; +} + +/* Add datum to KeyArray, resizing if needed */ +static void +addDatum(KeyArray *keys, Datum datum, GinNullCategory category) +{ + if (keys->nvalues >= keys->maxvalues) + { + keys->maxvalues *= 2; + keys->keys = (Datum *) + repalloc(keys->keys, sizeof(Datum) * keys->maxvalues); + keys->categories = (GinNullCategory *) + repalloc(keys->categories, sizeof(GinNullCategory) * keys->maxvalues); + } + + keys->keys[keys->nvalues] = datum; + keys->categories[keys->nvalues] = category; + keys->nvalues++; +} + +/* + * Collect data from a pending-list page in preparation for insertion into + * the main index. + * + * Go through all tuples >= startoff on page and collect values in accum + * + * Note that ka is just workspace --- it does not carry any state across + * calls. + */ +static void +processPendingPage(BuildAccumulator *accum, KeyArray *ka, + Page page, OffsetNumber startoff) +{ + ItemPointerData heapptr; + OffsetNumber i, + maxoff; + OffsetNumber attrnum; + + /* reset *ka to empty */ + ka->nvalues = 0; + + maxoff = PageGetMaxOffsetNumber(page); + Assert(maxoff >= FirstOffsetNumber); + ItemPointerSetInvalid(&heapptr); + attrnum = 0; + + for (i = startoff; i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + OffsetNumber curattnum; + Datum curkey; + GinNullCategory curcategory; + + /* Check for change of heap TID or attnum */ + curattnum = gintuple_get_attrnum(accum->ginstate, itup); + + if (!ItemPointerIsValid(&heapptr)) + { + heapptr = itup->t_tid; + attrnum = curattnum; + } + else if (!(ItemPointerEquals(&heapptr, &itup->t_tid) && + curattnum == attrnum)) + { + /* + * ginInsertBAEntries can insert several datums per call, but only + * for one heap tuple and one column. So call it at a boundary, + * and reset ka. + */ + ginInsertBAEntries(accum, &heapptr, attrnum, + ka->keys, ka->categories, ka->nvalues); + ka->nvalues = 0; + heapptr = itup->t_tid; + attrnum = curattnum; + } + + /* Add key to KeyArray */ + curkey = gintuple_get_key(accum->ginstate, itup, &curcategory); + addDatum(ka, curkey, curcategory); + } + + /* Dump out all remaining keys */ + ginInsertBAEntries(accum, &heapptr, attrnum, + ka->keys, ka->categories, ka->nvalues); +} + +/* + * Move tuples from pending pages into regular GIN structure. + * + * On first glance it looks completely not crash-safe. But if we crash + * after posting entries to the main index and before removing them from the + * pending list, it's okay because when we redo the posting later on, nothing + * bad will happen. + * + * fill_fsm indicates that ginInsertCleanup should add deleted pages + * to FSM otherwise caller is responsible to put deleted pages into + * FSM. + * + * If stats isn't null, we count deleted pending pages into the counts. + */ +void +ginInsertCleanup(GinState *ginstate, bool full_clean, + bool fill_fsm, bool forceCleanup, + IndexBulkDeleteResult *stats) +{ + Relation index = ginstate->index; + Buffer metabuffer, + buffer; + Page metapage, + page; + GinMetaPageData *metadata; + MemoryContext opCtx, + oldCtx; + BuildAccumulator accum; + KeyArray datums; + BlockNumber blkno, + blknoFinish; + bool cleanupFinish = false; + bool fsm_vac = false; + Size workMemory; + + /* + * We would like to prevent concurrent cleanup process. For that we will + * lock metapage in exclusive mode using LockPage() call. Nobody other + * will use that lock for metapage, so we keep possibility of concurrent + * insertion into pending list + */ + + if (forceCleanup) + { + /* + * We are called from [auto]vacuum/analyze or gin_clean_pending_list() + * and we would like to wait concurrent cleanup to finish. + */ + LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); + workMemory = + (IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ? + autovacuum_work_mem : maintenance_work_mem; + } + else + { + /* + * We are called from regular insert and if we see concurrent cleanup + * just exit in hope that concurrent process will clean up pending + * list. + */ + if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock)) + return; + workMemory = work_mem; + } + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + if (metadata->head == InvalidBlockNumber) + { + /* Nothing to do */ + UnlockReleaseBuffer(metabuffer); + UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); + return; + } + + /* + * Remember a tail page to prevent infinite cleanup if other backends add + * new tuples faster than we can cleanup. + */ + blknoFinish = metadata->tail; + + /* + * Read and lock head of pending list + */ + blkno = metadata->head; + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + + LockBuffer(metabuffer, GIN_UNLOCK); + + /* + * Initialize. All temporary space will be in opCtx + */ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "GIN insert cleanup temporary context", + ALLOCSET_DEFAULT_SIZES); + + oldCtx = MemoryContextSwitchTo(opCtx); + + initKeyArray(&datums, 128); + ginInitBA(&accum); + accum.ginstate = ginstate; + + /* + * At the top of this loop, we have pin and lock on the current page of + * the pending list. However, we'll release that before exiting the loop. + * Note we also have pin but not lock on the metapage. + */ + for (;;) + { + Assert(!GinPageIsDeleted(page)); + + /* + * Are we walk through the page which as we remember was a tail when + * we start our cleanup? But if caller asks us to clean up whole + * pending list then ignore old tail, we will work until list becomes + * empty. + */ + if (blkno == blknoFinish && full_clean == false) + cleanupFinish = true; + + /* + * read page's datums into accum + */ + processPendingPage(&accum, &datums, page, FirstOffsetNumber); + + vacuum_delay_point(); + + /* + * Is it time to flush memory to disk? Flush if we are at the end of + * the pending list, or if we have a full row and memory is getting + * full. + */ + if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || + (GinPageHasFullRow(page) && + (accum.allocatedMemory >= workMemory * 1024L))) + { + ItemPointerData *list; + uint32 nlist; + Datum key; + GinNullCategory category; + OffsetNumber maxoff, + attnum; + + /* + * Unlock current page to increase performance. Changes of page + * will be checked later by comparing maxoff after completion of + * memory flush. + */ + maxoff = PageGetMaxOffsetNumber(page); + LockBuffer(buffer, GIN_UNLOCK); + + /* + * Moving collected data into regular structure can take + * significant amount of time - so, run it without locking pending + * list. + */ + ginBeginBAScan(&accum); + while ((list = ginGetBAEntry(&accum, + &attnum, &key, &category, &nlist)) != NULL) + { + ginEntryInsert(ginstate, attnum, key, category, + list, nlist, NULL); + vacuum_delay_point(); + } + + /* + * Lock the whole list to remove pages + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + LockBuffer(buffer, GIN_SHARE); + + Assert(!GinPageIsDeleted(page)); + + /* + * While we left the page unlocked, more stuff might have gotten + * added to it. If so, process those entries immediately. There + * shouldn't be very many, so we don't worry about the fact that + * we're doing this with exclusive lock. Insertion algorithm + * guarantees that inserted row(s) will not continue on next page. + * NOTE: intentionally no vacuum_delay_point in this loop. + */ + if (PageGetMaxOffsetNumber(page) != maxoff) + { + ginInitBA(&accum); + processPendingPage(&accum, &datums, page, maxoff + 1); + + ginBeginBAScan(&accum); + while ((list = ginGetBAEntry(&accum, + &attnum, &key, &category, &nlist)) != NULL) + ginEntryInsert(ginstate, attnum, key, category, + list, nlist, NULL); + } + + /* + * Remember next page - it will become the new list head + */ + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); /* shiftList will do exclusive + * locking */ + + /* + * remove read pages from pending list, at this point all content + * of read pages is in regular structure + */ + shiftList(index, metabuffer, blkno, fill_fsm, stats); + + /* At this point, some pending pages have been freed up */ + fsm_vac = true; + + Assert(blkno == metadata->head); + LockBuffer(metabuffer, GIN_UNLOCK); + + /* + * if we removed the whole pending list or we cleanup tail (which + * we remembered on start our cleanup process) then just exit + */ + if (blkno == InvalidBlockNumber || cleanupFinish) + break; + + /* + * release memory used so far and reinit state + */ + MemoryContextReset(opCtx); + initKeyArray(&datums, datums.maxvalues); + ginInitBA(&accum); + } + else + { + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); + } + + /* + * Read next page in pending list + */ + vacuum_delay_point(); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + } + + UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); + ReleaseBuffer(metabuffer); + + /* + * As pending list pages can have a high churn rate, it is desirable to + * recycle them immediately to the FreeSpaceMap when ordinary backends + * clean the list. + */ + if (fsm_vac && fill_fsm) + IndexFreeSpaceMapVacuum(index); + + /* Clean up temporary space */ + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(opCtx); +} + +/* + * SQL-callable function to clean the insert pending list + */ +Datum +gin_clean_pending_list(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + Relation indexRel = index_open(indexoid, RowExclusiveLock); + IndexBulkDeleteResult stats; + GinState ginstate; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("GIN pending list cannot be cleaned up during recovery."))); + + /* Must be a GIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != GIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a GIN index", + RelationGetRelationName(indexRel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(indexRel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary indexes of other sessions"))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (!pg_class_ownercheck(indexoid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + memset(&stats, 0, sizeof(stats)); + initGinState(&ginstate, indexRel); + ginInsertCleanup(&ginstate, true, true, true, &stats); + + index_close(indexRel, RowExclusiveLock); + + PG_RETURN_INT64((int64) stats.pages_deleted); +} diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c new file mode 100644 index 0000000..03191e0 --- /dev/null +++ b/src/backend/access/gin/ginget.c @@ -0,0 +1,1970 @@ +/*------------------------------------------------------------------------- + * + * ginget.c + * fetch tuples from a GIN scan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginget.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/relscan.h" +#include "miscadmin.h" +#include "storage/predicate.h" +#include "utils/datum.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* GUC parameter */ +int GinFuzzySearchLimit = 0; + +typedef struct pendingPosition +{ + Buffer pendingBuffer; + OffsetNumber firstOffset; + OffsetNumber lastOffset; + ItemPointerData item; + bool *hasMatchKey; +} pendingPosition; + + +/* + * Goes to the next page if current offset is outside of bounds + */ +static bool +moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot) +{ + Page page = BufferGetPage(stack->buffer); + + if (stack->off > PageGetMaxOffsetNumber(page)) + { + /* + * We scanned the whole page, so we should take right page + */ + if (GinPageRightMost(page)) + return false; /* no more pages */ + + stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); + stack->blkno = BufferGetBlockNumber(stack->buffer); + stack->off = FirstOffsetNumber; + PredicateLockPage(btree->index, stack->blkno, snapshot); + } + + return true; +} + +/* + * Scan all pages of a posting tree and save all its heap ItemPointers + * in scanEntry->matchBitmap + */ +static void +scanPostingTree(Relation index, GinScanEntry scanEntry, + BlockNumber rootPostingTree, Snapshot snapshot) +{ + GinBtreeData btree; + GinBtreeStack *stack; + Buffer buffer; + Page page; + + /* Descend to the leftmost leaf page */ + stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot); + buffer = stack->buffer; + + IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ + + freeGinBtreeStack(stack); + + /* + * Loop iterates through all leaf pages of posting tree + */ + for (;;) + { + page = BufferGetPage(buffer); + if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) + { + int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap); + + scanEntry->predictNumberResult += n; + } + + if (GinPageRightMost(page)) + break; /* no more pages */ + + buffer = ginStepRight(buffer, index, GIN_SHARE); + } + + UnlockReleaseBuffer(buffer); +} + +/* + * Collects TIDs into scanEntry->matchBitmap for all heap tuples that + * match the search entry. This supports three different match modes: + * + * 1. Partial-match support: scan from current point until the + * comparePartialFn says we're done. + * 2. SEARCH_MODE_ALL: scan from current point (which should be first + * key for the current attnum) until we hit null items or end of attnum + * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first + * key for the current attnum) until we hit end of attnum + * + * Returns true if done, false if it's necessary to restart scan from scratch + */ +static bool +collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, + GinScanEntry scanEntry, Snapshot snapshot) +{ + OffsetNumber attnum; + Form_pg_attribute attr; + + /* Initialize empty bitmap result */ + scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL); + + /* Null query cannot partial-match anything */ + if (scanEntry->isPartialMatch && + scanEntry->queryCategory != GIN_CAT_NORM_KEY) + return true; + + /* Locate tupdesc entry for key column (for attbyval/attlen data) */ + attnum = scanEntry->attnum; + attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1); + + /* + * Predicate lock entry leaf page, following pages will be locked by + * moveRightIfItNeeded() + */ + PredicateLockPage(btree->index, stack->buffer, snapshot); + + for (;;) + { + Page page; + IndexTuple itup; + Datum idatum; + GinNullCategory icategory; + + /* + * stack->off points to the interested entry, buffer is already locked + */ + if (moveRightIfItNeeded(btree, stack, snapshot) == false) + return true; + + page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); + + /* + * If tuple stores another attribute then stop scan + */ + if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) + return true; + + /* Safe to fetch attribute value */ + idatum = gintuple_get_key(btree->ginstate, itup, &icategory); + + /* + * Check for appropriate scan stop conditions + */ + if (scanEntry->isPartialMatch) + { + int32 cmp; + + /* + * In partial match, stop scan at any null (including + * placeholders); partial matches never match nulls + */ + if (icategory != GIN_CAT_NORM_KEY) + return true; + + /*---------- + * Check of partial match. + * case cmp == 0 => match + * case cmp > 0 => not match and finish scan + * case cmp < 0 => not match and continue scan + *---------- + */ + cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1], + btree->ginstate->supportCollation[attnum - 1], + scanEntry->queryKey, + idatum, + UInt16GetDatum(scanEntry->strategy), + PointerGetDatum(scanEntry->extra_data))); + + if (cmp > 0) + return true; + else if (cmp < 0) + { + stack->off++; + continue; + } + } + else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL) + { + /* + * In ALL mode, we are not interested in null items, so we can + * stop if we get to a null-item placeholder (which will be the + * last entry for a given attnum). We do want to include NULL_KEY + * and EMPTY_ITEM entries, though. + */ + if (icategory == GIN_CAT_NULL_ITEM) + return true; + } + + /* + * OK, we want to return the TIDs listed in this entry. + */ + if (GinIsPostingTree(itup)) + { + BlockNumber rootPostingTree = GinGetPostingTree(itup); + + /* + * We should unlock current page (but not unpin) during tree scan + * to prevent deadlock with vacuum processes. + * + * We save current entry value (idatum) to be able to re-find our + * tuple after re-locking + */ + if (icategory == GIN_CAT_NORM_KEY) + idatum = datumCopy(idatum, attr->attbyval, attr->attlen); + + LockBuffer(stack->buffer, GIN_UNLOCK); + + /* + * Acquire predicate lock on the posting tree. We already hold a + * lock on the entry page, but insertions to the posting tree + * don't check for conflicts on that level. + */ + PredicateLockPage(btree->index, rootPostingTree, snapshot); + + /* Collect all the TIDs in this entry's posting tree */ + scanPostingTree(btree->index, scanEntry, rootPostingTree, + snapshot); + + /* + * We lock again the entry page and while it was unlocked insert + * might have occurred, so we need to re-find our position. + */ + LockBuffer(stack->buffer, GIN_SHARE); + page = BufferGetPage(stack->buffer); + if (!GinPageIsLeaf(page)) + { + /* + * Root page becomes non-leaf while we unlock it. We will + * start again, this situation doesn't occur often - root can + * became a non-leaf only once per life of index. + */ + return false; + } + + /* Search forward to re-find idatum */ + for (;;) + { + if (moveRightIfItNeeded(btree, stack, snapshot) == false) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to re-find tuple within index \"%s\"", + RelationGetRelationName(btree->index)))); + + page = BufferGetPage(stack->buffer); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); + + if (gintuple_get_attrnum(btree->ginstate, itup) == attnum) + { + Datum newDatum; + GinNullCategory newCategory; + + newDatum = gintuple_get_key(btree->ginstate, itup, + &newCategory); + + if (ginCompareEntries(btree->ginstate, attnum, + newDatum, newCategory, + idatum, icategory) == 0) + break; /* Found! */ + } + + stack->off++; + } + + if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval) + pfree(DatumGetPointer(idatum)); + } + else + { + ItemPointer ipd; + int nipd; + + ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd); + tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false); + scanEntry->predictNumberResult += GinGetNPosting(itup); + pfree(ipd); + } + + /* + * Done with this entry, go to the next + */ + stack->off++; + } +} + +/* + * Start* functions setup beginning state of searches: finds correct buffer and pins it. + */ +static void +startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot) +{ + GinBtreeData btreeEntry; + GinBtreeStack *stackEntry; + Page page; + bool needUnlock; + +restartScanEntry: + entry->buffer = InvalidBuffer; + ItemPointerSetMin(&entry->curItem); + entry->offset = InvalidOffsetNumber; + if (entry->list) + pfree(entry->list); + entry->list = NULL; + entry->nlist = 0; + entry->matchBitmap = NULL; + entry->matchResult = NULL; + entry->reduceResult = false; + entry->predictNumberResult = 0; + + /* + * we should find entry, and begin scan of posting tree or just store + * posting list in memory + */ + ginPrepareEntryScan(&btreeEntry, entry->attnum, + entry->queryKey, entry->queryCategory, + ginstate); + stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot); + page = BufferGetPage(stackEntry->buffer); + + /* ginFindLeafPage() will have already checked snapshot age. */ + needUnlock = true; + + entry->isFinished = true; + + if (entry->isPartialMatch || + entry->queryCategory == GIN_CAT_EMPTY_QUERY) + { + /* + * btreeEntry.findItem locates the first item >= given search key. + * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item + * because of the way the GIN_CAT_EMPTY_QUERY category code is + * assigned.) We scan forward from there and collect all TIDs needed + * for the entry type. + */ + btreeEntry.findItem(&btreeEntry, stackEntry); + if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot) + == false) + { + /* + * GIN tree was seriously restructured, so we will cleanup all + * found data and rescan. See comments near 'return false' in + * collectMatchBitmap() + */ + if (entry->matchBitmap) + { + if (entry->matchIterator) + tbm_end_iterate(entry->matchIterator); + entry->matchIterator = NULL; + tbm_free(entry->matchBitmap); + entry->matchBitmap = NULL; + } + LockBuffer(stackEntry->buffer, GIN_UNLOCK); + freeGinBtreeStack(stackEntry); + goto restartScanEntry; + } + + if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) + { + entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); + entry->isFinished = false; + } + } + else if (btreeEntry.findItem(&btreeEntry, stackEntry)) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); + + if (GinIsPostingTree(itup)) + { + BlockNumber rootPostingTree = GinGetPostingTree(itup); + GinBtreeStack *stack; + Page page; + ItemPointerData minItem; + + /* + * This is an equality scan, so lock the root of the posting tree. + * It represents a lock on the exact key value, and covers all the + * items in the posting tree. + */ + PredicateLockPage(ginstate->index, rootPostingTree, snapshot); + + /* + * We should unlock entry page before touching posting tree to + * prevent deadlocks with vacuum processes. Because entry is never + * deleted from page and posting tree is never reduced to the + * posting list, we can unlock page after getting BlockNumber of + * root of posting tree. + */ + LockBuffer(stackEntry->buffer, GIN_UNLOCK); + needUnlock = false; + + stack = ginScanBeginPostingTree(&entry->btree, ginstate->index, + rootPostingTree, snapshot); + entry->buffer = stack->buffer; + + /* + * We keep buffer pinned because we need to prevent deletion of + * page during scan. See GIN's vacuum implementation. RefCount is + * increased to keep buffer pinned after freeGinBtreeStack() call. + */ + IncrBufferRefCount(entry->buffer); + + page = BufferGetPage(entry->buffer); + + /* + * Load the first page into memory. + */ + ItemPointerSetMin(&minItem); + entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem); + + entry->predictNumberResult = stack->predictNumber * entry->nlist; + + LockBuffer(entry->buffer, GIN_UNLOCK); + freeGinBtreeStack(stack); + entry->isFinished = false; + } + else + { + /* + * Lock the entry leaf page. This is more coarse-grained than + * necessary, because it will conflict with any insertions that + * land on the same leaf page, not only the exact key we searched + * for. But locking an individual tuple would require updating + * that lock whenever it moves because of insertions or vacuums, + * which seems too complicated. + */ + PredicateLockPage(ginstate->index, + BufferGetBlockNumber(stackEntry->buffer), + snapshot); + if (GinGetNPosting(itup) > 0) + { + entry->list = ginReadTuple(ginstate, entry->attnum, itup, + &entry->nlist); + entry->predictNumberResult = entry->nlist; + + entry->isFinished = false; + } + } + } + else + { + /* + * No entry found. Predicate lock the leaf page, to lock the place + * where the entry would've been, had there been one. + */ + PredicateLockPage(ginstate->index, + BufferGetBlockNumber(stackEntry->buffer), snapshot); + } + + if (needUnlock) + LockBuffer(stackEntry->buffer, GIN_UNLOCK); + freeGinBtreeStack(stackEntry); +} + +/* + * Comparison function for scan entry indexes. Sorts by predictNumberResult, + * least frequent items first. + */ +static int +entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg) +{ + const GinScanKey key = (const GinScanKey) arg; + int i1 = *(const int *) a1; + int i2 = *(const int *) a2; + uint32 n1 = key->scanEntry[i1]->predictNumberResult; + uint32 n2 = key->scanEntry[i2]->predictNumberResult; + + if (n1 < n2) + return -1; + else if (n1 == n2) + return 0; + else + return 1; +} + +static void +startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) +{ + MemoryContext oldCtx = CurrentMemoryContext; + int i; + int j; + int *entryIndexes; + + ItemPointerSetMin(&key->curItem); + key->curItemMatches = false; + key->recheckCurItem = false; + key->isFinished = false; + + /* + * Divide the entries into two distinct sets: required and additional. + * Additional entries are not enough for a match alone, without any items + * from the required set, but are needed by the consistent function to + * decide if an item matches. When scanning, we can skip over items from + * additional entries that have no corresponding matches in any of the + * required entries. That speeds up queries like "frequent & rare" + * considerably, if the frequent term can be put in the additional set. + * + * There can be many legal ways to divide them entries into these two + * sets. A conservative division is to just put everything in the required + * set, but the more you can put in the additional set, the more you can + * skip during the scan. To maximize skipping, we try to put as many + * frequent items as possible into additional, and less frequent ones into + * required. To do that, sort the entries by frequency + * (predictNumberResult), and put entries into the required set in that + * order, until the consistent function says that none of the remaining + * entries can form a match, without any items from the required set. The + * rest go to the additional set. + * + * Exclude-only scan keys are known to have no required entries. + */ + if (key->excludeOnly) + { + MemoryContextSwitchTo(so->keyCtx); + + key->nrequired = 0; + key->nadditional = key->nentries; + key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry)); + for (i = 0; i < key->nadditional; i++) + key->additionalEntries[i] = key->scanEntry[i]; + } + else if (key->nentries > 1) + { + MemoryContextSwitchTo(so->tempCtx); + + entryIndexes = (int *) palloc(sizeof(int) * key->nentries); + for (i = 0; i < key->nentries; i++) + entryIndexes[i] = i; + qsort_arg(entryIndexes, key->nentries, sizeof(int), + entryIndexByFrequencyCmp, key); + + for (i = 0; i < key->nentries - 1; i++) + { + /* Pass all entries <= i as FALSE, and the rest as MAYBE */ + for (j = 0; j <= i; j++) + key->entryRes[entryIndexes[j]] = GIN_FALSE; + for (j = i + 1; j < key->nentries; j++) + key->entryRes[entryIndexes[j]] = GIN_MAYBE; + + if (key->triConsistentFn(key) == GIN_FALSE) + break; + } + /* i is now the last required entry. */ + + MemoryContextSwitchTo(so->keyCtx); + + key->nrequired = i + 1; + key->nadditional = key->nentries - key->nrequired; + key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry)); + key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry)); + + j = 0; + for (i = 0; i < key->nrequired; i++) + key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]]; + for (i = 0; i < key->nadditional; i++) + key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]]; + + /* clean up after consistentFn calls (also frees entryIndexes) */ + MemoryContextReset(so->tempCtx); + } + else + { + MemoryContextSwitchTo(so->keyCtx); + + key->nrequired = 1; + key->nadditional = 0; + key->requiredEntries = palloc(1 * sizeof(GinScanEntry)); + key->requiredEntries[0] = key->scanEntry[0]; + } + MemoryContextSwitchTo(oldCtx); +} + +static void +startScan(IndexScanDesc scan) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + GinState *ginstate = &so->ginstate; + uint32 i; + + for (i = 0; i < so->totalentries; i++) + startScanEntry(ginstate, so->entries[i], scan->xs_snapshot); + + if (GinFuzzySearchLimit > 0) + { + /* + * If all of keys more than threshold we will try to reduce result, we + * hope (and only hope, for intersection operation of array our + * supposition isn't true), that total result will not more than + * minimal predictNumberResult. + */ + bool reduce = true; + + for (i = 0; i < so->totalentries; i++) + { + if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit) + { + reduce = false; + break; + } + } + if (reduce) + { + for (i = 0; i < so->totalentries; i++) + { + so->entries[i]->predictNumberResult /= so->totalentries; + so->entries[i]->reduceResult = true; + } + } + } + + /* + * Now that we have the estimates for the entry frequencies, finish + * initializing the scan keys. + */ + for (i = 0; i < so->nkeys; i++) + startScanKey(ginstate, so, so->keys + i); +} + +/* + * Load the next batch of item pointers from a posting tree. + * + * Note that we copy the page into GinScanEntry->list array and unlock it, but + * keep it pinned to prevent interference with vacuum. + */ +static void +entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, + ItemPointerData advancePast, Snapshot snapshot) +{ + Page page; + int i; + bool stepright; + + if (!BufferIsValid(entry->buffer)) + { + entry->isFinished = true; + return; + } + + /* + * We have two strategies for finding the correct page: step right from + * the current page, or descend the tree again from the root. If + * advancePast equals the current item, the next matching item should be + * on the next page, so we step right. Otherwise, descend from root. + */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0) + { + stepright = true; + LockBuffer(entry->buffer, GIN_SHARE); + } + else + { + GinBtreeStack *stack; + + ReleaseBuffer(entry->buffer); + + /* + * Set the search key, and find the correct leaf page. + */ + if (ItemPointerIsLossyPage(&advancePast)) + { + ItemPointerSet(&entry->btree.itemptr, + GinItemPointerGetBlockNumber(&advancePast) + 1, + FirstOffsetNumber); + } + else + { + ItemPointerSet(&entry->btree.itemptr, + GinItemPointerGetBlockNumber(&advancePast), + OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast))); + } + entry->btree.fullScan = false; + stack = ginFindLeafPage(&entry->btree, true, false, snapshot); + + /* we don't need the stack, just the buffer. */ + entry->buffer = stack->buffer; + IncrBufferRefCount(entry->buffer); + freeGinBtreeStack(stack); + stepright = false; + } + + elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d", + GinItemPointerGetBlockNumber(&advancePast), + GinItemPointerGetOffsetNumber(&advancePast), + !stepright); + + page = BufferGetPage(entry->buffer); + for (;;) + { + entry->offset = InvalidOffsetNumber; + if (entry->list) + { + pfree(entry->list); + entry->list = NULL; + entry->nlist = 0; + } + + if (stepright) + { + /* + * We've processed all the entries on this page. If it was the + * last page in the tree, we're done. + */ + if (GinPageRightMost(page)) + { + UnlockReleaseBuffer(entry->buffer); + entry->buffer = InvalidBuffer; + entry->isFinished = true; + return; + } + + /* + * Step to next page, following the right link. then find the + * first ItemPointer greater than advancePast. + */ + entry->buffer = ginStepRight(entry->buffer, + ginstate->index, + GIN_SHARE); + page = BufferGetPage(entry->buffer); + } + stepright = true; + + if (GinPageGetOpaque(page)->flags & GIN_DELETED) + continue; /* page was deleted by concurrent vacuum */ + + /* + * The first item > advancePast might not be on this page, but + * somewhere to the right, if the page was split, or a non-match from + * another key in the query allowed us to skip some items from this + * entry. Keep following the right-links until we re-find the correct + * page. + */ + if (!GinPageRightMost(page) && + ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0) + { + /* + * the item we're looking is > the right bound of the page, so it + * can't be on this page. + */ + continue; + } + + entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast); + + for (i = 0; i < entry->nlist; i++) + { + if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0) + { + entry->offset = i; + + if (GinPageRightMost(page)) + { + /* after processing the copied items, we're done. */ + UnlockReleaseBuffer(entry->buffer); + entry->buffer = InvalidBuffer; + } + else + LockBuffer(entry->buffer, GIN_UNLOCK); + return; + } + } + } +} + +#define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) +#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) + +/* + * Sets entry->curItem to next heap item pointer > advancePast, for one entry + * of one scan key, or sets entry->isFinished to true if there are no more. + * + * Item pointers are returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * entry potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the + * current implementation this is guaranteed by the behavior of tidbitmaps. + */ +static void +entryGetItem(GinState *ginstate, GinScanEntry entry, + ItemPointerData advancePast, Snapshot snapshot) +{ + Assert(!entry->isFinished); + + Assert(!ItemPointerIsValid(&entry->curItem) || + ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); + + if (entry->matchBitmap) + { + /* A bitmap result */ + BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast); + OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast); + + for (;;) + { + /* + * If we've exhausted all items on this block, move to next block + * in the bitmap. + */ + while (entry->matchResult == NULL || + (entry->matchResult->ntuples >= 0 && + entry->offset >= entry->matchResult->ntuples) || + entry->matchResult->blockno < advancePastBlk || + (ItemPointerIsLossyPage(&advancePast) && + entry->matchResult->blockno == advancePastBlk)) + { + entry->matchResult = tbm_iterate(entry->matchIterator); + + if (entry->matchResult == NULL) + { + ItemPointerSetInvalid(&entry->curItem); + tbm_end_iterate(entry->matchIterator); + entry->matchIterator = NULL; + entry->isFinished = true; + break; + } + + /* + * Reset counter to the beginning of entry->matchResult. Note: + * entry->offset is still greater than matchResult->ntuples if + * matchResult is lossy. So, on next call we will get next + * result from TIDBitmap. + */ + entry->offset = 0; + } + if (entry->isFinished) + break; + + /* + * We're now on the first page after advancePast which has any + * items on it. If it's a lossy result, return that. + */ + if (entry->matchResult->ntuples < 0) + { + ItemPointerSetLossyPage(&entry->curItem, + entry->matchResult->blockno); + + /* + * We might as well fall out of the loop; we could not + * estimate number of results on this page to support correct + * reducing of result even if it's enabled. + */ + break; + } + + /* + * Not a lossy page. Skip over any offsets <= advancePast, and + * return that. + */ + if (entry->matchResult->blockno == advancePastBlk) + { + /* + * First, do a quick check against the last offset on the + * page. If that's > advancePast, so are all the other + * offsets, so just go back to the top to get the next page. + */ + if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff) + { + entry->offset = entry->matchResult->ntuples; + continue; + } + + /* Otherwise scan to find the first item > advancePast */ + while (entry->matchResult->offsets[entry->offset] <= advancePastOff) + entry->offset++; + } + + ItemPointerSet(&entry->curItem, + entry->matchResult->blockno, + entry->matchResult->offsets[entry->offset]); + entry->offset++; + + /* Done unless we need to reduce the result */ + if (!entry->reduceResult || !dropItem(entry)) + break; + } + } + else if (!BufferIsValid(entry->buffer)) + { + /* + * A posting list from an entry tuple, or the last page of a posting + * tree. + */ + for (;;) + { + if (entry->offset >= entry->nlist) + { + ItemPointerSetInvalid(&entry->curItem); + entry->isFinished = true; + break; + } + + entry->curItem = entry->list[entry->offset++]; + + /* If we're not past advancePast, keep scanning */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + continue; + + /* Done unless we need to reduce the result */ + if (!entry->reduceResult || !dropItem(entry)) + break; + } + } + else + { + /* A posting tree */ + for (;;) + { + /* If we've processed the current batch, load more items */ + while (entry->offset >= entry->nlist) + { + entryLoadMoreItems(ginstate, entry, advancePast, snapshot); + + if (entry->isFinished) + { + ItemPointerSetInvalid(&entry->curItem); + return; + } + } + + entry->curItem = entry->list[entry->offset++]; + + /* If we're not past advancePast, keep scanning */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + continue; + + /* Done unless we need to reduce the result */ + if (!entry->reduceResult || !dropItem(entry)) + break; + + /* + * Advance advancePast (so that entryLoadMoreItems will load the + * right data), and keep scanning + */ + advancePast = entry->curItem; + } + } +} + +/* + * Identify the "current" item among the input entry streams for this scan key + * that is greater than advancePast, and test whether it passes the scan key + * qual condition. + * + * The current item is the smallest curItem among the inputs. key->curItem + * is set to that value. key->curItemMatches is set to indicate whether that + * TID passes the consistentFn test. If so, key->recheckCurItem is set true + * iff recheck is needed for this item pointer (including the case where the + * item pointer is a lossy page pointer). + * + * If all entry streams are exhausted, sets key->isFinished to true. + * + * Item pointers must be returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * key potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in scanGetItem.) + */ +static void +keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, + ItemPointerData advancePast, Snapshot snapshot) +{ + ItemPointerData minItem; + ItemPointerData curPageLossy; + uint32 i; + bool haveLossyEntry; + GinScanEntry entry; + GinTernaryValue res; + MemoryContext oldCtx; + bool allFinished; + + Assert(!key->isFinished); + + /* + * We might have already tested this item; if so, no need to repeat work. + * (Note: the ">" case can happen, if advancePast is exact but we + * previously had to set curItem to a lossy-page pointer.) + */ + if (ginCompareItemPointers(&key->curItem, &advancePast) > 0) + return; + + /* + * Find the minimum item > advancePast among the active entry streams. + * + * Note: a lossy-page entry is encoded by a ItemPointer with max value for + * offset (0xffff), so that it will sort after any exact entries for the + * same page. So we'll prefer to return exact pointers not lossy + * pointers, which is good. + */ + ItemPointerSetMax(&minItem); + allFinished = true; + for (i = 0; i < key->nrequired; i++) + { + entry = key->requiredEntries[i]; + + if (entry->isFinished) + continue; + + /* + * Advance this stream if necessary. + * + * In particular, since entry->curItem was initialized with + * ItemPointerSetMin, this ensures we fetch the first item for each + * entry on the first call. + */ + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + { + entryGetItem(ginstate, entry, advancePast, snapshot); + if (entry->isFinished) + continue; + } + + allFinished = false; + if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) + minItem = entry->curItem; + } + + if (allFinished && !key->excludeOnly) + { + /* all entries are finished */ + key->isFinished = true; + return; + } + + if (!key->excludeOnly) + { + /* + * For a normal scan key, we now know there are no matches < minItem. + * + * If minItem is lossy, it means that there were no exact items on the + * page among requiredEntries, because lossy pointers sort after exact + * items. However, there might be exact items for the same page among + * additionalEntries, so we mustn't advance past them. + */ + if (ItemPointerIsLossyPage(&minItem)) + { + if (GinItemPointerGetBlockNumber(&advancePast) < + GinItemPointerGetBlockNumber(&minItem)) + { + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&minItem), + InvalidOffsetNumber); + } + } + else + { + Assert(GinItemPointerGetOffsetNumber(&minItem) > 0); + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&minItem), + OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem))); + } + } + else + { + /* + * excludeOnly scan keys don't have any entries that are necessarily + * present in matching items. So, we consider the item just after + * advancePast. + */ + Assert(key->nrequired == 0); + ItemPointerSet(&minItem, + GinItemPointerGetBlockNumber(&advancePast), + OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast))); + } + + /* + * We might not have loaded all the entry streams for this TID yet. We + * could call the consistent function, passing MAYBE for those entries, to + * see if it can decide if this TID matches based on the information we + * have. But if the consistent-function is expensive, and cannot in fact + * decide with partial information, that could be a big loss. So, load all + * the additional entries, before calling the consistent function. + */ + for (i = 0; i < key->nadditional; i++) + { + entry = key->additionalEntries[i]; + + if (entry->isFinished) + continue; + + if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + { + entryGetItem(ginstate, entry, advancePast, snapshot); + if (entry->isFinished) + continue; + } + + /* + * Normally, none of the items in additionalEntries can have a curItem + * larger than minItem. But if minItem is a lossy page, then there + * might be exact items on the same page among additionalEntries. + */ + if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) + { + Assert(ItemPointerIsLossyPage(&minItem)); + minItem = entry->curItem; + } + } + + /* + * Ok, we've advanced all the entries up to minItem now. Set key->curItem, + * and perform consistentFn test. + * + * Lossy-page entries pose a problem, since we don't know the correct + * entryRes state to pass to the consistentFn, and we also don't know what + * its combining logic will be (could be AND, OR, or even NOT). If the + * logic is OR then the consistentFn might succeed for all items in the + * lossy page even when none of the other entries match. + * + * Our strategy is to call the tri-state consistent function, with the + * lossy-page entries set to MAYBE, and all the other entries FALSE. If it + * returns FALSE, none of the lossy items alone are enough for a match, so + * we don't need to return a lossy-page pointer. Otherwise, return a + * lossy-page pointer to indicate that the whole heap page must be + * checked. (On subsequent calls, we'll do nothing until minItem is past + * the page altogether, thus ensuring that we never return both regular + * and lossy pointers for the same page.) + * + * An exception is that it doesn't matter what we pass for lossy pointers + * in "hidden" entries, because the consistentFn's result can't depend on + * them. We could pass them as MAYBE as well, but if we're using the + * "shim" implementation of a tri-state consistent function (see + * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass + * them as true. + * + * Note that only lossy-page entries pointing to the current item's page + * should trigger this processing; we might have future lossy pages in the + * entry array, but they aren't relevant yet. + */ + key->curItem = minItem; + ItemPointerSetLossyPage(&curPageLossy, + GinItemPointerGetBlockNumber(&key->curItem)); + haveLossyEntry = false; + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry[i]; + if (entry->isFinished == false && + ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) + { + if (i < key->nuserentries) + key->entryRes[i] = GIN_MAYBE; + else + key->entryRes[i] = GIN_TRUE; + haveLossyEntry = true; + } + else + key->entryRes[i] = GIN_FALSE; + } + + /* prepare for calling consistentFn in temp context */ + oldCtx = MemoryContextSwitchTo(tempCtx); + + if (haveLossyEntry) + { + /* Have lossy-page entries, so see if whole page matches */ + res = key->triConsistentFn(key); + + if (res == GIN_TRUE || res == GIN_MAYBE) + { + /* Yes, so clean up ... */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(tempCtx); + + /* and return lossy pointer for whole page */ + key->curItem = curPageLossy; + key->curItemMatches = true; + key->recheckCurItem = true; + return; + } + } + + /* + * At this point we know that we don't need to return a lossy whole-page + * pointer, but we might have matches for individual exact item pointers, + * possibly in combination with a lossy pointer. Pass lossy pointers as + * MAYBE to the ternary consistent function, to let it decide if this + * tuple satisfies the overall key, even though we don't know if the lossy + * entries match. + * + * Prepare entryRes array to be passed to consistentFn. + */ + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry[i]; + if (entry->isFinished) + key->entryRes[i] = GIN_FALSE; +#if 0 + + /* + * This case can't currently happen, because we loaded all the entries + * for this item earlier. + */ + else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) + key->entryRes[i] = GIN_MAYBE; +#endif + else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) + key->entryRes[i] = GIN_MAYBE; + else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0) + key->entryRes[i] = GIN_TRUE; + else + key->entryRes[i] = GIN_FALSE; + } + + res = key->triConsistentFn(key); + + switch (res) + { + case GIN_TRUE: + key->curItemMatches = true; + /* triConsistentFn set recheckCurItem */ + break; + + case GIN_FALSE: + key->curItemMatches = false; + break; + + case GIN_MAYBE: + key->curItemMatches = true; + key->recheckCurItem = true; + break; + + default: + + /* + * the 'default' case shouldn't happen, but if the consistent + * function returns something bogus, this is the safe result + */ + key->curItemMatches = true; + key->recheckCurItem = true; + break; + } + + /* + * We have a tuple, and we know if it matches or not. If it's a non-match, + * we could continue to find the next matching tuple, but let's break out + * and give scanGetItem a chance to advance the other keys. They might be + * able to skip past to a much higher TID, allowing us to save work. + */ + + /* clean up after consistentFn calls */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(tempCtx); +} + +/* + * Get next heap item pointer (after advancePast) from scan. + * Returns true if anything found. + * On success, *item and *recheck are set. + * + * Note: this is very nearly the same logic as in keyGetItem(), except + * that we know the keys are to be combined with AND logic, whereas in + * keyGetItem() the combination logic is known only to the consistentFn. + */ +static bool +scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, + ItemPointerData *item, bool *recheck) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + uint32 i; + bool match; + + /*---------- + * Advance the scan keys in lock-step, until we find an item that matches + * all the keys. If any key reports isFinished, meaning its subset of the + * entries is exhausted, we can stop. Otherwise, set *item to the next + * matching item. + * + * This logic works only if a keyGetItem stream can never contain both + * exact and lossy pointers for the same page. Else we could have a + * case like + * + * stream 1 stream 2 + * ... ... + * 42/6 42/7 + * 50/1 42/0xffff + * ... ... + * + * We would conclude that 42/6 is not a match and advance stream 1, + * thus never detecting the match to the lossy pointer in stream 2. + * (keyGetItem has a similar problem versus entryGetItem.) + *---------- + */ + do + { + ItemPointerSetMin(item); + match = true; + for (i = 0; i < so->nkeys && match; i++) + { + GinScanKey key = so->keys + i; + + /* + * If we're considering a lossy page, skip excludeOnly keys, They + * can't exclude the whole page anyway. + */ + if (ItemPointerIsLossyPage(item) && key->excludeOnly) + { + /* + * ginNewScanKey() should never mark the first key as + * excludeOnly. + */ + Assert(i > 0); + continue; + } + + /* Fetch the next item for this key that is > advancePast. */ + keyGetItem(&so->ginstate, so->tempCtx, key, advancePast, + scan->xs_snapshot); + + if (key->isFinished) + return false; + + /* + * If it's not a match, we can immediately conclude that nothing + * <= this item matches, without checking the rest of the keys. + */ + if (!key->curItemMatches) + { + advancePast = key->curItem; + match = false; + break; + } + + /* + * It's a match. We can conclude that nothing < matches, so the + * other key streams can skip to this item. + * + * Beware of lossy pointers, though; from a lossy pointer, we can + * only conclude that nothing smaller than this *block* matches. + */ + if (ItemPointerIsLossyPage(&key->curItem)) + { + if (GinItemPointerGetBlockNumber(&advancePast) < + GinItemPointerGetBlockNumber(&key->curItem)) + { + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&key->curItem), + InvalidOffsetNumber); + } + } + else + { + Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0); + ItemPointerSet(&advancePast, + GinItemPointerGetBlockNumber(&key->curItem), + OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem))); + } + + /* + * If this is the first key, remember this location as a potential + * match, and proceed to check the rest of the keys. + * + * Otherwise, check if this is the same item that we checked the + * previous keys for (or a lossy pointer for the same page). If + * not, loop back to check the previous keys for this item (we + * will check this key again too, but keyGetItem returns quickly + * for that) + */ + if (i == 0) + { + *item = key->curItem; + } + else + { + if (ItemPointerIsLossyPage(&key->curItem) || + ItemPointerIsLossyPage(item)) + { + Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item)); + match = (GinItemPointerGetBlockNumber(&key->curItem) == + GinItemPointerGetBlockNumber(item)); + } + else + { + Assert(ginCompareItemPointers(&key->curItem, item) >= 0); + match = (ginCompareItemPointers(&key->curItem, item) == 0); + } + } + } + } while (!match); + + Assert(!ItemPointerIsMin(item)); + + /* + * Now *item contains the first ItemPointer after previous result that + * satisfied all the keys for that exact TID, or a lossy reference to the + * same page. + * + * We must return recheck = true if any of the keys are marked recheck. + */ + *recheck = false; + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + if (key->recheckCurItem) + { + *recheck = true; + break; + } + } + + return true; +} + + +/* + * Functions for scanning the pending list + */ + + +/* + * Get ItemPointer of next heap row to be checked from pending list. + * Returns false if there are no more. On pages with several heap rows + * it returns each row separately, on page with part of heap row returns + * per page data. pos->firstOffset and pos->lastOffset are set to identify + * the range of pending-list tuples belonging to this heap row. + * + * The pendingBuffer is presumed pinned and share-locked on entry, and is + * pinned and share-locked on success exit. On failure exit it's released. + */ +static bool +scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) +{ + OffsetNumber maxoff; + Page page; + IndexTuple itup; + + ItemPointerSetInvalid(&pos->item); + for (;;) + { + page = BufferGetPage(pos->pendingBuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); + + maxoff = PageGetMaxOffsetNumber(page); + if (pos->firstOffset > maxoff) + { + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + + if (blkno == InvalidBlockNumber) + { + UnlockReleaseBuffer(pos->pendingBuffer); + pos->pendingBuffer = InvalidBuffer; + + return false; + } + else + { + /* + * Here we must prevent deletion of next page by insertcleanup + * process, which may be trying to obtain exclusive lock on + * current page. So, we lock next page before releasing the + * current one + */ + Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); + + LockBuffer(tmpbuf, GIN_SHARE); + UnlockReleaseBuffer(pos->pendingBuffer); + + pos->pendingBuffer = tmpbuf; + pos->firstOffset = FirstOffsetNumber; + } + } + else + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); + pos->item = itup->t_tid; + if (GinPageHasFullRow(page)) + { + /* + * find itempointer to the next row + */ + for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); + if (!ItemPointerEquals(&pos->item, &itup->t_tid)) + break; + } + } + else + { + /* + * All itempointers are the same on this page + */ + pos->lastOffset = maxoff + 1; + } + + /* + * Now pos->firstOffset points to the first tuple of current heap + * row, pos->lastOffset points to the first tuple of next heap row + * (or to the end of page) + */ + break; + } + } + + return true; +} + +/* + * Scan pending-list page from current tuple (off) up till the first of: + * - match is found (then returns true) + * - no later match is possible + * - tuple's attribute number is not equal to entry's attrnum + * - reach end of page + * + * datum[]/category[]/datumExtracted[] arrays are used to cache the results + * of gintuple_get_key() on the current page. + */ +static bool +matchPartialInPendingList(GinState *ginstate, Page page, + OffsetNumber off, OffsetNumber maxoff, + GinScanEntry entry, + Datum *datum, GinNullCategory *category, + bool *datumExtracted) +{ + IndexTuple itup; + int32 cmp; + + /* Partial match to a null is not possible */ + if (entry->queryCategory != GIN_CAT_NORM_KEY) + return false; + + while (off < maxoff) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + + if (gintuple_get_attrnum(ginstate, itup) != entry->attnum) + return false; + + if (datumExtracted[off - 1] == false) + { + datum[off - 1] = gintuple_get_key(ginstate, itup, + &category[off - 1]); + datumExtracted[off - 1] = true; + } + + /* Once we hit nulls, no further match is possible */ + if (category[off - 1] != GIN_CAT_NORM_KEY) + return false; + + /*---------- + * Check partial match. + * case cmp == 0 => match + * case cmp > 0 => not match and end scan (no later match possible) + * case cmp < 0 => not match and continue scan + *---------- + */ + cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1], + ginstate->supportCollation[entry->attnum - 1], + entry->queryKey, + datum[off - 1], + UInt16GetDatum(entry->strategy), + PointerGetDatum(entry->extra_data))); + if (cmp == 0) + return true; + else if (cmp > 0) + return false; + + off++; + } + + return false; +} + +/* + * Set up the entryRes array for each key by looking at + * every entry for current heap row in pending list. + * + * Returns true if each scan key has at least one entryRes match. + * This corresponds to the situations where the normal index search will + * try to apply the key's consistentFn. (A tuple not meeting that requirement + * cannot be returned by the normal search since no entry stream will + * source its TID.) + * + * The pendingBuffer is presumed pinned and share-locked on entry. + */ +static bool +collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + OffsetNumber attrnum; + Page page; + IndexTuple itup; + int i, + j; + + /* + * Reset all entryRes and hasMatchKey flags + */ + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + memset(key->entryRes, GIN_FALSE, key->nentries); + } + memset(pos->hasMatchKey, false, so->nkeys); + + /* + * Outer loop iterates over multiple pending-list pages when a single heap + * row has entries spanning those pages. + */ + for (;;) + { + Datum datum[BLCKSZ / sizeof(IndexTupleData)]; + GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)]; + bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)]; + + Assert(pos->lastOffset > pos->firstOffset); + memset(datumExtracted + pos->firstOffset - 1, 0, + sizeof(bool) * (pos->lastOffset - pos->firstOffset)); + + page = BufferGetPage(pos->pendingBuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); + + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + for (j = 0; j < key->nentries; j++) + { + GinScanEntry entry = key->scanEntry[j]; + OffsetNumber StopLow = pos->firstOffset, + StopHigh = pos->lastOffset, + StopMiddle; + + /* If already matched on earlier page, do no extra work */ + if (key->entryRes[j]) + continue; + + /* + * Interesting tuples are from pos->firstOffset to + * pos->lastOffset and they are ordered by (attnum, Datum) as + * it's done in entry tree. So we can use binary search to + * avoid linear scanning. + */ + while (StopLow < StopHigh) + { + int res; + + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); + + attrnum = gintuple_get_attrnum(&so->ginstate, itup); + + if (key->attnum < attrnum) + { + StopHigh = StopMiddle; + continue; + } + if (key->attnum > attrnum) + { + StopLow = StopMiddle + 1; + continue; + } + + if (datumExtracted[StopMiddle - 1] == false) + { + datum[StopMiddle - 1] = + gintuple_get_key(&so->ginstate, itup, + &category[StopMiddle - 1]); + datumExtracted[StopMiddle - 1] = true; + } + + if (entry->queryCategory == GIN_CAT_EMPTY_QUERY) + { + /* special behavior depending on searchMode */ + if (entry->searchMode == GIN_SEARCH_MODE_ALL) + { + /* match anything except NULL_ITEM */ + if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM) + res = -1; + else + res = 0; + } + else + { + /* match everything */ + res = 0; + } + } + else + { + res = ginCompareEntries(&so->ginstate, + entry->attnum, + entry->queryKey, + entry->queryCategory, + datum[StopMiddle - 1], + category[StopMiddle - 1]); + } + + if (res == 0) + { + /* + * Found exact match (there can be only one, except in + * EMPTY_QUERY mode). + * + * If doing partial match, scan forward from here to + * end of page to check for matches. + * + * See comment above about tuple's ordering. + */ + if (entry->isPartialMatch) + key->entryRes[j] = + matchPartialInPendingList(&so->ginstate, + page, + StopMiddle, + pos->lastOffset, + entry, + datum, + category, + datumExtracted); + else + key->entryRes[j] = true; + + /* done with binary search */ + break; + } + else if (res < 0) + StopHigh = StopMiddle; + else + StopLow = StopMiddle + 1; + } + + if (StopLow >= StopHigh && entry->isPartialMatch) + { + /* + * No exact match on this page. If doing partial match, + * scan from the first tuple greater than target value to + * end of page. Note that since we don't remember whether + * the comparePartialFn told us to stop early on a + * previous page, we will uselessly apply comparePartialFn + * to the first tuple on each subsequent page. + */ + key->entryRes[j] = + matchPartialInPendingList(&so->ginstate, + page, + StopHigh, + pos->lastOffset, + entry, + datum, + category, + datumExtracted); + } + + pos->hasMatchKey[i] |= key->entryRes[j]; + } + } + + /* Advance firstOffset over the scanned tuples */ + pos->firstOffset = pos->lastOffset; + + if (GinPageHasFullRow(page)) + { + /* + * We have examined all pending entries for the current heap row. + * Break out of loop over pages. + */ + break; + } + else + { + /* + * Advance to next page of pending entries for the current heap + * row. Complain if there isn't one. + */ + ItemPointerData item = pos->item; + + if (scanGetCandidate(scan, pos) == false || + !ItemPointerEquals(&pos->item, &item)) + elog(ERROR, "could not find additional pending pages for same heap tuple"); + } + } + + /* + * All scan keys except excludeOnly require at least one entry to match. + * excludeOnly keys are an exception, because their implied + * GIN_CAT_EMPTY_QUERY scanEntry always matches. So return "true" if all + * non-excludeOnly scan keys have at least one match. + */ + for (i = 0; i < so->nkeys; i++) + { + if (pos->hasMatchKey[i] == false && !so->keys[i].excludeOnly) + return false; + } + + return true; +} + +/* + * Collect all matched rows from pending list into bitmap. + */ +static void +scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + MemoryContext oldCtx; + bool recheck, + match; + int i; + pendingPosition pos; + Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); + Page page; + BlockNumber blkno; + + *ntids = 0; + + /* + * Acquire predicate lock on the metapage, to conflict with any fastupdate + * insertions. + */ + PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot); + + LockBuffer(metabuffer, GIN_SHARE); + page = BufferGetPage(metabuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); + blkno = GinPageGetMeta(page)->head; + + /* + * fetch head of list before unlocking metapage. head page must be pinned + * to prevent deletion by vacuum process + */ + if (blkno == InvalidBlockNumber) + { + /* No pending list, so proceed with normal scan */ + UnlockReleaseBuffer(metabuffer); + return; + } + + pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); + LockBuffer(pos.pendingBuffer, GIN_SHARE); + pos.firstOffset = FirstOffsetNumber; + UnlockReleaseBuffer(metabuffer); + pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys); + + /* + * loop for each heap row. scanGetCandidate returns full row or row's + * tuples from first page. + */ + while (scanGetCandidate(scan, &pos)) + { + /* + * Check entries in tuple and set up entryRes array. + * + * If pending tuples belonging to the current heap row are spread + * across several pages, collectMatchesForHeapRow will read all of + * those pages. + */ + if (!collectMatchesForHeapRow(scan, &pos)) + continue; + + /* + * Matching of entries of one row is finished, so check row using + * consistent functions. + */ + oldCtx = MemoryContextSwitchTo(so->tempCtx); + recheck = false; + match = true; + + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + if (!key->boolConsistentFn(key)) + { + match = false; + break; + } + recheck |= key->recheckCurItem; + } + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(so->tempCtx); + + if (match) + { + tbm_add_tuples(tbm, &pos.item, 1, recheck); + (*ntids)++; + } + } + + pfree(pos.hasMatchKey); +} + + +#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes ) + +int64 +gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + int64 ntids; + ItemPointerData iptr; + bool recheck; + + /* + * Set up the scan keys, and check for unsatisfiable query. + */ + ginFreeScanKeys(so); /* there should be no keys yet, but just to be + * sure */ + ginNewScanKey(scan); + + if (GinIsVoidRes(scan)) + return 0; + + ntids = 0; + + /* + * First, scan the pending list and collect any matching entries into the + * bitmap. After we scan a pending item, some other backend could post it + * into the main index, and so we might visit it a second time during the + * main scan. This is okay because we'll just re-set the same bit in the + * bitmap. (The possibility of duplicate visits is a major reason why GIN + * can't support the amgettuple API, however.) Note that it would not do + * to scan the main index before the pending list, since concurrent + * cleanup could then make us miss entries entirely. + */ + scanPendingInsert(scan, tbm, &ntids); + + /* + * Now scan the main index. + */ + startScan(scan); + + ItemPointerSetMin(&iptr); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (!scanGetItem(scan, iptr, &iptr, &recheck)) + break; + + if (ItemPointerIsLossyPage(&iptr)) + tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr)); + else + tbm_add_tuples(tbm, &iptr, 1, recheck); + ntids++; + } + + return ntids; +} diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c new file mode 100644 index 0000000..0e8672c --- /dev/null +++ b/src/backend/access/gin/gininsert.c @@ -0,0 +1,541 @@ +/*------------------------------------------------------------------------- + * + * gininsert.c + * insert routines for the postgres inverted index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/gininsert.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/predicate.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +typedef struct +{ + GinState ginstate; + double indtuples; + GinStatsData buildStats; + MemoryContext tmpCtx; + MemoryContext funcCtx; + BuildAccumulator accum; +} GinBuildState; + + +/* + * Adds array of item pointers to tuple's posting list, or + * creates posting tree and tuple pointing to tree in case + * of not enough space. Max size of tuple is defined in + * GinFormTuple(). Returns a new, modified index tuple. + * items[] must be in sorted order with no duplicates. + */ +static IndexTuple +addItemPointersToLeafTuple(GinState *ginstate, + IndexTuple old, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats, Buffer buffer) +{ + OffsetNumber attnum; + Datum key; + GinNullCategory category; + IndexTuple res; + ItemPointerData *newItems, + *oldItems; + int oldNPosting, + newNPosting; + GinPostingList *compressedList; + + Assert(!GinIsPostingTree(old)); + + attnum = gintuple_get_attrnum(ginstate, old); + key = gintuple_get_key(ginstate, old, &category); + + /* merge the old and new posting lists */ + oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting); + + newItems = ginMergeItemPointers(items, nitem, + oldItems, oldNPosting, + &newNPosting); + + /* Compress the posting list, and try to a build tuple with room for it */ + res = NULL; + compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, + NULL); + pfree(newItems); + if (compressedList) + { + res = GinFormTuple(ginstate, attnum, key, category, + (char *) compressedList, + SizeOfGinPostingList(compressedList), + newNPosting, + false); + pfree(compressedList); + } + if (!res) + { + /* posting list would be too big, convert to posting tree */ + BlockNumber postingRoot; + + /* + * Initialize posting tree with the old tuple's posting list. It's + * surely small enough to fit on one posting-tree page, and should + * already be in order with no duplicates. + */ + postingRoot = createPostingTree(ginstate->index, + oldItems, + oldNPosting, + buildStats, + buffer); + + /* Now insert the TIDs-to-be-added into the posting tree */ + ginInsertItemPointers(ginstate->index, postingRoot, + items, nitem, + buildStats); + + /* And build a new posting-tree-only result tuple */ + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true); + GinSetPostingTree(res, postingRoot); + } + pfree(oldItems); + + return res; +} + +/* + * Build a fresh leaf tuple, either posting-list or posting-tree format + * depending on whether the given items list will fit. + * items[] must be in sorted order with no duplicates. + * + * This is basically the same logic as in addItemPointersToLeafTuple, + * but working from slightly different input. + */ +static IndexTuple +buildFreshLeafTuple(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats, Buffer buffer) +{ + IndexTuple res = NULL; + GinPostingList *compressedList; + + /* try to build a posting list tuple with all the items */ + compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, NULL); + if (compressedList) + { + res = GinFormTuple(ginstate, attnum, key, category, + (char *) compressedList, + SizeOfGinPostingList(compressedList), + nitem, false); + pfree(compressedList); + } + if (!res) + { + /* posting list would be too big, build posting tree */ + BlockNumber postingRoot; + + /* + * Build posting-tree-only result tuple. We do this first so as to + * fail quickly if the key is too big. + */ + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true); + + /* + * Initialize a new posting tree with the TIDs. + */ + postingRoot = createPostingTree(ginstate->index, items, nitem, + buildStats, buffer); + + /* And save the root link in the result tuple */ + GinSetPostingTree(res, postingRoot); + } + + return res; +} + +/* + * Insert one or more heap TIDs associated with the given key value. + * This will either add a single key entry, or enlarge a pre-existing entry. + * + * During an index build, buildStats is non-null and the counters + * it contains should be incremented as needed. + */ +void +ginEntryInsert(GinState *ginstate, + OffsetNumber attnum, Datum key, GinNullCategory category, + ItemPointerData *items, uint32 nitem, + GinStatsData *buildStats) +{ + GinBtreeData btree; + GinBtreeEntryInsertData insertdata; + GinBtreeStack *stack; + IndexTuple itup; + Page page; + + insertdata.isDelete = false; + + ginPrepareEntryScan(&btree, attnum, key, category, ginstate); + btree.isBuild = (buildStats != NULL); + + stack = ginFindLeafPage(&btree, false, false, NULL); + page = BufferGetPage(stack->buffer); + + if (btree.findItem(&btree, stack)) + { + /* found pre-existing entry */ + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); + + if (GinIsPostingTree(itup)) + { + /* add entries to existing posting tree */ + BlockNumber rootPostingTree = GinGetPostingTree(itup); + + /* release all stack */ + LockBuffer(stack->buffer, GIN_UNLOCK); + freeGinBtreeStack(stack); + + /* insert into posting tree */ + ginInsertItemPointers(ginstate->index, rootPostingTree, + items, nitem, + buildStats); + return; + } + + CheckForSerializableConflictIn(ginstate->index, NULL, + BufferGetBlockNumber(stack->buffer)); + /* modify an existing leaf entry */ + itup = addItemPointersToLeafTuple(ginstate, itup, + items, nitem, buildStats, stack->buffer); + + insertdata.isDelete = true; + } + else + { + CheckForSerializableConflictIn(ginstate->index, NULL, + BufferGetBlockNumber(stack->buffer)); + /* no match, so construct a new leaf entry */ + itup = buildFreshLeafTuple(ginstate, attnum, key, category, + items, nitem, buildStats, stack->buffer); + + /* + * nEntries counts leaf tuples, so increment it only when we make a + * new one. + */ + if (buildStats) + buildStats->nEntries++; + } + + /* Insert the new or modified leaf tuple */ + insertdata.entry = itup; + ginInsertValue(&btree, stack, &insertdata, buildStats); + pfree(itup); +} + +/* + * Extract index entries for a single indexable item, and add them to the + * BuildAccumulator's state. + * + * This function is used only during initial index creation. + */ +static void +ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum, + Datum value, bool isNull, + ItemPointer heapptr) +{ + Datum *entries; + GinNullCategory *categories; + int32 nentries; + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(buildstate->funcCtx); + entries = ginExtractEntries(buildstate->accum.ginstate, attnum, + value, isNull, + &nentries, &categories); + MemoryContextSwitchTo(oldCtx); + + ginInsertBAEntries(&buildstate->accum, heapptr, attnum, + entries, categories, nentries); + + buildstate->indtuples += nentries; + + MemoryContextReset(buildstate->funcCtx); +} + +static void +ginBuildCallback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + GinBuildState *buildstate = (GinBuildState *) state; + MemoryContext oldCtx; + int i; + + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++) + ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1), + values[i], isnull[i], tid); + + /* If we've maxed out our available memory, dump everything to the index */ + if (buildstate->accum.allocatedMemory >= (Size) maintenance_work_mem * 1024L) + { + ItemPointerData *list; + Datum key; + GinNullCategory category; + uint32 nlist; + OffsetNumber attnum; + + ginBeginBAScan(&buildstate->accum); + while ((list = ginGetBAEntry(&buildstate->accum, + &attnum, &key, &category, &nlist)) != NULL) + { + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + ginEntryInsert(&buildstate->ginstate, attnum, key, category, + list, nlist, &buildstate->buildStats); + } + + MemoryContextReset(buildstate->tmpCtx); + ginInitBA(&buildstate->accum); + } + + MemoryContextSwitchTo(oldCtx); +} + +IndexBuildResult * +ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + GinBuildState buildstate; + Buffer RootBuffer, + MetaBuffer; + ItemPointerData *list; + Datum key; + GinNullCategory category; + uint32 nlist; + MemoryContext oldCtx; + OffsetNumber attnum; + + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + initGinState(&buildstate.ginstate, index); + buildstate.indtuples = 0; + memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); + + /* initialize the meta page */ + MetaBuffer = GinNewBuffer(index); + + /* initialize the root page */ + RootBuffer = GinNewBuffer(index); + + START_CRIT_SECTION(); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); + + + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); + END_CRIT_SECTION(); + + /* count the root as first entry page */ + buildstate.buildStats.nEntryPages++; + + /* + * create a temporary memory context that is used to hold data not yet + * dumped out to the index + */ + buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin build temporary context", + ALLOCSET_DEFAULT_SIZES); + + /* + * create a temporary memory context that is used for calling + * ginExtractEntries(), and can be reset after each tuple + */ + buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin build temporary context for user-defined function", + ALLOCSET_DEFAULT_SIZES); + + buildstate.accum.ginstate = &buildstate.ginstate; + ginInitBA(&buildstate.accum); + + /* + * Do the heap scan. We disallow sync scan here because dataPlaceToPage + * prefers to receive tuples in TID order. + */ + reltuples = table_index_build_scan(heap, index, indexInfo, false, true, + ginBuildCallback, (void *) &buildstate, + NULL); + + /* dump remaining entries to the index */ + oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); + ginBeginBAScan(&buildstate.accum); + while ((list = ginGetBAEntry(&buildstate.accum, + &attnum, &key, &category, &nlist)) != NULL) + { + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + ginEntryInsert(&buildstate.ginstate, attnum, key, category, + list, nlist, &buildstate.buildStats); + } + MemoryContextSwitchTo(oldCtx); + + MemoryContextDelete(buildstate.funcCtx); + MemoryContextDelete(buildstate.tmpCtx); + + /* + * Update metapage stats + */ + buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); + ginUpdateStats(index, &buildstate.buildStats, true); + + /* + * We didn't write WAL records as we built the index, so if WAL-logging is + * required, write all pages to the WAL now. + */ + if (RelationNeedsWAL(index)) + { + log_newpage_range(index, MAIN_FORKNUM, + 0, RelationGetNumberOfBlocks(index), + true); + } + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = buildstate.indtuples; + + return result; +} + +/* + * ginbuildempty() -- build an empty gin index in the initialization fork + */ +void +ginbuildempty(Relation index) +{ + Buffer RootBuffer, + MetaBuffer; + + /* An empty GIN index has two pages. */ + MetaBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE); + RootBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog metabuffer and root buffer. */ + START_CRIT_SECTION(); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + log_newpage_buffer(MetaBuffer, true); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); + log_newpage_buffer(RootBuffer, false); + END_CRIT_SECTION(); + + /* Unlock and release the buffers. */ + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); +} + +/* + * Insert index entries for a single indexable item during "normal" + * (non-fast-update) insertion + */ +static void +ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, + Datum value, bool isNull, + ItemPointer item) +{ + Datum *entries; + GinNullCategory *categories; + int32 i, + nentries; + + entries = ginExtractEntries(ginstate, attnum, value, isNull, + &nentries, &categories); + + for (i = 0; i < nentries; i++) + ginEntryInsert(ginstate, attnum, entries[i], categories[i], + item, 1, NULL); +} + +bool +gininsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + GinState *ginstate = (GinState *) indexInfo->ii_AmCache; + MemoryContext oldCtx; + MemoryContext insertCtx; + int i; + + /* Initialize GinState cache if first call in this statement */ + if (ginstate == NULL) + { + oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context); + ginstate = (GinState *) palloc(sizeof(GinState)); + initGinState(ginstate, index); + indexInfo->ii_AmCache = (void *) ginstate; + MemoryContextSwitchTo(oldCtx); + } + + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin insert temporary context", + ALLOCSET_DEFAULT_SIZES); + + oldCtx = MemoryContextSwitchTo(insertCtx); + + if (GinGetUseFastUpdate(index)) + { + GinTupleCollector collector; + + memset(&collector, 0, sizeof(GinTupleCollector)); + + for (i = 0; i < ginstate->origTupdesc->natts; i++) + ginHeapTupleFastCollect(ginstate, &collector, + (OffsetNumber) (i + 1), + values[i], isnull[i], + ht_ctid); + + ginHeapTupleFastInsert(ginstate, &collector); + } + else + { + for (i = 0; i < ginstate->origTupdesc->natts; i++) + ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1), + values[i], isnull[i], + ht_ctid); + } + + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + + return false; +} diff --git a/src/backend/access/gin/ginlogic.c b/src/backend/access/gin/ginlogic.c new file mode 100644 index 0000000..6bf3288 --- /dev/null +++ b/src/backend/access/gin/ginlogic.c @@ -0,0 +1,246 @@ +/*------------------------------------------------------------------------- + * + * ginlogic.c + * routines for performing binary- and ternary-logic consistent checks. + * + * A GIN operator class can provide a boolean or ternary consistent + * function, or both. This file provides both boolean and ternary + * interfaces to the rest of the GIN code, even if only one of them is + * implemented by the opclass. + * + * Providing a boolean interface when the opclass implements only the + * ternary function is straightforward - just call the ternary function + * with the check-array as is, and map the GIN_TRUE, GIN_FALSE, GIN_MAYBE + * return codes to TRUE, FALSE and TRUE+recheck, respectively. Providing + * a ternary interface when the opclass only implements a boolean function + * is implemented by calling the boolean function many times, with all the + * MAYBE arguments set to all combinations of TRUE and FALSE (up to a + * certain number of MAYBE arguments). + * + * (A boolean function is enough to determine if an item matches, but a + * GIN scan can apply various optimizations if it can determine that an + * item matches or doesn't match, even if it doesn't know if some of the + * keys are present or not. That's what the ternary consistent function + * is used for.) + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginlogic.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/reloptions.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_type.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" + + +/* + * Maximum number of MAYBE inputs that shimTriConsistentFn will try to + * resolve by calling all combinations. + */ +#define MAX_MAYBE_ENTRIES 4 + +/* + * Dummy consistent functions for an EVERYTHING key. Just claim it matches. + */ +static bool +trueConsistentFn(GinScanKey key) +{ + key->recheckCurItem = false; + return true; +} +static GinTernaryValue +trueTriConsistentFn(GinScanKey key) +{ + return GIN_TRUE; +} + +/* + * A helper function for calling a regular, binary logic, consistent function. + */ +static bool +directBoolConsistentFn(GinScanKey key) +{ + /* + * Initialize recheckCurItem in case the consistentFn doesn't know it + * should set it. The safe assumption in that case is to force recheck. + */ + key->recheckCurItem = true; + + return DatumGetBool(FunctionCall8Coll(key->consistentFmgrInfo, + key->collation, + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nuserentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(&key->recheckCurItem), + PointerGetDatum(key->queryValues), + PointerGetDatum(key->queryCategories))); +} + +/* + * A helper function for calling a native ternary logic consistent function. + */ +static GinTernaryValue +directTriConsistentFn(GinScanKey key) +{ + return DatumGetGinTernaryValue(FunctionCall7Coll(key->triConsistentFmgrInfo, + key->collation, + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nuserentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(key->queryValues), + PointerGetDatum(key->queryCategories))); +} + +/* + * This function implements a binary logic consistency check, using a ternary + * logic consistent function provided by the opclass. GIN_MAYBE return value + * is interpreted as true with recheck flag. + */ +static bool +shimBoolConsistentFn(GinScanKey key) +{ + GinTernaryValue result; + + result = DatumGetGinTernaryValue(FunctionCall7Coll(key->triConsistentFmgrInfo, + key->collation, + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nuserentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(key->queryValues), + PointerGetDatum(key->queryCategories))); + if (result == GIN_MAYBE) + { + key->recheckCurItem = true; + return true; + } + else + { + key->recheckCurItem = false; + return result; + } +} + +/* + * This function implements a tri-state consistency check, using a boolean + * consistent function provided by the opclass. + * + * Our strategy is to call consistentFn with MAYBE inputs replaced with every + * combination of TRUE/FALSE. If consistentFn returns the same value for every + * combination, that's the overall result. Otherwise, return MAYBE. Testing + * every combination is O(n^2), so this is only feasible for a small number of + * MAYBE inputs. + * + * NB: This function modifies the key->entryRes array! + */ +static GinTernaryValue +shimTriConsistentFn(GinScanKey key) +{ + int nmaybe; + int maybeEntries[MAX_MAYBE_ENTRIES]; + int i; + bool boolResult; + bool recheck = false; + GinTernaryValue curResult; + + /* + * Count how many MAYBE inputs there are, and store their indexes in + * maybeEntries. If there are too many MAYBE inputs, it's not feasible to + * test all combinations, so give up and return MAYBE. + */ + nmaybe = 0; + for (i = 0; i < key->nentries; i++) + { + if (key->entryRes[i] == GIN_MAYBE) + { + if (nmaybe >= MAX_MAYBE_ENTRIES) + return GIN_MAYBE; + maybeEntries[nmaybe++] = i; + } + } + + /* + * If none of the inputs were MAYBE, so we can just call consistent + * function as is. + */ + if (nmaybe == 0) + return directBoolConsistentFn(key); + + /* First call consistent function with all the maybe-inputs set FALSE */ + for (i = 0; i < nmaybe; i++) + key->entryRes[maybeEntries[i]] = GIN_FALSE; + curResult = directBoolConsistentFn(key); + + for (;;) + { + /* Twiddle the entries for next combination. */ + for (i = 0; i < nmaybe; i++) + { + if (key->entryRes[maybeEntries[i]] == GIN_FALSE) + { + key->entryRes[maybeEntries[i]] = GIN_TRUE; + break; + } + else + key->entryRes[maybeEntries[i]] = GIN_FALSE; + } + if (i == nmaybe) + break; + + boolResult = directBoolConsistentFn(key); + recheck |= key->recheckCurItem; + + if (curResult != boolResult) + return GIN_MAYBE; + } + + /* TRUE with recheck is taken to mean MAYBE */ + if (curResult == GIN_TRUE && recheck) + curResult = GIN_MAYBE; + + return curResult; +} + +/* + * Set up the implementation of the consistent functions for a scan key. + */ +void +ginInitConsistentFunction(GinState *ginstate, GinScanKey key) +{ + if (key->searchMode == GIN_SEARCH_MODE_EVERYTHING) + { + key->boolConsistentFn = trueConsistentFn; + key->triConsistentFn = trueTriConsistentFn; + } + else + { + key->consistentFmgrInfo = &ginstate->consistentFn[key->attnum - 1]; + key->triConsistentFmgrInfo = &ginstate->triConsistentFn[key->attnum - 1]; + key->collation = ginstate->supportCollation[key->attnum - 1]; + + if (OidIsValid(ginstate->consistentFn[key->attnum - 1].fn_oid)) + key->boolConsistentFn = directBoolConsistentFn; + else + key->boolConsistentFn = shimBoolConsistentFn; + + if (OidIsValid(ginstate->triConsistentFn[key->attnum - 1].fn_oid)) + key->triConsistentFn = directTriConsistentFn; + else + key->triConsistentFn = shimTriConsistentFn; + } +} diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c new file mode 100644 index 0000000..216b2b9 --- /dev/null +++ b/src/backend/access/gin/ginpostinglist.c @@ -0,0 +1,434 @@ +/*------------------------------------------------------------------------- + * + * ginpostinglist.c + * routines for dealing with posting lists. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginpostinglist.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" + +#ifdef USE_ASSERT_CHECKING +#define CHECK_ENCODING_ROUNDTRIP +#endif + +/* + * For encoding purposes, item pointers are represented as 64-bit unsigned + * integers. The lowest 11 bits represent the offset number, and the next + * lowest 32 bits are the block number. That leaves 21 bits unused, i.e. + * only 43 low bits are used. + * + * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage < + * 2^11 on all supported block sizes. We are frugal with the bits, because + * smaller integers use fewer bytes in the varbyte encoding, saving disk + * space. (If we get a new table AM in the future that wants to use the full + * range of possible offset numbers, we'll need to change this.) + * + * These 43-bit integers are encoded using varbyte encoding. In each byte, + * the 7 low bits contain data, while the highest bit is a continuation bit. + * When the continuation bit is set, the next byte is part of the same + * integer, otherwise this is the last byte of this integer. 43 bits need + * at most 7 bytes in this encoding: + * + * 0XXXXXXX + * 1XXXXXXX 0XXXXYYY + * 1XXXXXXX 1XXXXYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY + * + * X = bits used for offset number + * Y = bits used for block number + * u = unused bit + * + * The bytes are in stored in little-endian order. + * + * An important property of this encoding is that removing an item from list + * never increases the size of the resulting compressed posting list. Proof: + * + * Removing number is actually replacement of two numbers with their sum. We + * have to prove that varbyte encoding of a sum can't be longer than varbyte + * encoding of its summands. Sum of two numbers is at most one bit wider than + * the larger of the summands. Widening a number by one bit enlarges its length + * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum + * is at most one byte longer than varbyte encoding of larger summand. Lesser + * summand is at least one byte, so the sum cannot take more space than the + * summands, Q.E.D. + * + * This property greatly simplifies VACUUM, which can assume that posting + * lists always fit on the same page after vacuuming. Note that even though + * that holds for removing items from a posting list, you must also be + * careful to not cause expansion e.g. when merging uncompressed items on the + * page into the compressed lists, when vacuuming. + */ + +/* + * How many bits do you need to encode offset number? OffsetNumber is a 16-bit + * integer, but you can't fit that many items on a page. 11 ought to be more + * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and + * use the minimum number of bits, but that would require changing the on-disk + * format if MaxHeapTuplesPerPage changes. Better to leave some slack. + */ +#define MaxHeapTuplesPerPageBits 11 + +/* Max. number of bytes needed to encode the largest supported integer. */ +#define MaxBytesPerInteger 7 + +static inline uint64 +itemptr_to_uint64(const ItemPointer iptr) +{ + uint64 val; + + Assert(ItemPointerIsValid(iptr)); + Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits)); + + val = GinItemPointerGetBlockNumber(iptr); + val <<= MaxHeapTuplesPerPageBits; + val |= GinItemPointerGetOffsetNumber(iptr); + + return val; +} + +static inline void +uint64_to_itemptr(uint64 val, ItemPointer iptr) +{ + GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1)); + val = val >> MaxHeapTuplesPerPageBits; + GinItemPointerSetBlockNumber(iptr, val); + + Assert(ItemPointerIsValid(iptr)); +} + +/* + * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer. + */ +static void +encode_varbyte(uint64 val, unsigned char **ptr) +{ + unsigned char *p = *ptr; + + while (val > 0x7F) + { + *(p++) = 0x80 | (val & 0x7F); + val >>= 7; + } + *(p++) = (unsigned char) val; + + *ptr = p; +} + +/* + * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer. + */ +static uint64 +decode_varbyte(unsigned char **ptr) +{ + uint64 val; + unsigned char *p = *ptr; + uint64 c; + + /* 1st byte */ + c = *(p++); + val = c & 0x7F; + if (c & 0x80) + { + /* 2nd byte */ + c = *(p++); + val |= (c & 0x7F) << 7; + if (c & 0x80) + { + /* 3rd byte */ + c = *(p++); + val |= (c & 0x7F) << 14; + if (c & 0x80) + { + /* 4th byte */ + c = *(p++); + val |= (c & 0x7F) << 21; + if (c & 0x80) + { + /* 5th byte */ + c = *(p++); + val |= (c & 0x7F) << 28; + if (c & 0x80) + { + /* 6th byte */ + c = *(p++); + val |= (c & 0x7F) << 35; + if (c & 0x80) + { + /* 7th byte, should not have continuation bit */ + c = *(p++); + val |= c << 42; + Assert((c & 0x80) == 0); + } + } + } + } + } + } + + *ptr = p; + + return val; +} + +/* + * Encode a posting list. + * + * The encoded list is returned in a palloc'd struct, which will be at most + * 'maxsize' bytes in size. The number items in the returned segment is + * returned in *nwritten. If it's not equal to nipd, not all the items fit + * in 'maxsize', and only the first *nwritten were encoded. + * + * The allocated size of the returned struct is short-aligned, and the padding + * byte at the end, if any, is zero. + */ +GinPostingList * +ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, + int *nwritten) +{ + uint64 prev; + int totalpacked = 0; + int maxbytes; + GinPostingList *result; + unsigned char *ptr; + unsigned char *endptr; + + maxsize = SHORTALIGN_DOWN(maxsize); + + result = palloc(maxsize); + + maxbytes = maxsize - offsetof(GinPostingList, bytes); + Assert(maxbytes > 0); + + /* Store the first special item */ + result->first = ipd[0]; + + prev = itemptr_to_uint64(&result->first); + + ptr = result->bytes; + endptr = result->bytes + maxbytes; + for (totalpacked = 1; totalpacked < nipd; totalpacked++) + { + uint64 val = itemptr_to_uint64(&ipd[totalpacked]); + uint64 delta = val - prev; + + Assert(val > prev); + + if (endptr - ptr >= MaxBytesPerInteger) + encode_varbyte(delta, &ptr); + else + { + /* + * There are less than 7 bytes left. Have to check if the next + * item fits in that space before writing it out. + */ + unsigned char buf[MaxBytesPerInteger]; + unsigned char *p = buf; + + encode_varbyte(delta, &p); + if (p - buf > (endptr - ptr)) + break; /* output is full */ + + memcpy(ptr, buf, p - buf); + ptr += (p - buf); + } + prev = val; + } + result->nbytes = ptr - result->bytes; + + /* + * If we wrote an odd number of bytes, zero out the padding byte at the + * end. + */ + if (result->nbytes != SHORTALIGN(result->nbytes)) + result->bytes[result->nbytes] = 0; + + if (nwritten) + *nwritten = totalpacked; + + Assert(SizeOfGinPostingList(result) <= maxsize); + + /* + * Check that the encoded segment decodes back to the original items. + */ +#if defined (CHECK_ENCODING_ROUNDTRIP) + { + int ndecoded; + ItemPointer tmp = ginPostingListDecode(result, &ndecoded); + + Assert(ndecoded == totalpacked); + Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0); + pfree(tmp); + } +#endif + + return result; +} + +/* + * Decode a compressed posting list into an array of item pointers. + * The number of items is returned in *ndecoded. + */ +ItemPointer +ginPostingListDecode(GinPostingList *plist, int *ndecoded) +{ + return ginPostingListDecodeAllSegments(plist, + SizeOfGinPostingList(plist), + ndecoded); +} + +/* + * Decode multiple posting list segments into an array of item pointers. + * The number of items is returned in *ndecoded_out. The segments are stored + * one after each other, with total size 'len' bytes. + */ +ItemPointer +ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out) +{ + ItemPointer result; + int nallocated; + uint64 val; + char *endseg = ((char *) segment) + len; + int ndecoded; + unsigned char *ptr; + unsigned char *endptr; + + /* + * Guess an initial size of the array. + */ + nallocated = segment->nbytes * 2 + 1; + result = palloc(nallocated * sizeof(ItemPointerData)); + + ndecoded = 0; + while ((char *) segment < endseg) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + /* copy the first item */ + Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); + Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0); + result[ndecoded] = segment->first; + ndecoded++; + + val = itemptr_to_uint64(&segment->first); + ptr = segment->bytes; + endptr = segment->bytes + segment->nbytes; + while (ptr < endptr) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + val += decode_varbyte(&ptr); + + uint64_to_itemptr(val, &result[ndecoded]); + ndecoded++; + } + segment = GinNextPostingListSegment(segment); + } + + if (ndecoded_out) + *ndecoded_out = ndecoded; + return result; +} + +/* + * Add all item pointers from a bunch of posting lists to a TIDBitmap. + */ +int +ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, + TIDBitmap *tbm) +{ + int ndecoded; + ItemPointer items; + + items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); + tbm_add_tuples(tbm, items, ndecoded, false); + pfree(items); + + return ndecoded; +} + +/* + * Merge two ordered arrays of itempointers, eliminating any duplicates. + * + * Returns a palloc'd array, and *nmerged is set to the number of items in + * the result, after eliminating duplicates. + */ +ItemPointer +ginMergeItemPointers(ItemPointerData *a, uint32 na, + ItemPointerData *b, uint32 nb, + int *nmerged) +{ + ItemPointerData *dst; + + dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData)); + + /* + * If the argument arrays don't overlap, we can just append them to each + * other. + */ + if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0) + { + memcpy(dst, a, na * sizeof(ItemPointerData)); + memcpy(&dst[na], b, nb * sizeof(ItemPointerData)); + *nmerged = na + nb; + } + else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0) + { + memcpy(dst, b, nb * sizeof(ItemPointerData)); + memcpy(&dst[nb], a, na * sizeof(ItemPointerData)); + *nmerged = na + nb; + } + else + { + ItemPointerData *dptr = dst; + ItemPointerData *aptr = a; + ItemPointerData *bptr = b; + + while (aptr - a < na && bptr - b < nb) + { + int cmp = ginCompareItemPointers(aptr, bptr); + + if (cmp > 0) + *dptr++ = *bptr++; + else if (cmp == 0) + { + /* only keep one copy of the identical items */ + *dptr++ = *bptr++; + aptr++; + } + else + *dptr++ = *aptr++; + } + + while (aptr - a < na) + *dptr++ = *aptr++; + + while (bptr - b < nb) + *dptr++ = *bptr++; + + *nmerged = dptr - dst; + } + + return dst; +} diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c new file mode 100644 index 0000000..55e2d49 --- /dev/null +++ b/src/backend/access/gin/ginscan.c @@ -0,0 +1,468 @@ +/*------------------------------------------------------------------------- + * + * ginscan.c + * routines to manage scans of inverted index relations + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginscan.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/relscan.h" +#include "pgstat.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +IndexScanDesc +ginbeginscan(Relation rel, int nkeys, int norderbys) +{ + IndexScanDesc scan; + GinScanOpaque so; + + /* no order by operators allowed */ + Assert(norderbys == 0); + + scan = RelationGetIndexScan(rel, nkeys, norderbys); + + /* allocate private workspace */ + so = (GinScanOpaque) palloc(sizeof(GinScanOpaqueData)); + so->keys = NULL; + so->nkeys = 0; + so->tempCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin scan temporary context", + ALLOCSET_DEFAULT_SIZES); + so->keyCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin scan key context", + ALLOCSET_DEFAULT_SIZES); + initGinState(&so->ginstate, scan->indexRelation); + + scan->opaque = so; + + return scan; +} + +/* + * Create a new GinScanEntry, unless an equivalent one already exists, + * in which case just return it + */ +static GinScanEntry +ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum, + StrategyNumber strategy, int32 searchMode, + Datum queryKey, GinNullCategory queryCategory, + bool isPartialMatch, Pointer extra_data) +{ + GinState *ginstate = &so->ginstate; + GinScanEntry scanEntry; + uint32 i; + + /* + * Look for an existing equivalent entry. + * + * Entries with non-null extra_data are never considered identical, since + * we can't know exactly what the opclass might be doing with that. + */ + if (extra_data == NULL) + { + for (i = 0; i < so->totalentries; i++) + { + GinScanEntry prevEntry = so->entries[i]; + + if (prevEntry->extra_data == NULL && + prevEntry->isPartialMatch == isPartialMatch && + prevEntry->strategy == strategy && + prevEntry->searchMode == searchMode && + prevEntry->attnum == attnum && + ginCompareEntries(ginstate, attnum, + prevEntry->queryKey, + prevEntry->queryCategory, + queryKey, + queryCategory) == 0) + { + /* Successful match */ + return prevEntry; + } + } + } + + /* Nope, create a new entry */ + scanEntry = (GinScanEntry) palloc(sizeof(GinScanEntryData)); + scanEntry->queryKey = queryKey; + scanEntry->queryCategory = queryCategory; + scanEntry->isPartialMatch = isPartialMatch; + scanEntry->extra_data = extra_data; + scanEntry->strategy = strategy; + scanEntry->searchMode = searchMode; + scanEntry->attnum = attnum; + + scanEntry->buffer = InvalidBuffer; + ItemPointerSetMin(&scanEntry->curItem); + scanEntry->matchBitmap = NULL; + scanEntry->matchIterator = NULL; + scanEntry->matchResult = NULL; + scanEntry->list = NULL; + scanEntry->nlist = 0; + scanEntry->offset = InvalidOffsetNumber; + scanEntry->isFinished = false; + scanEntry->reduceResult = false; + + /* Add it to so's array */ + if (so->totalentries >= so->allocentries) + { + so->allocentries *= 2; + so->entries = (GinScanEntry *) + repalloc(so->entries, so->allocentries * sizeof(GinScanEntry)); + } + so->entries[so->totalentries++] = scanEntry; + + return scanEntry; +} + +/* + * Append hidden scan entry of given category to the scan key. + * + * NB: this had better be called at most once per scan key, since + * ginFillScanKey leaves room for only one hidden entry. Currently, + * it seems sufficiently clear that this is true that we don't bother + * with any cross-check logic. + */ +static void +ginScanKeyAddHiddenEntry(GinScanOpaque so, GinScanKey key, + GinNullCategory queryCategory) +{ + int i = key->nentries++; + + /* strategy is of no interest because this is not a partial-match item */ + key->scanEntry[i] = ginFillScanEntry(so, key->attnum, + InvalidStrategy, key->searchMode, + (Datum) 0, queryCategory, + false, NULL); +} + +/* + * Initialize the next GinScanKey using the output from the extractQueryFn + */ +static void +ginFillScanKey(GinScanOpaque so, OffsetNumber attnum, + StrategyNumber strategy, int32 searchMode, + Datum query, uint32 nQueryValues, + Datum *queryValues, GinNullCategory *queryCategories, + bool *partial_matches, Pointer *extra_data) +{ + GinScanKey key = &(so->keys[so->nkeys++]); + GinState *ginstate = &so->ginstate; + uint32 i; + + key->nentries = nQueryValues; + key->nuserentries = nQueryValues; + + /* Allocate one extra array slot for possible "hidden" entry */ + key->scanEntry = (GinScanEntry *) palloc(sizeof(GinScanEntry) * + (nQueryValues + 1)); + key->entryRes = (GinTernaryValue *) palloc0(sizeof(GinTernaryValue) * + (nQueryValues + 1)); + + key->query = query; + key->queryValues = queryValues; + key->queryCategories = queryCategories; + key->extra_data = extra_data; + key->strategy = strategy; + key->searchMode = searchMode; + key->attnum = attnum; + + /* + * Initially, scan keys of GIN_SEARCH_MODE_ALL mode are marked + * excludeOnly. This might get changed later. + */ + key->excludeOnly = (searchMode == GIN_SEARCH_MODE_ALL); + + ItemPointerSetMin(&key->curItem); + key->curItemMatches = false; + key->recheckCurItem = false; + key->isFinished = false; + key->nrequired = 0; + key->nadditional = 0; + key->requiredEntries = NULL; + key->additionalEntries = NULL; + + ginInitConsistentFunction(ginstate, key); + + /* Set up normal scan entries using extractQueryFn's outputs */ + for (i = 0; i < nQueryValues; i++) + { + Datum queryKey; + GinNullCategory queryCategory; + bool isPartialMatch; + Pointer this_extra; + + queryKey = queryValues[i]; + queryCategory = queryCategories[i]; + isPartialMatch = + (ginstate->canPartialMatch[attnum - 1] && partial_matches) + ? partial_matches[i] : false; + this_extra = (extra_data) ? extra_data[i] : NULL; + + key->scanEntry[i] = ginFillScanEntry(so, attnum, + strategy, searchMode, + queryKey, queryCategory, + isPartialMatch, this_extra); + } + + /* + * For GIN_SEARCH_MODE_INCLUDE_EMPTY and GIN_SEARCH_MODE_EVERYTHING search + * modes, we add the "hidden" entry immediately. GIN_SEARCH_MODE_ALL is + * handled later, since we might be able to omit the hidden entry for it. + */ + if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) + ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_ITEM); + else if (searchMode == GIN_SEARCH_MODE_EVERYTHING) + ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_QUERY); +} + +/* + * Release current scan keys, if any. + */ +void +ginFreeScanKeys(GinScanOpaque so) +{ + uint32 i; + + if (so->keys == NULL) + return; + + for (i = 0; i < so->totalentries; i++) + { + GinScanEntry entry = so->entries[i]; + + if (entry->buffer != InvalidBuffer) + ReleaseBuffer(entry->buffer); + if (entry->list) + pfree(entry->list); + if (entry->matchIterator) + tbm_end_iterate(entry->matchIterator); + if (entry->matchBitmap) + tbm_free(entry->matchBitmap); + } + + MemoryContextResetAndDeleteChildren(so->keyCtx); + + so->keys = NULL; + so->nkeys = 0; + so->entries = NULL; + so->totalentries = 0; +} + +void +ginNewScanKey(IndexScanDesc scan) +{ + ScanKey scankey = scan->keyData; + GinScanOpaque so = (GinScanOpaque) scan->opaque; + int i; + bool hasNullQuery = false; + bool attrHasNormalScan[INDEX_MAX_KEYS] = {false}; + MemoryContext oldCtx; + + /* + * Allocate all the scan key information in the key context. (If + * extractQuery leaks anything there, it won't be reset until the end of + * scan or rescan, but that's OK.) + */ + oldCtx = MemoryContextSwitchTo(so->keyCtx); + + /* if no scan keys provided, allocate extra EVERYTHING GinScanKey */ + so->keys = (GinScanKey) + palloc(Max(scan->numberOfKeys, 1) * sizeof(GinScanKeyData)); + so->nkeys = 0; + + /* initialize expansible array of GinScanEntry pointers */ + so->totalentries = 0; + so->allocentries = 32; + so->entries = (GinScanEntry *) + palloc(so->allocentries * sizeof(GinScanEntry)); + + so->isVoidRes = false; + + for (i = 0; i < scan->numberOfKeys; i++) + { + ScanKey skey = &scankey[i]; + Datum *queryValues; + int32 nQueryValues = 0; + bool *partial_matches = NULL; + Pointer *extra_data = NULL; + bool *nullFlags = NULL; + GinNullCategory *categories; + int32 searchMode = GIN_SEARCH_MODE_DEFAULT; + + /* + * We assume that GIN-indexable operators are strict, so a null query + * argument means an unsatisfiable query. + */ + if (skey->sk_flags & SK_ISNULL) + { + so->isVoidRes = true; + break; + } + + /* OK to call the extractQueryFn */ + queryValues = (Datum *) + DatumGetPointer(FunctionCall7Coll(&so->ginstate.extractQueryFn[skey->sk_attno - 1], + so->ginstate.supportCollation[skey->sk_attno - 1], + skey->sk_argument, + PointerGetDatum(&nQueryValues), + UInt16GetDatum(skey->sk_strategy), + PointerGetDatum(&partial_matches), + PointerGetDatum(&extra_data), + PointerGetDatum(&nullFlags), + PointerGetDatum(&searchMode))); + + /* + * If bogus searchMode is returned, treat as GIN_SEARCH_MODE_ALL; note + * in particular we don't allow extractQueryFn to select + * GIN_SEARCH_MODE_EVERYTHING. + */ + if (searchMode < GIN_SEARCH_MODE_DEFAULT || + searchMode > GIN_SEARCH_MODE_ALL) + searchMode = GIN_SEARCH_MODE_ALL; + + /* Non-default modes require the index to have placeholders */ + if (searchMode != GIN_SEARCH_MODE_DEFAULT) + hasNullQuery = true; + + /* + * In default mode, no keys means an unsatisfiable query. + */ + if (queryValues == NULL || nQueryValues <= 0) + { + if (searchMode == GIN_SEARCH_MODE_DEFAULT) + { + so->isVoidRes = true; + break; + } + nQueryValues = 0; /* ensure sane value */ + } + + /* + * Create GinNullCategory representation. If the extractQueryFn + * didn't create a nullFlags array, we assume everything is non-null. + * While at it, detect whether any null keys are present. + */ + categories = (GinNullCategory *) palloc0(nQueryValues * sizeof(GinNullCategory)); + if (nullFlags) + { + int32 j; + + for (j = 0; j < nQueryValues; j++) + { + if (nullFlags[j]) + { + categories[j] = GIN_CAT_NULL_KEY; + hasNullQuery = true; + } + } + } + + ginFillScanKey(so, skey->sk_attno, + skey->sk_strategy, searchMode, + skey->sk_argument, nQueryValues, + queryValues, categories, + partial_matches, extra_data); + + /* Remember if we had any non-excludeOnly keys */ + if (searchMode != GIN_SEARCH_MODE_ALL) + attrHasNormalScan[skey->sk_attno - 1] = true; + } + + /* + * Processing GIN_SEARCH_MODE_ALL scan keys requires us to make a second + * pass over the scan keys. Above we marked each such scan key as + * excludeOnly. If the involved column has any normal (not excludeOnly) + * scan key as well, then we can leave it like that. Otherwise, one + * excludeOnly scan key must receive a GIN_CAT_EMPTY_QUERY hidden entry + * and be set to normal (excludeOnly = false). + */ + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = &so->keys[i]; + + if (key->searchMode != GIN_SEARCH_MODE_ALL) + continue; + + if (!attrHasNormalScan[key->attnum - 1]) + { + key->excludeOnly = false; + ginScanKeyAddHiddenEntry(so, key, GIN_CAT_EMPTY_QUERY); + attrHasNormalScan[key->attnum - 1] = true; + } + } + + /* + * If there are no regular scan keys, generate an EVERYTHING scankey to + * drive a full-index scan. + */ + if (so->nkeys == 0 && !so->isVoidRes) + { + hasNullQuery = true; + ginFillScanKey(so, FirstOffsetNumber, + InvalidStrategy, GIN_SEARCH_MODE_EVERYTHING, + (Datum) 0, 0, + NULL, NULL, NULL, NULL); + } + + /* + * If the index is version 0, it may be missing null and placeholder + * entries, which would render searches for nulls and full-index scans + * unreliable. Throw an error if so. + */ + if (hasNullQuery && !so->isVoidRes) + { + GinStatsData ginStats; + + ginGetStats(scan->indexRelation, &ginStats); + if (ginStats.ginVersion < 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("old GIN indexes do not support whole-index scans nor searches for nulls"), + errhint("To fix this, do REINDEX INDEX \"%s\".", + RelationGetRelationName(scan->indexRelation)))); + } + + MemoryContextSwitchTo(oldCtx); + + pgstat_count_index_scan(scan->indexRelation); +} + +void +ginrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + + ginFreeScanKeys(so); + + if (scankey && scan->numberOfKeys > 0) + { + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } +} + + +void +ginendscan(IndexScanDesc scan) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + + ginFreeScanKeys(so); + + MemoryContextDelete(so->tempCtx); + MemoryContextDelete(so->keyCtx); + + pfree(so); +} diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c new file mode 100644 index 0000000..cdd626f --- /dev/null +++ b/src/backend/access/gin/ginutil.c @@ -0,0 +1,707 @@ +/*------------------------------------------------------------------------- + * + * ginutil.c + * Utility routines for the Postgres inverted index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginutil.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/reloptions.h" +#include "access/xloginsert.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_type.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/typcache.h" + + +/* + * GIN handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +ginhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = GINNProcs; + amroutine->amoptsprocnum = GIN_OPTIONS_PROC; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = true; + amroutine->amclusterable = false; + amroutine->ampredlocks = true; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = true; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = ginbuild; + amroutine->ambuildempty = ginbuildempty; + amroutine->aminsert = gininsert; + amroutine->ambulkdelete = ginbulkdelete; + amroutine->amvacuumcleanup = ginvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = gincostestimate; + amroutine->amoptions = ginoptions; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = ginvalidate; + amroutine->amadjustmembers = ginadjustmembers; + amroutine->ambeginscan = ginbeginscan; + amroutine->amrescan = ginrescan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = gingetbitmap; + amroutine->amendscan = ginendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * initGinState: fill in an empty GinState struct to describe the index + * + * Note: assorted subsidiary data is allocated in the CurrentMemoryContext. + */ +void +initGinState(GinState *state, Relation index) +{ + TupleDesc origTupdesc = RelationGetDescr(index); + int i; + + MemSet(state, 0, sizeof(GinState)); + + state->index = index; + state->oneCol = (origTupdesc->natts == 1) ? true : false; + state->origTupdesc = origTupdesc; + + for (i = 0; i < origTupdesc->natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(origTupdesc, i); + + if (state->oneCol) + state->tupdesc[i] = state->origTupdesc; + else + { + state->tupdesc[i] = CreateTemplateTupleDesc(2); + + TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, + INT2OID, -1, 0); + TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, + attr->atttypid, + attr->atttypmod, + attr->attndims); + TupleDescInitEntryCollation(state->tupdesc[i], (AttrNumber) 2, + attr->attcollation); + } + + /* + * If the compare proc isn't specified in the opclass definition, look + * up the index key type's default btree comparator. + */ + if (index_getprocid(index, i + 1, GIN_COMPARE_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->compareFn[i]), + index_getprocinfo(index, i + 1, GIN_COMPARE_PROC), + CurrentMemoryContext); + } + else + { + TypeCacheEntry *typentry; + + typentry = lookup_type_cache(attr->atttypid, + TYPECACHE_CMP_PROC_FINFO); + if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(attr->atttypid)))); + fmgr_info_copy(&(state->compareFn[i]), + &(typentry->cmp_proc_finfo), + CurrentMemoryContext); + } + + /* Opclass must always provide extract procs */ + fmgr_info_copy(&(state->extractValueFn[i]), + index_getprocinfo(index, i + 1, GIN_EXTRACTVALUE_PROC), + CurrentMemoryContext); + fmgr_info_copy(&(state->extractQueryFn[i]), + index_getprocinfo(index, i + 1, GIN_EXTRACTQUERY_PROC), + CurrentMemoryContext); + + /* + * Check opclass capability to do tri-state or binary logic consistent + * check. + */ + if (index_getprocid(index, i + 1, GIN_TRICONSISTENT_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->triConsistentFn[i]), + index_getprocinfo(index, i + 1, GIN_TRICONSISTENT_PROC), + CurrentMemoryContext); + } + + if (index_getprocid(index, i + 1, GIN_CONSISTENT_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->consistentFn[i]), + index_getprocinfo(index, i + 1, GIN_CONSISTENT_PROC), + CurrentMemoryContext); + } + + if (state->consistentFn[i].fn_oid == InvalidOid && + state->triConsistentFn[i].fn_oid == InvalidOid) + { + elog(ERROR, "missing GIN support function (%d or %d) for attribute %d of index \"%s\"", + GIN_CONSISTENT_PROC, GIN_TRICONSISTENT_PROC, + i + 1, RelationGetRelationName(index)); + } + + /* + * Check opclass capability to do partial match. + */ + if (index_getprocid(index, i + 1, GIN_COMPARE_PARTIAL_PROC) != InvalidOid) + { + fmgr_info_copy(&(state->comparePartialFn[i]), + index_getprocinfo(index, i + 1, GIN_COMPARE_PARTIAL_PROC), + CurrentMemoryContext); + state->canPartialMatch[i] = true; + } + else + { + state->canPartialMatch[i] = false; + } + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type (for instance, hstore + * uses text index entries). If there's no index collation then + * specify default collation in case the support functions need + * collation. This is harmless if the support functions don't care + * about collation, so we just do it unconditionally. (We could + * alternatively call get_typcollation, but that seems like expensive + * overkill --- there aren't going to be any cases where a GIN storage + * type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + state->supportCollation[i] = index->rd_indcollation[i]; + else + state->supportCollation[i] = DEFAULT_COLLATION_OID; + } +} + +/* + * Extract attribute (column) number of stored entry from GIN tuple + */ +OffsetNumber +gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple) +{ + OffsetNumber colN; + + if (ginstate->oneCol) + { + /* column number is not stored explicitly */ + colN = FirstOffsetNumber; + } + else + { + Datum res; + bool isnull; + + /* + * First attribute is always int16, so we can safely use any tuple + * descriptor to obtain first attribute of tuple + */ + res = index_getattr(tuple, FirstOffsetNumber, ginstate->tupdesc[0], + &isnull); + Assert(!isnull); + + colN = DatumGetUInt16(res); + Assert(colN >= FirstOffsetNumber && colN <= ginstate->origTupdesc->natts); + } + + return colN; +} + +/* + * Extract stored datum (and possible null category) from GIN tuple + */ +Datum +gintuple_get_key(GinState *ginstate, IndexTuple tuple, + GinNullCategory *category) +{ + Datum res; + bool isnull; + + if (ginstate->oneCol) + { + /* + * Single column index doesn't store attribute numbers in tuples + */ + res = index_getattr(tuple, FirstOffsetNumber, ginstate->origTupdesc, + &isnull); + } + else + { + /* + * Since the datum type depends on which index column it's from, we + * must be careful to use the right tuple descriptor here. + */ + OffsetNumber colN = gintuple_get_attrnum(ginstate, tuple); + + res = index_getattr(tuple, OffsetNumberNext(FirstOffsetNumber), + ginstate->tupdesc[colN - 1], + &isnull); + } + + if (isnull) + *category = GinGetNullCategory(tuple, ginstate); + else + *category = GIN_CAT_NORM_KEY; + + return res; +} + +/* + * Allocate a new page (either by recycling, or by extending the index file) + * The returned buffer is already pinned and exclusive-locked + * Caller is responsible for initializing the page by calling GinInitBuffer + */ +Buffer +GinNewBuffer(Relation index) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(index); + + if (blkno == InvalidBlockNumber) + break; + + buffer = ReadBuffer(index, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + if (GinPageIsRecyclable(BufferGetPage(buffer))) + return buffer; /* OK to use */ + + LockBuffer(buffer, GIN_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(index); + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + + buffer = ReadBuffer(index, P_NEW); + LockBuffer(buffer, GIN_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return buffer; +} + +void +GinInitPage(Page page, uint32 f, Size pageSize) +{ + GinPageOpaque opaque; + + PageInit(page, pageSize, sizeof(GinPageOpaqueData)); + + opaque = GinPageGetOpaque(page); + opaque->flags = f; + opaque->rightlink = InvalidBlockNumber; +} + +void +GinInitBuffer(Buffer b, uint32 f) +{ + GinInitPage(BufferGetPage(b), f, BufferGetPageSize(b)); +} + +void +GinInitMetabuffer(Buffer b) +{ + GinMetaPageData *metadata; + Page page = BufferGetPage(b); + + GinInitPage(page, GIN_META, BufferGetPageSize(b)); + + metadata = GinPageGetMeta(page); + + metadata->head = metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; + metadata->nPendingPages = 0; + metadata->nPendingHeapTuples = 0; + metadata->nTotalPages = 0; + metadata->nEntryPages = 0; + metadata->nDataPages = 0; + metadata->nEntries = 0; + metadata->ginVersion = GIN_CURRENT_VERSION; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) page; +} + +/* + * Compare two keys of the same index column + */ +int +ginCompareEntries(GinState *ginstate, OffsetNumber attnum, + Datum a, GinNullCategory categorya, + Datum b, GinNullCategory categoryb) +{ + /* if not of same null category, sort by that first */ + if (categorya != categoryb) + return (categorya < categoryb) ? -1 : 1; + + /* all null items in same category are equal */ + if (categorya != GIN_CAT_NORM_KEY) + return 0; + + /* both not null, so safe to call the compareFn */ + return DatumGetInt32(FunctionCall2Coll(&ginstate->compareFn[attnum - 1], + ginstate->supportCollation[attnum - 1], + a, b)); +} + +/* + * Compare two keys of possibly different index columns + */ +int +ginCompareAttEntries(GinState *ginstate, + OffsetNumber attnuma, Datum a, GinNullCategory categorya, + OffsetNumber attnumb, Datum b, GinNullCategory categoryb) +{ + /* attribute number is the first sort key */ + if (attnuma != attnumb) + return (attnuma < attnumb) ? -1 : 1; + + return ginCompareEntries(ginstate, attnuma, a, categorya, b, categoryb); +} + + +/* + * Support for sorting key datums in ginExtractEntries + * + * Note: we only have to worry about null and not-null keys here; + * ginExtractEntries never generates more than one placeholder null, + * so it doesn't have to sort those. + */ +typedef struct +{ + Datum datum; + bool isnull; +} keyEntryData; + +typedef struct +{ + FmgrInfo *cmpDatumFunc; + Oid collation; + bool haveDups; +} cmpEntriesArg; + +static int +cmpEntries(const void *a, const void *b, void *arg) +{ + const keyEntryData *aa = (const keyEntryData *) a; + const keyEntryData *bb = (const keyEntryData *) b; + cmpEntriesArg *data = (cmpEntriesArg *) arg; + int res; + + if (aa->isnull) + { + if (bb->isnull) + res = 0; /* NULL "=" NULL */ + else + res = 1; /* NULL ">" not-NULL */ + } + else if (bb->isnull) + res = -1; /* not-NULL "<" NULL */ + else + res = DatumGetInt32(FunctionCall2Coll(data->cmpDatumFunc, + data->collation, + aa->datum, bb->datum)); + + /* + * Detect if we have any duplicates. If there are equal keys, qsort must + * compare them at some point, else it wouldn't know whether one should go + * before or after the other. + */ + if (res == 0) + data->haveDups = true; + + return res; +} + + +/* + * Extract the index key values from an indexable item + * + * The resulting key values are sorted, and any duplicates are removed. + * This avoids generating redundant index entries. + */ +Datum * +ginExtractEntries(GinState *ginstate, OffsetNumber attnum, + Datum value, bool isNull, + int32 *nentries, GinNullCategory **categories) +{ + Datum *entries; + bool *nullFlags; + int32 i; + + /* + * We don't call the extractValueFn on a null item. Instead generate a + * placeholder. + */ + if (isNull) + { + *nentries = 1; + entries = (Datum *) palloc(sizeof(Datum)); + entries[0] = (Datum) 0; + *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + (*categories)[0] = GIN_CAT_NULL_ITEM; + return entries; + } + + /* OK, call the opclass's extractValueFn */ + nullFlags = NULL; /* in case extractValue doesn't set it */ + entries = (Datum *) + DatumGetPointer(FunctionCall3Coll(&ginstate->extractValueFn[attnum - 1], + ginstate->supportCollation[attnum - 1], + value, + PointerGetDatum(nentries), + PointerGetDatum(&nullFlags))); + + /* + * Generate a placeholder if the item contained no keys. + */ + if (entries == NULL || *nentries <= 0) + { + *nentries = 1; + entries = (Datum *) palloc(sizeof(Datum)); + entries[0] = (Datum) 0; + *categories = (GinNullCategory *) palloc(sizeof(GinNullCategory)); + (*categories)[0] = GIN_CAT_EMPTY_ITEM; + return entries; + } + + /* + * If the extractValueFn didn't create a nullFlags array, create one, + * assuming that everything's non-null. + */ + if (nullFlags == NULL) + nullFlags = (bool *) palloc0(*nentries * sizeof(bool)); + + /* + * If there's more than one key, sort and unique-ify. + * + * XXX Using qsort here is notationally painful, and the overhead is + * pretty bad too. For small numbers of keys it'd likely be better to use + * a simple insertion sort. + */ + if (*nentries > 1) + { + keyEntryData *keydata; + cmpEntriesArg arg; + + keydata = (keyEntryData *) palloc(*nentries * sizeof(keyEntryData)); + for (i = 0; i < *nentries; i++) + { + keydata[i].datum = entries[i]; + keydata[i].isnull = nullFlags[i]; + } + + arg.cmpDatumFunc = &ginstate->compareFn[attnum - 1]; + arg.collation = ginstate->supportCollation[attnum - 1]; + arg.haveDups = false; + qsort_arg(keydata, *nentries, sizeof(keyEntryData), + cmpEntries, (void *) &arg); + + if (arg.haveDups) + { + /* there are duplicates, must get rid of 'em */ + int32 j; + + entries[0] = keydata[0].datum; + nullFlags[0] = keydata[0].isnull; + j = 1; + for (i = 1; i < *nentries; i++) + { + if (cmpEntries(&keydata[i - 1], &keydata[i], &arg) != 0) + { + entries[j] = keydata[i].datum; + nullFlags[j] = keydata[i].isnull; + j++; + } + } + *nentries = j; + } + else + { + /* easy, no duplicates */ + for (i = 0; i < *nentries; i++) + { + entries[i] = keydata[i].datum; + nullFlags[i] = keydata[i].isnull; + } + } + + pfree(keydata); + } + + /* + * Create GinNullCategory representation from nullFlags. + */ + *categories = (GinNullCategory *) palloc0(*nentries * sizeof(GinNullCategory)); + for (i = 0; i < *nentries; i++) + (*categories)[i] = (nullFlags[i] ? GIN_CAT_NULL_KEY : GIN_CAT_NORM_KEY); + + return entries; +} + +bytea * +ginoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fastupdate", RELOPT_TYPE_BOOL, offsetof(GinOptions, useFastUpdate)}, + {"gin_pending_list_limit", RELOPT_TYPE_INT, offsetof(GinOptions, + pendingListCleanupSize)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_GIN, + sizeof(GinOptions), + tab, lengthof(tab)); +} + +/* + * Fetch index's statistical data into *stats + * + * Note: in the result, nPendingPages can be trusted to be up-to-date, + * as can ginVersion; but the other fields are as of the last VACUUM. + */ +void +ginGetStats(Relation index, GinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + stats->nPendingPages = metadata->nPendingPages; + stats->nTotalPages = metadata->nTotalPages; + stats->nEntryPages = metadata->nEntryPages; + stats->nDataPages = metadata->nDataPages; + stats->nEntries = metadata->nEntries; + stats->ginVersion = metadata->ginVersion; + + UnlockReleaseBuffer(metabuffer); +} + +/* + * Write the given statistics to the index's metapage + * + * Note: nPendingPages and ginVersion are *not* copied over + */ +void +ginUpdateStats(Relation index, const GinStatsData *stats, bool is_build) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + START_CRIT_SECTION(); + + metadata->nTotalPages = stats->nTotalPages; + metadata->nEntryPages = stats->nEntryPages; + metadata->nDataPages = stats->nDataPages; + metadata->nEntries = stats->nEntries; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. (We must do this here because pre-v11 versions of PG did not + * set the metapage's pd_lower correctly, so a pg_upgraded index might + * contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; + + MarkBufferDirty(metabuffer); + + if (RelationNeedsWAL(index) && !is_build) + { + XLogRecPtr recptr; + ginxlogUpdateMeta data; + + data.node = index->rd_node; + data.ntuples = 0; + data.newRightlink = data.prevTail = InvalidBlockNumber; + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); + + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); + PageSetLSN(metapage, recptr); + } + + UnlockReleaseBuffer(metabuffer); + + END_CRIT_SECTION(); +} diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c new file mode 100644 index 0000000..a276eb0 --- /dev/null +++ b/src/backend/access/gin/ginvacuum.c @@ -0,0 +1,822 @@ +/*------------------------------------------------------------------------- + * + * ginvacuum.c + * delete & vacuum routines for the postgres GIN + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginvacuum.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xloginsert.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/memutils.h" + +struct GinVacuumState +{ + Relation index; + IndexBulkDeleteResult *result; + IndexBulkDeleteCallback callback; + void *callback_state; + GinState ginstate; + BufferAccessStrategy strategy; + MemoryContext tmpCxt; +}; + +/* + * Vacuums an uncompressed posting list. The size of the must can be specified + * in number of items (nitems). + * + * If none of the items need to be removed, returns NULL. Otherwise returns + * a new palloc'd array with the remaining items. The number of remaining + * items is returned in *nremaining. + */ +ItemPointer +ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items, + int nitem, int *nremaining) +{ + int i, + remaining = 0; + ItemPointer tmpitems = NULL; + + /* + * Iterate over TIDs array + */ + for (i = 0; i < nitem; i++) + { + if (gvs->callback(items + i, gvs->callback_state)) + { + gvs->result->tuples_removed += 1; + if (!tmpitems) + { + /* + * First TID to be deleted: allocate memory to hold the + * remaining items. + */ + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + memcpy(tmpitems, items, sizeof(ItemPointerData) * i); + } + } + else + { + gvs->result->num_index_tuples += 1; + if (tmpitems) + tmpitems[remaining] = items[i]; + remaining++; + } + } + + *nremaining = remaining; + return tmpitems; +} + +/* + * Create a WAL record for vacuuming entry tree leaf page. + */ +static void +xlogVacuumPage(Relation index, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + XLogRecPtr recptr; + + /* This is only used for entry tree leaf pages. */ + Assert(!GinPageIsData(page)); + Assert(GinPageIsLeaf(page)); + + if (!RelationNeedsWAL(index)) + return; + + /* + * Always create a full image, we don't track the changes on the page at + * any more fine-grained level. This could obviously be improved... + */ + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE); + PageSetLSN(page, recptr); +} + + +typedef struct DataPageDeleteStack +{ + struct DataPageDeleteStack *child; + struct DataPageDeleteStack *parent; + + BlockNumber blkno; /* current block number */ + Buffer leftBuffer; /* pinned and locked rightest non-deleted page + * on left */ + bool isRoot; +} DataPageDeleteStack; + + +/* + * Delete a posting tree page. + */ +static void +ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, + BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) +{ + Buffer dBuffer; + Buffer lBuffer; + Buffer pBuffer; + Page page, + parentPage; + BlockNumber rightlink; + + /* + * This function MUST be called only if someone of parent pages hold + * exclusive cleanup lock. This guarantees that no insertions currently + * happen in this subtree. Caller also acquires Exclusive locks on + * deletable, parent and left pages. + */ + lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, + RBM_NORMAL, gvs->strategy); + dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, + RBM_NORMAL, gvs->strategy); + pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, + RBM_NORMAL, gvs->strategy); + + page = BufferGetPage(dBuffer); + rightlink = GinPageGetOpaque(page)->rightlink; + + /* + * Any insert which would have gone on the leaf block will now go to its + * right sibling. + */ + PredicateLockPageCombine(gvs->index, deleteBlkno, rightlink); + + START_CRIT_SECTION(); + + /* Unlink the page by changing left sibling's rightlink */ + page = BufferGetPage(lBuffer); + GinPageGetOpaque(page)->rightlink = rightlink; + + /* Delete downlink from parent */ + parentPage = BufferGetPage(pBuffer); +#ifdef USE_ASSERT_CHECKING + do + { + PostingItem *tod = GinDataPageGetPostingItem(parentPage, myoff); + + Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); + } while (0); +#endif + GinPageDeletePostingItem(parentPage, myoff); + + page = BufferGetPage(dBuffer); + + /* + * we shouldn't change rightlink field to save workability of running + * search scan + */ + + /* + * Mark page as deleted, and remember last xid which could know its + * address. + */ + GinPageSetDeleted(page); + GinPageSetDeleteXid(page, ReadNextTransactionId()); + + MarkBufferDirty(pBuffer); + MarkBufferDirty(lBuffer); + MarkBufferDirty(dBuffer); + + if (RelationNeedsWAL(gvs->index)) + { + XLogRecPtr recptr; + ginxlogDeletePage data; + + /* + * We can't pass REGBUF_STANDARD for the deleted page, because we + * didn't set pd_lower on pre-9.4 versions. The page might've been + * binary-upgraded from an older version, and hence not have pd_lower + * set correctly. Ditto for the left page, but removing the item from + * the parent updated its pd_lower, so we know that's OK at this + * point. + */ + XLogBeginInsert(); + XLogRegisterBuffer(0, dBuffer, 0); + XLogRegisterBuffer(1, pBuffer, REGBUF_STANDARD); + XLogRegisterBuffer(2, lBuffer, 0); + + data.parentOffset = myoff; + data.rightLink = GinPageGetOpaque(page)->rightlink; + data.deleteXid = GinPageGetDeleteXid(page); + + XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE); + PageSetLSN(page, recptr); + PageSetLSN(parentPage, recptr); + PageSetLSN(BufferGetPage(lBuffer), recptr); + } + + ReleaseBuffer(pBuffer); + ReleaseBuffer(lBuffer); + ReleaseBuffer(dBuffer); + + END_CRIT_SECTION(); + + gvs->result->pages_newly_deleted++; + gvs->result->pages_deleted++; +} + + +/* + * Scans posting tree and deletes empty pages. Caller must lock root page for + * cleanup. During scan path from root to current page is kept exclusively + * locked. Also keep left page exclusively locked, because ginDeletePage() + * needs it. If we try to relock left page later, it could deadlock with + * ginStepRight(). + */ +static bool +ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, + DataPageDeleteStack *parent, OffsetNumber myoff) +{ + DataPageDeleteStack *me; + Buffer buffer; + Page page; + bool meDelete = false; + bool isempty; + + if (isRoot) + { + me = parent; + } + else + { + if (!parent->child) + { + me = (DataPageDeleteStack *) palloc0(sizeof(DataPageDeleteStack)); + me->parent = parent; + parent->child = me; + me->leftBuffer = InvalidBuffer; + } + else + me = parent->child; + } + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + + if (!isRoot) + LockBuffer(buffer, GIN_EXCLUSIVE); + + page = BufferGetPage(buffer); + + Assert(GinPageIsData(page)); + + if (!GinPageIsLeaf(page)) + { + OffsetNumber i; + + me->blkno = blkno; + for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++) + { + PostingItem *pitem = GinDataPageGetPostingItem(page, i); + + if (ginScanToDelete(gvs, PostingItemGetBlockNumber(pitem), false, me, i)) + i--; + } + + if (GinPageRightMost(page) && BufferIsValid(me->child->leftBuffer)) + { + UnlockReleaseBuffer(me->child->leftBuffer); + me->child->leftBuffer = InvalidBuffer; + } + } + + if (GinPageIsLeaf(page)) + isempty = GinDataLeafPageIsEmpty(page); + else + isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber; + + if (isempty) + { + /* we never delete the left- or rightmost branch */ + if (BufferIsValid(me->leftBuffer) && !GinPageRightMost(page)) + { + Assert(!isRoot); + ginDeletePage(gvs, blkno, BufferGetBlockNumber(me->leftBuffer), + me->parent->blkno, myoff, me->parent->isRoot); + meDelete = true; + } + } + + if (!meDelete) + { + if (BufferIsValid(me->leftBuffer)) + UnlockReleaseBuffer(me->leftBuffer); + me->leftBuffer = buffer; + } + else + { + if (!isRoot) + LockBuffer(buffer, GIN_UNLOCK); + + ReleaseBuffer(buffer); + } + + if (isRoot) + ReleaseBuffer(buffer); + + return meDelete; +} + + +/* + * Scan through posting tree leafs, delete empty tuples. Returns true if there + * is at least one empty page. + */ +static bool +ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno) +{ + Buffer buffer; + Page page; + bool hasVoidPage = false; + MemoryContext oldCxt; + + /* Find leftmost leaf page of posting tree and lock it in exclusive mode */ + while (true) + { + PostingItem *pitem; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + + Assert(GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + { + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_EXCLUSIVE); + break; + } + + Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); + + pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber); + blkno = PostingItemGetBlockNumber(pitem); + Assert(blkno != InvalidBlockNumber); + + UnlockReleaseBuffer(buffer); + } + + /* Iterate all posting tree leaves using rightlinks and vacuum them */ + while (true) + { + oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); + ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(gvs->tmpCxt); + + if (GinDataLeafPageIsEmpty(page)) + hasVoidPage = true; + + blkno = GinPageGetOpaque(page)->rightlink; + + UnlockReleaseBuffer(buffer); + + if (blkno == InvalidBlockNumber) + break; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + } + + return hasVoidPage; +} + +static void +ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) +{ + if (ginVacuumPostingTreeLeaves(gvs, rootBlkno)) + { + /* + * There is at least one empty page. So we have to rescan the tree + * deleting empty pages. + */ + Buffer buffer; + DataPageDeleteStack root, + *ptr, + *tmp; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, rootBlkno, + RBM_NORMAL, gvs->strategy); + + /* + * Lock posting tree root for cleanup to ensure there are no + * concurrent inserts. + */ + LockBufferForCleanup(buffer); + + memset(&root, 0, sizeof(DataPageDeleteStack)); + root.leftBuffer = InvalidBuffer; + root.isRoot = true; + + ginScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber); + + ptr = root.child; + + while (ptr) + { + tmp = ptr->child; + pfree(ptr); + ptr = tmp; + } + + UnlockReleaseBuffer(buffer); + } +} + +/* + * returns modified page or NULL if page isn't modified. + * Function works with original page until first change is occurred, + * then page is copied into temporary one. + */ +static Page +ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint32 *nroot) +{ + Page origpage = BufferGetPage(buffer), + tmppage; + OffsetNumber i, + maxoff = PageGetMaxOffsetNumber(origpage); + + tmppage = origpage; + + *nroot = 0; + + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + IndexTuple itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); + + if (GinIsPostingTree(itup)) + { + /* + * store posting tree's roots for further processing, we can't + * vacuum it just now due to risk of deadlocks with scans/inserts + */ + roots[*nroot] = GinGetDownlink(itup); + (*nroot)++; + } + else if (GinGetNPosting(itup) > 0) + { + int nitems; + ItemPointer items_orig; + bool free_items_orig; + ItemPointer items; + + /* Get list of item pointers from the tuple. */ + if (GinItupIsCompressed(itup)) + { + items_orig = ginPostingListDecode((GinPostingList *) GinGetPosting(itup), &nitems); + free_items_orig = true; + } + else + { + items_orig = (ItemPointer) GinGetPosting(itup); + nitems = GinGetNPosting(itup); + free_items_orig = false; + } + + /* Remove any items from the list that need to be vacuumed. */ + items = ginVacuumItemPointers(gvs, items_orig, nitems, &nitems); + + if (free_items_orig) + pfree(items_orig); + + /* If any item pointers were removed, recreate the tuple. */ + if (items) + { + OffsetNumber attnum; + Datum key; + GinNullCategory category; + GinPostingList *plist; + int plistsize; + + if (nitems > 0) + { + plist = ginCompressPostingList(items, nitems, GinMaxItemSize, NULL); + plistsize = SizeOfGinPostingList(plist); + } + else + { + plist = NULL; + plistsize = 0; + } + + /* + * if we already created a temporary page, make changes in + * place + */ + if (tmppage == origpage) + { + /* + * On first difference, create a temporary copy of the + * page and copy the tuple's posting list to it. + */ + tmppage = PageGetTempPageCopy(origpage); + + /* set itup pointer to new page */ + itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); + } + + attnum = gintuple_get_attrnum(&gvs->ginstate, itup); + key = gintuple_get_key(&gvs->ginstate, itup, &category); + itup = GinFormTuple(&gvs->ginstate, attnum, key, category, + (char *) plist, plistsize, + nitems, true); + if (plist) + pfree(plist); + PageIndexTupleDelete(tmppage, i); + + if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(gvs->index)); + + pfree(itup); + pfree(items); + } + } + } + + return (tmppage == origpage) ? NULL : tmppage; +} + +IndexBulkDeleteResult * +ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation index = info->index; + BlockNumber blkno = GIN_ROOT_BLKNO; + GinVacuumState gvs; + Buffer buffer; + BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; + uint32 nRoot; + + gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "Gin vacuum temporary context", + ALLOCSET_DEFAULT_SIZES); + gvs.index = index; + gvs.callback = callback; + gvs.callback_state = callback_state; + gvs.strategy = info->strategy; + initGinState(&gvs.ginstate, index); + + /* first time through? */ + if (stats == NULL) + { + /* Yes, so initialize stats to zeroes */ + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* + * and cleanup any pending inserts + */ + ginInsertCleanup(&gvs.ginstate, !IsAutoVacuumWorkerProcess(), + false, true, stats); + } + + /* we'll re-count the tuples each time */ + stats->num_index_tuples = 0; + gvs.result = stats; + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + + /* find leaf page */ + for (;;) + { + Page page = BufferGetPage(buffer); + IndexTuple itup; + + LockBuffer(buffer, GIN_SHARE); + + Assert(!GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + { + LockBuffer(buffer, GIN_UNLOCK); + LockBuffer(buffer, GIN_EXCLUSIVE); + + if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) + { + LockBuffer(buffer, GIN_UNLOCK); + continue; /* check it one more */ + } + break; + } + + Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); + blkno = GinGetDownlink(itup); + Assert(blkno != InvalidBlockNumber); + + UnlockReleaseBuffer(buffer); + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + } + + /* right now we found leftmost page in entry's BTree */ + + for (;;) + { + Page page = BufferGetPage(buffer); + Page resPage; + uint32 i; + + Assert(!GinPageIsData(page)); + + resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); + + blkno = GinPageGetOpaque(page)->rightlink; + + if (resPage) + { + START_CRIT_SECTION(); + PageRestoreTempPage(resPage, page); + MarkBufferDirty(buffer); + xlogVacuumPage(gvs.index, buffer); + UnlockReleaseBuffer(buffer); + END_CRIT_SECTION(); + } + else + { + UnlockReleaseBuffer(buffer); + } + + vacuum_delay_point(); + + for (i = 0; i < nRoot; i++) + { + ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); + vacuum_delay_point(); + } + + if (blkno == InvalidBlockNumber) /* rightmost page */ + break; + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + LockBuffer(buffer, GIN_EXCLUSIVE); + } + + MemoryContextDelete(gvs.tmpCxt); + + return gvs.result; +} + +IndexBulkDeleteResult * +ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation index = info->index; + bool needLock; + BlockNumber npages, + blkno; + BlockNumber totFreePages; + GinState ginstate; + GinStatsData idxStat; + + /* + * In an autovacuum analyze, we want to clean up pending insertions. + * Otherwise, an ANALYZE-only call is a no-op. + */ + if (info->analyze_only) + { + if (IsAutoVacuumWorkerProcess()) + { + initGinState(&ginstate, index); + ginInsertCleanup(&ginstate, false, true, true, stats); + } + return stats; + } + + /* + * Set up all-zero stats and cleanup pending inserts if ginbulkdelete + * wasn't called + */ + if (stats == NULL) + { + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + initGinState(&ginstate, index); + ginInsertCleanup(&ginstate, !IsAutoVacuumWorkerProcess(), + false, true, stats); + } + + memset(&idxStat, 0, sizeof(idxStat)); + + /* + * XXX we always report the heap tuple count as the number of index + * entries. This is bogus if the index is partial, but it's real hard to + * tell how many distinct heap entries are referenced by a GIN index. + */ + stats->num_index_tuples = Max(info->num_heap_tuples, 0); + stats->estimated_count = info->estimated_count; + + /* + * Need lock unless it's local to this backend. + */ + needLock = !RELATION_IS_LOCAL(index); + + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + npages = RelationGetNumberOfBlocks(index); + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + totFreePages = 0; + + for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + + vacuum_delay_point(); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + LockBuffer(buffer, GIN_SHARE); + page = (Page) BufferGetPage(buffer); + + if (GinPageIsRecyclable(page)) + { + Assert(blkno != GIN_ROOT_BLKNO); + RecordFreeIndexPage(index, blkno); + totFreePages++; + } + else if (GinPageIsData(page)) + { + idxStat.nDataPages++; + } + else if (!GinPageIsList(page)) + { + idxStat.nEntryPages++; + + if (GinPageIsLeaf(page)) + idxStat.nEntries += PageGetMaxOffsetNumber(page); + } + + UnlockReleaseBuffer(buffer); + } + + /* Update the metapage with accurate page and entry counts */ + idxStat.nTotalPages = npages; + ginUpdateStats(info->index, &idxStat, false); + + /* Finally, vacuum the FSM */ + IndexFreeSpaceMapVacuum(info->index); + + stats->pages_free = totFreePages; + + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + stats->num_pages = RelationGetNumberOfBlocks(index); + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return stats; +} + +/* + * Return whether Page can safely be recycled. + */ +bool +GinPageIsRecyclable(Page page) +{ + TransactionId delete_xid; + + if (PageIsNew(page)) + return true; + + if (!GinPageIsDeleted(page)) + return false; + + delete_xid = GinPageGetDeleteXid(page); + + if (!TransactionIdIsValid(delete_xid)) + return true; + + /* + * If no backend still could view delete_xid as in running, all scans + * concurrent with ginDeletePage() must have finished. + */ + return GlobalVisCheckRemovableXid(NULL, delete_xid); +} diff --git a/src/backend/access/gin/ginvalidate.c b/src/backend/access/gin/ginvalidate.c new file mode 100644 index 0000000..d2510da --- /dev/null +++ b/src/backend/access/gin/ginvalidate.c @@ -0,0 +1,338 @@ +/*------------------------------------------------------------------------- + * + * ginvalidate.c + * Opclass validator for GIN. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/gin_private.h" +#include "access/htup_details.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + +/* + * Validator for a GIN opclass. + */ +bool +ginvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + Oid opckeytype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opckeytype = classform->opckeytype; + if (!OidIsValid(opckeytype)) + opckeytype = opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* + * All GIN support functions should be registered with matching + * left/right types + */ + if (procform->amproclefttype != procform->amprocrighttype) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types", + opfamilyname, "gin", + format_procedure(procform->amproc)))); + result = false; + } + + /* + * We can't check signatures except within the specific opclass, since + * we need to know the associated opckeytype in many cases. + */ + if (procform->amproclefttype != opcintype) + continue; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case GIN_COMPARE_PROC: + ok = check_amproc_signature(procform->amproc, INT4OID, false, + 2, 2, opckeytype, opckeytype); + break; + case GIN_EXTRACTVALUE_PROC: + /* Some opclasses omit nullFlags */ + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 2, 3, opcintype, INTERNALOID, + INTERNALOID); + break; + case GIN_EXTRACTQUERY_PROC: + /* Some opclasses omit nullFlags and searchMode */ + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 5, 7, opcintype, INTERNALOID, + INT2OID, INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case GIN_CONSISTENT_PROC: + /* Some opclasses omit queryKeys and nullFlags */ + ok = check_amproc_signature(procform->amproc, BOOLOID, false, + 6, 8, INTERNALOID, INT2OID, + opcintype, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case GIN_COMPARE_PARTIAL_PROC: + ok = check_amproc_signature(procform->amproc, INT4OID, false, + 4, 4, opckeytype, opckeytype, + INT2OID, INTERNALOID); + break; + case GIN_TRICONSISTENT_PROC: + ok = check_amproc_signature(procform->amproc, CHAROID, false, + 7, 7, INTERNALOID, INT2OID, + opcintype, INT4OID, + INTERNALOID, INTERNALOID, + INTERNALOID); + break; + case GIN_OPTIONS_PROC: + ok = check_amoptsproc_signature(procform->amproc); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "gin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "gin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* TODO: Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "gin", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* gin doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "gin", + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all gin strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "gin", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * There is not a lot we can do to check the operator sets, since each + * GIN opclass is more or less a law unto itself, and some contain + * only operators that are binary-compatible with the opclass datatype + * (meaning that empty operator sets can be OK). That case also means + * that we shouldn't insist on nonempty function sets except for the + * opclass's own group. + */ + } + + /* Check that the originally-named opclass is complete */ + for (i = 1; i <= GINNProcs; i++) + { + if (opclassgroup && + (opclassgroup->functionset & (((uint64) 1) << i)) != 0) + continue; /* got it */ + if (i == GIN_COMPARE_PROC || i == GIN_COMPARE_PARTIAL_PROC || + i == GIN_OPTIONS_PROC) + continue; /* optional method */ + if (i == GIN_CONSISTENT_PROC || i == GIN_TRICONSISTENT_PROC) + continue; /* don't need both, see check below loop */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d", + opclassname, "gin", i))); + result = false; + } + if (!opclassgroup || + ((opclassgroup->functionset & (1 << GIN_CONSISTENT_PROC)) == 0 && + (opclassgroup->functionset & (1 << GIN_TRICONSISTENT_PROC)) == 0)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d or %d", + opclassname, "gin", + GIN_CONSISTENT_PROC, GIN_TRICONSISTENT_PROC))); + result = false; + } + + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + +/* + * Prechecking function for adding operators/functions to a GIN opfamily. + */ +void +ginadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + ListCell *lc; + + /* + * Operator members of a GIN opfamily should never have hard dependencies, + * since their connection to the opfamily depends only on what the support + * functions think, and that can be altered. For consistency, we make all + * soft dependencies point to the opfamily, though a soft dependency on + * the opclass would work as well in the CREATE OPERATOR CLASS case. + */ + foreach(lc, operators) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + + /* + * Required support functions should have hard dependencies. Preferably + * those are just dependencies on the opclass, but if we're in ALTER + * OPERATOR FAMILY, we leave the dependency pointing at the whole + * opfamily. (Given that GIN opclasses generally don't share opfamilies, + * it seems unlikely to be worth working harder.) + */ + foreach(lc, functions) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + switch (op->number) + { + case GIN_EXTRACTVALUE_PROC: + case GIN_EXTRACTQUERY_PROC: + /* Required support function */ + op->ref_is_hard = true; + break; + case GIN_COMPARE_PROC: + case GIN_CONSISTENT_PROC: + case GIN_COMPARE_PARTIAL_PROC: + case GIN_TRICONSISTENT_PROC: + case GIN_OPTIONS_PROC: + /* Optional, so force it to be a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("support function number %d is invalid for access method %s", + op->number, "gin"))); + break; + } + } +} diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c new file mode 100644 index 0000000..09ce4d6 --- /dev/null +++ b/src/backend/access/gin/ginxlog.c @@ -0,0 +1,813 @@ +/*------------------------------------------------------------------------- + * + * ginxlog.c + * WAL replay logic for inverted index. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginxlog.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/gin_private.h" +#include "access/ginxlog.h" +#include "access/xlogutils.h" +#include "utils/memutils.h" + +static MemoryContext opCtx; /* working memory for operations */ + +static void +ginRedoClearIncompleteSplit(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, block_id, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + GinPageGetOpaque(page)->flags &= ~GIN_INCOMPLETE_SPLIT; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoCreatePTree(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record); + char *ptr; + Buffer buffer; + Page page; + + buffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(buffer); + + GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED); + + ptr = XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree); + + /* Place page data */ + memcpy(GinDataLeafPageGetPostingList(page), ptr, data->size); + + GinDataPageSetDataSize(page, data->size); + + PageSetLSN(page, lsn); + + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata) +{ + Page page = BufferGetPage(buffer); + ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata; + OffsetNumber offset = data->offset; + IndexTuple itup; + + if (rightblkno != InvalidBlockNumber) + { + /* update link to right page after split */ + Assert(!GinPageIsLeaf(page)); + Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offset)); + GinSetDownlink(itup, rightblkno); + } + + if (data->isDelete) + { + Assert(GinPageIsLeaf(page)); + Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page)); + PageIndexTupleDelete(page, offset); + } + + itup = &data->tuple; + + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber) + { + RelFileNode node; + ForkNumber forknum; + BlockNumber blknum; + + BufferGetTag(buffer, &node, &forknum, &blknum); + elog(ERROR, "failed to add item to index page in %u/%u/%u", + node.spcNode, node.dbNode, node.relNode); + } +} + +/* + * Redo recompression of posting list. Doing all the changes in-place is not + * always possible, because it might require more space than we've on the page. + * Instead, once modification is required we copy unprocessed tail of the page + * into separately allocated chunk of memory for further reading original + * versions of segments. Thanks to that we don't bother about moving page data + * in-place. + */ +static void +ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) +{ + int actionno; + int segno; + GinPostingList *oldseg; + Pointer segmentend; + char *walbuf; + int totalsize; + Pointer tailCopy = NULL; + Pointer writePtr; + Pointer segptr; + + /* + * If the page is in pre-9.4 format, convert to new format first. + */ + if (!GinPageIsCompressed(page)) + { + ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page); + int nuncompressed = GinPageGetOpaque(page)->maxoff; + int npacked; + + /* + * Empty leaf pages are deleted as part of vacuum, but leftmost and + * rightmost pages are never deleted. So, pg_upgrade'd from pre-9.4 + * instances might contain empty leaf pages, and we need to handle + * them correctly. + */ + if (nuncompressed > 0) + { + GinPostingList *plist; + + plist = ginCompressPostingList(uncompressed, nuncompressed, + BLCKSZ, &npacked); + totalsize = SizeOfGinPostingList(plist); + + Assert(npacked == nuncompressed); + + memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize); + } + else + { + totalsize = 0; + } + + GinDataPageSetDataSize(page, totalsize); + GinPageSetCompressed(page); + GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + } + + oldseg = GinDataLeafPageGetPostingList(page); + writePtr = (Pointer) oldseg; + segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page); + segno = 0; + + walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf); + for (actionno = 0; actionno < data->nactions; actionno++) + { + uint8 a_segno = *((uint8 *) (walbuf++)); + uint8 a_action = *((uint8 *) (walbuf++)); + GinPostingList *newseg = NULL; + int newsegsize = 0; + ItemPointerData *items = NULL; + uint16 nitems = 0; + ItemPointerData *olditems; + int nolditems; + ItemPointerData *newitems; + int nnewitems; + int segsize; + + /* Extract all the information we need from the WAL record */ + if (a_action == GIN_SEGMENT_INSERT || + a_action == GIN_SEGMENT_REPLACE) + { + newseg = (GinPostingList *) walbuf; + newsegsize = SizeOfGinPostingList(newseg); + walbuf += SHORTALIGN(newsegsize); + } + + if (a_action == GIN_SEGMENT_ADDITEMS) + { + memcpy(&nitems, walbuf, sizeof(uint16)); + walbuf += sizeof(uint16); + items = (ItemPointerData *) walbuf; + walbuf += nitems * sizeof(ItemPointerData); + } + + /* Skip to the segment that this action concerns */ + Assert(segno <= a_segno); + while (segno < a_segno) + { + /* + * Once modification is started and page tail is copied, we've to + * copy unmodified segments. + */ + segsize = SizeOfGinPostingList(oldseg); + if (tailCopy) + { + Assert(writePtr + segsize < PageGetSpecialPointer(page)); + memcpy(writePtr, (Pointer) oldseg, segsize); + } + writePtr += segsize; + oldseg = GinNextPostingListSegment(oldseg); + segno++; + } + + /* + * ADDITEMS action is handled like REPLACE, but the new segment to + * replace the old one is reconstructed using the old segment from + * disk and the new items from the WAL record. + */ + if (a_action == GIN_SEGMENT_ADDITEMS) + { + int npacked; + + olditems = ginPostingListDecode(oldseg, &nolditems); + + newitems = ginMergeItemPointers(items, nitems, + olditems, nolditems, + &nnewitems); + Assert(nnewitems == nolditems + nitems); + + newseg = ginCompressPostingList(newitems, nnewitems, + BLCKSZ, &npacked); + Assert(npacked == nnewitems); + + newsegsize = SizeOfGinPostingList(newseg); + a_action = GIN_SEGMENT_REPLACE; + } + + segptr = (Pointer) oldseg; + if (segptr != segmentend) + segsize = SizeOfGinPostingList(oldseg); + else + { + /* + * Positioned after the last existing segment. Only INSERTs + * expected here. + */ + Assert(a_action == GIN_SEGMENT_INSERT); + segsize = 0; + } + + /* + * We're about to start modification of the page. So, copy tail of + * the page if it's not done already. + */ + if (!tailCopy && segptr != segmentend) + { + int tailSize = segmentend - segptr; + + tailCopy = (Pointer) palloc(tailSize); + memcpy(tailCopy, segptr, tailSize); + segptr = tailCopy; + oldseg = (GinPostingList *) segptr; + segmentend = segptr + tailSize; + } + + switch (a_action) + { + case GIN_SEGMENT_DELETE: + segptr += segsize; + segno++; + break; + + case GIN_SEGMENT_INSERT: + /* copy the new segment in place */ + Assert(writePtr + newsegsize <= PageGetSpecialPointer(page)); + memcpy(writePtr, newseg, newsegsize); + writePtr += newsegsize; + break; + + case GIN_SEGMENT_REPLACE: + /* copy the new version of segment in place */ + Assert(writePtr + newsegsize <= PageGetSpecialPointer(page)); + memcpy(writePtr, newseg, newsegsize); + writePtr += newsegsize; + segptr += segsize; + segno++; + break; + + default: + elog(ERROR, "unexpected GIN leaf action: %u", a_action); + } + oldseg = (GinPostingList *) segptr; + } + + /* Copy the rest of unmodified segments if any. */ + segptr = (Pointer) oldseg; + if (segptr != segmentend && tailCopy) + { + int restSize = segmentend - segptr; + + Assert(writePtr + restSize <= PageGetSpecialPointer(page)); + memcpy(writePtr, segptr, restSize); + writePtr += restSize; + } + + totalsize = writePtr - (Pointer) GinDataLeafPageGetPostingList(page); + GinDataPageSetDataSize(page, totalsize); +} + +static void +ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata) +{ + Page page = BufferGetPage(buffer); + + if (isLeaf) + { + ginxlogRecompressDataLeaf *data = (ginxlogRecompressDataLeaf *) rdata; + + Assert(GinPageIsLeaf(page)); + + ginRedoRecompress(page, data); + } + else + { + ginxlogInsertDataInternal *data = (ginxlogInsertDataInternal *) rdata; + PostingItem *oldpitem; + + Assert(!GinPageIsLeaf(page)); + + /* update link to right page after split */ + oldpitem = GinDataPageGetPostingItem(page, data->offset); + PostingItemSetBlockNumber(oldpitem, rightblkno); + + GinDataPageAddPostingItem(page, &data->newitem, data->offset); + } +} + +static void +ginRedoInsert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record); + Buffer buffer; +#ifdef NOT_USED + BlockNumber leftChildBlkno = InvalidBlockNumber; +#endif + BlockNumber rightChildBlkno = InvalidBlockNumber; + bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; + + /* + * First clear incomplete-split flag on child page if this finishes a + * split. + */ + if (!isLeaf) + { + char *payload = XLogRecGetData(record) + sizeof(ginxlogInsert); + +#ifdef NOT_USED + leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload); +#endif + payload += sizeof(BlockIdData); + rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload); + payload += sizeof(BlockIdData); + + ginRedoClearIncompleteSplit(record, 1); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + Size len; + char *payload = XLogRecGetBlockData(record, 0, &len); + + /* How to insert the payload is tree-type specific */ + if (data->flags & GIN_INSERT_ISDATA) + { + Assert(GinPageIsData(page)); + ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload); + } + else + { + Assert(!GinPageIsData(page)); + ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoSplit(XLogReaderState *record) +{ + ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record); + Buffer lbuffer, + rbuffer, + rootbuf; + bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; + bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; + + /* + * First clear incomplete-split flag on child page if this finishes a + * split + */ + if (!isLeaf) + ginRedoClearIncompleteSplit(record, 3); + + if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of left page"); + + if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of right page"); + + if (isRoot) + { + if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of root page"); + UnlockReleaseBuffer(rootbuf); + } + + UnlockReleaseBuffer(rbuffer); + UnlockReleaseBuffer(lbuffer); +} + +/* + * VACUUM_PAGE record contains simply a full image of the page, similar to + * an XLOG_FPI record. + */ +static void +ginRedoVacuumPage(XLogReaderState *record) +{ + Buffer buffer; + + if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) + { + elog(ERROR, "replay of gin entry tree page vacuum did not restore the page"); + } + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoVacuumDataLeafPage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + Size len; + ginxlogVacuumDataLeafPage *xlrec; + + xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, &len); + + Assert(GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + + ginRedoRecompress(page, &xlrec->data); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoDeletePage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record); + Buffer dbuffer; + Buffer pbuffer; + Buffer lbuffer; + Page page; + + /* + * Lock left page first in order to prevent possible deadlock with + * ginStepRight(). + */ + if (XLogReadBufferForRedo(record, 2, &lbuffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(lbuffer); + Assert(GinPageIsData(page)); + GinPageGetOpaque(page)->rightlink = data->rightLink; + PageSetLSN(page, lsn); + MarkBufferDirty(lbuffer); + } + + if (XLogReadBufferForRedo(record, 0, &dbuffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(dbuffer); + Assert(GinPageIsData(page)); + GinPageSetDeleted(page); + GinPageSetDeleteXid(page, data->deleteXid); + PageSetLSN(page, lsn); + MarkBufferDirty(dbuffer); + } + + if (XLogReadBufferForRedo(record, 1, &pbuffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(pbuffer); + Assert(GinPageIsData(page)); + Assert(!GinPageIsLeaf(page)); + GinPageDeletePostingItem(page, data->parentOffset); + PageSetLSN(page, lsn); + MarkBufferDirty(pbuffer); + } + + if (BufferIsValid(lbuffer)) + UnlockReleaseBuffer(lbuffer); + if (BufferIsValid(pbuffer)) + UnlockReleaseBuffer(pbuffer); + if (BufferIsValid(dbuffer)) + UnlockReleaseBuffer(dbuffer); +} + +static void +ginRedoUpdateMetapage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + Buffer buffer; + + /* + * Restore the metapage. This is essentially the same as a full-page + * image, so restore the metapage unconditionally without looking at the + * LSN, to avoid torn page hazards. + */ + metabuffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + GinInitMetabuffer(metabuffer); + memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + + if (data->ntuples > 0) + { + /* + * insert into tail page + */ + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + OffsetNumber off; + int i; + Size tupsize; + char *payload; + IndexTuple tuples; + Size totaltupsize; + + payload = XLogRecGetBlockData(record, 1, &totaltupsize); + tuples = (IndexTuple) payload; + + if (PageIsEmpty(page)) + off = FirstOffsetNumber; + else + off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + for (i = 0; i < data->ntuples; i++) + { + tupsize = IndexTupleSize(tuples); + + if (PageAddItem(page, (Item) tuples, tupsize, off, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple) (((char *) tuples) + tupsize); + + off++; + } + Assert(payload + totaltupsize == (char *) tuples); + + /* + * Increase counter of heap tuples + */ + GinPageGetOpaque(page)->maxoff++; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + else if (data->prevTail != InvalidBlockNumber) + { + /* + * New tail + */ + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + + GinPageGetOpaque(page)->rightlink = data->newRightlink; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + UnlockReleaseBuffer(metabuffer); +} + +static void +ginRedoInsertListPage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber l, + off = FirstOffsetNumber; + int i, + tupsize; + char *payload; + IndexTuple tuples; + Size totaltupsize; + + /* We always re-initialize the page. */ + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + + GinInitBuffer(buffer, GIN_LIST); + GinPageGetOpaque(page)->rightlink = data->rightlink; + if (data->rightlink == InvalidBlockNumber) + { + /* tail of sublist */ + GinPageSetFullRow(page); + GinPageGetOpaque(page)->maxoff = 1; + } + else + { + GinPageGetOpaque(page)->maxoff = 0; + } + + payload = XLogRecGetBlockData(record, 0, &totaltupsize); + + tuples = (IndexTuple) payload; + for (i = 0; i < data->ntuples; i++) + { + tupsize = IndexTupleSize(tuples); + + l = PageAddItem(page, (Item) tuples, tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple) (((char *) tuples) + tupsize); + off++; + } + Assert((char *) tuples == payload + totaltupsize); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoDeleteListPages(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + ginxlogDeleteListPages *data = (ginxlogDeleteListPages *) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + int i; + + metabuffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + GinInitMetabuffer(metabuffer); + + memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + + /* + * In normal operation, shiftList() takes exclusive lock on all the + * pages-to-be-deleted simultaneously. During replay, however, it should + * be all right to lock them one at a time. This is dependent on the fact + * that we are deleting pages from the head of the list, and that readers + * share-lock the next page before releasing the one they are on. So we + * cannot get past a reader that is on, or due to visit, any page we are + * going to delete. New incoming readers will block behind our metapage + * lock and then see a fully updated page list. + * + * No full-page images are taken of the deleted pages. Instead, they are + * re-initialized as empty, deleted pages. Their right-links don't need to + * be preserved, because no new readers can see the pages, as explained + * above. + */ + for (i = 0; i < data->ndeleted; i++) + { + Buffer buffer; + Page page; + + buffer = XLogInitBufferForRedo(record, i + 1); + page = BufferGetPage(buffer); + GinInitBuffer(buffer, GIN_DELETED); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); + } + UnlockReleaseBuffer(metabuffer); +} + +void +gin_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + + /* + * GIN indexes do not require any conflict processing. NB: If we ever + * implement a similar optimization as we have in b-tree, and remove + * killed tuples outside VACUUM, we'll need to handle that here. + */ + + oldCtx = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_GIN_CREATE_PTREE: + ginRedoCreatePTree(record); + break; + case XLOG_GIN_INSERT: + ginRedoInsert(record); + break; + case XLOG_GIN_SPLIT: + ginRedoSplit(record); + break; + case XLOG_GIN_VACUUM_PAGE: + ginRedoVacuumPage(record); + break; + case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: + ginRedoVacuumDataLeafPage(record); + break; + case XLOG_GIN_DELETE_PAGE: + ginRedoDeletePage(record); + break; + case XLOG_GIN_UPDATE_META_PAGE: + ginRedoUpdateMetapage(record); + break; + case XLOG_GIN_INSERT_LISTPAGE: + ginRedoInsertListPage(record); + break; + case XLOG_GIN_DELETE_LISTPAGE: + ginRedoDeleteListPages(record); + break; + default: + elog(PANIC, "gin_redo: unknown op code %u", info); + } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +gin_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "GIN recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +gin_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; +} + +/* + * Mask a GIN page before running consistency checks on it. + */ +void +gin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + PageHeader pagehdr = (PageHeader) page; + GinPageOpaque opaque; + + mask_page_lsn_and_checksum(page); + opaque = GinPageGetOpaque(page); + + mask_page_hint_bits(page); + + /* + * For a GIN_DELETED page, the page is initialized to empty. Hence, mask + * the whole page content. For other pages, mask the hole if pd_lower + * appears to have been set correctly. + */ + if (opaque->flags & GIN_DELETED) + mask_page_content(page); + else if (pagehdr->pd_lower > SizeOfPageHeaderData) + mask_unused_space(page); +} diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile new file mode 100644 index 0000000..1aca8bc --- /dev/null +++ b/src/backend/access/gist/Makefile @@ -0,0 +1,28 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/gist +# +# IDENTIFICATION +# src/backend/access/gist/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/gist +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + gist.o \ + gistbuild.o \ + gistbuildbuffers.o \ + gistget.o \ + gistproc.o \ + gistscan.o \ + gistsplit.o \ + gistutil.o \ + gistvacuum.o \ + gistvalidate.o \ + gistxlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README new file mode 100644 index 0000000..25cab00 --- /dev/null +++ b/src/backend/access/gist/README @@ -0,0 +1,467 @@ +src/backend/access/gist/README + +GiST Indexing +============= + +This directory contains an implementation of GiST indexing for Postgres. + +GiST stands for Generalized Search Tree. It was introduced in the seminal paper +"Generalized Search Trees for Database Systems", 1995, Joseph M. Hellerstein, +Jeffrey F. Naughton, Avi Pfeffer: + + http://www.sai.msu.su/~megera/postgres/gist/papers/gist.ps + https://dsf.berkeley.edu/papers/sigmod97-gist.pdf + +and implemented by J. Hellerstein and P. Aoki in an early version of +PostgreSQL (more details are available from The GiST Indexing Project +at Berkeley at http://gist.cs.berkeley.edu/). As a "university" +project it had a limited number of features and was in rare use. + +The current implementation of GiST supports: + + * Variable length keys + * Composite keys (multi-key) + * Ordered search (nearest-neighbor search) + * provides NULL-safe interface to GiST core + * Concurrency + * Recovery support via WAL logging + * Buffering build algorithm + +The support for concurrency implemented in PostgreSQL was developed based on +the paper "Access Methods for Next-Generation Database Systems" by +Marcel Kornacker: + + http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz + +Buffering build algorithm for GiST was developed based on the paper "Efficient +Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold +and Jeffrey Scott Vitter. + + http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf + +The original algorithms were modified in several ways: + +* They had to be adapted to PostgreSQL conventions. For example, the SEARCH + algorithm was considerably changed, because in PostgreSQL the search function + should return one tuple (next), not all tuples at once. Also, it should + release page locks between calls. +* Since we added support for variable length keys, it's not possible to + guarantee enough free space for all keys on pages after splitting. User + defined function picksplit doesn't have information about size of tuples + (each tuple may contain several keys as in multicolumn index while picksplit + could work with only one key) and pages. +* We modified original INSERT algorithm for performance reasons. In particular, + it is now a single-pass algorithm. +* Since the papers were theoretical, some details were omitted and we + had to find out ourself how to solve some specific problems. + +Because of the above reasons, we have revised the interaction of GiST +core and PostgreSQL WAL system. Moreover, we encountered (and solved) +a problem of uncompleted insertions when recovering after crash, which +was not touched in the paper. + +Search Algorithm +---------------- + +The search code maintains a queue of unvisited items, where an "item" is +either a heap tuple known to satisfy the search conditions, or an index +page that is consistent with the search conditions according to inspection +of its parent page's downlink item. Initially the root page is searched +to find unvisited items in it. Then we pull items from the queue. A +heap tuple pointer is just returned immediately; an index page entry +causes that page to be searched, generating more queue entries. + +The queue is kept ordered with heap tuple items at the front, then +index page entries, with any newly-added index page entry inserted +before existing index page entries. This ensures depth-first traversal +of the index, and in particular causes the first few heap tuples to be +returned as soon as possible. That is helpful in case there is a LIMIT +that requires only a few tuples to be produced. + +To implement nearest-neighbor search, the queue entries are augmented +with distance data: heap tuple entries are labeled with exact distance +from the search argument, while index-page entries must be labeled with +the minimum distance that any of their children could have. Then, +queue entries are retrieved in smallest-distance-first order, with +entries having identical distances managed as stated in the previous +paragraph. + +The search algorithm keeps an index page locked only long enough to scan +its entries and queue those that satisfy the search conditions. Since +insertions can occur concurrently with searches, it is possible for an +index child page to be split between the time we make a queue entry for it +(while visiting its parent page) and the time we actually reach and scan +the child page. To avoid missing the entries that were moved to the right +sibling, we detect whether a split has occurred by comparing the child +page's NSN (node sequence number, a special-purpose LSN) to the LSN that +the parent had when visited. If it did, the sibling page is immediately +added to the front of the queue, ensuring that its items will be scanned +in the same order as if they were still on the original child page. + +As is usual in Postgres, the search algorithm only guarantees to find index +entries that existed before the scan started; index entries added during +the scan might or might not be visited. This is okay as long as all +searches use MVCC snapshot rules to reject heap tuples newer than the time +of scan start. In particular, this means that we need not worry about +cases where a parent page's downlink key is "enlarged" after we look at it. +Any such enlargement would be to add child items that we aren't interested +in returning anyway. + + +Insert Algorithm +---------------- + +INSERT guarantees that the GiST tree remains balanced. User defined key method +Penalty is used for choosing a subtree to insert; method PickSplit is used for +the node splitting algorithm; method Union is used for propagating changes +upward to maintain the tree properties. + +To insert a tuple, we first have to find a suitable leaf page to insert to. +The algorithm walks down the tree, starting from the root, along the path +of smallest Penalty. At each step: + +1. Has this page been split since we looked at the parent? If so, it's +possible that we should be inserting to the other half instead, so retreat +back to the parent. +2. If this is a leaf node, we've found our target node. +3. Otherwise use Penalty to pick a new target subtree. +4. Check the key representing the target subtree. If it doesn't already cover +the key we're inserting, replace it with the Union of the old downlink key +and the key being inserted. (Actually, we always call Union, and just skip +the replacement if the Unioned key is the same as the existing key) +5. Replacing the key in step 4 might cause the page to be split. In that case, +propagate the change upwards and restart the algorithm from the first parent +that didn't need to be split. +6. Walk down to the target subtree, and goto 1. + +This differs from the insertion algorithm in the original paper. In the +original paper, you first walk down the tree until you reach a leaf page, and +then you adjust the downlink in the parent, and propagate the adjustment up, +all the way up to the root in the worst case. But we adjust the downlinks to +cover the new key already when we walk down, so that when we reach the leaf +page, we don't need to update the parents anymore, except to insert the +downlinks if we have to split the page. This makes crash recovery simpler: +after inserting a key to the page, the tree is immediately self-consistent +without having to update the parents. Even if we split a page and crash before +inserting the downlink to the parent, the tree is self-consistent because the +right half of the split is accessible via the rightlink of the left page +(which replaced the original page). + +Note that the algorithm can walk up and down the tree before reaching a leaf +page, if internal pages need to split while adjusting the downlinks for the +new key. Eventually, you should reach the bottom, and proceed with the +insertion of the new tuple. + +Once we've found the target page to insert to, we check if there's room +for the new tuple. If there is, the tuple is inserted, and we're done. +If it doesn't fit, however, the page needs to be split. Note that it is +possible that a page needs to be split into more than two pages, if keys have +different lengths or more than one key is being inserted at a time (which can +happen when inserting downlinks for a page split that resulted in more than +two pages at the lower level). After splitting a page, the parent page needs +to be updated. The downlink for the new page needs to be inserted, and the +downlink for the old page, which became the left half of the split, needs to +be updated to only cover those tuples that stayed on the left page. Inserting +the downlink in the parent can again lead to a page split, recursing up to the +root page in the worst case. + +gistplacetopage is the workhorse function that performs one step of the +insertion. If the tuple fits, it inserts it to the given page, otherwise +it splits the page, and constructs the new downlink tuples for the split +pages. The caller must then call gistplacetopage() on the parent page to +insert the downlink tuples. The parent page that holds the downlink to +the child might have migrated as a result of concurrent splits of the +parent, gistFindCorrectParent() is used to find the parent page. + +Splitting the root page works slightly differently. At root split, +gistplacetopage() allocates the new child pages and replaces the old root +page with the new root containing downlinks to the new children, all in one +operation. + + +findPath is a subroutine of findParent, used when the correct parent page +can't be found by following the rightlinks at the parent level: + +findPath( stack item ) + push stack, [root, 0, 0] // page, LSN, parent + while( stack ) + ptr = top of stack + latch( ptr->page, S-mode ) + if ( ptr->parent->page->lsn < ptr->page->nsn ) + push stack, [ ptr->page->rightlink, 0, ptr->parent ] + end + for( each tuple on page ) + if ( tuple->pagepointer == item->page ) + return stack + else + add to stack at the end [tuple->pagepointer,0, ptr] + end + end + unlatch( ptr->page ) + pop stack + end + + +gistFindCorrectParent is used to re-find the parent of a page during +insertion. It might have migrated to the right since we traversed down the +tree because of page splits. + +findParent( stack item ) + parent = item->parent + if ( parent->page->lsn != parent->lsn ) + while(true) + search parent tuple on parent->page, if found the return + rightlink = parent->page->rightlink + unlatch( parent->page ) + if ( rightlink is incorrect ) + break loop + end + parent->page = rightlink + latch( parent->page, X-mode ) + end + newstack = findPath( item->parent ) + replace part of stack to new one + latch( parent->page, X-mode ) + return findParent( item ) + end + +pageSplit function decides how to distribute keys to the new pages after +page split: + +pageSplit(page, allkeys) + (lkeys, rkeys) = pickSplit( allkeys ) + if ( page is root ) + lpage = new page + else + lpage = page + rpage = new page + if ( no space left on rpage ) + newkeys = pageSplit( rpage, rkeys ) + else + push newkeys, union(rkeys) + end + if ( no space left on lpage ) + push newkeys, pageSplit( lpage, lkeys ) + else + push newkeys, union(lkeys) + end + return newkeys + + + +Concurrency control +------------------- +As a rule of thumb, if you need to hold a lock on multiple pages at the +same time, the locks should be acquired in the following order: child page +before parent, and left-to-right at the same level. Always acquiring the +locks in the same order avoids deadlocks. + +The search algorithm only looks at and locks one page at a time. Consequently +there's a race condition between a search and a page split. A page split +happens in two phases: 1. The page is split 2. The downlink is inserted to the +parent. If a search looks at the parent page between those steps, before the +downlink is inserted, it will still find the new right half by following the +rightlink on the left half. But it must not follow the rightlink if it saw the +downlink in the parent, or the page will be visited twice! + +A split initially marks the left page with the F_FOLLOW_RIGHT flag. If a scan +sees that flag set, it knows that the right page is missing the downlink, and +should be visited too. When split inserts the downlink to the parent, it +clears the F_FOLLOW_RIGHT flag in the child, and sets the NSN field in the +child page header to match the LSN of the insertion on the parent. If the +F_FOLLOW_RIGHT flag is not set, a scan compares the NSN on the child and the +LSN it saw in the parent. If NSN < LSN, the scan looked at the parent page +before the downlink was inserted, so it should follow the rightlink. Otherwise +the scan saw the downlink in the parent page, and will/did follow that as +usual. + +A scan can't normally see a page with the F_FOLLOW_RIGHT flag set, because +a page split keeps the child pages locked until the downlink has been inserted +to the parent and the flag cleared again. But if a crash happens in the middle +of a page split, before the downlinks are inserted into the parent, that will +leave a page with F_FOLLOW_RIGHT in the tree. Scans handle that just fine, +but we'll eventually want to fix that for performance reasons. And more +importantly, dealing with pages with missing downlink pointers in the parent +would complicate the insertion algorithm. So when an insertion sees a page +with F_FOLLOW_RIGHT set, it immediately tries to bring the split that +crashed in the middle to completion by adding the downlink in the parent. + +Buffering build algorithm +------------------------- + +In the buffering index build algorithm, some or all internal nodes have a +buffer attached to them. When a tuple is inserted at the top, the descend down +the tree is stopped as soon as a buffer is reached, and the tuple is pushed to +the buffer. When a buffer gets too full, all the tuples in it are flushed to +the lower level, where they again hit lower level buffers or leaf pages. This +makes the insertions happen in more of a breadth-first than depth-first order, +which greatly reduces the amount of random I/O required. + +In the algorithm, levels are numbered so that leaf pages have level zero, +and internal node levels count up from 1. This numbering ensures that a page's +level number never changes, even when the root page is split. + +Level Tree + +3 * + / \ +2 * * + / | \ / | \ +1 * * * * * * + / \ / \ / \ / \ / \ / \ +0 o o o o o o o o o o o o + +* - internal page +o - leaf page + +Internal pages that belong to certain levels have buffers associated with +them. Leaf pages never have buffers. Which levels have buffers is controlled +by "level step" parameter: level numbers that are multiples of level_step +have buffers, while others do not. For example, if level_step = 2, then +pages on levels 2, 4, 6, ... have buffers. If level_step = 1 then every +internal page has a buffer. + +Level Tree (level_step = 1) Tree (level_step = 2) + +3 * * + / \ / \ +2 *(b) *(b) *(b) *(b) + / | \ / | \ / | \ / | \ +1 *(b) *(b) *(b) *(b) *(b) *(b) * * * * * * + / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ +0 o o o o o o o o o o o o o o o o o o o o o o o o + +(b) - buffer + +Logically, a buffer is just bunch of tuples. Physically, it is divided in +pages, backed by a temporary file. Each buffer can be in one of two states: +a) Last page of the buffer is kept in main memory. A node buffer is +automatically switched to this state when a new index tuple is added to it, +or a tuple is removed from it. +b) All pages of the buffer are swapped out to disk. When a buffer becomes too +full, and we start to flush it, all other buffers are switched to this state. + +When an index tuple is inserted, its initial processing can end in one of the +following points: +1) Leaf page, if the depth of the index <= level_step, meaning that + none of the internal pages have buffers associated with them. +2) Buffer of topmost level page that has buffers. + +New index tuples are processed until one of the buffers in the topmost +buffered level becomes half-full. When a buffer becomes half-full, it's added +to the emptying queue, and will be emptied before a new tuple is processed. + +Buffer emptying process means that index tuples from the buffer are moved +into buffers at a lower level, or leaf pages. First, all the other buffers are +swapped to disk to free up the memory. Then tuples are popped from the buffer +one by one, and cascaded down the tree to the next buffer or leaf page below +the buffered node. + +Emptying a buffer has the interesting dynamic property that any intermediate +pages between the buffer being emptied, and the next buffered or leaf level +below it, become cached. If there are no more buffers below the node, the leaf +pages where the tuples finally land on get cached too. If there are, the last +buffer page of each buffer below is kept in memory. This is illustrated in +the figures below: + + Buffer being emptied to + lower-level buffers Buffer being emptied to leaf pages + + +(fb) +(fb) + / \ / \ + + + + + + / \ / \ / \ / \ + *(ab) *(ab) *(ab) *(ab) x x x x + ++ - cached internal page +x - cached leaf page +* - non-cached internal page +(fb) - buffer being emptied +(ab) - buffers being appended to, with last page in memory + +In the beginning of the index build, the level-step is chosen so that all those +pages involved in emptying one buffer fit in cache, so after each of those +pages have been accessed once and cached, emptying a buffer doesn't involve +any more I/O. This locality is where the speedup of the buffering algorithm +comes from. + +Emptying one buffer can fill up one or more of the lower-level buffers, +triggering emptying of them as well. Whenever a buffer becomes too full, it's +added to the emptying queue, and will be emptied after the current buffer has +been processed. + +To keep the size of each buffer limited even in the worst case, buffer emptying +is scheduled as soon as a buffer becomes half-full, and emptying it continues +until 1/2 of the nominal buffer size worth of tuples has been emptied. This +guarantees that when buffer emptying begins, all the lower-level buffers +are at most half-full. In the worst case that all the tuples are cascaded down +to the same lower-level buffer, that buffer therefore has enough space to +accommodate all the tuples emptied from the upper-level buffer. There is no +hard size limit in any of the data structures used, though, so this only needs +to be approximate; small overfilling of some buffers doesn't matter. + +If an internal page that has a buffer associated with it is split, the buffer +needs to be split too. All tuples in the buffer are scanned through and +relocated to the correct sibling buffers, using the penalty function to decide +which buffer each tuple should go to. + +After all tuples from the heap have been processed, there are still some index +tuples in the buffers. At this point, final buffer emptying starts. All buffers +are emptied in top-down order. This is slightly complicated by the fact that +new buffers can be allocated during the emptying, due to page splits. However, +the new buffers will always be siblings of buffers that haven't been fully +emptied yet; tuples never move upwards in the tree. The final emptying loops +through buffers at a given level until all buffers at that level have been +emptied, and then moves down to the next level. + +Bulk delete algorithm (VACUUM) +------------------------------ + +VACUUM works in two stages: + +In the first stage, we scan the whole index in physical order. To make sure +that we don't miss any dead tuples because a concurrent page split moved them, +we check the F_FOLLOW_RIGHT flags and NSN on each page, to detect if the +page has been concurrently split. If a concurrent page split is detected, and +one half of the page was moved to a position that we already scanned, we +"jump backwards" to scan the page again. This is the same mechanism that +B-tree VACUUM uses, but because we already have NSNs on pages, to detect page +splits during searches, we don't need a "vacuum cycle ID" concept for that +like B-tree does. + +While we scan all the pages, we also make note of any completely empty leaf +pages. We will try to unlink them from the tree after the scan. We also record +the block numbers of all internal pages; they are needed to locate parents of +the empty pages while unlinking them. + +We try to unlink any empty leaf pages from the tree, so that their space can +be reused. In order to delete an empty page, its downlink must be removed from +the parent. We scan all the internal pages, whose block numbers we memorized +in the first stage, and look for downlinks to pages that we have memorized as +being empty. Whenever we find one, we acquire a lock on the parent and child +page, re-check that the child page is still empty. Then, we remove the +downlink and mark the child as deleted, and release the locks. + +The insertion algorithm would get confused, if an internal page was completely +empty. So we never delete the last child of an internal page, even if it's +empty. Currently, we only support deleting leaf pages. + +This page deletion algorithm works on a best-effort basis. It might fail to +find a downlink, if a concurrent page split moved it after the first stage. +In that case, we won't be able to remove all empty pages. That's OK, it's +not expected to happen very often, and hopefully the next VACUUM will clean +it up. + +When we have deleted a page, it's possible that an in-progress search will +still descend on the page, if it saw the downlink before we removed it. The +search will see that it is deleted, and ignore it, but as long as that can +happen, we cannot reuse the page. To "wait out" any in-progress searches, when +a page is deleted, it's labeled with the current next-transaction counter +value. The page is not recycled, until that XID is no longer visible to +anyone. That's much more conservative than necessary, but let's keep it +simple. + + +Authors: + Teodor Sigaev + Oleg Bartunov diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c new file mode 100644 index 0000000..0683f42 --- /dev/null +++ b/src/backend/access/gist/gist.c @@ -0,0 +1,1713 @@ +/*------------------------------------------------------------------------- + * + * gist.c + * interface routines for the postgres GiST index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gist.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "access/gistscan.h" +#include "catalog/pg_collation.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* non-export function prototypes */ +static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate); +static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum); +static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild, Buffer rightchild, + bool unlockbuf, bool unlockleftchild); +static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo, bool unlockbuf); +static void gistprunepage(Relation rel, Page page, Buffer buffer, + Relation heapRel); + + +#define ROTATEDIST(d) do { \ + SplitedPageLayout *tmp=(SplitedPageLayout*)palloc0(sizeof(SplitedPageLayout)); \ + tmp->block.blkno = InvalidBlockNumber; \ + tmp->buffer = InvalidBuffer; \ + tmp->next = (d); \ + (d)=tmp; \ +} while(0) + + +/* + * GiST handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +gisthandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = GISTNProcs; + amroutine->amoptsprocnum = GIST_OPTIONS_PROC; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = true; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = true; + amroutine->amstorage = true; + amroutine->amclusterable = true; + amroutine->ampredlocks = true; + amroutine->amcanparallel = false; + amroutine->amcaninclude = true; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = gistbuild; + amroutine->ambuildempty = gistbuildempty; + amroutine->aminsert = gistinsert; + amroutine->ambulkdelete = gistbulkdelete; + amroutine->amvacuumcleanup = gistvacuumcleanup; + amroutine->amcanreturn = gistcanreturn; + amroutine->amcostestimate = gistcostestimate; + amroutine->amoptions = gistoptions; + amroutine->amproperty = gistproperty; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = gistvalidate; + amroutine->amadjustmembers = gistadjustmembers; + amroutine->ambeginscan = gistbeginscan; + amroutine->amrescan = gistrescan; + amroutine->amgettuple = gistgettuple; + amroutine->amgetbitmap = gistgetbitmap; + amroutine->amendscan = gistendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * Create and return a temporary memory context for use by GiST. We + * _always_ invoke user-provided methods in a temporary memory + * context, so that memory leaks in those functions cannot cause + * problems. Also, we use some additional temporary contexts in the + * GiST code itself, to avoid the need to do some awkward manual + * memory management. + */ +MemoryContext +createTempGistContext(void) +{ + return AllocSetContextCreate(CurrentMemoryContext, + "GiST temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +/* + * gistbuildempty() -- build an empty gist index in the initialization fork + */ +void +gistbuildempty(Relation index) +{ + Buffer buffer; + + /* Initialize the root page */ + buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog buffer */ + START_CRIT_SECTION(); + GISTInitBuffer(buffer, F_LEAF); + MarkBufferDirty(buffer); + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); + + /* Unlock and release the buffer */ + UnlockReleaseBuffer(buffer); +} + +/* + * gistinsert -- wrapper for GiST tuple insertion. + * + * This is the public interface routine for tuple insertion in GiSTs. + * It doesn't do any work; just locks the relation and passes the buck. + */ +bool +gistinsert(Relation r, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; + IndexTuple itup; + MemoryContext oldCxt; + + /* Initialize GISTSTATE cache if first call in this statement */ + if (giststate == NULL) + { + oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context); + giststate = initGISTstate(r); + giststate->tempCxt = createTempGistContext(); + indexInfo->ii_AmCache = (void *) giststate; + MemoryContextSwitchTo(oldCxt); + } + + oldCxt = MemoryContextSwitchTo(giststate->tempCxt); + + itup = gistFormTuple(giststate, r, + values, isnull, true /* size is currently bogus */ ); + itup->t_tid = *ht_ctid; + + gistdoinsert(r, itup, 0, giststate, heapRel, false); + + /* cleanup */ + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(giststate->tempCxt); + + return false; +} + + +/* + * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple + * at that offset is atomically removed along with inserting the new tuples. + * This is used to replace a tuple with a new one. + * + * If 'leftchildbuf' is valid, we're inserting the downlink for the page + * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. + * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. + * + * If 'markfollowright' is true and the page is split, the left child is + * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered + * index build, however, there is no concurrent access and the page splitting + * is done in a slightly simpler fashion, and false is passed. + * + * If there is not enough room on the page, it is split. All the split + * pages are kept pinned and locked and returned in *splitinfo, the caller + * is responsible for inserting the downlinks for them. However, if + * 'buffer' is the root page and it needs to be split, gistplacetopage() + * performs the split as one atomic operation, and *splitinfo is set to NIL. + * In that case, we continue to hold the root page locked, and the child + * pages are released; note that new tuple(s) are *not* on the root page + * but in one of the new child pages. + * + * If 'newblkno' is not NULL, returns the block number of page the first + * new/updated tuple was inserted to. Usually it's the given page, but could + * be its right sibling if the page was split. + * + * Returns 'true' if the page was split, 'false' otherwise. + */ +bool +gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, + Buffer buffer, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + BlockNumber *newblkno, + Buffer leftchildbuf, + List **splitinfo, + bool markfollowright, + Relation heapRel, + bool is_build) +{ + BlockNumber blkno = BufferGetBlockNumber(buffer); + Page page = BufferGetPage(buffer); + bool is_leaf = (GistPageIsLeaf(page)) ? true : false; + XLogRecPtr recptr; + int i; + bool is_split; + + /* + * Refuse to modify a page that's incompletely split. This should not + * happen because we finish any incomplete splits while we walk down the + * tree. However, it's remotely possible that another concurrent inserter + * splits a parent page, and errors out before completing the split. We + * will just throw an error in that case, and leave any split we had in + * progress unfinished too. The next insert that comes along will clean up + * the mess. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + + /* should never try to insert to a deleted page */ + Assert(!GistPageIsDeleted(page)); + + *splitinfo = NIL; + + /* + * if isupdate, remove old key: This node's key has been modified, either + * because a child split occurred or because we needed to adjust our key + * for an insert in a child node. Therefore, remove the old version of + * this node's key. + * + * for WAL replay, in the non-split case we handle this by setting up a + * one-element todelete array; in the split case, it's handled implicitly + * because the tuple vector passed to gistSplit won't include this tuple. + */ + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + + /* + * If leaf page is full, try at first to delete dead tuples. And then + * check again. + */ + if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) + { + gistprunepage(rel, page, buffer, heapRel); + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + } + + if (is_split) + { + /* no space for insertion */ + IndexTuple *itvec; + int tlen; + SplitedPageLayout *dist = NULL, + *ptr; + BlockNumber oldrlink = InvalidBlockNumber; + GistNSN oldnsn = 0; + SplitedPageLayout rootpg; + bool is_rootsplit; + int npage; + + is_rootsplit = (blkno == GIST_ROOT_BLKNO); + + /* + * Form index tuples vector to split. If we're replacing an old tuple, + * remove the old version from the vector. + */ + itvec = gistextractpage(page, &tlen); + if (OffsetNumberIsValid(oldoffnum)) + { + /* on inner page we should remove old tuple */ + int pos = oldoffnum - FirstOffsetNumber; + + tlen--; + if (pos != tlen) + memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); + } + itvec = gistjoinvector(itvec, &tlen, itup, ntup); + dist = gistSplit(rel, page, itvec, tlen, giststate); + + /* + * Check that split didn't produce too many pages. + */ + npage = 0; + for (ptr = dist; ptr; ptr = ptr->next) + npage++; + /* in a root split, we'll add one more page to the list below */ + if (is_rootsplit) + npage++; + if (npage > GIST_MAX_SPLIT_PAGES) + elog(ERROR, "GiST page split into too many halves (%d, maximum %d)", + npage, GIST_MAX_SPLIT_PAGES); + + /* + * Set up pages to work with. Allocate new buffers for all but the + * leftmost page. The original page becomes the new leftmost page, and + * is just replaced with the new contents. + * + * For a root-split, allocate new buffers for all child pages, the + * original page is overwritten with new root page containing + * downlinks to the new child pages. + */ + ptr = dist; + if (!is_rootsplit) + { + /* save old rightlink and NSN */ + oldrlink = GistPageGetOpaque(page)->rightlink; + oldnsn = GistPageGetNSN(page); + + dist->buffer = buffer; + dist->block.blkno = BufferGetBlockNumber(buffer); + dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); + + /* clean all flags except F_LEAF */ + GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; + + ptr = ptr->next; + } + for (; ptr; ptr = ptr->next) + { + /* Allocate new page */ + ptr->buffer = gistNewBuffer(rel); + GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); + ptr->page = BufferGetPage(ptr->buffer); + ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buffer), + BufferGetBlockNumber(ptr->buffer)); + } + + /* + * Now that we know which blocks the new pages go to, set up downlink + * tuples to point to them. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); + GistTupleSetValid(ptr->itup); + } + + /* + * If this is a root split, we construct the new root page with the + * downlinks here directly, instead of requiring the caller to insert + * them. Add the new root page to the list along with the child pages. + */ + if (is_rootsplit) + { + IndexTuple *downlinks; + int ndownlinks = 0; + int i; + + rootpg.buffer = buffer; + rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); + GistPageGetOpaque(rootpg.page)->flags = 0; + + /* Prepare a vector of all the downlinks */ + for (ptr = dist; ptr; ptr = ptr->next) + ndownlinks++; + downlinks = palloc(sizeof(IndexTuple) * ndownlinks); + for (i = 0, ptr = dist; ptr; ptr = ptr->next) + downlinks[i++] = ptr->itup; + + rootpg.block.blkno = GIST_ROOT_BLKNO; + rootpg.block.num = ndownlinks; + rootpg.list = gistfillitupvec(downlinks, ndownlinks, + &(rootpg.lenlist)); + rootpg.itup = NULL; + + rootpg.next = dist; + dist = &rootpg; + } + else + { + /* Prepare split-info to be returned to caller */ + for (ptr = dist; ptr; ptr = ptr->next) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + + si->buf = ptr->buffer; + si->downlink = ptr->itup; + *splitinfo = lappend(*splitinfo, si); + } + } + + /* + * Fill all pages. All the pages are new, ie. freshly allocated empty + * pages, or a temporary copy of the old page. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + char *data = (char *) (ptr->list); + + for (i = 0; i < ptr->block.num; i++) + { + IndexTuple thistup = (IndexTuple) data; + + if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); + + /* + * If this is the first inserted/updated tuple, let the caller + * know which page it landed on. + */ + if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) + *newblkno = ptr->block.blkno; + + data += IndexTupleSize(thistup); + } + + /* Set up rightlinks */ + if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) + GistPageGetOpaque(ptr->page)->rightlink = + ptr->next->block.blkno; + else + GistPageGetOpaque(ptr->page)->rightlink = oldrlink; + + /* + * Mark the all but the right-most page with the follow-right + * flag. It will be cleared as soon as the downlink is inserted + * into the parent, but this ensures that if we error out before + * that, the index is still consistent. (in buffering build mode, + * any error will abort the index build anyway, so this is not + * needed.) + */ + if (ptr->next && !is_rootsplit && markfollowright) + GistMarkFollowRight(ptr->page); + else + GistClearFollowRight(ptr->page); + + /* + * Copy the NSN of the original page to all pages. The + * F_FOLLOW_RIGHT flags ensure that scans will follow the + * rightlinks until the downlinks are inserted. + */ + GistPageSetNSN(ptr->page, oldnsn); + } + + /* + * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL + * insertion for that. NB: The number of pages and data segments + * specified here must match the calculations in gistXLogSplit()! + */ + if (!is_build && RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(npage, 1 + npage * 2); + + START_CRIT_SECTION(); + + /* + * Must mark buffers dirty before XLogInsert, even though we'll still + * be changing their opaque fields below. + */ + for (ptr = dist; ptr; ptr = ptr->next) + MarkBufferDirty(ptr->buffer); + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + /* + * The first page in the chain was a temporary working copy meant to + * replace the old page. Copy it over the old page. + */ + PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); + dist->page = BufferGetPage(dist->buffer); + + /* + * Write the WAL record. + * + * If we're building a new index, however, we don't WAL-log changes + * yet. The LSN-NSN interlock between parent and child requires that + * LSNs never move backwards, so set the LSNs to a value that's + * smaller than any real or fake unlogged LSN that might be generated + * later. (There can't be any concurrent scans during index build, so + * we don't need to be able to detect concurrent splits yet.) + */ + if (is_build) + recptr = GistBuildLSN; + else + { + if (RelationNeedsWAL(rel)) + recptr = gistXLogSplit(is_leaf, + dist, oldrlink, oldnsn, leftchildbuf, + markfollowright); + else + recptr = gistGetFakeLSN(rel); + } + + for (ptr = dist; ptr; ptr = ptr->next) + PageSetLSN(ptr->page, recptr); + + /* + * Return the new child buffers to the caller. + * + * If this was a root split, we've already inserted the downlink + * pointers, in the form of a new root page. Therefore we can release + * all the new buffers, and keep just the root page locked. + */ + if (is_rootsplit) + { + for (ptr = dist->next; ptr; ptr = ptr->next) + UnlockReleaseBuffer(ptr->buffer); + } + } + else + { + /* + * Enough space. We always get here if ntup==0. + */ + START_CRIT_SECTION(); + + /* + * Delete old tuple if any, then insert new tuple(s) if any. If + * possible, use the fast path of PageIndexTupleOverwrite. + */ + if (OffsetNumberIsValid(oldoffnum)) + { + if (ntup == 1) + { + /* One-for-one replacement, so use PageIndexTupleOverwrite */ + if (!PageIndexTupleOverwrite(page, oldoffnum, (Item) *itup, + IndexTupleSize(*itup))) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(rel)); + } + else + { + /* Delete old, then append new tuple(s) to page */ + PageIndexTupleDelete(page, oldoffnum); + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } + } + else + { + /* Just append new tuples at the end of the page */ + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + } + + MarkBufferDirty(buffer); + + if (BufferIsValid(leftchildbuf)) + MarkBufferDirty(leftchildbuf); + + if (is_build) + recptr = GistBuildLSN; + else + { + if (RelationNeedsWAL(rel)) + { + OffsetNumber ndeloffs = 0, + deloffs[1]; + + if (OffsetNumberIsValid(oldoffnum)) + { + deloffs[0] = oldoffnum; + ndeloffs = 1; + } + + recptr = gistXLogUpdate(buffer, + deloffs, ndeloffs, itup, ntup, + leftchildbuf); + } + else + recptr = gistGetFakeLSN(rel); + } + PageSetLSN(page, recptr); + + if (newblkno) + *newblkno = blkno; + } + + /* + * If we inserted the downlink for a child page, set NSN and clear + * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to + * follow the rightlink if and only if they looked at the parent page + * before we inserted the downlink. + * + * Note that we do this *after* writing the WAL record. That means that + * the possible full page image in the WAL record does not include these + * changes, and they must be replayed even if the page is restored from + * the full page image. There's a chicken-and-egg problem: if we updated + * the child pages first, we wouldn't know the recptr of the WAL record + * we're about to write. + */ + if (BufferIsValid(leftchildbuf)) + { + Page leftpg = BufferGetPage(leftchildbuf); + + GistPageSetNSN(leftpg, recptr); + GistClearFollowRight(leftpg); + + PageSetLSN(leftpg, recptr); + } + + END_CRIT_SECTION(); + + return is_split; +} + +/* + * Workhouse routine for doing insertion into a GiST index. Note that + * this routine assumes it is invoked in a short-lived memory context, + * so it does not bother releasing palloc'd allocations. + */ +void +gistdoinsert(Relation r, IndexTuple itup, Size freespace, + GISTSTATE *giststate, Relation heapRel, bool is_build) +{ + ItemId iid; + IndexTuple idxtuple; + GISTInsertStack firststack; + GISTInsertStack *stack; + GISTInsertState state; + bool xlocked = false; + + memset(&state, 0, sizeof(GISTInsertState)); + state.freespace = freespace; + state.r = r; + state.heapRel = heapRel; + state.is_build = is_build; + + /* Start from the root */ + firststack.blkno = GIST_ROOT_BLKNO; + firststack.lsn = 0; + firststack.retry_from_parent = false; + firststack.parent = NULL; + firststack.downlinkoffnum = InvalidOffsetNumber; + state.stack = stack = &firststack; + + /* + * Walk down along the path of smallest penalty, updating the parent + * pointers with the key we're inserting as we go. If we crash in the + * middle, the tree is consistent, although the possible parent updates + * were a waste. + */ + for (;;) + { + /* + * If we split an internal page while descending the tree, we have to + * retry at the parent. (Normally, the LSN-NSN interlock below would + * also catch this and cause us to retry. But LSNs are not updated + * during index build.) + */ + while (stack->retry_from_parent) + { + if (xlocked) + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + ReleaseBuffer(stack->buffer); + state.stack = stack = stack->parent; + } + + if (XLogRecPtrIsInvalid(stack->lsn)) + stack->buffer = ReadBuffer(state.r, stack->blkno); + + /* + * Be optimistic and grab shared lock first. Swap it for an exclusive + * lock later if we need to update the page. + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_SHARE); + gistcheckpage(state.r, stack->buffer); + } + + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = xlocked ? + PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer); + Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); + + /* + * If this page was split but the downlink was never inserted to the + * parent because the inserting backend crashed before doing that, fix + * that now. + */ + if (GistFollowRight(stack->page)) + { + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + /* someone might've completed the split when we unlocked */ + if (!GistFollowRight(stack->page)) + continue; + } + gistfixsplit(&state, giststate); + + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if ((stack->blkno != GIST_ROOT_BLKNO && + stack->parent->lsn < GistPageGetNSN(stack->page)) || + GistPageIsDeleted(stack->page)) + { + /* + * Concurrent split or page deletion detected. There's no + * guarantee that the downlink for this page is consistent with + * the tuple we're inserting anymore, so go back to parent and + * rechoose the best child. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + + if (!GistPageIsLeaf(stack->page)) + { + /* + * This is an internal page so continue to walk down the tree. + * Find the child node that has the minimum insertion penalty. + */ + BlockNumber childblkno; + IndexTuple newtup; + GISTInsertStack *item; + OffsetNumber downlinkoffnum; + + downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); + iid = PageGetItemId(stack->page, downlinkoffnum); + idxtuple = (IndexTuple) PageGetItem(stack->page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + /* + * Check that it's not a leftover invalid tuple from pre-9.1 + */ + if (GistTupleIsInvalid(idxtuple)) + ereport(ERROR, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(r)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + + /* + * Check that the key representing the target child node is + * consistent with the key we're inserting. Update it if it's not. + */ + newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); + if (newtup) + { + /* + * Swap shared lock for an exclusive one. Beware, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + + if (PageGetLSN(stack->page) != stack->lsn) + { + /* the page was changed while we unlocked it, retry */ + continue; + } + } + + /* + * Update the tuple. + * + * We still hold the lock after gistinserttuple(), but it + * might have to split the page to make the updated tuple fit. + * In that case the updated tuple might migrate to the other + * half of the split, so we have to go back to the parent and + * descend back to the half that's a better fit for the new + * tuple. + */ + if (gistinserttuple(&state, stack, giststate, newtup, + downlinkoffnum)) + { + /* + * If this was a root split, the root page continues to be + * the parent and the updated tuple went to one of the + * child pages, so we just need to retry from the root + * page. + */ + if (stack->blkno != GIST_ROOT_BLKNO) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + } + continue; + } + } + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + + /* descend to the chosen child */ + item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + item->blkno = childblkno; + item->parent = stack; + item->downlinkoffnum = downlinkoffnum; + state.stack = stack = item; + } + else + { + /* + * Leaf page. Insert the new key. We've already updated all the + * parents on the way down, but we might have to split the page if + * it doesn't fit. gistinserttuple() will take care of that. + */ + + /* + * Swap shared lock for an exclusive one. Be careful, the page may + * change while we unlock/lock the page... + */ + if (!xlocked) + { + LockBuffer(stack->buffer, GIST_UNLOCK); + LockBuffer(stack->buffer, GIST_EXCLUSIVE); + xlocked = true; + stack->page = (Page) BufferGetPage(stack->buffer); + stack->lsn = PageGetLSN(stack->page); + + if (stack->blkno == GIST_ROOT_BLKNO) + { + /* + * the only page that can become inner instead of leaf is + * the root page, so for root we should recheck it + */ + if (!GistPageIsLeaf(stack->page)) + { + /* + * very rare situation: during unlock/lock index with + * number of pages = 1 was increased + */ + LockBuffer(stack->buffer, GIST_UNLOCK); + xlocked = false; + continue; + } + + /* + * we don't need to check root split, because checking + * leaf/inner is enough to recognize split for root + */ + } + else if ((GistFollowRight(stack->page) || + stack->parent->lsn < GistPageGetNSN(stack->page)) || + GistPageIsDeleted(stack->page)) + { + /* + * The page was split or deleted while we momentarily + * unlocked the page. Go back to parent. + */ + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + } + + /* now state.stack->(page, buffer and blkno) points to leaf page */ + + gistinserttuple(&state, stack, giststate, itup, + InvalidOffsetNumber); + LockBuffer(stack->buffer, GIST_UNLOCK); + + /* Release any pins we might still hold before exiting */ + for (; stack; stack = stack->parent) + ReleaseBuffer(stack->buffer); + break; + } + } +} + +/* + * Traverse the tree to find path from root page to specified "child" block. + * + * returns a new insertion stack, starting from the parent of "child", up + * to the root. *downlinkoffnum is set to the offset of the downlink in the + * direct parent of child. + * + * To prevent deadlocks, this should lock only one page at a time. + */ +static GISTInsertStack * +gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) +{ + Page page; + Buffer buffer; + OffsetNumber i, + maxoff; + ItemId iid; + IndexTuple idxtuple; + List *fifo; + GISTInsertStack *top, + *ptr; + BlockNumber blkno; + + top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + top->blkno = GIST_ROOT_BLKNO; + top->downlinkoffnum = InvalidOffsetNumber; + + fifo = list_make1(top); + while (fifo != NIL) + { + /* Get next page to visit */ + top = linitial(fifo); + fifo = list_delete_first(fifo); + + buffer = ReadBuffer(r, top->blkno); + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(r, buffer); + page = (Page) BufferGetPage(buffer); + + if (GistPageIsLeaf(page)) + { + /* + * Because we scan the index top-down, all the rest of the pages + * in the queue must be leaf pages as well. + */ + UnlockReleaseBuffer(buffer); + break; + } + + /* currently, internal pages are never deleted */ + Assert(!GistPageIsDeleted(page)); + + top->lsn = BufferGetLSNAtomic(buffer); + + /* + * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a + * downlink. This should not normally happen.. + */ + if (GistFollowRight(page)) + elog(ERROR, "concurrent GiST page split was incomplete"); + + if (top->parent && top->parent->lsn < GistPageGetNSN(page) && + GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) + { + /* + * Page was split while we looked elsewhere. We didn't see the + * downlink to the right page when we scanned the parent, so add + * it to the queue now. + * + * Put the right page ahead of the queue, so that we visit it + * next. That's important, because if this is the lowest internal + * level, just above leaves, we might already have queued up some + * leaf pages, and we assume that there can't be any non-leaf + * pages behind leaf pages. + */ + ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr->blkno = GistPageGetOpaque(page)->rightlink; + ptr->downlinkoffnum = InvalidOffsetNumber; + ptr->parent = top->parent; + + fifo = lcons(ptr, fifo); + } + + maxoff = PageGetMaxOffsetNumber(page); + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + if (blkno == child) + { + /* Found it! */ + UnlockReleaseBuffer(buffer); + *downlinkoffnum = i; + return top; + } + else + { + /* Append this child to the list of pages to visit later */ + ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); + ptr->blkno = blkno; + ptr->downlinkoffnum = i; + ptr->parent = top; + + fifo = lappend(fifo, ptr); + } + } + + UnlockReleaseBuffer(buffer); + } + + elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", + RelationGetRelationName(r), child); + return NULL; /* keep compiler quiet */ +} + +/* + * Updates the stack so that child->parent is the correct parent of the + * child. child->parent must be exclusively locked on entry, and will + * remain so at exit, but it might not be the same page anymore. + */ +static void +gistFindCorrectParent(Relation r, GISTInsertStack *child) +{ + GISTInsertStack *parent = child->parent; + + gistcheckpage(r, parent->buffer); + parent->page = (Page) BufferGetPage(parent->buffer); + + /* here we don't need to distinguish between split and page update */ + if (child->downlinkoffnum == InvalidOffsetNumber || + parent->lsn != PageGetLSN(parent->page)) + { + /* parent is changed, look child in right links until found */ + OffsetNumber i, + maxoff; + ItemId iid; + IndexTuple idxtuple; + GISTInsertStack *ptr; + + while (true) + { + maxoff = PageGetMaxOffsetNumber(parent->page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(parent->page, i); + idxtuple = (IndexTuple) PageGetItem(parent->page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + { + /* yes!!, found */ + child->downlinkoffnum = i; + return; + } + } + + parent->blkno = GistPageGetOpaque(parent->page)->rightlink; + UnlockReleaseBuffer(parent->buffer); + if (parent->blkno == InvalidBlockNumber) + { + /* + * End of chain and still didn't find parent. It's a very-very + * rare situation when root splitted. + */ + break; + } + parent->buffer = ReadBuffer(r, parent->blkno); + LockBuffer(parent->buffer, GIST_EXCLUSIVE); + gistcheckpage(r, parent->buffer); + parent->page = (Page) BufferGetPage(parent->buffer); + } + + /* + * awful!!, we need search tree to find parent ... , but before we + * should release all old parent + */ + + ptr = child->parent->parent; /* child->parent already released + * above */ + while (ptr) + { + ReleaseBuffer(ptr->buffer); + ptr = ptr->parent; + } + + /* ok, find new path */ + ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); + + /* read all buffers as expected by caller */ + /* note we don't lock them or gistcheckpage them here! */ + while (ptr) + { + ptr->buffer = ReadBuffer(r, ptr->blkno); + ptr->page = (Page) BufferGetPage(ptr->buffer); + ptr = ptr->parent; + } + + /* install new chain of parents to stack */ + child->parent = parent; + + /* make recursive call to normal processing */ + LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(r, child); + } +} + +/* + * Form a downlink pointer for the page in 'buf'. + */ +static IndexTuple +gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, + GISTInsertStack *stack) +{ + Page page = BufferGetPage(buf); + OffsetNumber maxoff; + OffsetNumber offset; + IndexTuple downlink = NULL; + + maxoff = PageGetMaxOffsetNumber(page); + for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) + { + IndexTuple ituple = (IndexTuple) + PageGetItem(page, PageGetItemId(page, offset)); + + if (downlink == NULL) + downlink = CopyIndexTuple(ituple); + else + { + IndexTuple newdownlink; + + newdownlink = gistgetadjusted(rel, downlink, ituple, + giststate); + if (newdownlink) + downlink = newdownlink; + } + } + + /* + * If the page is completely empty, we can't form a meaningful downlink + * for it. But we have to insert a downlink for the page. Any key will do, + * as long as its consistent with the downlink of parent page, so that we + * can legally insert it to the parent. A minimal one that matches as few + * scans as possible would be best, to keep scans from doing useless work, + * but we don't know how to construct that. So we just use the downlink of + * the original page that was split - that's as far from optimal as it can + * get but will do.. + */ + if (!downlink) + { + ItemId iid; + + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(rel, stack); + iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum); + downlink = (IndexTuple) PageGetItem(stack->parent->page, iid); + downlink = CopyIndexTuple(downlink); + LockBuffer(stack->parent->buffer, GIST_UNLOCK); + } + + ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf)); + GistTupleSetValid(downlink); + + return downlink; +} + + +/* + * Complete the incomplete split of state->stack->page. + */ +static void +gistfixsplit(GISTInsertState *state, GISTSTATE *giststate) +{ + GISTInsertStack *stack = state->stack; + Buffer buf; + Page page; + List *splitinfo = NIL; + + ereport(LOG, + (errmsg("fixing incomplete split in index \"%s\", block %u", + RelationGetRelationName(state->r), stack->blkno))); + + Assert(GistFollowRight(stack->page)); + Assert(OffsetNumberIsValid(stack->downlinkoffnum)); + + buf = stack->buffer; + + /* + * Read the chain of split pages, following the rightlinks. Construct a + * downlink tuple for each page. + */ + for (;;) + { + GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); + IndexTuple downlink; + + page = BufferGetPage(buf); + + /* Form the new downlink tuples to insert to parent */ + downlink = gistformdownlink(state->r, buf, giststate, stack); + + si->buf = buf; + si->downlink = downlink; + + splitinfo = lappend(splitinfo, si); + + if (GistFollowRight(page)) + { + /* lock next page */ + buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink); + LockBuffer(buf, GIST_EXCLUSIVE); + } + else + break; + } + + /* Insert the downlinks */ + gistfinishsplit(state, stack, giststate, splitinfo, false); +} + +/* + * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the + * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new. + * 'stack' represents the path from the root to the page being updated. + * + * The caller must hold an exclusive lock on stack->buffer. The lock is still + * held on return, but the page might not contain the inserted tuple if the + * page was split. The function returns true if the page was split, false + * otherwise. + */ +static bool +gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum) +{ + return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum, + InvalidBuffer, InvalidBuffer, false, false); +} + +/* ---------------- + * An extended workhorse version of gistinserttuple(). This version allows + * inserting multiple tuples, or replacing a single tuple with multiple tuples. + * This is used to recursively update the downlinks in the parent when a page + * is split. + * + * If leftchild and rightchild are valid, we're inserting/replacing the + * downlink for rightchild, and leftchild is its left sibling. We clear the + * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the + * insertion of the downlink. + * + * To avoid holding locks for longer than necessary, when recursing up the + * tree to update the parents, the locking is a bit peculiar here. On entry, + * the caller must hold an exclusive lock on stack->buffer, as well as + * leftchild and rightchild if given. On return: + * + * - Lock on stack->buffer is released, if 'unlockbuf' is true. The page is + * always kept pinned, however. + * - Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page + * is kept pinned. + * - Lock and pin on 'rightchild' are always released. + * + * Returns 'true' if the page had to be split. Note that if the page was + * split, the inserted/updated tuples might've been inserted to a right + * sibling of stack->buffer instead of stack->buffer itself. + */ +static bool +gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, + IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, + Buffer leftchild, Buffer rightchild, + bool unlockbuf, bool unlockleftchild) +{ + List *splitinfo; + bool is_split; + + /* + * Check for any rw conflicts (in serializable isolation level) just + * before we intend to modify the page + */ + CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer)); + + /* Insert the tuple(s) to the page, splitting the page if necessary */ + is_split = gistplacetopage(state->r, state->freespace, giststate, + stack->buffer, + tuples, ntup, + oldoffnum, NULL, + leftchild, + &splitinfo, + true, + state->heapRel, + state->is_build); + + /* + * Before recursing up in case the page was split, release locks on the + * child pages. We don't need to keep them locked when updating the + * parent. + */ + if (BufferIsValid(rightchild)) + UnlockReleaseBuffer(rightchild); + if (BufferIsValid(leftchild) && unlockleftchild) + LockBuffer(leftchild, GIST_UNLOCK); + + /* + * If we had to split, insert/update the downlinks in the parent. If the + * caller requested us to release the lock on stack->buffer, tell + * gistfinishsplit() to do that as soon as it's safe to do so. If we + * didn't have to split, release it ourselves. + */ + if (splitinfo) + gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf); + else if (unlockbuf) + LockBuffer(stack->buffer, GIST_UNLOCK); + + return is_split; +} + +/* + * Finish an incomplete split by inserting/updating the downlinks in parent + * page. 'splitinfo' contains all the child pages involved in the split, + * from left-to-right. + * + * On entry, the caller must hold a lock on stack->buffer and all the child + * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is + * released on return. The child pages are always unlocked and unpinned. + */ +static void +gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, + GISTSTATE *giststate, List *splitinfo, bool unlockbuf) +{ + GISTPageSplitInfo *right; + GISTPageSplitInfo *left; + IndexTuple tuples[2]; + + /* A split always contains at least two halves */ + Assert(list_length(splitinfo) >= 2); + + /* + * We need to insert downlinks for each new page, and update the downlink + * for the original (leftmost) page in the split. Begin at the rightmost + * page, inserting one downlink at a time until there's only two pages + * left. Finally insert the downlink for the last new page and update the + * downlink for the original page as one operation. + */ + LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); + + /* + * Insert downlinks for the siblings from right to left, until there are + * only two siblings left. + */ + for (int pos = list_length(splitinfo) - 1; pos > 1; pos--) + { + right = (GISTPageSplitInfo *) list_nth(splitinfo, pos); + left = (GISTPageSplitInfo *) list_nth(splitinfo, pos - 1); + + gistFindCorrectParent(state->r, stack); + if (gistinserttuples(state, stack->parent, giststate, + &right->downlink, 1, + InvalidOffsetNumber, + left->buf, right->buf, false, false)) + { + /* + * If the parent page was split, the existing downlink might have + * moved. + */ + stack->downlinkoffnum = InvalidOffsetNumber; + } + /* gistinserttuples() released the lock on right->buf. */ + } + + right = (GISTPageSplitInfo *) lsecond(splitinfo); + left = (GISTPageSplitInfo *) linitial(splitinfo); + + /* + * Finally insert downlink for the remaining right page and update the + * downlink for the original page to not contain the tuples that were + * moved to the new pages. + */ + tuples[0] = left->downlink; + tuples[1] = right->downlink; + gistFindCorrectParent(state->r, stack); + if (gistinserttuples(state, stack->parent, giststate, + tuples, 2, + stack->downlinkoffnum, + left->buf, right->buf, + true, /* Unlock parent */ + unlockbuf /* Unlock stack->buffer if caller wants + * that */ + )) + { + /* + * If the parent page was split, the downlink might have moved. + */ + stack->downlinkoffnum = InvalidOffsetNumber; + } + + Assert(left->buf == stack->buffer); + + /* + * If we split the page because we had to adjust the downlink on an + * internal page, while descending the tree for inserting a new tuple, + * then this might no longer be the correct page for the new tuple. The + * downlink to this page might not cover the new tuple anymore, it might + * need to go to the newly-created right sibling instead. Tell the caller + * to walk back up the stack, to re-check at the parent which page to + * insert to. + * + * Normally, the LSN-NSN interlock during the tree descend would also + * detect that a concurrent split happened (by ourselves), and cause us to + * retry at the parent. But that mechanism doesn't work during index + * build, because we don't do WAL-logging, and don't update LSNs, during + * index build. + */ + stack->retry_from_parent = true; +} + +/* + * gistSplit -- split a page in the tree and fill struct + * used for XLOG and real writes buffers. Function is recursive, ie + * it will split page until keys will fit in every page. + */ +SplitedPageLayout * +gistSplit(Relation r, + Page page, + IndexTuple *itup, /* contains compressed entry */ + int len, + GISTSTATE *giststate) +{ + IndexTuple *lvectup, + *rvectup; + GistSplitVector v; + int i; + SplitedPageLayout *res = NULL; + + /* this should never recurse very deeply, but better safe than sorry */ + check_stack_depth(); + + /* there's no point in splitting an empty page */ + Assert(len > 0); + + /* + * If a single tuple doesn't fit on a page, no amount of splitting will + * help. + */ + if (len == 1) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + IndexTupleSize(itup[0]), GiSTPageSize, + RelationGetRelationName(r)))); + + memset(v.spl_lisnull, true, + sizeof(bool) * giststate->nonLeafTupdesc->natts); + memset(v.spl_risnull, true, + sizeof(bool) * giststate->nonLeafTupdesc->natts); + gistSplitByKey(r, page, itup, len, giststate, &v, 0); + + /* form left and right vector */ + lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); + rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1)); + + for (i = 0; i < v.splitVector.spl_nleft; i++) + lvectup[i] = itup[v.splitVector.spl_left[i] - 1]; + + for (i = 0; i < v.splitVector.spl_nright; i++) + rvectup[i] = itup[v.splitVector.spl_right[i] - 1]; + + /* finalize splitting (may need another split) */ + if (!gistfitpage(rvectup, v.splitVector.spl_nright)) + { + res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate); + } + else + { + ROTATEDIST(res); + res->block.num = v.splitVector.spl_nright; + res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist)); + res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false); + } + + if (!gistfitpage(lvectup, v.splitVector.spl_nleft)) + { + SplitedPageLayout *resptr, + *subres; + + resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate); + + /* install on list's tail */ + while (resptr->next) + resptr = resptr->next; + + resptr->next = res; + res = subres; + } + else + { + ROTATEDIST(res); + res->block.num = v.splitVector.spl_nleft; + res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist)); + res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false); + } + + return res; +} + +/* + * Create a GISTSTATE and fill it with information about the index + */ +GISTSTATE * +initGISTstate(Relation index) +{ + GISTSTATE *giststate; + MemoryContext scanCxt; + MemoryContext oldCxt; + int i; + + /* safety check to protect fixed-size arrays in GISTSTATE */ + if (index->rd_att->natts > INDEX_MAX_KEYS) + elog(ERROR, "numberOfAttributes %d > %d", + index->rd_att->natts, INDEX_MAX_KEYS); + + /* Create the memory context that will hold the GISTSTATE */ + scanCxt = AllocSetContextCreate(CurrentMemoryContext, + "GiST scan context", + ALLOCSET_DEFAULT_SIZES); + oldCxt = MemoryContextSwitchTo(scanCxt); + + /* Create and fill in the GISTSTATE */ + giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE)); + + giststate->scanCxt = scanCxt; + giststate->tempCxt = scanCxt; /* caller must change this if needed */ + giststate->leafTupdesc = index->rd_att; + + /* + * The truncated tupdesc for non-leaf index tuples, which doesn't contain + * the INCLUDE attributes. + * + * It is used to form tuples during tuple adjustment and page split. + * B-tree creates shortened tuple descriptor for every truncated tuple, + * because it is doing this less often: it does not have to form truncated + * tuples during page split. Also, B-tree is not adjusting tuples on + * internal pages the way GiST does. + */ + giststate->nonLeafTupdesc = CreateTupleDescCopyConstr(index->rd_att); + giststate->nonLeafTupdesc->natts = + IndexRelationGetNumberOfKeyAttributes(index); + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++) + { + fmgr_info_copy(&(giststate->consistentFn[i]), + index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC), + scanCxt); + fmgr_info_copy(&(giststate->unionFn[i]), + index_getprocinfo(index, i + 1, GIST_UNION_PROC), + scanCxt); + + /* opclasses are not required to provide a Compress method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_COMPRESS_PROC))) + fmgr_info_copy(&(giststate->compressFn[i]), + index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC), + scanCxt); + else + giststate->compressFn[i].fn_oid = InvalidOid; + + /* opclasses are not required to provide a Decompress method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DECOMPRESS_PROC))) + fmgr_info_copy(&(giststate->decompressFn[i]), + index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC), + scanCxt); + else + giststate->decompressFn[i].fn_oid = InvalidOid; + + fmgr_info_copy(&(giststate->penaltyFn[i]), + index_getprocinfo(index, i + 1, GIST_PENALTY_PROC), + scanCxt); + fmgr_info_copy(&(giststate->picksplitFn[i]), + index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC), + scanCxt); + fmgr_info_copy(&(giststate->equalFn[i]), + index_getprocinfo(index, i + 1, GIST_EQUAL_PROC), + scanCxt); + + /* opclasses are not required to provide a Distance method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC))) + fmgr_info_copy(&(giststate->distanceFn[i]), + index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC), + scanCxt); + else + giststate->distanceFn[i].fn_oid = InvalidOid; + + /* opclasses are not required to provide a Fetch method */ + if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC))) + fmgr_info_copy(&(giststate->fetchFn[i]), + index_getprocinfo(index, i + 1, GIST_FETCH_PROC), + scanCxt); + else + giststate->fetchFn[i].fn_oid = InvalidOid; + + /* + * If the index column has a specified collation, we should honor that + * while doing comparisons. However, we may have a collatable storage + * type for a noncollatable indexed data type. If there's no index + * collation then specify default collation in case the support + * functions need collation. This is harmless if the support + * functions don't care about collation, so we just do it + * unconditionally. (We could alternatively call get_typcollation, + * but that seems like expensive overkill --- there aren't going to be + * any cases where a GiST storage type has a nondefault collation.) + */ + if (OidIsValid(index->rd_indcollation[i])) + giststate->supportCollation[i] = index->rd_indcollation[i]; + else + giststate->supportCollation[i] = DEFAULT_COLLATION_OID; + } + + /* No opclass information for INCLUDE attributes */ + for (; i < index->rd_att->natts; i++) + { + giststate->consistentFn[i].fn_oid = InvalidOid; + giststate->unionFn[i].fn_oid = InvalidOid; + giststate->compressFn[i].fn_oid = InvalidOid; + giststate->decompressFn[i].fn_oid = InvalidOid; + giststate->penaltyFn[i].fn_oid = InvalidOid; + giststate->picksplitFn[i].fn_oid = InvalidOid; + giststate->equalFn[i].fn_oid = InvalidOid; + giststate->distanceFn[i].fn_oid = InvalidOid; + giststate->fetchFn[i].fn_oid = InvalidOid; + giststate->supportCollation[i] = InvalidOid; + } + + MemoryContextSwitchTo(oldCxt); + + return giststate; +} + +void +freeGISTstate(GISTSTATE *giststate) +{ + /* It's sufficient to delete the scanCxt */ + MemoryContextDelete(giststate->scanCxt); +} + +/* + * gistprunepage() -- try to remove LP_DEAD items from the given page. + * Function assumes that buffer is exclusively locked. + */ +static void +gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel) +{ + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable = 0; + OffsetNumber offnum, + maxoff; + + Assert(GistPageIsLeaf(page)); + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + TransactionId latestRemovedXid = InvalidTransactionId; + + if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) + latestRemovedXid = + index_compute_xid_horizon_for_tuples(rel, heapRel, buffer, + deletable, ndeletable); + + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that F_HAS_GARBAGE is only a hint anyway. + */ + GistClearPageHasGarbage(page); + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogDelete(buffer, + deletable, ndeletable, + latestRemovedXid); + + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + } + + /* + * Note: if we didn't find any LP_DEAD items, then the page's + * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a + * separate write to clear it, however. We will clear it when we split + * the page. + */ +} diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c new file mode 100644 index 0000000..ec28bfe --- /dev/null +++ b/src/backend/access/gist/gistbuild.c @@ -0,0 +1,1566 @@ +/*------------------------------------------------------------------------- + * + * gistbuild.c + * build algorithm for GiST indexes implementation. + * + * There are two different strategies: + * + * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted + * order, and create downlinks and internal pages as we go. This builds + * the index from the bottom up, similar to how B-tree index build + * works. + * + * 2. Start with an empty index, and insert all tuples one by one. + * + * The sorted method is used if the operator classes for all columns have + * a 'sortsupport' defined. Otherwise, we resort to the second strategy. + * + * The second strategy can optionally use buffers at different levels of + * the tree to reduce I/O, see "Buffering build algorithm" in the README + * for a more detailed explanation. It initially calls insert over and + * over, but switches to the buffered algorithm after a certain number of + * tuples (unless buffering mode is disabled). + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistbuild.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/genam.h" +#include "access/gist_private.h" +#include "access/gistxlog.h" +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "optimizer/optimizer.h" +#include "storage/bufmgr.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/tuplesort.h" + +/* Step of index tuples for check whether to switch to buffering build mode */ +#define BUFFERING_MODE_SWITCH_CHECK_STEP 256 + +/* + * Number of tuples to process in the slow way before switching to buffering + * mode, when buffering is explicitly turned on. Also, the number of tuples + * to process between readjusting the buffer size parameter, while in + * buffering mode. + */ +#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096 + +/* + * Strategy used to build the index. It can change between the + * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used, + * that needs to be decided up-front and cannot be changed afterwards. + */ +typedef enum +{ + GIST_SORTED_BUILD, /* bottom-up build by sorting */ + GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to + * switch */ + GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to + * buffering build mode if the index grows too + * big */ + GIST_BUFFERING_STATS, /* gathering statistics of index tuple size + * before switching to the buffering build + * mode */ + GIST_BUFFERING_ACTIVE /* in buffering build mode */ +} GistBuildMode; + +/* Working state for gistbuild and its callback */ +typedef struct +{ + Relation indexrel; + Relation heaprel; + GISTSTATE *giststate; + + Size freespace; /* amount of free space to leave on pages */ + + GistBuildMode buildMode; + + int64 indtuples; /* number of tuples indexed */ + + /* + * Extra data structures used during a buffering build. 'gfbb' contains + * information related to managing the build buffers. 'parentMap' is a + * lookup table of the parent of each internal page. + */ + int64 indtuplesSize; /* total size of all indexed tuples */ + GISTBuildBuffers *gfbb; + HTAB *parentMap; + + /* + * Extra data structures used during a sorting build. + */ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + + BlockNumber pages_allocated; + BlockNumber pages_written; + + int ready_num_pages; + BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; + Page ready_pages[XLR_MAX_BLOCK_ID]; +} GISTBuildState; + +/* + * In sorted build, we use a stack of these structs, one for each level, + * to hold an in-memory buffer of the rightmost page at the level. When the + * page fills up, it is written out and a new page is allocated. + */ +typedef struct GistSortedBuildPageState +{ + Page page; + struct GistSortedBuildPageState *parent; /* Upper level, if any */ +} GistSortedBuildPageState; + +/* prototypes for private functions */ + +static void gistSortedBuildCallback(Relation index, ItemPointer tid, + Datum *values, bool *isnull, + bool tupleIsAlive, void *state); +static void gist_indexsortbuild(GISTBuildState *state); +static void gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup); +static void gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate); +static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); + +static void gistInitBuffering(GISTBuildState *buildstate); +static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); +static void gistBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state); +static void gistBufferingBuildInsert(GISTBuildState *buildstate, + IndexTuple itup); +static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, + BlockNumber startblkno, int startlevel); +static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, + Buffer buffer, int level, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + BlockNumber parentblk, OffsetNumber downlinkoffnum); +static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate, + BlockNumber childblkno, int level, + BlockNumber *parentblk, + OffsetNumber *downlinkoffnum); +static void gistProcessEmptyingQueue(GISTBuildState *buildstate); +static void gistEmptyAllBuffers(GISTBuildState *buildstate); +static int gistGetMaxLevel(Relation index); + +static void gistInitParentMap(GISTBuildState *buildstate); +static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, + BlockNumber parent); +static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent); +static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child); + + +/* + * Main entry point to GiST index build. + */ +IndexBuildResult * +gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + GISTBuildState buildstate; + MemoryContext oldcxt = CurrentMemoryContext; + int fillfactor; + Oid SortSupportFnOids[INDEX_MAX_KEYS]; + GiSTOptions *options = (GiSTOptions *) index->rd_options; + + /* + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + buildstate.indexrel = index; + buildstate.heaprel = heap; + buildstate.sortstate = NULL; + buildstate.giststate = initGISTstate(index); + + /* + * Create a temporary memory context that is reset once for each tuple + * processed. (Note: we don't bother to make this a child of the + * giststate's scanCxt, so we have to delete it separately at the end.) + */ + buildstate.giststate->tempCxt = createTempGistContext(); + + /* + * Choose build strategy. First check whether the user specified to use + * buffering mode. (The use-case for that in the field is somewhat + * questionable perhaps, but it's important for testing purposes.) + */ + if (options) + { + if (options->buffering_mode == GIST_OPTION_BUFFERING_ON) + buildstate.buildMode = GIST_BUFFERING_STATS; + else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF) + buildstate.buildMode = GIST_BUFFERING_DISABLED; + else /* must be "auto" */ + buildstate.buildMode = GIST_BUFFERING_AUTO; + } + else + { + buildstate.buildMode = GIST_BUFFERING_AUTO; + } + + /* + * Unless buffering mode was forced, see if we can use sorting instead. + */ + if (buildstate.buildMode != GIST_BUFFERING_STATS) + { + bool hasallsortsupports = true; + int keyscount = IndexRelationGetNumberOfKeyAttributes(index); + + for (int i = 0; i < keyscount; i++) + { + SortSupportFnOids[i] = index_getprocid(index, i + 1, + GIST_SORTSUPPORT_PROC); + if (!OidIsValid(SortSupportFnOids[i])) + { + hasallsortsupports = false; + break; + } + } + if (hasallsortsupports) + buildstate.buildMode = GIST_SORTED_BUILD; + } + + /* + * Calculate target amount of free space to leave on pages. + */ + fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR; + buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; + + /* + * Build the index using the chosen strategy. + */ + buildstate.indtuples = 0; + buildstate.indtuplesSize = 0; + + if (buildstate.buildMode == GIST_SORTED_BUILD) + { + /* + * Sort all data, build the index from bottom up. + */ + buildstate.sortstate = tuplesort_begin_index_gist(heap, + index, + maintenance_work_mem, + NULL, + false); + + /* Scan the table, adding all tuples to the tuplesort */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistSortedBuildCallback, + (void *) &buildstate, NULL); + + /* + * Perform the sort and build index pages. + */ + tuplesort_performsort(buildstate.sortstate); + + gist_indexsortbuild(&buildstate); + + tuplesort_end(buildstate.sortstate); + } + else + { + /* + * Initialize an empty index and insert all tuples, possibly using + * buffers on intermediate levels. + */ + Buffer buffer; + Page page; + + /* initialize the root page */ + buffer = gistNewBuffer(index); + Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); + page = BufferGetPage(buffer); + + START_CRIT_SECTION(); + + GISTInitBuffer(buffer, F_LEAF); + + MarkBufferDirty(buffer); + PageSetLSN(page, GistBuildLSN); + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* Scan the table, inserting all the tuples to the index. */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistBuildCallback, + (void *) &buildstate, NULL); + + /* + * If buffering was used, flush out all the tuples that are still in + * the buffers. + */ + if (buildstate.buildMode == GIST_BUFFERING_ACTIVE) + { + elog(DEBUG1, "all tuples processed, emptying buffers"); + gistEmptyAllBuffers(&buildstate); + gistFreeBuildBuffers(buildstate.gfbb); + } + + /* + * We didn't write WAL records as we built the index, so if + * WAL-logging is required, write all pages to the WAL now. + */ + if (RelationNeedsWAL(index)) + { + log_newpage_range(index, MAIN_FORKNUM, + 0, RelationGetNumberOfBlocks(index), + true); + } + } + + /* okay, all heap tuples are indexed */ + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(buildstate.giststate->tempCxt); + + freeGISTstate(buildstate.giststate); + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = (double) buildstate.indtuples; + + return result; +} + +/*------------------------------------------------------------------------- + * Routines for sorted build + *------------------------------------------------------------------------- + */ + +/* + * Per-tuple callback for table_index_build_scan. + */ +static void +gistSortedBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + GISTBuildState *buildstate = (GISTBuildState *) state; + MemoryContext oldCtx; + Datum compressed_values[INDEX_MAX_KEYS]; + + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); + + /* Form an index tuple and point it at the heap tuple */ + gistCompressValues(buildstate->giststate, index, + values, isnull, + true, compressed_values); + + tuplesort_putindextuplevalues(buildstate->sortstate, + buildstate->indexrel, + tid, + compressed_values, isnull); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->giststate->tempCxt); + + /* Update tuple count. */ + buildstate->indtuples += 1; +} + +/* + * Build GiST index from bottom up from pre-sorted tuples. + */ +static void +gist_indexsortbuild(GISTBuildState *state) +{ + IndexTuple itup; + GistSortedBuildPageState *leafstate; + GistSortedBuildPageState *pagestate; + Page page; + + state->pages_allocated = 0; + state->pages_written = 0; + state->ready_num_pages = 0; + + /* + * Write an empty page as a placeholder for the root page. It will be + * replaced with the real root page at the end. + */ + page = palloc0(BLCKSZ); + RelationOpenSmgr(state->indexrel); + smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + page, true); + state->pages_allocated++; + state->pages_written++; + + /* Allocate a temporary buffer for the first leaf page. */ + leafstate = palloc(sizeof(GistSortedBuildPageState)); + leafstate->page = page; + leafstate->parent = NULL; + gistinitpage(page, F_LEAF); + + /* + * Fill index pages with tuples in the sorted order. + */ + while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL) + { + gist_indexsortbuild_pagestate_add(state, leafstate, itup); + MemoryContextReset(state->giststate->tempCxt); + } + + /* + * Write out the partially full non-root pages. + * + * Keep in mind that flush can build a new root. + */ + pagestate = leafstate; + while (pagestate->parent != NULL) + { + GistSortedBuildPageState *parent; + + gist_indexsortbuild_pagestate_flush(state, pagestate); + parent = pagestate->parent; + pfree(pagestate->page); + pfree(pagestate); + pagestate = parent; + } + + gist_indexsortbuild_flush_ready_pages(state); + + /* Write out the root */ + RelationOpenSmgr(state->indexrel); + PageSetLSN(pagestate->page, GistBuildLSN); + PageSetChecksumInplace(pagestate->page, GIST_ROOT_BLKNO); + smgrwrite(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + if (RelationNeedsWAL(state->indexrel)) + log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + + pfree(pagestate->page); + pfree(pagestate); + + /* + * When we WAL-logged index pages, we must nonetheless fsync index files. + * Since we're building outside shared buffers, a CHECKPOINT occurring + * during the build has no way to flush the previously written data to + * disk (indeed it won't know the index even exists). A crash later on + * would replay WAL from the checkpoint, therefore it wouldn't replay our + * earlier WAL entries. If we do not fsync those pages here, they might + * still not be on disk when the crash occurs. + */ + if (RelationNeedsWAL(state->indexrel)) + { + RelationOpenSmgr(state->indexrel); + smgrimmedsync(state->indexrel->rd_smgr, MAIN_FORKNUM); + } +} + +/* + * Add tuple to a page. If the pages is full, write it out and re-initialize + * a new page first. + */ +static void +gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup) +{ + Size sizeNeeded; + + /* Does the tuple fit? If not, flush */ + sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace; + if (PageGetFreeSpace(pagestate->page) < sizeNeeded) + gist_indexsortbuild_pagestate_flush(state, pagestate); + + gistfillbuffer(pagestate->page, &itup, 1, InvalidOffsetNumber); +} + +static void +gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate) +{ + GistSortedBuildPageState *parent; + IndexTuple *itvec; + IndexTuple union_tuple; + int vect_len; + bool isleaf; + BlockNumber blkno; + MemoryContext oldCtx; + + /* check once per page */ + CHECK_FOR_INTERRUPTS(); + + if (state->ready_num_pages == XLR_MAX_BLOCK_ID) + gist_indexsortbuild_flush_ready_pages(state); + + /* + * The page is now complete. Assign a block number to it, and add it to + * the list of finished pages. (We don't write it out immediately, because + * we want to WAL-log the pages in batches.) + */ + blkno = state->pages_allocated++; + state->ready_blknos[state->ready_num_pages] = blkno; + state->ready_pages[state->ready_num_pages] = pagestate->page; + state->ready_num_pages++; + + isleaf = GistPageIsLeaf(pagestate->page); + + /* + * Form a downlink tuple to represent all the tuples on the page. + */ + oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt); + itvec = gistextractpage(pagestate->page, &vect_len); + union_tuple = gistunion(state->indexrel, itvec, vect_len, + state->giststate); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); + MemoryContextSwitchTo(oldCtx); + + /* + * Insert the downlink to the parent page. If this was the root, create a + * new page as the parent, which becomes the new root. + */ + parent = pagestate->parent; + if (parent == NULL) + { + parent = palloc(sizeof(GistSortedBuildPageState)); + parent->page = (Page) palloc(BLCKSZ); + parent->parent = NULL; + gistinitpage(parent->page, 0); + + pagestate->parent = parent; + } + gist_indexsortbuild_pagestate_add(state, parent, union_tuple); + + /* Re-initialize the page buffer for next page on this level. */ + pagestate->page = palloc(BLCKSZ); + gistinitpage(pagestate->page, isleaf ? F_LEAF : 0); + + /* + * Set the right link to point to the previous page. This is just for + * debugging purposes: GiST only follows the right link if a page is split + * concurrently to a scan, and that cannot happen during index build. + * + * It's a bit counterintuitive that we set the right link on the new page + * to point to the previous page, and not the other way round. But GiST + * pages are not ordered like B-tree pages are, so as long as the + * right-links form a chain through all the pages in the same level, the + * order doesn't matter. + */ + GistPageGetOpaque(pagestate->page)->rightlink = blkno; +} + +static void +gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) +{ + if (state->ready_num_pages == 0) + return; + + RelationOpenSmgr(state->indexrel); + + for (int i = 0; i < state->ready_num_pages; i++) + { + Page page = state->ready_pages[i]; + BlockNumber blkno = state->ready_blknos[i]; + + /* Currently, the blocks must be buffered in order. */ + if (blkno != state->pages_written) + elog(ERROR, "unexpected block number to flush GiST sorting build"); + + PageSetLSN(page, GistBuildLSN); + PageSetChecksumInplace(page, blkno); + smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, blkno, page, true); + + state->pages_written++; + } + + if (RelationNeedsWAL(state->indexrel)) + log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages, + state->ready_blknos, state->ready_pages, true); + + for (int i = 0; i < state->ready_num_pages; i++) + pfree(state->ready_pages[i]); + + state->ready_num_pages = 0; +} + + +/*------------------------------------------------------------------------- + * Routines for non-sorted build + *------------------------------------------------------------------------- + */ + +/* + * Attempt to switch to buffering mode. + * + * If there is not enough memory for buffering build, sets bufferingMode + * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch + * anymore. Otherwise initializes the build buffers, and sets bufferingMode to + * GIST_BUFFERING_ACTIVE. + */ +static void +gistInitBuffering(GISTBuildState *buildstate) +{ + Relation index = buildstate->indexrel; + int pagesPerBuffer; + Size pageFreeSpace; + Size itupAvgSize, + itupMinSize; + double avgIndexTuplesPerPage, + maxIndexTuplesPerPage; + int i; + int levelStep; + + /* Calc space of index page which is available for index tuples */ + pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) + - sizeof(ItemIdData) + - buildstate->freespace; + + /* + * Calculate average size of already inserted index tuples using gathered + * statistics. + */ + itupAvgSize = (double) buildstate->indtuplesSize / + (double) buildstate->indtuples; + + /* + * Calculate minimal possible size of index tuple by index metadata. + * Minimal possible size of varlena is VARHDRSZ. + * + * XXX: that's not actually true, as a short varlen can be just 2 bytes. + * And we should take padding into account here. + */ + itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData)); + for (i = 0; i < index->rd_att->natts; i++) + { + if (TupleDescAttr(index->rd_att, i)->attlen < 0) + itupMinSize += VARHDRSZ; + else + itupMinSize += TupleDescAttr(index->rd_att, i)->attlen; + } + + /* Calculate average and maximal number of index tuples which fit to page */ + avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; + maxIndexTuplesPerPage = pageFreeSpace / itupMinSize; + + /* + * We need to calculate two parameters for the buffering algorithm: + * levelStep and pagesPerBuffer. + * + * levelStep determines the size of subtree that we operate on, while + * emptying a buffer. A higher value is better, as you need fewer buffer + * emptying steps to build the index. However, if you set it too high, the + * subtree doesn't fit in cache anymore, and you quickly lose the benefit + * of the buffers. + * + * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is + * the number of tuples on page (ie. fanout), and M is the amount of + * internal memory available. Curiously, they doesn't explain *why* that + * setting is optimal. We calculate it by taking the highest levelStep so + * that a subtree still fits in cache. For a small B, our way of + * calculating levelStep is very close to Arge et al's formula. For a + * large B, our formula gives a value that is 2x higher. + * + * The average size (in pages) of a subtree of depth n can be calculated + * as a geometric series: + * + * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B) + * + * where B is the average number of index tuples on page. The subtree is + * cached in the shared buffer cache and the OS cache, so we choose + * levelStep so that the subtree size is comfortably smaller than + * effective_cache_size, with a safety factor of 4. + * + * The estimate on the average number of index tuples on page is based on + * average tuple sizes observed before switching to buffered build, so the + * real subtree size can be somewhat larger. Also, it would selfish to + * gobble the whole cache for our index build. The safety factor of 4 + * should account for those effects. + * + * The other limiting factor for setting levelStep is that while + * processing a subtree, we need to hold one page for each buffer at the + * next lower buffered level. The max. number of buffers needed for that + * is maxIndexTuplesPerPage^levelStep. This is very conservative, but + * hopefully maintenance_work_mem is set high enough that you're + * constrained by effective_cache_size rather than maintenance_work_mem. + * + * XXX: the buffer hash table consumes a fair amount of memory too per + * buffer, but that is not currently taken into account. That scales on + * the total number of buffers used, ie. the index size and on levelStep. + * Note that a higher levelStep *reduces* the amount of memory needed for + * the hash table. + */ + levelStep = 1; + for (;;) + { + double subtreesize; + double maxlowestlevelpages; + + /* size of an average subtree at this levelStep (in pages). */ + subtreesize = + (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) / + (1 - avgIndexTuplesPerPage); + + /* max number of pages at the lowest level of a subtree */ + maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep); + + /* subtree must fit in cache (with safety factor of 4) */ + if (subtreesize > effective_cache_size / 4) + break; + + /* each node in the lowest level of a subtree has one page in memory */ + if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ) + break; + + /* Good, we can handle this levelStep. See if we can go one higher. */ + levelStep++; + } + + /* + * We just reached an unacceptable value of levelStep in previous loop. + * So, decrease levelStep to get last acceptable value. + */ + levelStep--; + + /* + * If there's not enough cache or maintenance_work_mem, fall back to plain + * inserts. + */ + if (levelStep <= 0) + { + elog(DEBUG1, "failed to switch to buffered GiST build"); + buildstate->buildMode = GIST_BUFFERING_DISABLED; + return; + } + + /* + * The second parameter to set is pagesPerBuffer, which determines the + * size of each buffer. We adjust pagesPerBuffer also during the build, + * which is why this calculation is in a separate function. + */ + pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep); + + /* Initialize GISTBuildBuffers with these parameters */ + buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep, + gistGetMaxLevel(index)); + + gistInitParentMap(buildstate); + + buildstate->buildMode = GIST_BUFFERING_ACTIVE; + + elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d", + levelStep, pagesPerBuffer); +} + +/* + * Calculate pagesPerBuffer parameter for the buffering algorithm. + * + * Buffer size is chosen so that assuming that tuples are distributed + * randomly, emptying half a buffer fills on average one page in every buffer + * at the next lower level. + */ +static int +calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep) +{ + double pagesPerBuffer; + double avgIndexTuplesPerPage; + double itupAvgSize; + Size pageFreeSpace; + + /* Calc space of index page which is available for index tuples */ + pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) + - sizeof(ItemIdData) + - buildstate->freespace; + + /* + * Calculate average size of already inserted index tuples using gathered + * statistics. + */ + itupAvgSize = (double) buildstate->indtuplesSize / + (double) buildstate->indtuples; + + avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; + + /* + * Recalculate required size of buffers. + */ + pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep); + + return (int) rint(pagesPerBuffer); +} + +/* + * Per-tuple callback for table_index_build_scan. + */ +static void +gistBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + GISTBuildState *buildstate = (GISTBuildState *) state; + IndexTuple itup; + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); + + /* form an index tuple and point it at the heap tuple */ + itup = gistFormTuple(buildstate->giststate, index, + values, isnull, + true); + itup->t_tid = *tid; + + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE) + { + /* We have buffers, so use them. */ + gistBufferingBuildInsert(buildstate, itup); + } + else + { + /* + * There's no buffers (yet). Since we already have the index relation + * locked, we call gistdoinsert directly. + */ + gistdoinsert(index, itup, buildstate->freespace, + buildstate->giststate, buildstate->heaprel, true); + } + + /* Update tuple count and total size. */ + buildstate->indtuples += 1; + buildstate->indtuplesSize += IndexTupleSize(itup); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->giststate->tempCxt); + + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE && + buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0) + { + /* Adjust the target buffer size now */ + buildstate->gfbb->pagesPerBuffer = + calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep); + } + + /* + * In 'auto' mode, check if the index has grown too large to fit in cache, + * and switch to buffering mode if it has. + * + * To avoid excessive calls to smgrnblocks(), only check this every + * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples. + * + * In 'stats' state, switch as soon as we have seen enough tuples to have + * some idea of the average tuple size. + */ + if ((buildstate->buildMode == GIST_BUFFERING_AUTO && + buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 && + effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) || + (buildstate->buildMode == GIST_BUFFERING_STATS && + buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)) + { + /* + * Index doesn't fit in effective cache anymore. Try to switch to + * buffering build mode. + */ + gistInitBuffering(buildstate); + } +} + +/* + * Insert function for buffering index build. + */ +static void +gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup) +{ + /* Insert the tuple to buffers. */ + gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel); + + /* If we filled up (half of a) buffer, process buffer emptying. */ + gistProcessEmptyingQueue(buildstate); +} + +/* + * Process an index tuple. Runs the tuple down the tree until we reach a leaf + * page or node buffer, and inserts the tuple there. Returns true if we have + * to stop buffer emptying process (because one of child buffers can't take + * index tuples anymore). + */ +static bool +gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, + BlockNumber startblkno, int startlevel) +{ + GISTSTATE *giststate = buildstate->giststate; + GISTBuildBuffers *gfbb = buildstate->gfbb; + Relation indexrel = buildstate->indexrel; + BlockNumber childblkno; + Buffer buffer; + bool result = false; + BlockNumber blkno; + int level; + OffsetNumber downlinkoffnum = InvalidOffsetNumber; + BlockNumber parentblkno = InvalidBlockNumber; + + CHECK_FOR_INTERRUPTS(); + + /* + * Loop until we reach a leaf page (level == 0) or a level with buffers + * (not including the level we start at, because we would otherwise make + * no progress). + */ + blkno = startblkno; + level = startlevel; + for (;;) + { + ItemId iid; + IndexTuple idxtuple, + newtup; + Page page; + OffsetNumber childoffnum; + + /* Have we reached a level with buffers? */ + if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel) + break; + + /* Have we reached a leaf page? */ + if (level == 0) + break; + + /* + * Nope. Descend down to the next level then. Choose a child to + * descend down to. + */ + + buffer = ReadBuffer(indexrel, blkno); + LockBuffer(buffer, GIST_EXCLUSIVE); + + page = (Page) BufferGetPage(buffer); + childoffnum = gistchoose(indexrel, page, itup, giststate); + iid = PageGetItemId(page, childoffnum); + idxtuple = (IndexTuple) PageGetItem(page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + if (level > 1) + gistMemorizeParent(buildstate, childblkno, blkno); + + /* + * Check that the key representing the target child node is consistent + * with the key we're inserting. Update it if it's not. + */ + newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate); + if (newtup) + { + blkno = gistbufferinginserttuples(buildstate, + buffer, + level, + &newtup, + 1, + childoffnum, + InvalidBlockNumber, + InvalidOffsetNumber); + /* gistbufferinginserttuples() released the buffer */ + } + else + UnlockReleaseBuffer(buffer); + + /* Descend to the child */ + parentblkno = blkno; + blkno = childblkno; + downlinkoffnum = childoffnum; + Assert(level > 0); + level--; + } + + if (LEVEL_HAS_BUFFERS(level, gfbb)) + { + /* + * We've reached level with buffers. Place the index tuple to the + * buffer, and add the buffer to the emptying queue if it overflows. + */ + GISTNodeBuffer *childNodeBuffer; + + /* Find the buffer or create a new one */ + childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level); + + /* Add index tuple to it */ + gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup); + + if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb)) + result = true; + } + else + { + /* + * We've reached a leaf page. Place the tuple here. + */ + Assert(level == 0); + buffer = ReadBuffer(indexrel, blkno); + LockBuffer(buffer, GIST_EXCLUSIVE); + gistbufferinginserttuples(buildstate, buffer, level, + &itup, 1, InvalidOffsetNumber, + parentblkno, downlinkoffnum); + /* gistbufferinginserttuples() released the buffer */ + } + + return result; +} + +/* + * Insert tuples to a given page. + * + * This is analogous with gistinserttuples() in the regular insertion code. + * + * Returns the block number of the page where the (first) new or updated tuple + * was inserted. Usually that's the original page, but might be a sibling page + * if the original page was split. + * + * Caller should hold a lock on 'buffer' on entry. This function will unlock + * and unpin it. + */ +static BlockNumber +gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, + IndexTuple *itup, int ntup, OffsetNumber oldoffnum, + BlockNumber parentblk, OffsetNumber downlinkoffnum) +{ + GISTBuildBuffers *gfbb = buildstate->gfbb; + List *splitinfo; + bool is_split; + BlockNumber placed_to_blk = InvalidBlockNumber; + + is_split = gistplacetopage(buildstate->indexrel, + buildstate->freespace, + buildstate->giststate, + buffer, + itup, ntup, oldoffnum, &placed_to_blk, + InvalidBuffer, + &splitinfo, + false, + buildstate->heaprel, true); + + /* + * If this is a root split, update the root path item kept in memory. This + * ensures that all path stacks are always complete, including all parent + * nodes up to the root. That simplifies the algorithm to re-find correct + * parent. + */ + if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO) + { + Page page = BufferGetPage(buffer); + OffsetNumber off; + OffsetNumber maxoff; + + Assert(level == gfbb->rootlevel); + gfbb->rootlevel++; + + elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel); + + /* + * All the downlinks on the old root page are now on one of the child + * pages. Visit all the new child pages to memorize the parents of the + * grandchildren. + */ + if (gfbb->rootlevel > 1) + { + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno); + + LockBuffer(childbuf, GIST_SHARE); + gistMemorizeAllDownlinks(buildstate, childbuf); + UnlockReleaseBuffer(childbuf); + + /* + * Also remember that the parent of the new child page is the + * root block. + */ + gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO); + } + } + } + + if (splitinfo) + { + /* + * Insert the downlinks to the parent. This is analogous with + * gistfinishsplit() in the regular insertion code, but the locking is + * simpler, and we have to maintain the buffers on internal nodes and + * the parent map. + */ + IndexTuple *downlinks; + int ndownlinks, + i; + Buffer parentBuffer; + ListCell *lc; + + /* Parent may have changed since we memorized this path. */ + parentBuffer = + gistBufferingFindCorrectParent(buildstate, + BufferGetBlockNumber(buffer), + level, + &parentblk, + &downlinkoffnum); + + /* + * If there's a buffer associated with this page, that needs to be + * split too. gistRelocateBuildBuffersOnSplit() will also adjust the + * downlinks in 'splitinfo', to make sure they're consistent not only + * with the tuples already on the pages, but also the tuples in the + * buffers that will eventually be inserted to them. + */ + gistRelocateBuildBuffersOnSplit(gfbb, + buildstate->giststate, + buildstate->indexrel, + level, + buffer, splitinfo); + + /* Create an array of all the downlink tuples */ + ndownlinks = list_length(splitinfo); + downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks); + i = 0; + foreach(lc, splitinfo) + { + GISTPageSplitInfo *splitinfo = lfirst(lc); + + /* + * Remember the parent of each new child page in our parent map. + * This assumes that the downlinks fit on the parent page. If the + * parent page is split, too, when we recurse up to insert the + * downlinks, the recursive gistbufferinginserttuples() call will + * update the map again. + */ + if (level > 0) + gistMemorizeParent(buildstate, + BufferGetBlockNumber(splitinfo->buf), + BufferGetBlockNumber(parentBuffer)); + + /* + * Also update the parent map for all the downlinks that got moved + * to a different page. (actually this also loops through the + * downlinks that stayed on the original page, but it does no + * harm). + */ + if (level > 1) + gistMemorizeAllDownlinks(buildstate, splitinfo->buf); + + /* + * Since there's no concurrent access, we can release the lower + * level buffers immediately. This includes the original page. + */ + UnlockReleaseBuffer(splitinfo->buf); + downlinks[i++] = splitinfo->downlink; + } + + /* Insert them into parent. */ + gistbufferinginserttuples(buildstate, parentBuffer, level + 1, + downlinks, ndownlinks, downlinkoffnum, + InvalidBlockNumber, InvalidOffsetNumber); + + list_free_deep(splitinfo); /* we don't need this anymore */ + } + else + UnlockReleaseBuffer(buffer); + + return placed_to_blk; +} + +/* + * Find the downlink pointing to a child page. + * + * 'childblkno' indicates the child page to find the parent for. 'level' is + * the level of the child. On entry, *parentblkno and *downlinkoffnum can + * point to a location where the downlink used to be - we will check that + * location first, and save some cycles if it hasn't moved. The function + * returns a buffer containing the downlink, exclusively-locked, and + * *parentblkno and *downlinkoffnum are set to the real location of the + * downlink. + * + * If the child page is a leaf (level == 0), the caller must supply a correct + * parentblkno. Otherwise we use the parent map hash table to find the parent + * block. + * + * This function serves the same purpose as gistFindCorrectParent() during + * normal index inserts, but this is simpler because we don't need to deal + * with concurrent inserts. + */ +static Buffer +gistBufferingFindCorrectParent(GISTBuildState *buildstate, + BlockNumber childblkno, int level, + BlockNumber *parentblkno, + OffsetNumber *downlinkoffnum) +{ + BlockNumber parent; + Buffer buffer; + Page page; + OffsetNumber maxoff; + OffsetNumber off; + + if (level > 0) + parent = gistGetParent(buildstate, childblkno); + else + { + /* + * For a leaf page, the caller must supply a correct parent block + * number. + */ + if (*parentblkno == InvalidBlockNumber) + elog(ERROR, "no parent buffer provided of child %u", childblkno); + parent = *parentblkno; + } + + buffer = ReadBuffer(buildstate->indexrel, parent); + page = BufferGetPage(buffer); + LockBuffer(buffer, GIST_EXCLUSIVE); + gistcheckpage(buildstate->indexrel, buffer); + maxoff = PageGetMaxOffsetNumber(page); + + /* Check if it was not moved */ + if (parent == *parentblkno && *parentblkno != InvalidBlockNumber && + *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff) + { + ItemId iid = PageGetItemId(page, *downlinkoffnum); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) + { + /* Still there */ + return buffer; + } + } + + /* + * Downlink was not at the offset where it used to be. Scan the page to + * find it. During normal gist insertions, it might've moved to another + * page, to the right, but during a buffering build, we keep track of the + * parent of each page in the lookup table so we should always know what + * page it's on. + */ + for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) + { + /* yes!!, found it */ + *downlinkoffnum = off; + return buffer; + } + } + + elog(ERROR, "failed to re-find parent for block %u", childblkno); + return InvalidBuffer; /* keep compiler quiet */ +} + +/* + * Process buffers emptying stack. Emptying of one buffer can cause emptying + * of other buffers. This function iterates until this cascading emptying + * process finished, e.g. until buffers emptying stack is empty. + */ +static void +gistProcessEmptyingQueue(GISTBuildState *buildstate) +{ + GISTBuildBuffers *gfbb = buildstate->gfbb; + + /* Iterate while we have elements in buffers emptying stack. */ + while (gfbb->bufferEmptyingQueue != NIL) + { + GISTNodeBuffer *emptyingNodeBuffer; + + /* Get node buffer from emptying stack. */ + emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue); + gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue); + emptyingNodeBuffer->queuedForEmptying = false; + + /* + * We are going to load last pages of buffers where emptying will be + * to. So let's unload any previously loaded buffers. + */ + gistUnloadNodeBuffers(gfbb); + + /* + * Pop tuples from the buffer and run them down to the buffers at + * lower level, or leaf pages. We continue until one of the lower + * level buffers fills up, or this buffer runs empty. + * + * In Arge et al's paper, the buffer emptying is stopped after + * processing 1/2 node buffer worth of tuples, to avoid overfilling + * any of the lower level buffers. However, it's more efficient to + * keep going until one of the lower level buffers actually fills up, + * so that's what we do. This doesn't need to be exact, if a buffer + * overfills by a few tuples, there's no harm done. + */ + while (true) + { + IndexTuple itup; + + /* Get next index tuple from the buffer */ + if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup)) + break; + + /* + * Run it down to the underlying node buffer or leaf page. + * + * Note: it's possible that the buffer we're emptying splits as a + * result of this call. If that happens, our emptyingNodeBuffer + * points to the left half of the split. After split, it's very + * likely that the new left buffer is no longer over the half-full + * threshold, but we might as well keep flushing tuples from it + * until we fill a lower-level buffer. + */ + if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level)) + { + /* + * A lower level buffer filled up. Stop emptying this buffer, + * to avoid overflowing the lower level buffer. + */ + break; + } + + /* Free all the memory allocated during index tuple processing */ + MemoryContextReset(buildstate->giststate->tempCxt); + } + } +} + +/* + * Empty all node buffers, from top to bottom. This is done at the end of + * index build to flush all remaining tuples to the index. + * + * Note: This destroys the buffersOnLevels lists, so the buffers should not + * be inserted to after this call. + */ +static void +gistEmptyAllBuffers(GISTBuildState *buildstate) +{ + GISTBuildBuffers *gfbb = buildstate->gfbb; + MemoryContext oldCtx; + int i; + + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); + + /* + * Iterate through the levels from top to bottom. + */ + for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--) + { + /* + * Empty all buffers on this level. Note that new buffers can pop up + * in the list during the processing, as a result of page splits, so a + * simple walk through the list won't work. We remove buffers from the + * list when we see them empty; a buffer can't become non-empty once + * it's been fully emptied. + */ + while (gfbb->buffersOnLevels[i] != NIL) + { + GISTNodeBuffer *nodeBuffer; + + nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]); + + if (nodeBuffer->blocksCount != 0) + { + /* + * Add this buffer to the emptying queue, and proceed to empty + * the queue. + */ + if (!nodeBuffer->queuedForEmptying) + { + MemoryContextSwitchTo(gfbb->context); + nodeBuffer->queuedForEmptying = true; + gfbb->bufferEmptyingQueue = + lcons(nodeBuffer, gfbb->bufferEmptyingQueue); + MemoryContextSwitchTo(buildstate->giststate->tempCxt); + } + gistProcessEmptyingQueue(buildstate); + } + else + gfbb->buffersOnLevels[i] = + list_delete_first(gfbb->buffersOnLevels[i]); + } + elog(DEBUG2, "emptied all buffers at level %d", i); + } + MemoryContextSwitchTo(oldCtx); +} + +/* + * Get the depth of the GiST index. + */ +static int +gistGetMaxLevel(Relation index) +{ + int maxLevel; + BlockNumber blkno; + + /* + * Traverse down the tree, starting from the root, until we hit the leaf + * level. + */ + maxLevel = 0; + blkno = GIST_ROOT_BLKNO; + while (true) + { + Buffer buffer; + Page page; + IndexTuple itup; + + buffer = ReadBuffer(index, blkno); + + /* + * There's no concurrent access during index build, so locking is just + * pro forma. + */ + LockBuffer(buffer, GIST_SHARE); + page = (Page) BufferGetPage(buffer); + + if (GistPageIsLeaf(page)) + { + /* We hit the bottom, so we're done. */ + UnlockReleaseBuffer(buffer); + break; + } + + /* + * Pick the first downlink on the page, and follow it. It doesn't + * matter which downlink we choose, the tree has the same depth + * everywhere, so we just pick the first one. + */ + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, FirstOffsetNumber)); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + UnlockReleaseBuffer(buffer); + + /* + * We're going down on the tree. It means that there is yet one more + * level in the tree. + */ + maxLevel++; + } + return maxLevel; +} + + +/* + * Routines for managing the parent map. + * + * Whenever a page is split, we need to insert the downlinks into the parent. + * We need to somehow find the parent page to do that. In normal insertions, + * we keep a stack of nodes visited when we descend the tree. However, in + * buffering build, we can start descending the tree from any internal node, + * when we empty a buffer by cascading tuples to its children. So we don't + * have a full stack up to the root available at that time. + * + * So instead, we maintain a hash table to track the parent of every internal + * page. We don't need to track the parents of leaf nodes, however. Whenever + * we insert to a leaf, we've just descended down from its parent, so we know + * its immediate parent already. This helps a lot to limit the memory used + * by this hash table. + * + * Whenever an internal node is split, the parent map needs to be updated. + * the parent of the new child page needs to be recorded, and also the + * entries for all page whose downlinks are moved to a new page at the split + * needs to be updated. + * + * We also update the parent map whenever we descend the tree. That might seem + * unnecessary, because we maintain the map whenever a downlink is moved or + * created, but it is needed because we switch to buffering mode after + * creating a tree with regular index inserts. Any pages created before + * switching to buffering mode will not be present in the parent map initially, + * but will be added there the first time we visit them. + */ + +typedef struct +{ + BlockNumber childblkno; /* hash key */ + BlockNumber parentblkno; +} ParentMapEntry; + +static void +gistInitParentMap(GISTBuildState *buildstate) +{ + HASHCTL hashCtl; + + hashCtl.keysize = sizeof(BlockNumber); + hashCtl.entrysize = sizeof(ParentMapEntry); + hashCtl.hcxt = CurrentMemoryContext; + buildstate->parentMap = hash_create("gistbuild parent map", + 1024, + &hashCtl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); +} + +static void +gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent) +{ + ParentMapEntry *entry; + bool found; + + entry = (ParentMapEntry *) hash_search(buildstate->parentMap, + (const void *) &child, + HASH_ENTER, + &found); + entry->parentblkno = parent; +} + +/* + * Scan all downlinks on a page, and memorize their parent. + */ +static void +gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf) +{ + OffsetNumber maxoff; + OffsetNumber off; + BlockNumber parentblkno = BufferGetBlockNumber(parentbuf); + Page page = BufferGetPage(parentbuf); + + Assert(!GistPageIsLeaf(page)); + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + gistMemorizeParent(buildstate, childblkno, parentblkno); + } +} + +static BlockNumber +gistGetParent(GISTBuildState *buildstate, BlockNumber child) +{ + ParentMapEntry *entry; + bool found; + + /* Find node buffer in hash table */ + entry = (ParentMapEntry *) hash_search(buildstate->parentMap, + (const void *) &child, + HASH_FIND, + &found); + if (!found) + elog(ERROR, "could not find parent of block %u in lookup table", child); + + return entry->parentblkno; +} diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c new file mode 100644 index 0000000..95cc334 --- /dev/null +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -0,0 +1,775 @@ +/*------------------------------------------------------------------------- + * + * gistbuildbuffers.c + * node buffer management functions for GiST buffering build algorithm. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistbuildbuffers.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/gist_private.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/buffile.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb); +static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer); +static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer); +static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, + GISTNodeBuffer *nodeBuffer); +static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, + IndexTuple item); +static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, + IndexTuple *item); +static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb); +static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum); + +static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr); +static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr); + + +/* + * Initialize GiST build buffers. + */ +GISTBuildBuffers * +gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) +{ + GISTBuildBuffers *gfbb; + HASHCTL hashCtl; + + gfbb = palloc(sizeof(GISTBuildBuffers)); + gfbb->pagesPerBuffer = pagesPerBuffer; + gfbb->levelStep = levelStep; + + /* + * Create a temporary file to hold buffer pages that are swapped out of + * memory. + */ + gfbb->pfile = BufFileCreateTemp(false); + gfbb->nFileBlocks = 0; + + /* Initialize free page management. */ + gfbb->nFreeBlocks = 0; + gfbb->freeBlocksLen = 32; + gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); + + /* + * Current memory context will be used for all in-memory data structures + * of buffers which are persistent during buffering build. + */ + gfbb->context = CurrentMemoryContext; + + /* + * nodeBuffersTab hash is association between index blocks and it's + * buffers. + */ + hashCtl.keysize = sizeof(BlockNumber); + hashCtl.entrysize = sizeof(GISTNodeBuffer); + hashCtl.hcxt = CurrentMemoryContext; + gfbb->nodeBuffersTab = hash_create("gistbuildbuffers", + 1024, + &hashCtl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + gfbb->bufferEmptyingQueue = NIL; + + /* + * Per-level node buffers lists for final buffers emptying process. Node + * buffers are inserted here when they are created. + */ + gfbb->buffersOnLevelsLen = 1; + gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * + gfbb->buffersOnLevelsLen); + gfbb->buffersOnLevels[0] = NIL; + + /* + * Block numbers of node buffers which last pages are currently loaded + * into main memory. + */ + gfbb->loadedBuffersLen = 32; + gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * + sizeof(GISTNodeBuffer *)); + gfbb->loadedBuffersCount = 0; + + gfbb->rootlevel = maxLevel; + + return gfbb; +} + +/* + * Returns a node buffer for given block. The buffer is created if it + * doesn't exist yet. + */ +GISTNodeBuffer * +gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, + BlockNumber nodeBlocknum, int level) +{ + GISTNodeBuffer *nodeBuffer; + bool found; + + /* Find node buffer in hash table */ + nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab, + (const void *) &nodeBlocknum, + HASH_ENTER, + &found); + if (!found) + { + /* + * Node buffer wasn't found. Initialize the new buffer as empty. + */ + MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); + + /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */ + nodeBuffer->blocksCount = 0; + nodeBuffer->pageBlocknum = InvalidBlockNumber; + nodeBuffer->pageBuffer = NULL; + nodeBuffer->queuedForEmptying = false; + nodeBuffer->isTemp = false; + nodeBuffer->level = level; + + /* + * Add this buffer to the list of buffers on this level. Enlarge + * buffersOnLevels array if needed. + */ + if (level >= gfbb->buffersOnLevelsLen) + { + int i; + + gfbb->buffersOnLevels = + (List **) repalloc(gfbb->buffersOnLevels, + (level + 1) * sizeof(List *)); + + /* initialize the enlarged portion */ + for (i = gfbb->buffersOnLevelsLen; i <= level; i++) + gfbb->buffersOnLevels[i] = NIL; + gfbb->buffersOnLevelsLen = level + 1; + } + + /* + * Prepend the new buffer to the list of buffers on this level. It's + * not arbitrary that the new buffer is put to the beginning of the + * list: in the final emptying phase we loop through all buffers at + * each level, and flush them. If a page is split during the emptying, + * it's more efficient to flush the new splitted pages first, before + * moving on to pre-existing pages on the level. The buffers just + * created during the page split are likely still in cache, so + * flushing them immediately is more efficient than putting them to + * the end of the queue. + */ + gfbb->buffersOnLevels[level] = lcons(nodeBuffer, + gfbb->buffersOnLevels[level]); + + MemoryContextSwitchTo(oldcxt); + } + + return nodeBuffer; +} + +/* + * Allocate memory for a buffer page. + */ +static GISTNodeBufferPage * +gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) +{ + GISTNodeBufferPage *pageBuffer; + + pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, + BLCKSZ); + pageBuffer->prev = InvalidBlockNumber; + + /* Set page free space */ + PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; + return pageBuffer; +} + +/* + * Add specified buffer into loadedBuffers array. + */ +static void +gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) +{ + /* Never add a temporary buffer to the array */ + if (nodeBuffer->isTemp) + return; + + /* Enlarge the array if needed */ + if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen) + { + gfbb->loadedBuffersLen *= 2; + gfbb->loadedBuffers = (GISTNodeBuffer **) + repalloc(gfbb->loadedBuffers, + gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); + } + + gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer; + gfbb->loadedBuffersCount++; +} + +/* + * Load last page of node buffer into main memory. + */ +static void +gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) +{ + /* Check if we really should load something */ + if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0) + { + /* Allocate memory for page */ + nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); + + /* Read block from temporary file */ + ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum, + nodeBuffer->pageBuffer); + + /* Mark file block as free */ + gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum); + + /* Mark node buffer as loaded */ + gistAddLoadedBuffer(gfbb, nodeBuffer); + nodeBuffer->pageBlocknum = InvalidBlockNumber; + } +} + +/* + * Write last page of node buffer to the disk. + */ +static void +gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) +{ + /* Check if we have something to write */ + if (nodeBuffer->pageBuffer) + { + BlockNumber blkno; + + /* Get free file block */ + blkno = gistBuffersGetFreeBlock(gfbb); + + /* Write block to the temporary file */ + WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); + + /* Free memory of that page */ + pfree(nodeBuffer->pageBuffer); + nodeBuffer->pageBuffer = NULL; + + /* Save block number */ + nodeBuffer->pageBlocknum = blkno; + } +} + +/* + * Write last pages of all node buffers to the disk. + */ +void +gistUnloadNodeBuffers(GISTBuildBuffers *gfbb) +{ + int i; + + /* Unload all the buffers that have a page loaded in memory. */ + for (i = 0; i < gfbb->loadedBuffersCount; i++) + gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]); + + /* Now there are no node buffers with loaded last page */ + gfbb->loadedBuffersCount = 0; +} + +/* + * Add index tuple to buffer page. + */ +static void +gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup) +{ + Size itupsz = IndexTupleSize(itup); + char *ptr; + + /* There should be enough of space. */ + Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz)); + + /* Reduce free space value of page to reserve a spot for the tuple. */ + PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz); + + /* Get pointer to the spot we reserved (ie. end of free space). */ + ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET + + PAGE_FREE_SPACE(pageBuffer); + + /* Copy the index tuple there. */ + memcpy(ptr, itup, itupsz); +} + +/* + * Get last item from buffer page and remove it from page. + */ +static void +gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup) +{ + IndexTuple ptr; + Size itupsz; + + Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */ + + /* Get pointer to last index tuple */ + ptr = (IndexTuple) ((char *) pageBuffer + + BUFFER_PAGE_DATA_OFFSET + + PAGE_FREE_SPACE(pageBuffer)); + itupsz = IndexTupleSize(ptr); + + /* Make a copy of the tuple */ + *itup = (IndexTuple) palloc(itupsz); + memcpy(*itup, ptr, itupsz); + + /* Mark the space used by the tuple as free */ + PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz); +} + +/* + * Push an index tuple to node buffer. + */ +void +gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, + IndexTuple itup) +{ + /* + * Most part of memory operations will be in buffering build persistent + * context. So, let's switch to it. + */ + MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); + + /* + * If the buffer is currently empty, create the first page. + */ + if (nodeBuffer->blocksCount == 0) + { + nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); + nodeBuffer->blocksCount = 1; + gistAddLoadedBuffer(gfbb, nodeBuffer); + } + + /* Load last page of node buffer if it wasn't in memory already */ + if (!nodeBuffer->pageBuffer) + gistLoadNodeBuffer(gfbb, nodeBuffer); + + /* + * Check if there is enough space on the last page for the tuple. + */ + if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup)) + { + /* + * Nope. Swap previous block to disk and allocate a new one. + */ + BlockNumber blkno; + + /* Write filled page to the disk */ + blkno = gistBuffersGetFreeBlock(gfbb); + WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); + + /* + * Reset the in-memory page as empty, and link the previous block to + * the new page by storing its block number in the prev-link. + */ + PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = + BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); + nodeBuffer->pageBuffer->prev = blkno; + + /* We've just added one more page */ + nodeBuffer->blocksCount++; + } + + gistPlaceItupToPage(nodeBuffer->pageBuffer, itup); + + /* + * If the buffer just overflowed, add it to the emptying queue. + */ + if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying) + { + gfbb->bufferEmptyingQueue = lcons(nodeBuffer, + gfbb->bufferEmptyingQueue); + nodeBuffer->queuedForEmptying = true; + } + + /* Restore memory context */ + MemoryContextSwitchTo(oldcxt); +} + +/* + * Removes one index tuple from node buffer. Returns true if success and false + * if node buffer is empty. + */ +bool +gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, + IndexTuple *itup) +{ + /* + * If node buffer is empty then return false. + */ + if (nodeBuffer->blocksCount <= 0) + return false; + + /* Load last page of node buffer if needed */ + if (!nodeBuffer->pageBuffer) + gistLoadNodeBuffer(gfbb, nodeBuffer); + + /* + * Get index tuple from last non-empty page. + */ + gistGetItupFromPage(nodeBuffer->pageBuffer, itup); + + /* + * If we just removed the last tuple from the page, fetch previous page on + * this node buffer (if any). + */ + if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer)) + { + BlockNumber prevblkno; + + /* + * blocksCount includes the page in pageBuffer, so decrease it now. + */ + nodeBuffer->blocksCount--; + + /* + * If there's more pages, fetch previous one. + */ + prevblkno = nodeBuffer->pageBuffer->prev; + if (prevblkno != InvalidBlockNumber) + { + /* There is a previous page. Fetch it. */ + Assert(nodeBuffer->blocksCount > 0); + ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer); + + /* + * Now that we've read the block in memory, we can release its + * on-disk block for reuse. + */ + gistBuffersReleaseBlock(gfbb, prevblkno); + } + else + { + /* No more pages. Free memory. */ + Assert(nodeBuffer->blocksCount == 0); + pfree(nodeBuffer->pageBuffer); + nodeBuffer->pageBuffer = NULL; + } + } + return true; +} + +/* + * Select a currently unused block for writing to. + */ +static long +gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb) +{ + /* + * If there are multiple free blocks, we select the one appearing last in + * freeBlocks[]. If there are none, assign the next block at the end of + * the file (causing the file to be extended). + */ + if (gfbb->nFreeBlocks > 0) + return gfbb->freeBlocks[--gfbb->nFreeBlocks]; + else + return gfbb->nFileBlocks++; +} + +/* + * Return a block# to the freelist. + */ +static void +gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum) +{ + int ndx; + + /* Enlarge freeBlocks array if full. */ + if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen) + { + gfbb->freeBlocksLen *= 2; + gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks, + gfbb->freeBlocksLen * + sizeof(long)); + } + + /* Add blocknum to array */ + ndx = gfbb->nFreeBlocks++; + gfbb->freeBlocks[ndx] = blocknum; +} + +/* + * Free buffering build data structure. + */ +void +gistFreeBuildBuffers(GISTBuildBuffers *gfbb) +{ + /* Close buffers file. */ + BufFileClose(gfbb->pfile); + + /* All other things will be freed on memory context release */ +} + +/* + * Data structure representing information about node buffer for index tuples + * relocation from splitted node buffer. + */ +typedef struct +{ + GISTENTRY entry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + GISTPageSplitInfo *splitinfo; + GISTNodeBuffer *nodeBuffer; +} RelocationBufferInfo; + +/* + * At page split, distribute tuples from the buffer of the split page to + * new buffers for the created page halves. This also adjusts the downlinks + * in 'splitinfo' to include the tuples in the buffers. + */ +void +gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, + Relation r, int level, + Buffer buffer, List *splitinfo) +{ + RelocationBufferInfo *relocationBuffersInfos; + bool found; + GISTNodeBuffer *nodeBuffer; + BlockNumber blocknum; + IndexTuple itup; + int splitPagesCount = 0, + i; + GISTENTRY entry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + GISTNodeBuffer oldBuf; + ListCell *lc; + + /* If the splitted page doesn't have buffers, we have nothing to do. */ + if (!LEVEL_HAS_BUFFERS(level, gfbb)) + return; + + /* + * Get the node buffer of the splitted page. + */ + blocknum = BufferGetBlockNumber(buffer); + nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum, + HASH_FIND, &found); + if (!found) + { + /* The page has no buffer, so we have nothing to do. */ + return; + } + + /* + * Make a copy of the old buffer, as we're going reuse it as the buffer + * for the new left page, which is on the same block as the old page. + * That's not true for the root page, but that's fine because we never + * have a buffer on the root page anyway. The original algorithm as + * described by Arge et al did, but it's of no use, as you might as well + * read the tuples straight from the heap instead of the root buffer. + */ + Assert(blocknum != GIST_ROOT_BLKNO); + memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer)); + oldBuf.isTemp = true; + + /* Reset the old buffer, used for the new left page from now on */ + nodeBuffer->blocksCount = 0; + nodeBuffer->pageBuffer = NULL; + nodeBuffer->pageBlocknum = InvalidBlockNumber; + + /* + * Allocate memory for information about relocation buffers. + */ + splitPagesCount = list_length(splitinfo); + relocationBuffersInfos = + (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * + splitPagesCount); + + /* + * Fill relocation buffers information for node buffers of pages produced + * by split. + */ + i = 0; + foreach(lc, splitinfo) + { + GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc); + GISTNodeBuffer *newNodeBuffer; + + /* Decompress parent index tuple of node buffer page. */ + gistDeCompressAtt(giststate, r, + si->downlink, NULL, (OffsetNumber) 0, + relocationBuffersInfos[i].entry, + relocationBuffersInfos[i].isnull); + + /* + * Create a node buffer for the page. The leftmost half is on the same + * block as the old page before split, so for the leftmost half this + * will return the original buffer. The tuples on the original buffer + * were relinked to the temporary buffer, so the original one is now + * empty. + */ + newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level); + + relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; + relocationBuffersInfos[i].splitinfo = si; + + i++; + } + + /* + * Loop through all index tuples in the buffer of the page being split, + * moving them to buffers for the new pages. We try to move each tuple to + * the page that will result in the lowest penalty for the leading column + * or, in the case of a tie, the lowest penalty for the earliest column + * that is not tied. + * + * The page searching logic is very similar to gistchoose(). + */ + while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup)) + { + float best_penalty[INDEX_MAX_KEYS]; + int i, + which; + IndexTuple newtup; + RelocationBufferInfo *targetBufferInfo; + + gistDeCompressAtt(giststate, r, + itup, NULL, (OffsetNumber) 0, entry, isnull); + + /* default to using first page (shouldn't matter) */ + which = 0; + + /* + * best_penalty[j] is the best penalty we have seen so far for column + * j, or -1 when we haven't yet examined column j. Array entries to + * the right of the first -1 are undefined. + */ + best_penalty[0] = -1; + + /* + * Loop over possible target pages, looking for one to move this tuple + * to. + */ + for (i = 0; i < splitPagesCount; i++) + { + RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; + bool zero_penalty; + int j; + + zero_penalty = true; + + /* Loop over index attributes. */ + for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++) + { + float usize; + + /* Compute penalty for this column. */ + usize = gistpenalty(giststate, j, + &splitPageInfo->entry[j], + splitPageInfo->isnull[j], + &entry[j], isnull[j]); + if (usize > 0) + zero_penalty = false; + + if (best_penalty[j] < 0 || usize < best_penalty[j]) + { + /* + * New best penalty for column. Tentatively select this + * page as the target, and record the best penalty. Then + * reset the next column's penalty to "unknown" (and + * indirectly, the same for all the ones to its right). + * This will force us to adopt this page's penalty values + * as the best for all the remaining columns during + * subsequent loop iterations. + */ + which = i; + best_penalty[j] = usize; + + if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1) + best_penalty[j + 1] = -1; + } + else if (best_penalty[j] == usize) + { + /* + * The current page is exactly as good for this column as + * the best page seen so far. The next iteration of this + * loop will compare the next column. + */ + } + else + { + /* + * The current page is worse for this column than the best + * page seen so far. Skip the remaining columns and move + * on to the next page, if any. + */ + zero_penalty = false; /* so outer loop won't exit */ + break; + } + } + + /* + * If we find a page with zero penalty for all columns, there's no + * need to examine remaining pages; just break out of the loop and + * return it. + */ + if (zero_penalty) + break; + } + + /* OK, "which" is the page index to push the tuple to */ + targetBufferInfo = &relocationBuffersInfos[which]; + + /* Push item to selected node buffer */ + gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup); + + /* Adjust the downlink for this page, if needed. */ + newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink, + itup, giststate); + if (newtup) + { + gistDeCompressAtt(giststate, r, + newtup, NULL, (OffsetNumber) 0, + targetBufferInfo->entry, + targetBufferInfo->isnull); + + targetBufferInfo->splitinfo->downlink = newtup; + } + } + + pfree(relocationBuffersInfos); +} + + +/* + * Wrappers around BufFile operations. The main difference is that these + * wrappers report errors with ereport(), so that the callers don't need + * to check the return code. + */ + +static void +ReadTempFileBlock(BufFile *file, long blknum, void *ptr) +{ + size_t nread; + + if (BufFileSeekBlock(file, blknum) != 0) + elog(ERROR, "could not seek to block %ld in temporary file", blknum); + nread = BufFileRead(file, ptr, BLCKSZ); + if (nread != BLCKSZ) + elog(ERROR, "could not read temporary file: read only %zu of %zu bytes", + nread, (size_t) BLCKSZ); +} + +static void +WriteTempFileBlock(BufFile *file, long blknum, void *ptr) +{ + if (BufFileSeekBlock(file, blknum) != 0) + elog(ERROR, "could not seek to block %ld in temporary file", blknum); + BufFileWrite(file, ptr, BLCKSZ); +} diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c new file mode 100644 index 0000000..c8f7e78 --- /dev/null +++ b/src/backend/access/gist/gistget.c @@ -0,0 +1,803 @@ +/*------------------------------------------------------------------------- + * + * gistget.c + * fetch tuples from a GiST scan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistget.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/gist_private.h" +#include "access/relscan.h" +#include "lib/pairingheap.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/float.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * gistkillitems() -- set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * We re-read page here, so it's important to check page LSN. If the page + * has been modified since the last read (as determined by LSN), we cannot + * flag any entries because it is possible that the old entry was vacuumed + * away and the TID was re-used by a completely different heap tuple. + */ +static void +gistkillitems(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId iid; + int i; + bool killedsomething = false; + + Assert(so->curBlkno != InvalidBlockNumber); + Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); + Assert(so->killedItems != NULL); + + buffer = ReadBuffer(scan->indexRelation, so->curBlkno); + if (!BufferIsValid(buffer)) + return; + + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + + /* + * If page LSN differs it means that the page was modified since the last + * read. killedItems could be not valid so LP_DEAD hints applying is not + * safe. + */ + if (BufferGetLSNAtomic(buffer) != so->curPageLSN) + { + UnlockReleaseBuffer(buffer); + so->numKilled = 0; /* reset counter */ + return; + } + + Assert(GistPageIsLeaf(page)); + + /* + * Mark all killedItems as dead. We need no additional recheck, because, + * if page was modified, curPageLSN must have changed. + */ + for (i = 0; i < so->numKilled; i++) + { + offnum = so->killedItems[i]; + iid = PageGetItemId(page, offnum); + ItemIdMarkDead(iid); + killedsomething = true; + } + + if (killedsomething) + { + GistMarkPageHasGarbage(page); + MarkBufferDirtyHint(buffer, true); + } + + UnlockReleaseBuffer(buffer); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; +} + +/* + * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? + * + * The index tuple might represent either a heap tuple or a lower index page, + * depending on whether the containing page is a leaf page or not. + * + * On success return for a heap tuple, *recheck_p is set to indicate whether + * the quals need to be rechecked. We recheck if any of the consistent() + * functions request it. recheck is not interesting when examining a non-leaf + * entry, since we must visit the lower index page if there's any doubt. + * Similarly, *recheck_distances_p is set to indicate whether the distances + * need to be rechecked, and it is also ignored for non-leaf entries. + * + * If we are doing an ordered scan, so->distances[] is filled with distance + * data from the distance() functions before returning success. + * + * We must decompress the key in the IndexTuple before passing it to the + * sk_funcs (which actually are the opclass Consistent or Distance methods). + * + * Note that this function is always invoked in a short-lived memory context, + * so we don't need to worry about cleaning up allocated memory, either here + * or in the implementation of any Consistent or Distance methods. + */ +static bool +gistindex_keytest(IndexScanDesc scan, + IndexTuple tuple, + Page page, + OffsetNumber offset, + bool *recheck_p, + bool *recheck_distances_p) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + ScanKey key = scan->keyData; + int keySize = scan->numberOfKeys; + IndexOrderByDistance *distance_p; + Relation r = scan->indexRelation; + + *recheck_p = false; + *recheck_distances_p = false; + + /* + * If it's a leftover invalid tuple from pre-9.1, treat it as a match with + * minimum possible distances. This means we'll always follow it to the + * referenced page. + */ + if (GistTupleIsInvalid(tuple)) + { + int i; + + if (GistPageIsLeaf(page)) /* shouldn't happen */ + elog(ERROR, "invalid GiST tuple found on leaf page"); + for (i = 0; i < scan->numberOfOrderBys; i++) + { + so->distances[i].value = -get_float8_infinity(); + so->distances[i].isnull = false; + } + return true; + } + + /* Check whether it matches according to the Consistent functions */ + while (keySize > 0) + { + Datum datum; + bool isNull; + + datum = index_getattr(tuple, + key->sk_attno, + giststate->leafTupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* + * On non-leaf page we can't conclude that child hasn't NULL + * values because of assumption in GiST: union (VAL, NULL) is VAL. + * But if on non-leaf page key IS NULL, then all children are + * NULL. + */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (GistPageIsLeaf(page) && !isNull) + return false; + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (isNull) + return false; + } + } + else if (isNull) + { + return false; + } + else + { + Datum test; + bool recheck; + GISTENTRY de; + + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + false, isNull); + + /* + * Call the Consistent function to evaluate the test. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the comparison operator's strategy number and subtype + * from pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * We initialize the recheck flag to true (the safest assumption) + * in case the Consistent function forgets to set it. + */ + recheck = true; + + test = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int16GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); + + if (!DatumGetBool(test)) + return false; + *recheck_p |= recheck; + } + + key++; + keySize--; + } + + /* OK, it passes --- now let's compute the distances */ + key = scan->orderByData; + distance_p = so->distances; + keySize = scan->numberOfOrderBys; + while (keySize > 0) + { + Datum datum; + bool isNull; + + datum = index_getattr(tuple, + key->sk_attno, + giststate->leafTupdesc, + &isNull); + + if ((key->sk_flags & SK_ISNULL) || isNull) + { + /* Assume distance computes as null */ + distance_p->value = 0.0; + distance_p->isnull = true; + } + else + { + Datum dist; + bool recheck; + GISTENTRY de; + + gistdentryinit(giststate, key->sk_attno - 1, &de, + datum, r, page, offset, + false, isNull); + + /* + * Call the Distance function to evaluate the distance. The + * arguments are the index datum (as a GISTENTRY*), the comparison + * datum, the ordering operator's strategy number and subtype from + * pg_amop, and the recheck flag. + * + * (Presently there's no need to pass the subtype since it'll + * always be zero, but might as well pass it for possible future + * use.) + * + * If the function sets the recheck flag, the returned distance is + * a lower bound on the true distance and needs to be rechecked. + * We initialize the flag to 'false'. This flag was added in + * version 9.5; distance functions written before that won't know + * about the flag, but are expected to never be lossy. + */ + recheck = false; + dist = FunctionCall5Coll(&key->sk_func, + key->sk_collation, + PointerGetDatum(&de), + key->sk_argument, + Int16GetDatum(key->sk_strategy), + ObjectIdGetDatum(key->sk_subtype), + PointerGetDatum(&recheck)); + *recheck_distances_p |= recheck; + distance_p->value = DatumGetFloat8(dist); + distance_p->isnull = false; + } + + key++; + distance_p++; + keySize--; + } + + return true; +} + +/* + * Scan all items on the GiST index page identified by *pageItem, and insert + * them into the queue (or directly to output areas) + * + * scan: index scan we are executing + * pageItem: search queue item identifying an index page to scan + * myDistances: distances array associated with pageItem, or NULL at the root + * tbm: if not NULL, gistgetbitmap's output bitmap + * ntids: if not NULL, gistgetbitmap's output tuple counter + * + * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap + * tuples should be reported directly into the bitmap. If they are NULL, + * we're doing a plain or ordered indexscan. For a plain indexscan, heap + * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, + * heap tuple TIDs are pushed into individual search queue items. In an + * index-only scan, reconstructed index tuples are returned along with the + * TIDs. + * + * If we detect that the index page has split since we saw its downlink + * in the parent, we push its new right sibling onto the queue so the + * sibling will be processed next. + */ +static void +gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, + IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTSTATE *giststate = so->giststate; + Relation r = scan->indexRelation; + Buffer buffer; + Page page; + GISTPageOpaque opaque; + OffsetNumber maxoff; + OffsetNumber i; + MemoryContext oldcxt; + + Assert(!GISTSearchItemIsHeap(*pageItem)); + + buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); + LockBuffer(buffer, GIST_SHARE); + PredicateLockPage(r, BufferGetBlockNumber(buffer), scan->xs_snapshot); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + TestForOldSnapshot(scan->xs_snapshot, r, page); + opaque = GistPageGetOpaque(page); + + /* + * Check if we need to follow the rightlink. We need to follow it if the + * page was concurrently split since we visited the parent (in which case + * parentlsn < nsn), or if the system crashed after a page split but + * before the downlink was inserted into the parent. + */ + if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && + (GistFollowRight(page) || + pageItem->data.parentlsn < GistPageGetNSN(page)) && + opaque->rightlink != InvalidBlockNumber /* sanity check */ ) + { + /* There was a page split, follow right link to add pages */ + GISTSearchItem *item; + + /* This can't happen when starting at the root */ + Assert(myDistances != NULL); + + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for the right sibling index page */ + item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); + item->blkno = opaque->rightlink; + item->data.parentlsn = pageItem->data.parentlsn; + + /* Insert it into the queue using same distances as for this page */ + memcpy(item->distances, myDistances, + sizeof(item->distances[0]) * scan->numberOfOrderBys); + + pairingheap_add(so->queue, &item->phNode); + + MemoryContextSwitchTo(oldcxt); + } + + /* + * Check if the page was deleted after we saw the downlink. There's + * nothing of interest on a deleted page. Note that we must do this after + * checking the NSN for concurrent splits! It's possible that the page + * originally contained some tuples that are visible to us, but was split + * so that all the visible tuples were moved to another page, and then + * this page was deleted. + */ + if (GistPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer); + return; + } + + so->nPageData = so->curPageData = 0; + scan->xs_hitup = NULL; /* might point into pageDataCxt */ + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->curPageLSN = BufferGetLSNAtomic(buffer); + + /* + * check all tuples on page + */ + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId iid = PageGetItemId(page, i); + IndexTuple it; + bool match; + bool recheck; + bool recheck_distances; + + /* + * If the scan specifies not to return killed tuples, then we treat a + * killed tuple as not passing the qual. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + continue; + + it = (IndexTuple) PageGetItem(page, iid); + + /* + * Must call gistindex_keytest in tempCxt, and clean up any leftover + * junk afterward. + */ + oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt); + + match = gistindex_keytest(scan, it, page, i, + &recheck, &recheck_distances); + + MemoryContextSwitchTo(oldcxt); + MemoryContextReset(so->giststate->tempCxt); + + /* Ignore tuple if it doesn't match */ + if (!match) + continue; + + if (tbm && GistPageIsLeaf(page)) + { + /* + * getbitmap scan, so just push heap tuple TIDs into the bitmap + * without worrying about ordering + */ + tbm_add_tuples(tbm, &it->t_tid, 1, recheck); + (*ntids)++; + } + else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) + { + /* + * Non-ordered scan, so report tuples in so->pageData[] + */ + so->pageData[so->nPageData].heapPtr = it->t_tid; + so->pageData[so->nPageData].recheck = recheck; + so->pageData[so->nPageData].offnum = i; + + /* + * In an index-only scan, also fetch the data from the tuple. The + * reconstructed tuples are stored in pageDataCxt. + */ + if (scan->xs_want_itup) + { + oldcxt = MemoryContextSwitchTo(so->pageDataCxt); + so->pageData[so->nPageData].recontup = + gistFetchTuple(giststate, r, it); + MemoryContextSwitchTo(oldcxt); + } + so->nPageData++; + } + else + { + /* + * Must push item into search queue. We get here for any lower + * index page, and also for heap tuples if doing an ordered + * search. + */ + GISTSearchItem *item; + int nOrderBys = scan->numberOfOrderBys; + + oldcxt = MemoryContextSwitchTo(so->queueCxt); + + /* Create new GISTSearchItem for this item */ + item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); + + if (GistPageIsLeaf(page)) + { + /* Creating heap-tuple GISTSearchItem */ + item->blkno = InvalidBlockNumber; + item->data.heap.heapPtr = it->t_tid; + item->data.heap.recheck = recheck; + item->data.heap.recheckDistances = recheck_distances; + + /* + * In an index-only scan, also fetch the data from the tuple. + */ + if (scan->xs_want_itup) + item->data.heap.recontup = gistFetchTuple(giststate, r, it); + } + else + { + /* Creating index-page GISTSearchItem */ + item->blkno = ItemPointerGetBlockNumber(&it->t_tid); + + /* + * LSN of current page is lsn of parent page for child. We + * only have a shared lock, so we need to get the LSN + * atomically. + */ + item->data.parentlsn = BufferGetLSNAtomic(buffer); + } + + /* Insert it into the queue using new distance data */ + memcpy(item->distances, so->distances, + sizeof(item->distances[0]) * nOrderBys); + + pairingheap_add(so->queue, &item->phNode); + + MemoryContextSwitchTo(oldcxt); + } + } + + UnlockReleaseBuffer(buffer); +} + +/* + * Extract next item (in order) from search queue + * + * Returns a GISTSearchItem or NULL. Caller must pfree item when done with it. + */ +static GISTSearchItem * +getNextGISTSearchItem(GISTScanOpaque so) +{ + GISTSearchItem *item; + + if (!pairingheap_is_empty(so->queue)) + { + item = (GISTSearchItem *) pairingheap_remove_first(so->queue); + } + else + { + /* Done when both heaps are empty */ + item = NULL; + } + + /* Return item; caller is responsible to pfree it */ + return item; +} + +/* + * Fetch next heap tuple in an ordered search + */ +static bool +getNextNearest(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool res = false; + + if (scan->xs_hitup) + { + /* free previously returned tuple */ + pfree(scan->xs_hitup); + scan->xs_hitup = NULL; + } + + do + { + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) + break; + + if (GISTSearchItemIsHeap(*item)) + { + /* found a heap item at currently minimal distance */ + scan->xs_heaptid = item->data.heap.heapPtr; + scan->xs_recheck = item->data.heap.recheck; + + index_store_float8_orderby_distances(scan, so->orderByTypes, + item->distances, + item->data.heap.recheckDistances); + + /* in an index-only scan, also return the reconstructed tuple. */ + if (scan->xs_want_itup) + scan->xs_hitup = item->data.heap.recontup; + res = true; + } + else + { + /* visit an index page, extract its items into queue */ + CHECK_FOR_INTERRUPTS(); + + gistScanPage(scan, item, item->distances, NULL, NULL); + } + + pfree(item); + } while (!res); + + return res; +} + +/* + * gistgettuple() -- Get the next tuple in the scan + */ +bool +gistgettuple(IndexScanDesc scan, ScanDirection dir) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + + if (dir != ForwardScanDirection) + elog(ERROR, "GiST only supports forward scan direction"); + + if (!so->qual_ok) + return false; + + if (so->firstCall) + { + /* Begin the scan by processing the root page */ + GISTSearchItem fakeItem; + + pgstat_count_index_scan(scan->indexRelation); + + so->firstCall = false; + so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, NULL, NULL); + } + + if (scan->numberOfOrderBys > 0) + { + /* Must fetch tuples in strict distance order */ + return getNextNearest(scan); + } + else + { + /* Fetch tuples index-page-at-a-time */ + for (;;) + { + if (so->curPageData < so->nPageData) + { + if (scan->kill_prior_tuple && so->curPageData > 0) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } + /* continuing to return tuples from a leaf page */ + scan->xs_heaptid = so->pageData[so->curPageData].heapPtr; + scan->xs_recheck = so->pageData[so->curPageData].recheck; + + /* in an index-only scan, also return the reconstructed tuple */ + if (scan->xs_want_itup) + scan->xs_hitup = so->pageData[so->curPageData].recontup; + + so->curPageData++; + + return true; + } + + /* + * Check the last returned tuple and add it to killedItems if + * necessary + */ + if (scan->kill_prior_tuple + && so->curPageData > 0 + && so->curPageData == so->nPageData) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } + /* find and process the next index page */ + do + { + GISTSearchItem *item; + + if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) + gistkillitems(scan); + + item = getNextGISTSearchItem(so); + + if (!item) + return false; + + CHECK_FOR_INTERRUPTS(); + + /* save current item BlockNumber for next gistkillitems() call */ + so->curBlkno = item->blkno; + + /* + * While scanning a leaf page, ItemPointers of matching heap + * tuples are stored in so->pageData. If there are any on + * this page, we fall out of the inner "do" and loop around to + * return them. + */ + gistScanPage(scan, item, item->distances, NULL, NULL); + + pfree(item); + } while (so->nPageData == 0); + } + } +} + +/* + * gistgetbitmap() -- Get a bitmap of all heap tuple locations + */ +int64 +gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + int64 ntids = 0; + GISTSearchItem fakeItem; + + if (!so->qual_ok) + return 0; + + pgstat_count_index_scan(scan->indexRelation); + + /* Begin the scan by processing the root page */ + so->curPageData = so->nPageData = 0; + scan->xs_hitup = NULL; + if (so->pageDataCxt) + MemoryContextReset(so->pageDataCxt); + + fakeItem.blkno = GIST_ROOT_BLKNO; + memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); + gistScanPage(scan, &fakeItem, NULL, tbm, &ntids); + + /* + * While scanning a leaf page, ItemPointers of matching heap tuples will + * be stored directly into tbm, so we don't need to deal with them here. + */ + for (;;) + { + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (!item) + break; + + CHECK_FOR_INTERRUPTS(); + + gistScanPage(scan, item, item->distances, tbm, &ntids); + + pfree(item); + } + + return ntids; +} + +/* + * Can we do index-only scans on the given index column? + * + * Opclasses that implement a fetch function support index-only scans. + * Opclasses without compression functions also support index-only scans. + * Included attributes always can be fetched for index-only scans. + */ +bool +gistcanreturn(Relation index, int attno) +{ + if (attno > IndexRelationGetNumberOfKeyAttributes(index) || + OidIsValid(index_getprocid(index, attno, GIST_FETCH_PROC)) || + !OidIsValid(index_getprocid(index, attno, GIST_COMPRESS_PROC))) + return true; + else + return false; +} diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c new file mode 100644 index 0000000..d474612 --- /dev/null +++ b/src/backend/access/gist/gistproc.c @@ -0,0 +1,1777 @@ +/*------------------------------------------------------------------------- + * + * gistproc.c + * Support procedures for GiSTs over 2-D objects (boxes, polygons, circles, + * points). + * + * This gives R-tree behavior, with Guttman's poly-time split algorithm. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistproc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/gist.h" +#include "access/stratnum.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/geo_decls.h" +#include "utils/sortsupport.h" + + +static bool gist_box_leaf_consistent(BOX *key, BOX *query, + StrategyNumber strategy); +static bool rtree_internal_consistent(BOX *key, BOX *query, + StrategyNumber strategy); + +static uint64 point_zorder_internal(float4 x, float4 y); +static uint64 part_bits32_by2(uint32 x); +static uint32 ieee_float32_to_uint32(float f); +static int gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup); +static Datum gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup); +static int gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup); +static bool gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup); + + +/* Minimum accepted ratio of split */ +#define LIMIT_RATIO 0.3 + + +/************************************************** + * Box ops + **************************************************/ + +/* + * Calculates union of two boxes, a and b. The result is stored in *n. + */ +static void +rt_box_union(BOX *n, const BOX *a, const BOX *b) +{ + n->high.x = float8_max(a->high.x, b->high.x); + n->high.y = float8_max(a->high.y, b->high.y); + n->low.x = float8_min(a->low.x, b->low.x); + n->low.y = float8_min(a->low.y, b->low.y); +} + +/* + * Size of a BOX for penalty-calculation purposes. + * The result can be +Infinity, but not NaN. + */ +static float8 +size_box(const BOX *box) +{ + /* + * Check for zero-width cases. Note that we define the size of a zero- + * by-infinity box as zero. It's important to special-case this somehow, + * as naively multiplying infinity by zero will produce NaN. + * + * The less-than cases should not happen, but if they do, say "zero". + */ + if (float8_le(box->high.x, box->low.x) || + float8_le(box->high.y, box->low.y)) + return 0.0; + + /* + * We treat NaN as larger than +Infinity, so any distance involving a NaN + * and a non-NaN is infinite. Note the previous check eliminated the + * possibility that the low fields are NaNs. + */ + if (isnan(box->high.x) || isnan(box->high.y)) + return get_float8_infinity(); + return float8_mul(float8_mi(box->high.x, box->low.x), + float8_mi(box->high.y, box->low.y)); +} + +/* + * Return amount by which the union of the two boxes is larger than + * the original BOX's area. The result can be +Infinity, but not NaN. + */ +static float8 +box_penalty(const BOX *original, const BOX *new) +{ + BOX unionbox; + + rt_box_union(&unionbox, original, new); + return float8_mi(size_box(&unionbox), size_box(original)); +} + +/* + * The GiST Consistent method for boxes + * + * Should return false if for all data items x below entry, + * the predicate x op query must be false, where op is the oper + * corresponding to strategy in the pg_amop table. + */ +Datum +gist_box_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + BOX *query = PG_GETARG_BOX_P(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + + /* All cases served by this function are exact */ + *recheck = false; + + if (DatumGetBoxP(entry->key) == NULL || query == NULL) + PG_RETURN_BOOL(false); + + /* + * if entry is not leaf, use rtree_internal_consistent, else use + * gist_box_leaf_consistent + */ + if (GIST_LEAF(entry)) + PG_RETURN_BOOL(gist_box_leaf_consistent(DatumGetBoxP(entry->key), + query, + strategy)); + else + PG_RETURN_BOOL(rtree_internal_consistent(DatumGetBoxP(entry->key), + query, + strategy)); +} + +/* + * Increase BOX b to include addon. + */ +static void +adjustBox(BOX *b, const BOX *addon) +{ + if (float8_lt(b->high.x, addon->high.x)) + b->high.x = addon->high.x; + if (float8_gt(b->low.x, addon->low.x)) + b->low.x = addon->low.x; + if (float8_lt(b->high.y, addon->high.y)) + b->high.y = addon->high.y; + if (float8_gt(b->low.y, addon->low.y)) + b->low.y = addon->low.y; +} + +/* + * The GiST Union method for boxes + * + * returns the minimal bounding box that encloses all the entries in entryvec + */ +Datum +gist_box_union(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + int *sizep = (int *) PG_GETARG_POINTER(1); + int numranges, + i; + BOX *cur, + *pageunion; + + numranges = entryvec->n; + pageunion = (BOX *) palloc(sizeof(BOX)); + cur = DatumGetBoxP(entryvec->vector[0].key); + memcpy((void *) pageunion, (void *) cur, sizeof(BOX)); + + for (i = 1; i < numranges; i++) + { + cur = DatumGetBoxP(entryvec->vector[i].key); + adjustBox(pageunion, cur); + } + *sizep = sizeof(BOX); + + PG_RETURN_POINTER(pageunion); +} + +/* + * We store boxes as boxes in GiST indexes, so we do not need + * compress, decompress, or fetch functions. + */ + +/* + * The GiST Penalty method for boxes (also used for points) + * + * As in the R-tree paper, we use change in area as our penalty metric + */ +Datum +gist_box_penalty(PG_FUNCTION_ARGS) +{ + GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); + float *result = (float *) PG_GETARG_POINTER(2); + BOX *origbox = DatumGetBoxP(origentry->key); + BOX *newbox = DatumGetBoxP(newentry->key); + + *result = (float) box_penalty(origbox, newbox); + PG_RETURN_POINTER(result); +} + +/* + * Trivial split: half of entries will be placed on one page + * and another half - to another + */ +static void +fallbackSplit(GistEntryVector *entryvec, GIST_SPLITVEC *v) +{ + OffsetNumber i, + maxoff; + BOX *unionL = NULL, + *unionR = NULL; + int nbytes; + + maxoff = entryvec->n - 1; + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + v->spl_nleft = v->spl_nright = 0; + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + BOX *cur = DatumGetBoxP(entryvec->vector[i].key); + + if (i <= (maxoff - FirstOffsetNumber + 1) / 2) + { + v->spl_left[v->spl_nleft] = i; + if (unionL == NULL) + { + unionL = (BOX *) palloc(sizeof(BOX)); + *unionL = *cur; + } + else + adjustBox(unionL, cur); + + v->spl_nleft++; + } + else + { + v->spl_right[v->spl_nright] = i; + if (unionR == NULL) + { + unionR = (BOX *) palloc(sizeof(BOX)); + *unionR = *cur; + } + else + adjustBox(unionR, cur); + + v->spl_nright++; + } + } + + v->spl_ldatum = BoxPGetDatum(unionL); + v->spl_rdatum = BoxPGetDatum(unionR); +} + +/* + * Represents information about an entry that can be placed to either group + * without affecting overlap over selected axis ("common entry"). + */ +typedef struct +{ + /* Index of entry in the initial array */ + int index; + /* Delta between penalties of entry insertion into different groups */ + float8 delta; +} CommonEntry; + +/* + * Context for g_box_consider_split. Contains information about currently + * selected split and some general information. + */ +typedef struct +{ + int entriesCount; /* total number of entries being split */ + BOX boundingBox; /* minimum bounding box across all entries */ + + /* Information about currently selected split follows */ + + bool first; /* true if no split was selected yet */ + + float8 leftUpper; /* upper bound of left interval */ + float8 rightLower; /* lower bound of right interval */ + + float4 ratio; + float4 overlap; + int dim; /* axis of this split */ + float8 range; /* width of general MBR projection to the + * selected axis */ +} ConsiderSplitContext; + +/* + * Interval represents projection of box to axis. + */ +typedef struct +{ + float8 lower, + upper; +} SplitInterval; + +/* + * Interval comparison function by lower bound of the interval; + */ +static int +interval_cmp_lower(const void *i1, const void *i2) +{ + float8 lower1 = ((const SplitInterval *) i1)->lower, + lower2 = ((const SplitInterval *) i2)->lower; + + return float8_cmp_internal(lower1, lower2); +} + +/* + * Interval comparison function by upper bound of the interval; + */ +static int +interval_cmp_upper(const void *i1, const void *i2) +{ + float8 upper1 = ((const SplitInterval *) i1)->upper, + upper2 = ((const SplitInterval *) i2)->upper; + + return float8_cmp_internal(upper1, upper2); +} + +/* + * Replace negative (or NaN) value with zero. + */ +static inline float +non_negative(float val) +{ + if (val >= 0.0f) + return val; + else + return 0.0f; +} + +/* + * Consider replacement of currently selected split with the better one. + */ +static inline void +g_box_consider_split(ConsiderSplitContext *context, int dimNum, + float8 rightLower, int minLeftCount, + float8 leftUpper, int maxLeftCount) +{ + int leftCount, + rightCount; + float4 ratio, + overlap; + float8 range; + + /* + * Calculate entries distribution ratio assuming most uniform distribution + * of common entries. + */ + if (minLeftCount >= (context->entriesCount + 1) / 2) + { + leftCount = minLeftCount; + } + else + { + if (maxLeftCount <= context->entriesCount / 2) + leftCount = maxLeftCount; + else + leftCount = context->entriesCount / 2; + } + rightCount = context->entriesCount - leftCount; + + /* + * Ratio of split - quotient between size of lesser group and total + * entries count. + */ + ratio = float4_div(Min(leftCount, rightCount), context->entriesCount); + + if (ratio > LIMIT_RATIO) + { + bool selectthis = false; + + /* + * The ratio is acceptable, so compare current split with previously + * selected one. Between splits of one dimension we search for minimal + * overlap (allowing negative values) and minimal ration (between same + * overlaps. We switch dimension if find less overlap (non-negative) + * or less range with same overlap. + */ + if (dimNum == 0) + range = float8_mi(context->boundingBox.high.x, + context->boundingBox.low.x); + else + range = float8_mi(context->boundingBox.high.y, + context->boundingBox.low.y); + + overlap = float8_div(float8_mi(leftUpper, rightLower), range); + + /* If there is no previous selection, select this */ + if (context->first) + selectthis = true; + else if (context->dim == dimNum) + { + /* + * Within the same dimension, choose the new split if it has a + * smaller overlap, or same overlap but better ratio. + */ + if (overlap < context->overlap || + (overlap == context->overlap && ratio > context->ratio)) + selectthis = true; + } + else + { + /* + * Across dimensions, choose the new split if it has a smaller + * *non-negative* overlap, or same *non-negative* overlap but + * bigger range. This condition differs from the one described in + * the article. On the datasets where leaf MBRs don't overlap + * themselves, non-overlapping splits (i.e. splits which have zero + * *non-negative* overlap) are frequently possible. In this case + * splits tends to be along one dimension, because most distant + * non-overlapping splits (i.e. having lowest negative overlap) + * appears to be in the same dimension as in the previous split. + * Therefore MBRs appear to be very prolonged along another + * dimension, which leads to bad search performance. Using range + * as the second split criteria makes MBRs more quadratic. Using + * *non-negative* overlap instead of overlap as the first split + * criteria gives to range criteria a chance to matter, because + * non-overlapping splits are equivalent in this criteria. + */ + if (non_negative(overlap) < non_negative(context->overlap) || + (range > context->range && + non_negative(overlap) <= non_negative(context->overlap))) + selectthis = true; + } + + if (selectthis) + { + /* save information about selected split */ + context->first = false; + context->ratio = ratio; + context->range = range; + context->overlap = overlap; + context->rightLower = rightLower; + context->leftUpper = leftUpper; + context->dim = dimNum; + } + } +} + +/* + * Compare common entries by their deltas. + */ +static int +common_entry_cmp(const void *i1, const void *i2) +{ + float8 delta1 = ((const CommonEntry *) i1)->delta, + delta2 = ((const CommonEntry *) i2)->delta; + + return float8_cmp_internal(delta1, delta2); +} + +/* + * -------------------------------------------------------------------------- + * Double sorting split algorithm. This is used for both boxes and points. + * + * The algorithm finds split of boxes by considering splits along each axis. + * Each entry is first projected as an interval on the X-axis, and different + * ways to split the intervals into two groups are considered, trying to + * minimize the overlap of the groups. Then the same is repeated for the + * Y-axis, and the overall best split is chosen. The quality of a split is + * determined by overlap along that axis and some other criteria (see + * g_box_consider_split). + * + * After that, all the entries are divided into three groups: + * + * 1) Entries which should be placed to the left group + * 2) Entries which should be placed to the right group + * 3) "Common entries" which can be placed to any of groups without affecting + * of overlap along selected axis. + * + * The common entries are distributed by minimizing penalty. + * + * For details see: + * "A new double sorting-based node splitting algorithm for R-tree", A. Korotkov + * http://syrcose.ispras.ru/2011/files/SYRCoSE2011_Proceedings.pdf#page=36 + * -------------------------------------------------------------------------- + */ +Datum +gist_box_picksplit(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber i, + maxoff; + ConsiderSplitContext context; + BOX *box, + *leftBox, + *rightBox; + int dim, + commonEntriesCount; + SplitInterval *intervalsLower, + *intervalsUpper; + CommonEntry *commonEntries; + int nentries; + + memset(&context, 0, sizeof(ConsiderSplitContext)); + + maxoff = entryvec->n - 1; + nentries = context.entriesCount = maxoff - FirstOffsetNumber + 1; + + /* Allocate arrays for intervals along axes */ + intervalsLower = (SplitInterval *) palloc(nentries * sizeof(SplitInterval)); + intervalsUpper = (SplitInterval *) palloc(nentries * sizeof(SplitInterval)); + + /* + * Calculate the overall minimum bounding box over all the entries. + */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + box = DatumGetBoxP(entryvec->vector[i].key); + if (i == FirstOffsetNumber) + context.boundingBox = *box; + else + adjustBox(&context.boundingBox, box); + } + + /* + * Iterate over axes for optimal split searching. + */ + context.first = true; /* nothing selected yet */ + for (dim = 0; dim < 2; dim++) + { + float8 leftUpper, + rightLower; + int i1, + i2; + + /* Project each entry as an interval on the selected axis. */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + box = DatumGetBoxP(entryvec->vector[i].key); + if (dim == 0) + { + intervalsLower[i - FirstOffsetNumber].lower = box->low.x; + intervalsLower[i - FirstOffsetNumber].upper = box->high.x; + } + else + { + intervalsLower[i - FirstOffsetNumber].lower = box->low.y; + intervalsLower[i - FirstOffsetNumber].upper = box->high.y; + } + } + + /* + * Make two arrays of intervals: one sorted by lower bound and another + * sorted by upper bound. + */ + memcpy(intervalsUpper, intervalsLower, + sizeof(SplitInterval) * nentries); + qsort(intervalsLower, nentries, sizeof(SplitInterval), + interval_cmp_lower); + qsort(intervalsUpper, nentries, sizeof(SplitInterval), + interval_cmp_upper); + + /*---- + * The goal is to form a left and right interval, so that every entry + * interval is contained by either left or right interval (or both). + * + * For example, with the intervals (0,1), (1,3), (2,3), (2,4): + * + * 0 1 2 3 4 + * +-+ + * +---+ + * +-+ + * +---+ + * + * The left and right intervals are of the form (0,a) and (b,4). + * We first consider splits where b is the lower bound of an entry. + * We iterate through all entries, and for each b, calculate the + * smallest possible a. Then we consider splits where a is the + * upper bound of an entry, and for each a, calculate the greatest + * possible b. + * + * In the above example, the first loop would consider splits: + * b=0: (0,1)-(0,4) + * b=1: (0,1)-(1,4) + * b=2: (0,3)-(2,4) + * + * And the second loop: + * a=1: (0,1)-(1,4) + * a=3: (0,3)-(2,4) + * a=4: (0,4)-(2,4) + */ + + /* + * Iterate over lower bound of right group, finding smallest possible + * upper bound of left group. + */ + i1 = 0; + i2 = 0; + rightLower = intervalsLower[i1].lower; + leftUpper = intervalsUpper[i2].lower; + while (true) + { + /* + * Find next lower bound of right group. + */ + while (i1 < nentries && + float8_eq(rightLower, intervalsLower[i1].lower)) + { + if (float8_lt(leftUpper, intervalsLower[i1].upper)) + leftUpper = intervalsLower[i1].upper; + i1++; + } + if (i1 >= nentries) + break; + rightLower = intervalsLower[i1].lower; + + /* + * Find count of intervals which anyway should be placed to the + * left group. + */ + while (i2 < nentries && + float8_le(intervalsUpper[i2].upper, leftUpper)) + i2++; + + /* + * Consider found split. + */ + g_box_consider_split(&context, dim, rightLower, i1, leftUpper, i2); + } + + /* + * Iterate over upper bound of left group finding greatest possible + * lower bound of right group. + */ + i1 = nentries - 1; + i2 = nentries - 1; + rightLower = intervalsLower[i1].upper; + leftUpper = intervalsUpper[i2].upper; + while (true) + { + /* + * Find next upper bound of left group. + */ + while (i2 >= 0 && float8_eq(leftUpper, intervalsUpper[i2].upper)) + { + if (float8_gt(rightLower, intervalsUpper[i2].lower)) + rightLower = intervalsUpper[i2].lower; + i2--; + } + if (i2 < 0) + break; + leftUpper = intervalsUpper[i2].upper; + + /* + * Find count of intervals which anyway should be placed to the + * right group. + */ + while (i1 >= 0 && float8_ge(intervalsLower[i1].lower, rightLower)) + i1--; + + /* + * Consider found split. + */ + g_box_consider_split(&context, dim, + rightLower, i1 + 1, leftUpper, i2 + 1); + } + } + + /* + * If we failed to find any acceptable splits, use trivial split. + */ + if (context.first) + { + fallbackSplit(entryvec, v); + PG_RETURN_POINTER(v); + } + + /* + * Ok, we have now selected the split across one axis. + * + * While considering the splits, we already determined that there will be + * enough entries in both groups to reach the desired ratio, but we did + * not memorize which entries go to which group. So determine that now. + */ + + /* Allocate vectors for results */ + v->spl_left = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); + v->spl_right = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); + v->spl_nleft = 0; + v->spl_nright = 0; + + /* Allocate bounding boxes of left and right groups */ + leftBox = palloc0(sizeof(BOX)); + rightBox = palloc0(sizeof(BOX)); + + /* + * Allocate an array for "common entries" - entries which can be placed to + * either group without affecting overlap along selected axis. + */ + commonEntriesCount = 0; + commonEntries = (CommonEntry *) palloc(nentries * sizeof(CommonEntry)); + + /* Helper macros to place an entry in the left or right group */ +#define PLACE_LEFT(box, off) \ + do { \ + if (v->spl_nleft > 0) \ + adjustBox(leftBox, box); \ + else \ + *leftBox = *(box); \ + v->spl_left[v->spl_nleft++] = off; \ + } while(0) + +#define PLACE_RIGHT(box, off) \ + do { \ + if (v->spl_nright > 0) \ + adjustBox(rightBox, box); \ + else \ + *rightBox = *(box); \ + v->spl_right[v->spl_nright++] = off; \ + } while(0) + + /* + * Distribute entries which can be distributed unambiguously, and collect + * common entries. + */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + float8 lower, + upper; + + /* + * Get upper and lower bounds along selected axis. + */ + box = DatumGetBoxP(entryvec->vector[i].key); + if (context.dim == 0) + { + lower = box->low.x; + upper = box->high.x; + } + else + { + lower = box->low.y; + upper = box->high.y; + } + + if (float8_le(upper, context.leftUpper)) + { + /* Fits to the left group */ + if (float8_ge(lower, context.rightLower)) + { + /* Fits also to the right group, so "common entry" */ + commonEntries[commonEntriesCount++].index = i; + } + else + { + /* Doesn't fit to the right group, so join to the left group */ + PLACE_LEFT(box, i); + } + } + else + { + /* + * Each entry should fit on either left or right group. Since this + * entry didn't fit on the left group, it better fit in the right + * group. + */ + Assert(float8_ge(lower, context.rightLower)); + + /* Doesn't fit to the left group, so join to the right group */ + PLACE_RIGHT(box, i); + } + } + + /* + * Distribute "common entries", if any. + */ + if (commonEntriesCount > 0) + { + /* + * Calculate minimum number of entries that must be placed in both + * groups, to reach LIMIT_RATIO. + */ + int m = ceil(LIMIT_RATIO * nentries); + + /* + * Calculate delta between penalties of join "common entries" to + * different groups. + */ + for (i = 0; i < commonEntriesCount; i++) + { + box = DatumGetBoxP(entryvec->vector[commonEntries[i].index].key); + commonEntries[i].delta = Abs(float8_mi(box_penalty(leftBox, box), + box_penalty(rightBox, box))); + } + + /* + * Sort "common entries" by calculated deltas in order to distribute + * the most ambiguous entries first. + */ + qsort(commonEntries, commonEntriesCount, sizeof(CommonEntry), common_entry_cmp); + + /* + * Distribute "common entries" between groups. + */ + for (i = 0; i < commonEntriesCount; i++) + { + box = DatumGetBoxP(entryvec->vector[commonEntries[i].index].key); + + /* + * Check if we have to place this entry in either group to achieve + * LIMIT_RATIO. + */ + if (v->spl_nleft + (commonEntriesCount - i) <= m) + PLACE_LEFT(box, commonEntries[i].index); + else if (v->spl_nright + (commonEntriesCount - i) <= m) + PLACE_RIGHT(box, commonEntries[i].index); + else + { + /* Otherwise select the group by minimal penalty */ + if (box_penalty(leftBox, box) < box_penalty(rightBox, box)) + PLACE_LEFT(box, commonEntries[i].index); + else + PLACE_RIGHT(box, commonEntries[i].index); + } + } + } + + v->spl_ldatum = PointerGetDatum(leftBox); + v->spl_rdatum = PointerGetDatum(rightBox); + PG_RETURN_POINTER(v); +} + +/* + * Equality method + * + * This is used for boxes, points, circles, and polygons, all of which store + * boxes as GiST index entries. + * + * Returns true only when boxes are exactly the same. We can't use fuzzy + * comparisons here without breaking index consistency; therefore, this isn't + * equivalent to box_same(). + */ +Datum +gist_box_same(PG_FUNCTION_ARGS) +{ + BOX *b1 = PG_GETARG_BOX_P(0); + BOX *b2 = PG_GETARG_BOX_P(1); + bool *result = (bool *) PG_GETARG_POINTER(2); + + if (b1 && b2) + *result = (float8_eq(b1->low.x, b2->low.x) && + float8_eq(b1->low.y, b2->low.y) && + float8_eq(b1->high.x, b2->high.x) && + float8_eq(b1->high.y, b2->high.y)); + else + *result = (b1 == NULL && b2 == NULL); + PG_RETURN_POINTER(result); +} + +/* + * Leaf-level consistency for boxes: just apply the query operator + */ +static bool +gist_box_leaf_consistent(BOX *key, BOX *query, StrategyNumber strategy) +{ + bool retval; + + switch (strategy) + { + case RTLeftStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_left, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverLeftStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overleft, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverlapStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overlap, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverRightStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overright, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTRightStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_right, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTSameStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_same, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTContainsStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_contain, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTContainedByStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_contained, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverBelowStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overbelow, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTBelowStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_below, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTAboveStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_above, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverAboveStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overabove, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + retval = false; /* keep compiler quiet */ + break; + } + return retval; +} + +/***************************************** + * Common rtree functions (for boxes, polygons, and circles) + *****************************************/ + +/* + * Internal-page consistency for all these types + * + * We can use the same function since all types use bounding boxes as the + * internal-page representation. + */ +static bool +rtree_internal_consistent(BOX *key, BOX *query, StrategyNumber strategy) +{ + bool retval; + + switch (strategy) + { + case RTLeftStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overright, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverLeftStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_right, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverlapStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overlap, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverRightStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_left, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTRightStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overleft, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTSameStrategyNumber: + case RTContainsStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_contain, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTContainedByStrategyNumber: + retval = DatumGetBool(DirectFunctionCall2(box_overlap, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverBelowStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_above, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTBelowStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overabove, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTAboveStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_overbelow, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + case RTOverAboveStrategyNumber: + retval = !DatumGetBool(DirectFunctionCall2(box_below, + PointerGetDatum(key), + PointerGetDatum(query))); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + retval = false; /* keep compiler quiet */ + break; + } + return retval; +} + +/************************************************** + * Polygon ops + **************************************************/ + +/* + * GiST compress for polygons: represent a polygon by its bounding box + */ +Datum +gist_poly_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval; + + if (entry->leafkey) + { + POLYGON *in = DatumGetPolygonP(entry->key); + BOX *r; + + r = (BOX *) palloc(sizeof(BOX)); + memcpy((void *) r, (void *) &(in->boundbox), sizeof(BOX)); + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, + entry->offset, false); + } + else + retval = entry; + PG_RETURN_POINTER(retval); +} + +/* + * The GiST Consistent method for polygons + */ +Datum +gist_poly_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + POLYGON *query = PG_GETARG_POLYGON_P(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + bool result; + + /* All cases served by this function are inexact */ + *recheck = true; + + if (DatumGetBoxP(entry->key) == NULL || query == NULL) + PG_RETURN_BOOL(false); + + /* + * Since the operators require recheck anyway, we can just use + * rtree_internal_consistent even at leaf nodes. (This works in part + * because the index entries are bounding boxes not polygons.) + */ + result = rtree_internal_consistent(DatumGetBoxP(entry->key), + &(query->boundbox), strategy); + + /* Avoid memory leak if supplied poly is toasted */ + PG_FREE_IF_COPY(query, 1); + + PG_RETURN_BOOL(result); +} + +/************************************************** + * Circle ops + **************************************************/ + +/* + * GiST compress for circles: represent a circle by its bounding box + */ +Datum +gist_circle_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval; + + if (entry->leafkey) + { + CIRCLE *in = DatumGetCircleP(entry->key); + BOX *r; + + r = (BOX *) palloc(sizeof(BOX)); + r->high.x = float8_pl(in->center.x, in->radius); + r->low.x = float8_mi(in->center.x, in->radius); + r->high.y = float8_pl(in->center.y, in->radius); + r->low.y = float8_mi(in->center.y, in->radius); + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, + entry->offset, false); + } + else + retval = entry; + PG_RETURN_POINTER(retval); +} + +/* + * The GiST Consistent method for circles + */ +Datum +gist_circle_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + CIRCLE *query = PG_GETARG_CIRCLE_P(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + BOX bbox; + bool result; + + /* All cases served by this function are inexact */ + *recheck = true; + + if (DatumGetBoxP(entry->key) == NULL || query == NULL) + PG_RETURN_BOOL(false); + + /* + * Since the operators require recheck anyway, we can just use + * rtree_internal_consistent even at leaf nodes. (This works in part + * because the index entries are bounding boxes not circles.) + */ + bbox.high.x = float8_pl(query->center.x, query->radius); + bbox.low.x = float8_mi(query->center.x, query->radius); + bbox.high.y = float8_pl(query->center.y, query->radius); + bbox.low.y = float8_mi(query->center.y, query->radius); + + result = rtree_internal_consistent(DatumGetBoxP(entry->key), + &bbox, strategy); + + PG_RETURN_BOOL(result); +} + +/************************************************** + * Point ops + **************************************************/ + +Datum +gist_point_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + + if (entry->leafkey) /* Point, actually */ + { + BOX *box = palloc(sizeof(BOX)); + Point *point = DatumGetPointP(entry->key); + GISTENTRY *retval = palloc(sizeof(GISTENTRY)); + + box->high = box->low = *point; + + gistentryinit(*retval, BoxPGetDatum(box), + entry->rel, entry->page, entry->offset, false); + + PG_RETURN_POINTER(retval); + } + + PG_RETURN_POINTER(entry); +} + +/* + * GiST Fetch method for point + * + * Get point coordinates from its bounding box coordinates and form new + * gistentry. + */ +Datum +gist_point_fetch(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + BOX *in = DatumGetBoxP(entry->key); + Point *r; + GISTENTRY *retval; + + retval = palloc(sizeof(GISTENTRY)); + + r = (Point *) palloc(sizeof(Point)); + r->x = in->high.x; + r->y = in->high.y; + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, + entry->offset, false); + + PG_RETURN_POINTER(retval); +} + + +#define point_point_distance(p1,p2) \ + DatumGetFloat8(DirectFunctionCall2(point_distance, \ + PointPGetDatum(p1), PointPGetDatum(p2))) + +static float8 +computeDistance(bool isLeaf, BOX *box, Point *point) +{ + float8 result = 0.0; + + if (isLeaf) + { + /* simple point to point distance */ + result = point_point_distance(point, &box->low); + } + else if (point->x <= box->high.x && point->x >= box->low.x && + point->y <= box->high.y && point->y >= box->low.y) + { + /* point inside the box */ + result = 0.0; + } + else if (point->x <= box->high.x && point->x >= box->low.x) + { + /* point is over or below box */ + Assert(box->low.y <= box->high.y); + if (point->y > box->high.y) + result = float8_mi(point->y, box->high.y); + else if (point->y < box->low.y) + result = float8_mi(box->low.y, point->y); + else + elog(ERROR, "inconsistent point values"); + } + else if (point->y <= box->high.y && point->y >= box->low.y) + { + /* point is to left or right of box */ + Assert(box->low.x <= box->high.x); + if (point->x > box->high.x) + result = float8_mi(point->x, box->high.x); + else if (point->x < box->low.x) + result = float8_mi(box->low.x, point->x); + else + elog(ERROR, "inconsistent point values"); + } + else + { + /* closest point will be a vertex */ + Point p; + float8 subresult; + + result = point_point_distance(point, &box->low); + + subresult = point_point_distance(point, &box->high); + if (result > subresult) + result = subresult; + + p.x = box->low.x; + p.y = box->high.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + + p.x = box->high.x; + p.y = box->low.y; + subresult = point_point_distance(point, &p); + if (result > subresult) + result = subresult; + } + + return result; +} + +static bool +gist_point_consistent_internal(StrategyNumber strategy, + bool isLeaf, BOX *key, Point *query) +{ + bool result = false; + + switch (strategy) + { + case RTLeftStrategyNumber: + result = FPlt(key->low.x, query->x); + break; + case RTRightStrategyNumber: + result = FPgt(key->high.x, query->x); + break; + case RTAboveStrategyNumber: + result = FPgt(key->high.y, query->y); + break; + case RTBelowStrategyNumber: + result = FPlt(key->low.y, query->y); + break; + case RTSameStrategyNumber: + if (isLeaf) + { + /* key.high must equal key.low, so we can disregard it */ + result = (FPeq(key->low.x, query->x) && + FPeq(key->low.y, query->y)); + } + else + { + result = (FPle(query->x, key->high.x) && + FPge(query->x, key->low.x) && + FPle(query->y, key->high.y) && + FPge(query->y, key->low.y)); + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + result = false; /* keep compiler quiet */ + break; + } + + return result; +} + +#define GeoStrategyNumberOffset 20 +#define PointStrategyNumberGroup 0 +#define BoxStrategyNumberGroup 1 +#define PolygonStrategyNumberGroup 2 +#define CircleStrategyNumberGroup 3 + +Datum +gist_point_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + bool *recheck = (bool *) PG_GETARG_POINTER(4); + bool result; + StrategyNumber strategyGroup; + + /* + * We have to remap these strategy numbers to get this klugy + * classification logic to work. + */ + if (strategy == RTOldBelowStrategyNumber) + strategy = RTBelowStrategyNumber; + else if (strategy == RTOldAboveStrategyNumber) + strategy = RTAboveStrategyNumber; + + strategyGroup = strategy / GeoStrategyNumberOffset; + switch (strategyGroup) + { + case PointStrategyNumberGroup: + result = gist_point_consistent_internal(strategy % GeoStrategyNumberOffset, + GIST_LEAF(entry), + DatumGetBoxP(entry->key), + PG_GETARG_POINT_P(1)); + *recheck = false; + break; + case BoxStrategyNumberGroup: + { + /* + * The only operator in this group is point <@ box (on_pb), so + * we needn't examine strategy again. + * + * For historical reasons, on_pb uses exact rather than fuzzy + * comparisons. We could use box_overlap when at an internal + * page, but that would lead to possibly visiting child pages + * uselessly, because box_overlap uses fuzzy comparisons. + * Instead we write a non-fuzzy overlap test. The same code + * will also serve for leaf-page tests, since leaf keys have + * high == low. + */ + BOX *query, + *key; + + query = PG_GETARG_BOX_P(1); + key = DatumGetBoxP(entry->key); + + result = (key->high.x >= query->low.x && + key->low.x <= query->high.x && + key->high.y >= query->low.y && + key->low.y <= query->high.y); + *recheck = false; + } + break; + case PolygonStrategyNumberGroup: + { + POLYGON *query = PG_GETARG_POLYGON_P(1); + + result = DatumGetBool(DirectFunctionCall5(gist_poly_consistent, + PointerGetDatum(entry), + PolygonPGetDatum(query), + Int16GetDatum(RTOverlapStrategyNumber), + 0, PointerGetDatum(recheck))); + + if (GIST_LEAF(entry) && result) + { + /* + * We are on leaf page and quick check shows overlapping + * of polygon's bounding box and point + */ + BOX *box = DatumGetBoxP(entry->key); + + Assert(box->high.x == box->low.x + && box->high.y == box->low.y); + result = DatumGetBool(DirectFunctionCall2(poly_contain_pt, + PolygonPGetDatum(query), + PointPGetDatum(&box->high))); + *recheck = false; + } + } + break; + case CircleStrategyNumberGroup: + { + CIRCLE *query = PG_GETARG_CIRCLE_P(1); + + result = DatumGetBool(DirectFunctionCall5(gist_circle_consistent, + PointerGetDatum(entry), + CirclePGetDatum(query), + Int16GetDatum(RTOverlapStrategyNumber), + 0, PointerGetDatum(recheck))); + + if (GIST_LEAF(entry) && result) + { + /* + * We are on leaf page and quick check shows overlapping + * of polygon's bounding box and point + */ + BOX *box = DatumGetBoxP(entry->key); + + Assert(box->high.x == box->low.x + && box->high.y == box->low.y); + result = DatumGetBool(DirectFunctionCall2(circle_contain_pt, + CirclePGetDatum(query), + PointPGetDatum(&box->high))); + *recheck = false; + } + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + result = false; /* keep compiler quiet */ + break; + } + + PG_RETURN_BOOL(result); +} + +Datum +gist_point_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + float8 distance; + StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; + + switch (strategyGroup) + { + case PointStrategyNumberGroup: + distance = computeDistance(GIST_LEAF(entry), + DatumGetBoxP(entry->key), + PG_GETARG_POINT_P(1)); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + distance = 0.0; /* keep compiler quiet */ + break; + } + + PG_RETURN_FLOAT8(distance); +} + +static float8 +gist_bbox_distance(GISTENTRY *entry, Datum query, StrategyNumber strategy) +{ + float8 distance; + StrategyNumber strategyGroup = strategy / GeoStrategyNumberOffset; + + switch (strategyGroup) + { + case PointStrategyNumberGroup: + distance = computeDistance(false, + DatumGetBoxP(entry->key), + DatumGetPointP(query)); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + distance = 0.0; /* keep compiler quiet */ + } + + return distance; +} + +Datum +gist_box_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + Datum query = PG_GETARG_DATUM(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + /* bool *recheck = (bool *) PG_GETARG_POINTER(4); */ + float8 distance; + + distance = gist_bbox_distance(entry, query, strategy); + + PG_RETURN_FLOAT8(distance); +} + +/* + * The inexact GiST distance methods for geometric types that store bounding + * boxes. + * + * Compute lossy distance from point to index entries. The result is inexact + * because index entries are bounding boxes, not the exact shapes of the + * indexed geometric types. We use distance from point to MBR of index entry. + * This is a lower bound estimate of distance from point to indexed geometric + * type. + */ +Datum +gist_circle_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + Datum query = PG_GETARG_DATUM(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + float8 distance; + + distance = gist_bbox_distance(entry, query, strategy); + *recheck = true; + + PG_RETURN_FLOAT8(distance); +} + +Datum +gist_poly_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + Datum query = PG_GETARG_DATUM(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + float8 distance; + + distance = gist_bbox_distance(entry, query, strategy); + *recheck = true; + + PG_RETURN_FLOAT8(distance); +} + +/* + * Z-order routines for fast index build + */ + +/* + * Compute Z-value of a point + * + * Z-order (also known as Morton Code) maps a two-dimensional point to a + * single integer, in a way that preserves locality. Points that are close in + * the two-dimensional space are mapped to integer that are not far from each + * other. We do that by interleaving the bits in the X and Y components. + * + * Morton Code is normally defined only for integers, but the X and Y values + * of a point are floating point. We expect floats to be in IEEE format. + */ +static uint64 +point_zorder_internal(float4 x, float4 y) +{ + uint32 ix = ieee_float32_to_uint32(x); + uint32 iy = ieee_float32_to_uint32(y); + + /* Interleave the bits */ + return part_bits32_by2(ix) | (part_bits32_by2(iy) << 1); +} + +/* Interleave 32 bits with zeroes */ +static uint64 +part_bits32_by2(uint32 x) +{ + uint64 n = x; + + n = (n | (n << 16)) & UINT64CONST(0x0000FFFF0000FFFF); + n = (n | (n << 8)) & UINT64CONST(0x00FF00FF00FF00FF); + n = (n | (n << 4)) & UINT64CONST(0x0F0F0F0F0F0F0F0F); + n = (n | (n << 2)) & UINT64CONST(0x3333333333333333); + n = (n | (n << 1)) & UINT64CONST(0x5555555555555555); + + return n; +} + +/* + * Convert a 32-bit IEEE float to uint32 in a way that preserves the ordering + */ +static uint32 +ieee_float32_to_uint32(float f) +{ + /*---- + * + * IEEE 754 floating point format + * ------------------------------ + * + * IEEE 754 floating point numbers have this format: + * + * exponent (8 bits) + * | + * s eeeeeeee mmmmmmmmmmmmmmmmmmmmmmm + * | | + * sign mantissa (23 bits) + * + * Infinity has all bits in the exponent set and the mantissa is all + * zeros. Negative infinity is the same but with the sign bit set. + * + * NaNs are represented with all bits in the exponent set, and the least + * significant bit in the mantissa also set. The rest of the mantissa bits + * can be used to distinguish different kinds of NaNs. + * + * The IEEE format has the nice property that when you take the bit + * representation and interpret it as an integer, the order is preserved, + * except for the sign. That holds for the +-Infinity values too. + * + * Mapping to uint32 + * ----------------- + * + * In order to have a smooth transition from negative to positive numbers, + * we map floats to unsigned integers like this: + * + * x < 0 to range 0-7FFFFFFF + * x = 0 to value 8000000 (both positive and negative zero) + * x > 0 to range 8000001-FFFFFFFF + * + * We don't care to distinguish different kind of NaNs, so they are all + * mapped to the same arbitrary value, FFFFFFFF. Because of the IEEE bit + * representation of NaNs, there aren't any non-NaN values that would be + * mapped to FFFFFFFF. In fact, there is a range of unused values on both + * ends of the uint32 space. + */ + if (isnan(f)) + return 0xFFFFFFFF; + else + { + union + { + float f; + uint32 i; + } u; + + u.f = f; + + /* Check the sign bit */ + if ((u.i & 0x80000000) != 0) + { + /* + * Map the negative value to range 0-7FFFFFFF. This flips the sign + * bit to 0 in the same instruction. + */ + Assert(f <= 0); /* can be -0 */ + u.i ^= 0xFFFFFFFF; + } + else + { + /* Map the positive value (or 0) to range 80000000-FFFFFFFF */ + u.i |= 0x80000000; + } + + return u.i; + } +} + +/* + * Compare the Z-order of points + */ +static int +gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup) +{ + Point *p1 = &(DatumGetBoxP(a)->low); + Point *p2 = &(DatumGetBoxP(b)->low); + uint64 z1; + uint64 z2; + + /* + * Do a quick check for equality first. It's not clear if this is worth it + * in general, but certainly is when used as tie-breaker with abbreviated + * keys, + */ + if (p1->x == p2->x && p1->y == p2->y) + return 0; + + z1 = point_zorder_internal(p1->x, p1->y); + z2 = point_zorder_internal(p2->x, p2->y); + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * Abbreviated version of Z-order comparison + * + * The abbreviated format is a Z-order value computed from the two 32-bit + * floats. If SIZEOF_DATUM == 8, the 64-bit Z-order value fits fully in the + * abbreviated Datum, otherwise use its most significant bits. + */ +static Datum +gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup) +{ + Point *p = &(DatumGetBoxP(original)->low); + uint64 z; + + z = point_zorder_internal(p->x, p->y); + +#if SIZEOF_DATUM == 8 + return (Datum) z; +#else + return (Datum) (z >> 32); +#endif +} + +static int +gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup) +{ + /* + * Compare the pre-computed Z-orders as unsigned integers. Datum is a + * typedef for 'uintptr_t', so no casting is required. + */ + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * We never consider aborting the abbreviation. + * + * On 64-bit systems, the abbreviation is not lossy so it is always + * worthwhile. (Perhaps it's not on 32-bit systems, but we don't bother + * with logic to decide.) + */ +static bool +gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup) +{ + return false; +} + +/* + * Sort support routine for fast GiST index build by sorting. + */ +Datum +gist_point_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + if (ssup->abbreviate) + { + ssup->comparator = gist_bbox_zorder_cmp_abbrev; + ssup->abbrev_converter = gist_bbox_zorder_abbrev_convert; + ssup->abbrev_abort = gist_bbox_zorder_abbrev_abort; + ssup->abbrev_full_comparator = gist_bbox_zorder_cmp; + } + else + { + ssup->comparator = gist_bbox_zorder_cmp; + } + PG_RETURN_VOID(); +} diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c new file mode 100644 index 0000000..61e92cf --- /dev/null +++ b/src/backend/access/gist/gistscan.c @@ -0,0 +1,358 @@ +/*------------------------------------------------------------------------- + * + * gistscan.c + * routines to manage scans on GiST index relations + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistscan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "access/gistscan.h" +#include "access/relscan.h" +#include "utils/float.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +/* + * Pairing heap comparison function for the GISTSearchItem queue + */ +static int +pairingheap_GISTSearchItem_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) +{ + const GISTSearchItem *sa = (const GISTSearchItem *) a; + const GISTSearchItem *sb = (const GISTSearchItem *) b; + IndexScanDesc scan = (IndexScanDesc) arg; + int i; + + /* Order according to distance comparison */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + if (sa->distances[i].isnull) + { + if (!sb->distances[i].isnull) + return -1; + } + else if (sb->distances[i].isnull) + { + return 1; + } + else + { + int cmp = -float8_cmp_internal(sa->distances[i].value, + sb->distances[i].value); + + if (cmp != 0) + return cmp; + } + } + + /* Heap items go before inner pages, to ensure a depth-first search */ + if (GISTSearchItemIsHeap(*sa) && !GISTSearchItemIsHeap(*sb)) + return 1; + if (!GISTSearchItemIsHeap(*sa) && GISTSearchItemIsHeap(*sb)) + return -1; + + return 0; +} + + +/* + * Index AM API functions for scanning GiST indexes + */ + +IndexScanDesc +gistbeginscan(Relation r, int nkeys, int norderbys) +{ + IndexScanDesc scan; + GISTSTATE *giststate; + GISTScanOpaque so; + MemoryContext oldCxt; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + /* First, set up a GISTSTATE with a scan-lifespan memory context */ + giststate = initGISTstate(scan->indexRelation); + + /* + * Everything made below is in the scanCxt, or is a child of the scanCxt, + * so it'll all go away automatically in gistendscan. + */ + oldCxt = MemoryContextSwitchTo(giststate->scanCxt); + + /* initialize opaque data */ + so = (GISTScanOpaque) palloc0(sizeof(GISTScanOpaqueData)); + so->giststate = giststate; + giststate->tempCxt = createTempGistContext(); + so->queue = NULL; + so->queueCxt = giststate->scanCxt; /* see gistrescan */ + + /* workspaces with size dependent on numberOfOrderBys: */ + so->distances = palloc(sizeof(so->distances[0]) * scan->numberOfOrderBys); + so->qual_ok = true; /* in case there are zero keys */ + if (scan->numberOfOrderBys > 0) + { + scan->xs_orderbyvals = palloc0(sizeof(Datum) * scan->numberOfOrderBys); + scan->xs_orderbynulls = palloc(sizeof(bool) * scan->numberOfOrderBys); + memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); + } + + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; + so->curBlkno = InvalidBlockNumber; + so->curPageLSN = InvalidXLogRecPtr; + + scan->opaque = so; + + /* + * All fields required for index-only scans are initialized in gistrescan, + * as we don't know yet if we're doing an index-only scan or not. + */ + + MemoryContextSwitchTo(oldCxt); + + return scan; +} + +void +gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, + ScanKey orderbys, int norderbys) +{ + /* nkeys and norderbys arguments are ignored */ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool first_time; + int i; + MemoryContext oldCxt; + + /* rescan an existing indexscan --- reset state */ + + /* + * The first time through, we create the search queue in the scanCxt. + * Subsequent times through, we create the queue in a separate queueCxt, + * which is created on the second call and reset on later calls. Thus, in + * the common case where a scan is only rescan'd once, we just put the + * queue in scanCxt and don't pay the overhead of making a second memory + * context. If we do rescan more than once, the first queue is just left + * for dead until end of scan; this small wastage seems worth the savings + * in the common case. + */ + if (so->queue == NULL) + { + /* first time through */ + Assert(so->queueCxt == so->giststate->scanCxt); + first_time = true; + } + else if (so->queueCxt == so->giststate->scanCxt) + { + /* second time through */ + so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST queue context", + ALLOCSET_DEFAULT_SIZES); + first_time = false; + } + else + { + /* third or later time through */ + MemoryContextReset(so->queueCxt); + first_time = false; + } + + /* + * If we're doing an index-only scan, on the first call, also initialize a + * tuple descriptor to represent the returned index tuples and create a + * memory context to hold them during the scan. + */ + if (scan->xs_want_itup && !scan->xs_hitupdesc) + { + int natts; + int nkeyatts; + int attno; + + /* + * The storage type of the index can be different from the original + * datatype being indexed, so we cannot just grab the index's tuple + * descriptor. Instead, construct a descriptor with the original data + * types. + */ + natts = RelationGetNumberOfAttributes(scan->indexRelation); + nkeyatts = IndexRelationGetNumberOfKeyAttributes(scan->indexRelation); + so->giststate->fetchTupdesc = CreateTemplateTupleDesc(natts); + for (attno = 1; attno <= nkeyatts; attno++) + { + TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL, + scan->indexRelation->rd_opcintype[attno - 1], + -1, 0); + } + + for (; attno <= natts; attno++) + { + /* taking opcintype from giststate->tupdesc */ + TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL, + TupleDescAttr(so->giststate->leafTupdesc, + attno - 1)->atttypid, + -1, 0); + } + scan->xs_hitupdesc = so->giststate->fetchTupdesc; + + /* Also create a memory context that will hold the returned tuples */ + so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST page data context", + ALLOCSET_DEFAULT_SIZES); + } + + /* create new, empty pairing heap for search queue */ + oldCxt = MemoryContextSwitchTo(so->queueCxt); + so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); + MemoryContextSwitchTo(oldCxt); + + so->firstCall = true; + + /* Update scan key, if a new one is given */ + if (key && scan->numberOfKeys > 0) + { + void **fn_extras = NULL; + + /* + * If this isn't the first time through, preserve the fn_extra + * pointers, so that if the consistentFns are using them to cache + * data, that data is not leaked across a rescan. + */ + if (!first_time) + { + fn_extras = (void **) palloc(scan->numberOfKeys * sizeof(void *)); + for (i = 0; i < scan->numberOfKeys; i++) + fn_extras[i] = scan->keyData[i].sk_func.fn_extra; + } + + memmove(scan->keyData, key, + scan->numberOfKeys * sizeof(ScanKeyData)); + + /* + * Modify the scan key so that the Consistent method is called for all + * comparisons. The original operator is passed to the Consistent + * function in the form of its strategy number, which is available + * from the sk_strategy field, and its subtype from the sk_subtype + * field. + * + * Next, if any of keys is a NULL and that key is not marked with + * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we + * assume all indexable operators are strict). + */ + so->qual_ok = true; + + for (i = 0; i < scan->numberOfKeys; i++) + { + ScanKey skey = scan->keyData + i; + + /* + * Copy consistent support function to ScanKey structure instead + * of function implementing filtering operator. + */ + fmgr_info_copy(&(skey->sk_func), + &(so->giststate->consistentFn[skey->sk_attno - 1]), + so->giststate->scanCxt); + + /* Restore prior fn_extra pointers, if not first time */ + if (!first_time) + skey->sk_func.fn_extra = fn_extras[i]; + + if (skey->sk_flags & SK_ISNULL) + { + if (!(skey->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL))) + so->qual_ok = false; + } + } + + if (!first_time) + pfree(fn_extras); + } + + /* Update order-by key, if a new one is given */ + if (orderbys && scan->numberOfOrderBys > 0) + { + void **fn_extras = NULL; + + /* As above, preserve fn_extra if not first time through */ + if (!first_time) + { + fn_extras = (void **) palloc(scan->numberOfOrderBys * sizeof(void *)); + for (i = 0; i < scan->numberOfOrderBys; i++) + fn_extras[i] = scan->orderByData[i].sk_func.fn_extra; + } + + memmove(scan->orderByData, orderbys, + scan->numberOfOrderBys * sizeof(ScanKeyData)); + + so->orderByTypes = (Oid *) palloc(scan->numberOfOrderBys * sizeof(Oid)); + + /* + * Modify the order-by key so that the Distance method is called for + * all comparisons. The original operator is passed to the Distance + * function in the form of its strategy number, which is available + * from the sk_strategy field, and its subtype from the sk_subtype + * field. + */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + ScanKey skey = scan->orderByData + i; + FmgrInfo *finfo = &(so->giststate->distanceFn[skey->sk_attno - 1]); + + /* Check we actually have a distance function ... */ + if (!OidIsValid(finfo->fn_oid)) + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + GIST_DISTANCE_PROC, skey->sk_attno, + RelationGetRelationName(scan->indexRelation)); + + /* + * Look up the datatype returned by the original ordering + * operator. GiST always uses a float8 for the distance function, + * but the ordering operator could be anything else. + * + * XXX: The distance function is only allowed to be lossy if the + * ordering operator's result type is float4 or float8. Otherwise + * we don't know how to return the distance to the executor. But + * we cannot check that here, as we won't know if the distance + * function is lossy until it returns *recheck = true for the + * first time. + */ + so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid); + + /* + * Copy distance support function to ScanKey structure instead of + * function implementing ordering operator. + */ + fmgr_info_copy(&(skey->sk_func), finfo, so->giststate->scanCxt); + + /* Restore prior fn_extra pointers, if not first time */ + if (!first_time) + skey->sk_func.fn_extra = fn_extras[i]; + } + + if (!first_time) + pfree(fn_extras); + } + + /* any previous xs_hitup will have been pfree'd in context resets above */ + scan->xs_hitup = NULL; +} + +void +gistendscan(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + + /* + * freeGISTstate is enough to clean up everything made by gistbeginscan, + * as well as the queueCxt if there is a separate context for it. + */ + freeGISTstate(so->giststate); +} diff --git a/src/backend/access/gist/gistsplit.c b/src/backend/access/gist/gistsplit.c new file mode 100644 index 0000000..526ed12 --- /dev/null +++ b/src/backend/access/gist/gistsplit.c @@ -0,0 +1,779 @@ +/*------------------------------------------------------------------------- + * + * gistsplit.c + * Multi-column page splitting algorithm + * + * This file is concerned with making good page-split decisions in multi-column + * GiST indexes. The opclass-specific picksplit functions can only be expected + * to produce answers based on a single column. We first run the picksplit + * function for column 1; then, if there are more columns, we check if any of + * the tuples are "don't cares" so far as the column 1 split is concerned + * (that is, they could go to either side for no additional penalty). If so, + * we try to redistribute those tuples on the basis of the next column. + * Repeat till we're out of columns. + * + * gistSplitByKey() is the entry point to this file. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistsplit.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "utils/rel.h" + +typedef struct +{ + OffsetNumber *entries; + int len; + Datum *attr; + bool *isnull; + bool *dontcare; +} GistSplitUnion; + + +/* + * Form unions of subkeys in itvec[] entries listed in gsvp->entries[], + * ignoring any tuples that are marked in gsvp->dontcare[]. Subroutine for + * gistunionsubkey. + */ +static void +gistunionsubkeyvec(GISTSTATE *giststate, IndexTuple *itvec, + GistSplitUnion *gsvp) +{ + IndexTuple *cleanedItVec; + int i, + cleanedLen = 0; + + cleanedItVec = (IndexTuple *) palloc(sizeof(IndexTuple) * gsvp->len); + + for (i = 0; i < gsvp->len; i++) + { + if (gsvp->dontcare && gsvp->dontcare[gsvp->entries[i]]) + continue; + + cleanedItVec[cleanedLen++] = itvec[gsvp->entries[i] - 1]; + } + + gistMakeUnionItVec(giststate, cleanedItVec, cleanedLen, + gsvp->attr, gsvp->isnull); + + pfree(cleanedItVec); +} + +/* + * Recompute unions of left- and right-side subkeys after a page split, + * ignoring any tuples that are marked in spl->spl_dontcare[]. + * + * Note: we always recompute union keys for all index columns. In some cases + * this might represent duplicate work for the leftmost column(s), but it's + * not safe to assume that "zero penalty to move a tuple" means "the union + * key doesn't change at all". Penalty functions aren't 100% accurate. + */ +static void +gistunionsubkey(GISTSTATE *giststate, IndexTuple *itvec, GistSplitVector *spl) +{ + GistSplitUnion gsvp; + + gsvp.dontcare = spl->spl_dontcare; + + gsvp.entries = spl->splitVector.spl_left; + gsvp.len = spl->splitVector.spl_nleft; + gsvp.attr = spl->spl_lattr; + gsvp.isnull = spl->spl_lisnull; + + gistunionsubkeyvec(giststate, itvec, &gsvp); + + gsvp.entries = spl->splitVector.spl_right; + gsvp.len = spl->splitVector.spl_nright; + gsvp.attr = spl->spl_rattr; + gsvp.isnull = spl->spl_risnull; + + gistunionsubkeyvec(giststate, itvec, &gsvp); +} + +/* + * Find tuples that are "don't cares", that is could be moved to the other + * side of the split with zero penalty, so far as the attno column is + * concerned. + * + * Don't-care tuples are marked by setting the corresponding entry in + * spl->spl_dontcare[] to "true". Caller must have initialized that array + * to zeroes. + * + * Returns number of don't-cares found. + */ +static int +findDontCares(Relation r, GISTSTATE *giststate, GISTENTRY *valvec, + GistSplitVector *spl, int attno) +{ + int i; + GISTENTRY entry; + int NumDontCare = 0; + + /* + * First, search the left-side tuples to see if any have zero penalty to + * be added to the right-side union key. + * + * attno column is known all-not-null (see gistSplitByKey), so we need not + * check for nulls + */ + gistentryinit(entry, spl->splitVector.spl_rdatum, r, NULL, + (OffsetNumber) 0, false); + for (i = 0; i < spl->splitVector.spl_nleft; i++) + { + int j = spl->splitVector.spl_left[i]; + float penalty = gistpenalty(giststate, attno, &entry, false, + &valvec[j], false); + + if (penalty == 0.0) + { + spl->spl_dontcare[j] = true; + NumDontCare++; + } + } + + /* And conversely for the right-side tuples */ + gistentryinit(entry, spl->splitVector.spl_ldatum, r, NULL, + (OffsetNumber) 0, false); + for (i = 0; i < spl->splitVector.spl_nright; i++) + { + int j = spl->splitVector.spl_right[i]; + float penalty = gistpenalty(giststate, attno, &entry, false, + &valvec[j], false); + + if (penalty == 0.0) + { + spl->spl_dontcare[j] = true; + NumDontCare++; + } + } + + return NumDontCare; +} + +/* + * Remove tuples that are marked don't-cares from the tuple index array a[] + * of length *len. This is applied separately to the spl_left and spl_right + * arrays. + */ +static void +removeDontCares(OffsetNumber *a, int *len, const bool *dontcare) +{ + int origlen, + newlen, + i; + OffsetNumber *curwpos; + + origlen = newlen = *len; + curwpos = a; + for (i = 0; i < origlen; i++) + { + OffsetNumber ai = a[i]; + + if (dontcare[ai] == false) + { + /* re-emit item into a[] */ + *curwpos = ai; + curwpos++; + } + else + newlen--; + } + + *len = newlen; +} + +/* + * Place a single don't-care tuple into either the left or right side of the + * split, according to which has least penalty for merging the tuple into + * the previously-computed union keys. We need consider only columns starting + * at attno. + */ +static void +placeOne(Relation r, GISTSTATE *giststate, GistSplitVector *v, + IndexTuple itup, OffsetNumber off, int attno) +{ + GISTENTRY identry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + bool toLeft = true; + + gistDeCompressAtt(giststate, r, itup, NULL, (OffsetNumber) 0, + identry, isnull); + + for (; attno < giststate->nonLeafTupdesc->natts; attno++) + { + float lpenalty, + rpenalty; + GISTENTRY entry; + + gistentryinit(entry, v->spl_lattr[attno], r, NULL, 0, false); + lpenalty = gistpenalty(giststate, attno, &entry, v->spl_lisnull[attno], + identry + attno, isnull[attno]); + gistentryinit(entry, v->spl_rattr[attno], r, NULL, 0, false); + rpenalty = gistpenalty(giststate, attno, &entry, v->spl_risnull[attno], + identry + attno, isnull[attno]); + + if (lpenalty != rpenalty) + { + if (lpenalty > rpenalty) + toLeft = false; + break; + } + } + + if (toLeft) + v->splitVector.spl_left[v->splitVector.spl_nleft++] = off; + else + v->splitVector.spl_right[v->splitVector.spl_nright++] = off; +} + +#define SWAPVAR( s, d, t ) \ +do { \ + (t) = (s); \ + (s) = (d); \ + (d) = (t); \ +} while(0) + +/* + * Clean up when we did a secondary split but the user-defined PickSplit + * method didn't support it (leaving spl_ldatum_exists or spl_rdatum_exists + * true). + * + * We consider whether to swap the left and right outputs of the secondary + * split; this can be worthwhile if the penalty for merging those tuples into + * the previously chosen sets is less that way. + * + * In any case we must update the union datums for the current column by + * adding in the previous union keys (oldL/oldR), since the user-defined + * PickSplit method didn't do so. + */ +static void +supportSecondarySplit(Relation r, GISTSTATE *giststate, int attno, + GIST_SPLITVEC *sv, Datum oldL, Datum oldR) +{ + bool leaveOnLeft = true, + tmpBool; + GISTENTRY entryL, + entryR, + entrySL, + entrySR; + + gistentryinit(entryL, oldL, r, NULL, 0, false); + gistentryinit(entryR, oldR, r, NULL, 0, false); + gistentryinit(entrySL, sv->spl_ldatum, r, NULL, 0, false); + gistentryinit(entrySR, sv->spl_rdatum, r, NULL, 0, false); + + if (sv->spl_ldatum_exists && sv->spl_rdatum_exists) + { + float penalty1, + penalty2; + + penalty1 = gistpenalty(giststate, attno, &entryL, false, &entrySL, false) + + gistpenalty(giststate, attno, &entryR, false, &entrySR, false); + penalty2 = gistpenalty(giststate, attno, &entryL, false, &entrySR, false) + + gistpenalty(giststate, attno, &entryR, false, &entrySL, false); + + if (penalty1 > penalty2) + leaveOnLeft = false; + } + else + { + GISTENTRY *entry1 = (sv->spl_ldatum_exists) ? &entryL : &entryR; + float penalty1, + penalty2; + + /* + * There is only one previously defined union, so we just choose swap + * or not by lowest penalty for that side. We can only get here if a + * secondary split happened to have all NULLs in its column in the + * tuples that the outer recursion level had assigned to one side. + * (Note that the null checks in gistSplitByKey don't prevent the + * case, because they'll only be checking tuples that were considered + * don't-cares at the outer recursion level, not the tuples that went + * into determining the passed-down left and right union keys.) + */ + penalty1 = gistpenalty(giststate, attno, entry1, false, &entrySL, false); + penalty2 = gistpenalty(giststate, attno, entry1, false, &entrySR, false); + + if (penalty1 < penalty2) + leaveOnLeft = (sv->spl_ldatum_exists) ? true : false; + else + leaveOnLeft = (sv->spl_rdatum_exists) ? true : false; + } + + if (leaveOnLeft == false) + { + /* + * swap left and right + */ + OffsetNumber *off, + noff; + Datum datum; + + SWAPVAR(sv->spl_left, sv->spl_right, off); + SWAPVAR(sv->spl_nleft, sv->spl_nright, noff); + SWAPVAR(sv->spl_ldatum, sv->spl_rdatum, datum); + gistentryinit(entrySL, sv->spl_ldatum, r, NULL, 0, false); + gistentryinit(entrySR, sv->spl_rdatum, r, NULL, 0, false); + } + + if (sv->spl_ldatum_exists) + gistMakeUnionKey(giststate, attno, &entryL, false, &entrySL, false, + &sv->spl_ldatum, &tmpBool); + + if (sv->spl_rdatum_exists) + gistMakeUnionKey(giststate, attno, &entryR, false, &entrySR, false, + &sv->spl_rdatum, &tmpBool); + + sv->spl_ldatum_exists = sv->spl_rdatum_exists = false; +} + +/* + * Trivial picksplit implementation. Function called only + * if user-defined picksplit puts all keys on the same side of the split. + * That is a bug of user-defined picksplit but we don't want to fail. + */ +static void +genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC *v, int attno) +{ + OffsetNumber i, + maxoff; + int nbytes; + GistEntryVector *evec; + + maxoff = entryvec->n - 1; + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + v->spl_nleft = v->spl_nright = 0; + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + if (i <= (maxoff - FirstOffsetNumber + 1) / 2) + { + v->spl_left[v->spl_nleft] = i; + v->spl_nleft++; + } + else + { + v->spl_right[v->spl_nright] = i; + v->spl_nright++; + } + } + + /* + * Form union datums for each side + */ + evec = palloc(sizeof(GISTENTRY) * entryvec->n + GEVHDRSZ); + + evec->n = v->spl_nleft; + memcpy(evec->vector, entryvec->vector + FirstOffsetNumber, + sizeof(GISTENTRY) * evec->n); + v->spl_ldatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); + + evec->n = v->spl_nright; + memcpy(evec->vector, entryvec->vector + FirstOffsetNumber + v->spl_nleft, + sizeof(GISTENTRY) * evec->n); + v->spl_rdatum = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&nbytes)); +} + +/* + * Calls user picksplit method for attno column to split tuples into + * two vectors. + * + * Returns false if split is complete (there are no more index columns, or + * there is no need to consider them because split is optimal already). + * + * Returns true and v->spl_dontcare = NULL if the picksplit result is + * degenerate (all tuples seem to be don't-cares), so we should just + * disregard this column and split on the next column(s) instead. + * + * Returns true and v->spl_dontcare != NULL if there are don't-care tuples + * that could be relocated based on the next column(s). The don't-care + * tuples have been removed from the split and must be reinserted by caller. + * There is at least one non-don't-care tuple on each side of the split, + * and union keys for all columns are updated to include just those tuples. + * + * A true result implies there is at least one more index column. + */ +static bool +gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVector *v, + IndexTuple *itup, int len, GISTSTATE *giststate) +{ + GIST_SPLITVEC *sv = &v->splitVector; + + /* + * Prepare spl_ldatum/spl_rdatum/spl_ldatum_exists/spl_rdatum_exists in + * case we are doing a secondary split (see comments in gist.h). + */ + sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true; + sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true; + sv->spl_ldatum = v->spl_lattr[attno]; + sv->spl_rdatum = v->spl_rattr[attno]; + + /* + * Let the opclass-specific PickSplit method do its thing. Note that at + * this point we know there are no null keys in the entryvec. + */ + FunctionCall2Coll(&giststate->picksplitFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(entryvec), + PointerGetDatum(sv)); + + if (sv->spl_nleft == 0 || sv->spl_nright == 0) + { + /* + * User-defined picksplit failed to create an actual split, ie it put + * everything on the same side. Complain but cope. + */ + ereport(DEBUG1, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("picksplit method for column %d of index \"%s\" failed", + attno + 1, RelationGetRelationName(r)), + errhint("The index is not optimal. To optimize it, contact a developer, or try to use the column as the second one in the CREATE INDEX command."))); + + /* + * Reinit GIST_SPLITVEC. Although these fields are not used by + * genericPickSplit(), set them up for further processing + */ + sv->spl_ldatum_exists = (v->spl_lisnull[attno]) ? false : true; + sv->spl_rdatum_exists = (v->spl_risnull[attno]) ? false : true; + sv->spl_ldatum = v->spl_lattr[attno]; + sv->spl_rdatum = v->spl_rattr[attno]; + + /* Do a generic split */ + genericPickSplit(giststate, entryvec, sv, attno); + } + else + { + /* hack for compatibility with old picksplit API */ + if (sv->spl_left[sv->spl_nleft - 1] == InvalidOffsetNumber) + sv->spl_left[sv->spl_nleft - 1] = (OffsetNumber) (entryvec->n - 1); + if (sv->spl_right[sv->spl_nright - 1] == InvalidOffsetNumber) + sv->spl_right[sv->spl_nright - 1] = (OffsetNumber) (entryvec->n - 1); + } + + /* Clean up if PickSplit didn't take care of a secondary split */ + if (sv->spl_ldatum_exists || sv->spl_rdatum_exists) + supportSecondarySplit(r, giststate, attno, sv, + v->spl_lattr[attno], v->spl_rattr[attno]); + + /* emit union datums computed by PickSplit back to v arrays */ + v->spl_lattr[attno] = sv->spl_ldatum; + v->spl_rattr[attno] = sv->spl_rdatum; + v->spl_lisnull[attno] = false; + v->spl_risnull[attno] = false; + + /* + * If index columns remain, then consider whether we can improve the split + * by using them. + */ + v->spl_dontcare = NULL; + + if (attno + 1 < giststate->nonLeafTupdesc->natts) + { + int NumDontCare; + + /* + * Make a quick check to see if left and right union keys are equal; + * if so, the split is certainly degenerate, so tell caller to + * re-split with the next column. + */ + if (gistKeyIsEQ(giststate, attno, sv->spl_ldatum, sv->spl_rdatum)) + return true; + + /* + * Locate don't-care tuples, if any. If there are none, the split is + * optimal, so just fall out and return false. + */ + v->spl_dontcare = (bool *) palloc0(sizeof(bool) * (entryvec->n + 1)); + + NumDontCare = findDontCares(r, giststate, entryvec->vector, v, attno); + + if (NumDontCare > 0) + { + /* + * Remove don't-cares from spl_left[] and spl_right[]. + */ + removeDontCares(sv->spl_left, &sv->spl_nleft, v->spl_dontcare); + removeDontCares(sv->spl_right, &sv->spl_nright, v->spl_dontcare); + + /* + * If all tuples on either side were don't-cares, the split is + * degenerate, and we're best off to ignore it and split on the + * next column. (We used to try to press on with a secondary + * split by forcing a random tuple on each side to be treated as + * non-don't-care, but it seems unlikely that that technique + * really gives a better result. Note that we don't want to try a + * secondary split with empty left or right primary split sides, + * because then there is no union key on that side for the + * PickSplit function to try to expand, so it can have no good + * figure of merit for what it's doing. Also note that this check + * ensures we can't produce a bogus one-side-only split in the + * NumDontCare == 1 special case below.) + */ + if (sv->spl_nleft == 0 || sv->spl_nright == 0) + { + v->spl_dontcare = NULL; + return true; + } + + /* + * Recompute union keys, considering only non-don't-care tuples. + * NOTE: this will set union keys for remaining index columns, + * which will cause later calls of gistUserPicksplit to pass those + * values down to user-defined PickSplit methods with + * spl_ldatum_exists/spl_rdatum_exists set true. + */ + gistunionsubkey(giststate, itup, v); + + if (NumDontCare == 1) + { + /* + * If there's only one don't-care tuple then we can't do a + * PickSplit on it, so just choose whether to send it left or + * right by comparing penalties. We needed the + * gistunionsubkey step anyway so that we have appropriate + * union keys for figuring the penalties. + */ + OffsetNumber toMove; + + /* find it ... */ + for (toMove = FirstOffsetNumber; toMove < entryvec->n; toMove++) + { + if (v->spl_dontcare[toMove]) + break; + } + Assert(toMove < entryvec->n); + + /* ... and assign it to cheaper side */ + placeOne(r, giststate, v, itup[toMove - 1], toMove, attno + 1); + + /* + * At this point the union keys are wrong, but we don't care + * because we're done splitting. The outermost recursion + * level of gistSplitByKey will fix things before returning. + */ + } + else + return true; + } + } + + return false; +} + +/* + * simply split page in half + */ +static void +gistSplitHalf(GIST_SPLITVEC *v, int len) +{ + int i; + + v->spl_nright = v->spl_nleft = 0; + v->spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + v->spl_right = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + for (i = 1; i <= len; i++) + if (i < len / 2) + v->spl_right[v->spl_nright++] = i; + else + v->spl_left[v->spl_nleft++] = i; + + /* we need not compute union keys, caller took care of it */ +} + +/* + * gistSplitByKey: main entry point for page-splitting algorithm + * + * r: index relation + * page: page being split + * itup: array of IndexTuples to be processed + * len: number of IndexTuples to be processed (must be at least 2) + * giststate: additional info about index + * v: working state and output area + * attno: column we are working on (zero-based index) + * + * Outside caller must initialize v->spl_lisnull and v->spl_risnull arrays + * to all-true. On return, spl_left/spl_nleft contain indexes of tuples + * to go left, spl_right/spl_nright contain indexes of tuples to go right, + * spl_lattr/spl_lisnull contain left-side union key values, and + * spl_rattr/spl_risnull contain right-side union key values. Other fields + * in this struct are workspace for this file. + * + * Outside caller must pass zero for attno. The function may internally + * recurse to the next column by passing attno+1. + */ +void +gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, + GISTSTATE *giststate, GistSplitVector *v, int attno) +{ + GistEntryVector *entryvec; + OffsetNumber *offNullTuples; + int nOffNullTuples = 0; + int i; + + /* generate the item array, and identify tuples with null keys */ + /* note that entryvec->vector[0] goes unused in this code */ + entryvec = palloc(GEVHDRSZ + (len + 1) * sizeof(GISTENTRY)); + entryvec->n = len + 1; + offNullTuples = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + + for (i = 1; i <= len; i++) + { + Datum datum; + bool IsNull; + + datum = index_getattr(itup[i - 1], attno + 1, giststate->leafTupdesc, + &IsNull); + gistdentryinit(giststate, attno, &(entryvec->vector[i]), + datum, r, page, i, + false, IsNull); + if (IsNull) + offNullTuples[nOffNullTuples++] = i; + } + + if (nOffNullTuples == len) + { + /* + * Corner case: All keys in attno column are null, so just transfer + * our attention to the next column. If there's no next column, just + * split page in half. + */ + v->spl_risnull[attno] = v->spl_lisnull[attno] = true; + + if (attno + 1 < giststate->nonLeafTupdesc->natts) + gistSplitByKey(r, page, itup, len, giststate, v, attno + 1); + else + gistSplitHalf(&v->splitVector, len); + } + else if (nOffNullTuples > 0) + { + int j = 0; + + /* + * We don't want to mix NULL and not-NULL keys on one page, so split + * nulls to right page and not-nulls to left. + */ + v->splitVector.spl_right = offNullTuples; + v->splitVector.spl_nright = nOffNullTuples; + v->spl_risnull[attno] = true; + + v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + v->splitVector.spl_nleft = 0; + for (i = 1; i <= len; i++) + if (j < v->splitVector.spl_nright && offNullTuples[j] == i) + j++; + else + v->splitVector.spl_left[v->splitVector.spl_nleft++] = i; + + /* Compute union keys, unless outer recursion level will handle it */ + if (attno == 0 && giststate->nonLeafTupdesc->natts == 1) + { + v->spl_dontcare = NULL; + gistunionsubkey(giststate, itup, v); + } + } + else + { + /* + * All keys are not-null, so apply user-defined PickSplit method + */ + if (gistUserPicksplit(r, entryvec, attno, v, itup, len, giststate)) + { + /* + * Splitting on attno column is not optimal, so consider + * redistributing don't-care tuples according to the next column + */ + Assert(attno + 1 < giststate->nonLeafTupdesc->natts); + + if (v->spl_dontcare == NULL) + { + /* + * This split was actually degenerate, so ignore it altogether + * and just split according to the next column. + */ + gistSplitByKey(r, page, itup, len, giststate, v, attno + 1); + } + else + { + /* + * Form an array of just the don't-care tuples to pass to a + * recursive invocation of this function for the next column. + */ + IndexTuple *newitup = (IndexTuple *) palloc(len * sizeof(IndexTuple)); + OffsetNumber *map = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); + int newlen = 0; + GIST_SPLITVEC backupSplit; + + for (i = 0; i < len; i++) + { + if (v->spl_dontcare[i + 1]) + { + newitup[newlen] = itup[i]; + map[newlen] = i + 1; + newlen++; + } + } + + Assert(newlen > 0); + + /* + * Make a backup copy of v->splitVector, since the recursive + * call will overwrite that with its own result. + */ + backupSplit = v->splitVector; + backupSplit.spl_left = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); + memcpy(backupSplit.spl_left, v->splitVector.spl_left, sizeof(OffsetNumber) * v->splitVector.spl_nleft); + backupSplit.spl_right = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); + memcpy(backupSplit.spl_right, v->splitVector.spl_right, sizeof(OffsetNumber) * v->splitVector.spl_nright); + + /* Recursively decide how to split the don't-care tuples */ + gistSplitByKey(r, page, newitup, newlen, giststate, v, attno + 1); + + /* Merge result of subsplit with non-don't-care tuples */ + for (i = 0; i < v->splitVector.spl_nleft; i++) + backupSplit.spl_left[backupSplit.spl_nleft++] = map[v->splitVector.spl_left[i] - 1]; + for (i = 0; i < v->splitVector.spl_nright; i++) + backupSplit.spl_right[backupSplit.spl_nright++] = map[v->splitVector.spl_right[i] - 1]; + + v->splitVector = backupSplit; + } + } + } + + /* + * If we're handling a multicolumn index, at the end of the recursion + * recompute the left and right union datums for all index columns. This + * makes sure we hand back correct union datums in all corner cases, + * including when we haven't processed all columns to start with, or when + * a secondary split moved "don't care" tuples from one side to the other + * (we really shouldn't assume that that didn't change the union datums). + * + * Note: when we're in an internal recursion (attno > 0), we do not worry + * about whether the union datums we return with are sensible, since + * calling levels won't care. Also, in a single-column index, we expect + * that PickSplit (or the special cases above) produced correct union + * datums. + */ + if (attno == 0 && giststate->nonLeafTupdesc->natts > 1) + { + v->spl_dontcare = NULL; + gistunionsubkey(giststate, itup, v); + } +} diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c new file mode 100644 index 0000000..43ba03b --- /dev/null +++ b/src/backend/access/gist/gistutil.c @@ -0,0 +1,1066 @@ +/*------------------------------------------------------------------------- + * + * gistutil.c + * utilities routines for the postgres GiST index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistutil.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/gist_private.h" +#include "access/htup_details.h" +#include "access/reloptions.h" +#include "catalog/pg_opclass.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "utils/float.h" +#include "utils/lsyscache.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + +/* + * Write itup vector to page, has no control of free space. + */ +void +gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off) +{ + int i; + + if (off == InvalidOffsetNumber) + off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + for (i = 0; i < len; i++) + { + Size sz = IndexTupleSize(itup[i]); + OffsetNumber l; + + l = PageAddItem(page, (Item) itup[i], sz, off, false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes", + i, len, (int) sz); + off++; + } +} + +/* + * Check space for itup vector on page + */ +bool +gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace) +{ + unsigned int size = freespace, + deleted = 0; + int i; + + for (i = 0; i < len; i++) + size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); + + if (todelete != InvalidOffsetNumber) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete)); + + deleted = IndexTupleSize(itup) + sizeof(ItemIdData); + } + + return (PageGetFreeSpace(page) + deleted < size); +} + +bool +gistfitpage(IndexTuple *itvec, int len) +{ + int i; + Size size = 0; + + for (i = 0; i < len; i++) + size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); + + /* TODO: Consider fillfactor */ + return (size <= GiSTPageSize); +} + +/* + * Read buffer into itup vector + */ +IndexTuple * +gistextractpage(Page page, int *len /* out */ ) +{ + OffsetNumber i, + maxoff; + IndexTuple *itvec; + + maxoff = PageGetMaxOffsetNumber(page); + *len = maxoff; + itvec = palloc(sizeof(IndexTuple) * maxoff); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + + return itvec; +} + +/* + * join two vectors into one + */ +IndexTuple * +gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen) +{ + itvec = (IndexTuple *) repalloc((void *) itvec, sizeof(IndexTuple) * ((*len) + addlen)); + memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen); + *len += addlen; + return itvec; +} + +/* + * make plain IndexTuple vector + */ + +IndexTupleData * +gistfillitupvec(IndexTuple *vec, int veclen, int *memlen) +{ + char *ptr, + *ret; + int i; + + *memlen = 0; + + for (i = 0; i < veclen; i++) + *memlen += IndexTupleSize(vec[i]); + + ptr = ret = palloc(*memlen); + + for (i = 0; i < veclen; i++) + { + memcpy(ptr, vec[i], IndexTupleSize(vec[i])); + ptr += IndexTupleSize(vec[i]); + } + + return (IndexTupleData *) ret; +} + +/* + * Make unions of keys in IndexTuple vector (one union datum per index column). + * Union Datums are returned into the attr/isnull arrays. + * Resulting Datums aren't compressed. + */ +void +gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, + Datum *attr, bool *isnull) +{ + int i; + GistEntryVector *evec; + int attrsize; + + evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ); + + for (i = 0; i < giststate->nonLeafTupdesc->natts; i++) + { + int j; + + /* Collect non-null datums for this column */ + evec->n = 0; + for (j = 0; j < len; j++) + { + Datum datum; + bool IsNull; + + datum = index_getattr(itvec[j], i + 1, giststate->leafTupdesc, + &IsNull); + if (IsNull) + continue; + + gistdentryinit(giststate, i, + evec->vector + evec->n, + datum, + NULL, NULL, (OffsetNumber) 0, + false, IsNull); + evec->n++; + } + + /* If this column was all NULLs, the union is NULL */ + if (evec->n == 0) + { + attr[i] = (Datum) 0; + isnull[i] = true; + } + else + { + if (evec->n == 1) + { + /* unionFn may expect at least two inputs */ + evec->n = 2; + evec->vector[1] = evec->vector[0]; + } + + /* Make union and store in attr array */ + attr[i] = FunctionCall2Coll(&giststate->unionFn[i], + giststate->supportCollation[i], + PointerGetDatum(evec), + PointerGetDatum(&attrsize)); + + isnull[i] = false; + } + } +} + +/* + * Return an IndexTuple containing the result of applying the "union" + * method to the specified IndexTuple vector. + */ +IndexTuple +gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate) +{ + Datum attr[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + + gistMakeUnionItVec(giststate, itvec, len, attr, isnull); + + return gistFormTuple(giststate, r, attr, isnull, false); +} + +/* + * makes union of two key + */ +void +gistMakeUnionKey(GISTSTATE *giststate, int attno, + GISTENTRY *entry1, bool isnull1, + GISTENTRY *entry2, bool isnull2, + Datum *dst, bool *dstisnull) +{ + /* we need a GistEntryVector with room for exactly 2 elements */ + union + { + GistEntryVector gev; + char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ]; + } storage; + GistEntryVector *evec = &storage.gev; + int dstsize; + + evec->n = 2; + + if (isnull1 && isnull2) + { + *dstisnull = true; + *dst = (Datum) 0; + } + else + { + if (isnull1 == false && isnull2 == false) + { + evec->vector[0] = *entry1; + evec->vector[1] = *entry2; + } + else if (isnull1 == false) + { + evec->vector[0] = *entry1; + evec->vector[1] = *entry1; + } + else + { + evec->vector[0] = *entry2; + evec->vector[1] = *entry2; + } + + *dstisnull = false; + *dst = FunctionCall2Coll(&giststate->unionFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(evec), + PointerGetDatum(&dstsize)); + } +} + +bool +gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b) +{ + bool result; + + FunctionCall3Coll(&giststate->equalFn[attno], + giststate->supportCollation[attno], + a, b, + PointerGetDatum(&result)); + return result; +} + +/* + * Decompress all keys in tuple + */ +void +gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, + OffsetNumber o, GISTENTRY *attdata, bool *isnull) +{ + int i; + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + Datum datum; + + datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]); + gistdentryinit(giststate, i, &attdata[i], + datum, r, p, o, + false, isnull[i]); + } +} + +/* + * Forms union of oldtup and addtup, if union == oldtup then return NULL + */ +IndexTuple +gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate) +{ + bool neednew = false; + GISTENTRY oldentries[INDEX_MAX_KEYS], + addentries[INDEX_MAX_KEYS]; + bool oldisnull[INDEX_MAX_KEYS], + addisnull[INDEX_MAX_KEYS]; + Datum attr[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + IndexTuple newtup = NULL; + int i; + + gistDeCompressAtt(giststate, r, oldtup, NULL, + (OffsetNumber) 0, oldentries, oldisnull); + + gistDeCompressAtt(giststate, r, addtup, NULL, + (OffsetNumber) 0, addentries, addisnull); + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + gistMakeUnionKey(giststate, i, + oldentries + i, oldisnull[i], + addentries + i, addisnull[i], + attr + i, isnull + i); + + if (neednew) + /* we already need new key, so we can skip check */ + continue; + + if (isnull[i]) + /* union of key may be NULL if and only if both keys are NULL */ + continue; + + if (!addisnull[i]) + { + if (oldisnull[i] || + !gistKeyIsEQ(giststate, i, oldentries[i].key, attr[i])) + neednew = true; + } + } + + if (neednew) + { + /* need to update key */ + newtup = gistFormTuple(giststate, r, attr, isnull, false); + newtup->t_tid = oldtup->t_tid; + } + + return newtup; +} + +/* + * Search an upper index page for the entry with lowest penalty for insertion + * of the new index key contained in "it". + * + * Returns the index of the page entry to insert into. + */ +OffsetNumber +gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ + GISTSTATE *giststate) +{ + OffsetNumber result; + OffsetNumber maxoff; + OffsetNumber i; + float best_penalty[INDEX_MAX_KEYS]; + GISTENTRY entry, + identry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + int keep_current_best; + + Assert(!GistPageIsLeaf(p)); + + gistDeCompressAtt(giststate, r, + it, NULL, (OffsetNumber) 0, + identry, isnull); + + /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */ + result = FirstOffsetNumber; + + /* + * The index may have multiple columns, and there's a penalty value for + * each column. The penalty associated with a column that appears earlier + * in the index definition is strictly more important than the penalty of + * a column that appears later in the index definition. + * + * best_penalty[j] is the best penalty we have seen so far for column j, + * or -1 when we haven't yet examined column j. Array entries to the + * right of the first -1 are undefined. + */ + best_penalty[0] = -1; + + /* + * If we find a tuple that's exactly as good as the currently best one, we + * could use either one. When inserting a lot of tuples with the same or + * similar keys, it's preferable to descend down the same path when + * possible, as that's more cache-friendly. On the other hand, if all + * inserts land on the same leaf page after a split, we're never going to + * insert anything to the other half of the split, and will end up using + * only 50% of the available space. Distributing the inserts evenly would + * lead to better space usage, but that hurts cache-locality during + * insertion. To get the best of both worlds, when we find a tuple that's + * exactly as good as the previous best, choose randomly whether to stick + * to the old best, or use the new one. Once we decide to stick to the + * old best, we keep sticking to it for any subsequent equally good tuples + * we might find. This favors tuples with low offsets, but still allows + * some inserts to go to other equally-good subtrees. + * + * keep_current_best is -1 if we haven't yet had to make a random choice + * whether to keep the current best tuple. If we have done so, and + * decided to keep it, keep_current_best is 1; if we've decided to + * replace, keep_current_best is 0. (This state will be reset to -1 as + * soon as we've made the replacement, but sometimes we make the choice in + * advance of actually finding a replacement best tuple.) + */ + keep_current_best = -1; + + /* + * Loop over tuples on page. + */ + maxoff = PageGetMaxOffsetNumber(p); + Assert(maxoff >= FirstOffsetNumber); + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); + bool zero_penalty; + int j; + + zero_penalty = true; + + /* Loop over index attributes. */ + for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++) + { + Datum datum; + float usize; + bool IsNull; + + /* Compute penalty for this column. */ + datum = index_getattr(itup, j + 1, giststate->leafTupdesc, + &IsNull); + gistdentryinit(giststate, j, &entry, datum, r, p, i, + false, IsNull); + usize = gistpenalty(giststate, j, &entry, IsNull, + &identry[j], isnull[j]); + if (usize > 0) + zero_penalty = false; + + if (best_penalty[j] < 0 || usize < best_penalty[j]) + { + /* + * New best penalty for column. Tentatively select this tuple + * as the target, and record the best penalty. Then reset the + * next column's penalty to "unknown" (and indirectly, the + * same for all the ones to its right). This will force us to + * adopt this tuple's penalty values as the best for all the + * remaining columns during subsequent loop iterations. + */ + result = i; + best_penalty[j] = usize; + + if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1) + best_penalty[j + 1] = -1; + + /* we have new best, so reset keep-it decision */ + keep_current_best = -1; + } + else if (best_penalty[j] == usize) + { + /* + * The current tuple is exactly as good for this column as the + * best tuple seen so far. The next iteration of this loop + * will compare the next column. + */ + } + else + { + /* + * The current tuple is worse for this column than the best + * tuple seen so far. Skip the remaining columns and move on + * to the next tuple, if any. + */ + zero_penalty = false; /* so outer loop won't exit */ + break; + } + } + + /* + * If we looped past the last column, and did not update "result", + * then this tuple is exactly as good as the prior best tuple. + */ + if (j == IndexRelationGetNumberOfKeyAttributes(r) && result != i) + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 0) + { + /* we choose to use the new tuple */ + result = i; + /* choose again if there are even more exactly-as-good ones */ + keep_current_best = -1; + } + } + + /* + * If we find a tuple with zero penalty for all columns, and we've + * decided we don't want to search for another tuple with equal + * penalty, there's no need to examine remaining tuples; just break + * out of the loop and return it. + */ + if (zero_penalty) + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 1) + break; + } + } + + return result; +} + +/* + * initialize a GiST entry with a decompressed version of key + */ +void +gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, + Datum k, Relation r, Page pg, OffsetNumber o, + bool l, bool isNull) +{ + if (!isNull) + { + GISTENTRY *dep; + + gistentryinit(*e, k, r, pg, o, l); + + /* there may not be a decompress function in opclass */ + if (!OidIsValid(giststate->decompressFn[nkey].fn_oid)) + return; + + dep = (GISTENTRY *) + DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(e))); + /* decompressFn may just return the given pointer */ + if (dep != e) + gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset, + dep->leafkey); + } + else + gistentryinit(*e, (Datum) 0, r, pg, o, l); +} + +IndexTuple +gistFormTuple(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf) +{ + Datum compatt[INDEX_MAX_KEYS]; + IndexTuple res; + + gistCompressValues(giststate, r, attdata, isnull, isleaf, compatt); + + res = index_form_tuple(isleaf ? giststate->leafTupdesc : + giststate->nonLeafTupdesc, + compatt, isnull); + + /* + * The offset number on tuples on internal pages is unused. For historical + * reasons, it is set to 0xffff. + */ + ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); + return res; +} + +void +gistCompressValues(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf, Datum *compatt) +{ + int i; + + /* + * Call the compress method on each attribute. + */ + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + if (isnull[i]) + compatt[i] = (Datum) 0; + else + { + GISTENTRY centry; + GISTENTRY *cep; + + gistentryinit(centry, attdata[i], r, NULL, (OffsetNumber) 0, + isleaf); + /* there may not be a compress function in opclass */ + if (OidIsValid(giststate->compressFn[i].fn_oid)) + cep = (GISTENTRY *) + DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[i], + giststate->supportCollation[i], + PointerGetDatum(¢ry))); + else + cep = ¢ry; + compatt[i] = cep->key; + } + } + + if (isleaf) + { + /* + * Emplace each included attribute if any. + */ + for (; i < r->rd_att->natts; i++) + { + if (isnull[i]) + compatt[i] = (Datum) 0; + else + compatt[i] = attdata[i]; + } + } +} + +/* + * initialize a GiST entry with fetched value in key field + */ +static Datum +gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r) +{ + GISTENTRY fentry; + GISTENTRY *fep; + + gistentryinit(fentry, k, r, NULL, (OffsetNumber) 0, false); + + fep = (GISTENTRY *) + DatumGetPointer(FunctionCall1Coll(&giststate->fetchFn[nkey], + giststate->supportCollation[nkey], + PointerGetDatum(&fentry))); + + /* fetchFn set 'key', return it to the caller */ + return fep->key; +} + +/* + * Fetch all keys in tuple. + * Returns a new HeapTuple containing the originally-indexed data. + */ +HeapTuple +gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(giststate->tempCxt); + Datum fetchatt[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + int i; + + for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) + { + Datum datum; + + datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]); + + if (giststate->fetchFn[i].fn_oid != InvalidOid) + { + if (!isnull[i]) + fetchatt[i] = gistFetchAtt(giststate, i, datum, r); + else + fetchatt[i] = (Datum) 0; + } + else if (giststate->compressFn[i].fn_oid == InvalidOid) + { + /* + * If opclass does not provide compress method that could change + * original value, att is necessarily stored in original form. + */ + if (!isnull[i]) + fetchatt[i] = datum; + else + fetchatt[i] = (Datum) 0; + } + else + { + /* + * Index-only scans not supported for this column. Since the + * planner chose an index-only scan anyway, it is not interested + * in this column, and we can replace it with a NULL. + */ + isnull[i] = true; + fetchatt[i] = (Datum) 0; + } + } + + /* + * Get each included attribute. + */ + for (; i < r->rd_att->natts; i++) + { + fetchatt[i] = index_getattr(tuple, i + 1, giststate->leafTupdesc, + &isnull[i]); + } + MemoryContextSwitchTo(oldcxt); + + return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); +} + +float +gistpenalty(GISTSTATE *giststate, int attno, + GISTENTRY *orig, bool isNullOrig, + GISTENTRY *add, bool isNullAdd) +{ + float penalty = 0.0; + + if (giststate->penaltyFn[attno].fn_strict == false || + (isNullOrig == false && isNullAdd == false)) + { + FunctionCall3Coll(&giststate->penaltyFn[attno], + giststate->supportCollation[attno], + PointerGetDatum(orig), + PointerGetDatum(add), + PointerGetDatum(&penalty)); + /* disallow negative or NaN penalty */ + if (isnan(penalty) || penalty < 0.0) + penalty = 0.0; + } + else if (isNullOrig && isNullAdd) + penalty = 0.0; + else + { + /* try to prevent mixing null and non-null values */ + penalty = get_float4_infinity(); + } + + return penalty; +} + +/* + * Initialize a new index page + */ +void +gistinitpage(Page page, uint32 f) +{ + GISTPageOpaque opaque; + + PageInit(page, BLCKSZ, sizeof(GISTPageOpaqueData)); + + opaque = GistPageGetOpaque(page); + opaque->rightlink = InvalidBlockNumber; + opaque->flags = f; + opaque->gist_page_id = GIST_PAGE_ID; +} + +/* + * Initialize a new index buffer + */ +void +GISTInitBuffer(Buffer b, uint32 f) +{ + Page page; + + page = BufferGetPage(b); + gistinitpage(page, f); +} + +/* + * Verify that a freshly-read page looks sane. + */ +void +gistcheckpage(Relation rel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * ReadBuffer verifies that every newly-read page passes + * PageHeaderIsValid, which means it either contains a reasonably sane + * page header or is all-zero. We have to defend against the all-zero + * case, however. + */ + if (PageIsNew(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains unexpected zero page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + + /* + * Additionally check that the special area looks sane. + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); +} + + +/* + * Allocate a new page (either by recycling, or by extending the index file) + * + * The returned buffer is already pinned and exclusive-locked + * + * Caller is responsible for initializing the page by calling GISTInitBuffer + */ +Buffer +gistNewBuffer(Relation r) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(r); + + if (blkno == InvalidBlockNumber) + break; /* nothing left in FSM */ + + buffer = ReadBuffer(r, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + Page page = BufferGetPage(buffer); + + /* + * If the page was never initialized, it's OK to use. + */ + if (PageIsNew(page)) + return buffer; + + gistcheckpage(r, buffer); + + /* + * Otherwise, recycle it if deleted, and too old to have any + * processes interested in it. + */ + if (gistPageRecyclable(page)) + { + /* + * If we are generating WAL for Hot Standby then create a WAL + * record that will allow us to conflict with queries running + * on standby, in case they have snapshots older than the + * page's deleteXid. + */ + if (XLogStandbyInfoActive() && RelationNeedsWAL(r)) + gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page)); + + return buffer; + } + + LockBuffer(buffer, GIST_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(r); + + if (needLock) + LockRelationForExtension(r, ExclusiveLock); + + buffer = ReadBuffer(r, P_NEW); + LockBuffer(buffer, GIST_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(r, ExclusiveLock); + + return buffer; +} + +/* Can this page be recycled yet? */ +bool +gistPageRecyclable(Page page) +{ + if (PageIsNew(page)) + return true; + if (GistPageIsDeleted(page)) + { + /* + * The page was deleted, but when? If it was just deleted, a scan + * might have seen the downlink to it, and will read the page later. + * As long as that can happen, we must keep the deleted page around as + * a tombstone. + * + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have + * seen its downlink, and we can recycle it. + */ + FullTransactionId deletexid_full = GistPageGetDeleteXid(page); + + return GlobalVisCheckRemovableFullXid(NULL, deletexid_full); + } + return false; +} + +bytea * +gistoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)}, + {"buffering", RELOPT_TYPE_ENUM, offsetof(GiSTOptions, buffering_mode)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_GIST, + sizeof(GiSTOptions), + tab, lengthof(tab)); +} + +/* + * gistproperty() -- Check boolean properties of indexes. + * + * This is optional for most AMs, but is required for GiST because the core + * property code doesn't support AMPROP_DISTANCE_ORDERABLE. We also handle + * AMPROP_RETURNABLE here to save opening the rel to call gistcanreturn. + */ +bool +gistproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull) +{ + Oid opclass, + opfamily, + opcintype; + int16 procno; + + /* Only answer column-level inquiries */ + if (attno == 0) + return false; + + /* + * Currently, GiST distance-ordered scans require that there be a distance + * function in the opclass with the default types (i.e. the one loaded + * into the relcache entry, see initGISTstate). So we assume that if such + * a function exists, then there's a reason for it (rather than grubbing + * through all the opfamily's operators to find an ordered one). + * + * Essentially the same code can test whether we support returning the + * column data, since that's true if the opclass provides a fetch proc. + */ + + switch (prop) + { + case AMPROP_DISTANCE_ORDERABLE: + procno = GIST_DISTANCE_PROC; + break; + case AMPROP_RETURNABLE: + procno = GIST_FETCH_PROC; + break; + default: + return false; + } + + /* First we need to know the column's opclass. */ + opclass = get_index_column_opclass(index_oid, attno); + if (!OidIsValid(opclass)) + { + *isnull = true; + return true; + } + + /* Now look up the opclass family and input datatype. */ + if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype)) + { + *isnull = true; + return true; + } + + /* And now we can check whether the function is provided. */ + + *res = SearchSysCacheExists4(AMPROCNUM, + ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(opcintype), + ObjectIdGetDatum(opcintype), + Int16GetDatum(procno)); + + /* + * Special case: even without a fetch function, AMPROP_RETURNABLE is true + * if the opclass has no compress function. + */ + if (prop == AMPROP_RETURNABLE && !*res) + { + *res = !SearchSysCacheExists4(AMPROCNUM, + ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(opcintype), + ObjectIdGetDatum(opcintype), + Int16GetDatum(GIST_COMPRESS_PROC)); + } + + *isnull = false; + + return true; +} + +/* + * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page + * splits anyway. This function provides a fake sequence of LSNs for that + * purpose. + */ +XLogRecPtr +gistGetFakeLSN(Relation rel) +{ + if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) + { + /* + * Temporary relations are only accessible in our session, so a simple + * backend-local counter will do. + */ + static XLogRecPtr counter = FirstNormalUnloggedLSN; + + return counter++; + } + else if (RelationIsPermanent(rel)) + { + /* + * WAL-logging on this relation will start after commit, so its LSNs + * must be distinct numbers smaller than the LSN at the next commit. + * Emit a dummy WAL record if insert-LSN hasn't advanced after the + * last call. + */ + static XLogRecPtr lastlsn = InvalidXLogRecPtr; + XLogRecPtr currlsn = GetXLogInsertRecPtr(); + + /* Shouldn't be called for WAL-logging relations */ + Assert(!RelationNeedsWAL(rel)); + + /* No need for an actual record if we already have a distinct LSN */ + if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn) + currlsn = gistXLogAssignLSN(); + + lastlsn = currlsn; + return currlsn; + } + else + { + /* + * Unlogged relations are accessible from other backends, and survive + * (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us. + */ + Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED); + return GetFakeLSNForUnloggedRel(); + } +} diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c new file mode 100644 index 0000000..0663193 --- /dev/null +++ b/src/backend/access/gist/gistvacuum.c @@ -0,0 +1,668 @@ +/*------------------------------------------------------------------------- + * + * gistvacuum.c + * vacuuming routines for the postgres GiST index access method. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistvacuum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/gist_private.h" +#include "access/transam.h" +#include "commands/vacuum.h" +#include "lib/integerset.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "utils/memutils.h" + +/* Working state needed by gistbulkdelete */ +typedef struct +{ + IndexVacuumInfo *info; + IndexBulkDeleteResult *stats; + IndexBulkDeleteCallback callback; + void *callback_state; + GistNSN startNSN; + + /* + * These are used to memorize all internal and empty leaf pages. They are + * used for deleting all the empty pages. + */ + IntegerSet *internal_page_set; + IntegerSet *empty_leaf_set; + MemoryContext page_set_context; +} GistVacState; + +static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state); +static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno, + BlockNumber orig_blkno); +static void gistvacuum_delete_empty_pages(IndexVacuumInfo *info, + GistVacState *vstate); +static bool gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + Buffer buffer, OffsetNumber downlink, + Buffer leafBuffer); + +/* + * VACUUM bulkdelete stage: remove index entries. + */ +IndexBulkDeleteResult * +gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + gistvacuumscan(info, stats, callback, callback_state); + + return stats; +} + +/* + * VACUUM cleanup stage: delete empty pages, and update index statistics. + */ +IndexBulkDeleteResult * +gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + /* + * If gistbulkdelete was called, we need not do anything, just return the + * stats from the latest gistbulkdelete call. If it wasn't called, we + * still need to do a pass over the index, to obtain index statistics. + */ + if (stats == NULL) + { + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + gistvacuumscan(info, stats, NULL, NULL); + } + + /* + * It's quite possible for us to be fooled by concurrent page splits into + * double-counting some index tuples, so disbelieve any total that exceeds + * the underlying heap's count ... if we know that accurately. Otherwise + * this might just make matters worse. + */ + if (!info->estimated_count) + { + if (stats->num_index_tuples > info->num_heap_tuples) + stats->num_index_tuples = info->num_heap_tuples; + } + + return stats; +} + +/* + * gistvacuumscan --- scan the index for VACUUMing purposes + * + * This scans the index for leaf tuples that are deletable according to the + * vacuum callback, and updates the stats. Both btbulkdelete and + * btvacuumcleanup invoke this (the latter only if no btbulkdelete call + * occurred). + * + * This also makes note of any empty leaf pages, as well as all internal + * pages while looping over all index pages. After scanning all the pages, we + * remove the empty pages so that they can be reused. Any deleted pages are + * added directly to the free space map. (They should've been added there + * when they were originally deleted, already, but it's possible that the FSM + * was lost at a crash, for example.) + * + * The caller is responsible for initially allocating/zeroing a stats struct. + */ +static void +gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation rel = info->index; + GistVacState vstate; + BlockNumber num_pages; + bool needLock; + BlockNumber blkno; + MemoryContext oldctx; + + /* + * Reset fields that track information about the entire index now. This + * avoids double-counting in the case where a single VACUUM command + * requires multiple scans of the index. + * + * Avoid resetting the tuples_removed and pages_newly_deleted fields here, + * since they track information about the VACUUM command, and so must last + * across each call to gistvacuumscan(). + * + * (Note that pages_free is treated as state about the whole index, not + * the current VACUUM. This is appropriate because RecordFreeIndexPage() + * calls are idempotent, and get repeated for the same deleted pages in + * some scenarios. The point for us is to track the number of recyclable + * pages in the index at the end of the VACUUM command.) + */ + stats->num_pages = 0; + stats->estimated_count = false; + stats->num_index_tuples = 0; + stats->pages_deleted = 0; + stats->pages_free = 0; + + /* + * Create the integer sets to remember all the internal and the empty leaf + * pages in page_set_context. Internally, the integer set will remember + * this context so that the subsequent allocations for these integer sets + * will be done from the same context. + */ + vstate.page_set_context = GenerationContextCreate(CurrentMemoryContext, + "GiST VACUUM page set context", + 16 * 1024); + oldctx = MemoryContextSwitchTo(vstate.page_set_context); + vstate.internal_page_set = intset_create(); + vstate.empty_leaf_set = intset_create(); + MemoryContextSwitchTo(oldctx); + + /* Set up info to pass down to gistvacuumpage */ + vstate.info = info; + vstate.stats = stats; + vstate.callback = callback; + vstate.callback_state = callback_state; + if (RelationNeedsWAL(rel)) + vstate.startNSN = GetInsertRecPtr(); + else + vstate.startNSN = gistGetFakeLSN(rel); + + /* + * The outer loop iterates over all index pages, in physical order (we + * hope the kernel will cooperate in providing read-ahead for speed). It + * is critical that we visit all leaf pages, including ones added after we + * start the scan, else we might fail to delete some deletable tuples. + * Hence, we must repeatedly check the relation length. We must acquire + * the relation-extension lock while doing so to avoid a race condition: + * if someone else is extending the relation, there is a window where + * bufmgr/smgr have created a new all-zero page but it hasn't yet been + * write-locked by gistNewBuffer(). If we manage to scan such a page + * here, we'll improperly assume it can be recycled. Taking the lock + * synchronizes things enough to prevent a problem: either num_pages won't + * include the new page, or gistNewBuffer already has write lock on the + * buffer and it will be fully initialized before we can examine it. (See + * also vacuumlazy.c, which has the same issue.) Also, we need not worry + * if a page is added immediately after we look; the page splitting code + * already has write-lock on the left page before it adds a right page, so + * we must already have processed any tuples due to be moved into such a + * page. + * + * We can skip locking for new or temp relations, however, since no one + * else could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(rel); + + blkno = GIST_ROOT_BLKNO; + for (;;) + { + /* Get the current relation length */ + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + num_pages = RelationGetNumberOfBlocks(rel); + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + /* Quit if we've scanned the whole relation */ + if (blkno >= num_pages) + break; + /* Iterate over pages, then loop back to recheck length */ + for (; blkno < num_pages; blkno++) + gistvacuumpage(&vstate, blkno, blkno); + } + + /* + * If we found any recyclable pages (and recorded them in the FSM), then + * forcibly update the upper-level FSM pages to ensure that searchers can + * find them. It's possible that the pages were also found during + * previous scans and so this is a waste of time, but it's cheap enough + * relative to scanning the index that it shouldn't matter much, and + * making sure that free pages are available sooner not later seems + * worthwhile. + * + * Note that if no recyclable pages exist, we don't bother vacuuming the + * FSM at all. + */ + if (stats->pages_free > 0) + IndexFreeSpaceMapVacuum(rel); + + /* update statistics */ + stats->num_pages = num_pages; + + /* + * If we saw any empty pages, try to unlink them from the tree so that + * they can be reused. + */ + gistvacuum_delete_empty_pages(info, &vstate); + + /* we don't need the internal and empty page sets anymore */ + MemoryContextDelete(vstate.page_set_context); + vstate.page_set_context = NULL; + vstate.internal_page_set = NULL; + vstate.empty_leaf_set = NULL; +} + +/* + * gistvacuumpage --- VACUUM one page + * + * This processes a single page for gistbulkdelete(). In some cases we + * must go back and re-examine previously-scanned pages; this routine + * recurses when necessary to handle that case. + * + * blkno is the page to process. orig_blkno is the highest block number + * reached by the outer gistvacuumscan loop (the same as blkno, unless we + * are recursing to re-examine a previous page). + */ +static void +gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) +{ + IndexVacuumInfo *info = vstate->info; + IndexBulkDeleteCallback callback = vstate->callback; + void *callback_state = vstate->callback_state; + Relation rel = info->index; + Buffer buffer; + Page page; + BlockNumber recurse_to; + +restart: + recurse_to = InvalidBlockNumber; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + + /* + * We are not going to stay here for a long time, aggressively grab an + * exclusive lock. + */ + LockBuffer(buffer, GIST_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (gistPageRecyclable(page)) + { + /* Okay to recycle this page */ + RecordFreeIndexPage(rel, blkno); + vstate->stats->pages_deleted++; + vstate->stats->pages_free++; + } + else if (GistPageIsDeleted(page)) + { + /* Already deleted, but can't recycle yet */ + vstate->stats->pages_deleted++; + } + else if (GistPageIsLeaf(page)) + { + OffsetNumber todelete[MaxOffsetNumber]; + int ntodelete = 0; + int nremain; + GISTPageOpaque opaque = GistPageGetOpaque(page); + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + + /* + * Check whether we need to recurse back to earlier pages. What we + * are concerned about is a page split that happened since we started + * the vacuum scan. If the split moved some tuples to a lower page + * then we might have missed 'em. If so, set up for tail recursion. + * + * This is similar to the checks we do during searches, when following + * a downlink, but we don't need to jump to higher-numbered pages, + * because we will process them later, anyway. + */ + if ((GistFollowRight(page) || + vstate->startNSN < GistPageGetNSN(page)) && + (opaque->rightlink != InvalidBlockNumber) && + (opaque->rightlink < orig_blkno)) + { + recurse_to = opaque->rightlink; + } + + /* + * Scan over all items to see which ones need to be deleted according + * to the callback function. + */ + if (callback) + { + OffsetNumber off; + + for (off = FirstOffsetNumber; + off <= maxoff; + off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (callback(&(idxtuple->t_tid), callback_state)) + todelete[ntodelete++] = off; + } + } + + /* + * Apply any needed deletes. We issue just one WAL record per page, + * so as to minimize WAL traffic. + */ + if (ntodelete > 0) + { + START_CRIT_SECTION(); + + MarkBufferDirty(buffer); + + PageIndexMultiDelete(page, todelete, ntodelete); + GistMarkTuplesDeleted(page); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogUpdate(buffer, + todelete, ntodelete, + NULL, 0, InvalidBuffer); + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + + vstate->stats->tuples_removed += ntodelete; + /* must recompute maxoff */ + maxoff = PageGetMaxOffsetNumber(page); + } + + nremain = maxoff - FirstOffsetNumber + 1; + if (nremain == 0) + { + /* + * The page is now completely empty. Remember its block number, + * so that we will try to delete the page in the second stage. + * + * Skip this when recursing, because IntegerSet requires that the + * values are added in ascending order. The next VACUUM will pick + * it up. + */ + if (blkno == orig_blkno) + intset_add_member(vstate->empty_leaf_set, blkno); + } + else + vstate->stats->num_index_tuples += nremain; + } + else + { + /* + * On an internal page, check for "invalid tuples", left behind by an + * incomplete page split on PostgreSQL 9.0 or below. These are not + * created by newer PostgreSQL versions, but unfortunately, there is + * no version number anywhere in a GiST index, so we don't know + * whether this index might still contain invalid tuples or not. + */ + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + OffsetNumber off; + + for (off = FirstOffsetNumber; + off <= maxoff; + off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (GistTupleIsInvalid(idxtuple)) + ereport(LOG, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(rel)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + } + + /* + * Remember the block number of this page, so that we can revisit it + * later in gistvacuum_delete_empty_pages(), when we search for + * parents of empty leaf pages. + */ + if (blkno == orig_blkno) + intset_add_member(vstate->internal_page_set, blkno); + } + + UnlockReleaseBuffer(buffer); + + /* + * This is really tail recursion, but if the compiler is too stupid to + * optimize it as such, we'd eat an uncomfortably large amount of stack + * space per recursion level (due to the deletable[] array). A failure is + * improbable since the number of levels isn't likely to be large ... but + * just in case, let's hand-optimize into a loop. + */ + if (recurse_to != InvalidBlockNumber) + { + blkno = recurse_to; + goto restart; + } +} + +/* + * Scan all internal pages, and try to delete their empty child pages. + */ +static void +gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistVacState *vstate) +{ + Relation rel = info->index; + BlockNumber empty_pages_remaining; + uint64 blkno; + + /* + * Rescan all inner pages to find those that have empty child pages. + */ + empty_pages_remaining = intset_num_entries(vstate->empty_leaf_set); + intset_begin_iterate(vstate->internal_page_set); + while (empty_pages_remaining > 0 && + intset_iterate_next(vstate->internal_page_set, &blkno)) + { + Buffer buffer; + Page page; + OffsetNumber off, + maxoff; + OffsetNumber todelete[MaxOffsetNumber]; + BlockNumber leafs_to_delete[MaxOffsetNumber]; + int ntodelete; + int deleted; + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber) blkno, + RBM_NORMAL, info->strategy); + + LockBuffer(buffer, GIST_SHARE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)) + { + /* + * This page was an internal page earlier, but now it's something + * else. Shouldn't happen... + */ + Assert(false); + UnlockReleaseBuffer(buffer); + continue; + } + + /* + * Scan all the downlinks, and see if any of them point to empty leaf + * pages. + */ + maxoff = PageGetMaxOffsetNumber(page); + ntodelete = 0; + for (off = FirstOffsetNumber; + off <= maxoff && ntodelete < maxoff - 1; + off = OffsetNumberNext(off)) + { + ItemId iid = PageGetItemId(page, off); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + BlockNumber leafblk; + + leafblk = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + if (intset_is_member(vstate->empty_leaf_set, leafblk)) + { + leafs_to_delete[ntodelete] = leafblk; + todelete[ntodelete++] = off; + } + } + + /* + * In order to avoid deadlock, child page must be locked before + * parent, so we must release the lock on the parent, lock the child, + * and then re-acquire the lock the parent. (And we wouldn't want to + * do I/O, while holding a lock, anyway.) + * + * At the instant that we're not holding a lock on the parent, the + * downlink might get moved by a concurrent insert, so we must + * re-check that it still points to the same child page after we have + * acquired both locks. Also, another backend might have inserted a + * tuple to the page, so that it is no longer empty. gistdeletepage() + * re-checks all these conditions. + */ + LockBuffer(buffer, GIST_UNLOCK); + + deleted = 0; + for (int i = 0; i < ntodelete; i++) + { + Buffer leafbuf; + + /* + * Don't remove the last downlink from the parent. That would + * confuse the insertion code. + */ + if (PageGetMaxOffsetNumber(page) == FirstOffsetNumber) + break; + + leafbuf = ReadBufferExtended(rel, MAIN_FORKNUM, leafs_to_delete[i], + RBM_NORMAL, info->strategy); + LockBuffer(leafbuf, GIST_EXCLUSIVE); + gistcheckpage(rel, leafbuf); + + LockBuffer(buffer, GIST_EXCLUSIVE); + if (gistdeletepage(info, vstate->stats, + buffer, todelete[i] - deleted, + leafbuf)) + deleted++; + LockBuffer(buffer, GIST_UNLOCK); + + UnlockReleaseBuffer(leafbuf); + } + + ReleaseBuffer(buffer); + + /* + * We can stop the scan as soon as we have seen the downlinks, even if + * we were not able to remove them all. + */ + empty_pages_remaining -= ntodelete; + } +} + +/* + * gistdeletepage takes a leaf page, and its parent, and tries to delete the + * leaf. Both pages must be locked. + * + * Even if the page was empty when we first saw it, a concurrent inserter might + * have added a tuple to it since. Similarly, the downlink might have moved. + * We re-check all the conditions, to make sure the page is still deletable, + * before modifying anything. + * + * Returns true, if the page was deleted, and false if a concurrent update + * prevented it. + */ +static bool +gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + Buffer parentBuffer, OffsetNumber downlink, + Buffer leafBuffer) +{ + Page parentPage = BufferGetPage(parentBuffer); + Page leafPage = BufferGetPage(leafBuffer); + ItemId iid; + IndexTuple idxtuple; + XLogRecPtr recptr; + FullTransactionId txid; + + /* + * Check that the leaf is still empty and deletable. + */ + if (!GistPageIsLeaf(leafPage)) + { + /* a leaf page should never become a non-leaf page */ + Assert(false); + return false; + } + + if (GistFollowRight(leafPage)) + return false; /* don't mess with a concurrent page split */ + + if (PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber) + return false; /* not empty anymore */ + + /* + * Ok, the leaf is deletable. Is the downlink in the parent page still + * valid? It might have been moved by a concurrent insert. We could try + * to re-find it by scanning the page again, possibly moving right if the + * was split. But for now, let's keep it simple and just give up. The + * next VACUUM will pick it up. + */ + if (PageIsNew(parentPage) || GistPageIsDeleted(parentPage) || + GistPageIsLeaf(parentPage)) + { + /* shouldn't happen, internal pages are never deleted */ + Assert(false); + return false; + } + + if (PageGetMaxOffsetNumber(parentPage) < downlink + || PageGetMaxOffsetNumber(parentPage) <= FirstOffsetNumber) + return false; + + iid = PageGetItemId(parentPage, downlink); + idxtuple = (IndexTuple) PageGetItem(parentPage, iid); + if (BufferGetBlockNumber(leafBuffer) != + ItemPointerGetBlockNumber(&(idxtuple->t_tid))) + return false; + + /* + * All good, proceed with the deletion. + * + * The page cannot be immediately recycled, because in-progress scans that + * saw the downlink might still visit it. Mark the page with the current + * next-XID counter, so that we know when it can be recycled. Once that + * XID becomes older than GlobalXmin, we know that all scans that are + * currently in progress must have ended. (That's much more conservative + * than needed, but let's keep it safe and simple.) + */ + txid = ReadNextFullTransactionId(); + + START_CRIT_SECTION(); + + /* mark the page as deleted */ + MarkBufferDirty(leafBuffer); + GistPageSetDeleted(leafPage, txid); + stats->pages_newly_deleted++; + stats->pages_deleted++; + + /* remove the downlink from the parent */ + MarkBufferDirty(parentBuffer); + PageIndexTupleDelete(parentPage, downlink); + + if (RelationNeedsWAL(info->index)) + recptr = gistXLogPageDelete(leafBuffer, txid, parentBuffer, downlink); + else + recptr = gistGetFakeLSN(info->index); + PageSetLSN(parentPage, recptr); + PageSetLSN(leafPage, recptr); + + END_CRIT_SECTION(); + + return true; +} diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c new file mode 100644 index 0000000..b885fa2 --- /dev/null +++ b/src/backend/access/gist/gistvalidate.c @@ -0,0 +1,355 @@ +/*------------------------------------------------------------------------- + * + * gistvalidate.c + * Opclass validator for GiST. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/gist_private.h" +#include "access/htup_details.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + + +/* + * Validator for a GiST opclass. + */ +bool +gistvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + Oid opckeytype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opckeytype = classform->opckeytype; + if (!OidIsValid(opckeytype)) + opckeytype = opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* + * All GiST support functions should be registered with matching + * left/right types + */ + if (procform->amproclefttype != procform->amprocrighttype) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types", + opfamilyname, "gist", + format_procedure(procform->amproc)))); + result = false; + } + + /* + * We can't check signatures except within the specific opclass, since + * we need to know the associated opckeytype in many cases. + */ + if (procform->amproclefttype != opcintype) + continue; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case GIST_CONSISTENT_PROC: + ok = check_amproc_signature(procform->amproc, BOOLOID, false, + 5, 5, INTERNALOID, opcintype, + INT2OID, OIDOID, INTERNALOID); + break; + case GIST_UNION_PROC: + ok = check_amproc_signature(procform->amproc, opckeytype, false, + 2, 2, INTERNALOID, INTERNALOID); + break; + case GIST_COMPRESS_PROC: + case GIST_DECOMPRESS_PROC: + case GIST_FETCH_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 1, 1, INTERNALOID); + break; + case GIST_PENALTY_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 3, 3, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case GIST_PICKSPLIT_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 2, 2, INTERNALOID, INTERNALOID); + break; + case GIST_EQUAL_PROC: + ok = check_amproc_signature(procform->amproc, INTERNALOID, false, + 3, 3, opckeytype, opckeytype, + INTERNALOID); + break; + case GIST_DISTANCE_PROC: + ok = check_amproc_signature(procform->amproc, FLOAT8OID, false, + 5, 5, INTERNALOID, opcintype, + INT2OID, OIDOID, INTERNALOID); + break; + case GIST_OPTIONS_PROC: + ok = check_amoptsproc_signature(procform->amproc); + break; + case GIST_SORTSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "gist", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "gist", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + Oid op_rettype; + + /* TODO: Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "gist", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* GiST supports ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH) + { + /* ... but must have matching distance proc */ + if (!OidIsValid(get_opfamily_proc(opfamilyoid, + oprform->amoplefttype, + oprform->amoplefttype, + GIST_DISTANCE_PROC))) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains unsupported ORDER BY specification for operator %s", + opfamilyname, "gist", + format_operator(oprform->amopopr)))); + result = false; + } + /* ... and operator result must match the claimed btree opfamily */ + op_rettype = get_op_rettype(oprform->amopopr); + if (!opfamily_can_sort_type(oprform->amopsortfamily, op_rettype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains incorrect ORDER BY opfamily specification for operator %s", + opfamilyname, "gist", + format_operator(oprform->amopopr)))); + result = false; + } + } + else + { + /* Search operators must always return bool */ + op_rettype = BOOLOID; + } + + /* Check operator signature */ + if (!check_amop_signature(oprform->amopopr, op_rettype, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "gist", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * There is not a lot we can do to check the operator sets, since each + * GiST opclass is more or less a law unto itself, and some contain + * only operators that are binary-compatible with the opclass datatype + * (meaning that empty operator sets can be OK). That case also means + * that we shouldn't insist on nonempty function sets except for the + * opclass's own group. + */ + } + + /* Check that the originally-named opclass is complete */ + for (i = 1; i <= GISTNProcs; i++) + { + if (opclassgroup && + (opclassgroup->functionset & (((uint64) 1) << i)) != 0) + continue; /* got it */ + if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || + i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || + i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC) + continue; /* optional methods */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d", + opclassname, "gist", i))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + +/* + * Prechecking function for adding operators/functions to a GiST opfamily. + */ +void +gistadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + ListCell *lc; + + /* + * Operator members of a GiST opfamily should never have hard + * dependencies, since their connection to the opfamily depends only on + * what the support functions think, and that can be altered. For + * consistency, we make all soft dependencies point to the opfamily, + * though a soft dependency on the opclass would work as well in the + * CREATE OPERATOR CLASS case. + */ + foreach(lc, operators) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + + /* + * Required support functions should have hard dependencies. Preferably + * those are just dependencies on the opclass, but if we're in ALTER + * OPERATOR FAMILY, we leave the dependency pointing at the whole + * opfamily. (Given that GiST opclasses generally don't share opfamilies, + * it seems unlikely to be worth working harder.) + */ + foreach(lc, functions) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + switch (op->number) + { + case GIST_CONSISTENT_PROC: + case GIST_UNION_PROC: + case GIST_PENALTY_PROC: + case GIST_PICKSPLIT_PROC: + case GIST_EQUAL_PROC: + /* Required support function */ + op->ref_is_hard = true; + break; + case GIST_COMPRESS_PROC: + case GIST_DECOMPRESS_PROC: + case GIST_DISTANCE_PROC: + case GIST_FETCH_PROC: + case GIST_OPTIONS_PROC: + case GIST_SORTSUPPORT_PROC: + /* Optional, so force it to be a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("support function number %d is invalid for access method %s", + op->number, "gist"))); + break; + } + } +} diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c new file mode 100644 index 0000000..6464cb9 --- /dev/null +++ b/src/backend/access/gist/gistxlog.c @@ -0,0 +1,696 @@ +/*------------------------------------------------------------------------- + * + * gistxlog.c + * WAL replay logic for GiST. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistxlog.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/gist_private.h" +#include "access/gistxlog.h" +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/procarray.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static MemoryContext opCtx; /* working memory for operations */ + +/* + * Replay the clearing of F_FOLLOW_RIGHT flag on a child page. + * + * Even if the WAL record includes a full-page image, we have to update the + * follow-right flag, because that change is not included in the full-page + * image. To be sure that the intermediate state with the wrong flag value is + * not visible to concurrent Hot Standby queries, this function handles + * restoring the full-page image as well as updating the flag. (Note that + * we never need to do anything else to the child page in the current WAL + * action.) + */ +static void +gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + XLogRedoAction action; + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the updated NSN is not included in the image. + */ + action = XLogReadBufferForRedo(record, block_id, &buffer); + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + page = BufferGetPage(buffer); + + GistPageSetNSN(page, lsn); + GistClearFollowRight(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * redo any page update (except page split) + */ +static void +gistRedoPageUpdateRecord(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + char *begin; + char *data; + Size datalen; + int ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 0, &datalen); + + page = (Page) BufferGetPage(buffer); + + if (xldata->ntodelete == 1 && xldata->ntoinsert == 1) + { + /* + * When replacing one tuple with one other tuple, we must use + * PageIndexTupleOverwrite for consistency with gistplacetopage. + */ + OffsetNumber offnum = *((OffsetNumber *) data); + IndexTuple itup; + Size itupsize; + + data += sizeof(OffsetNumber); + itup = (IndexTuple) data; + itupsize = IndexTupleSize(itup); + if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize)) + elog(ERROR, "failed to add item to GiST index page, size %d bytes", + (int) itupsize); + data += itupsize; + /* should be nothing left after consuming 1 tuple */ + Assert(data - begin == datalen); + /* update insertion count for assert check below */ + ninserted++; + } + else if (xldata->ntodelete > 0) + { + /* Otherwise, delete old tuples if any */ + OffsetNumber *todelete = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntodelete; + + PageIndexMultiDelete(page, todelete, xldata->ntodelete); + if (GistPageIsLeaf(page)) + GistMarkTuplesDeleted(page); + } + + /* Add new tuples if any */ + if (data - begin < datalen) + { + OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size sz = IndexTupleSize(itup); + OffsetNumber l; + + data += sz; + + l = PageAddItem(page, (Item) itup, sz, off, false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to GiST index page, size %d bytes", + (int) sz); + off++; + ninserted++; + } + } + + /* Check that XLOG record contained expected number of tuples */ + Assert(ninserted == xldata->ntoinsert); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + /* + * Fix follow-right data on left child page + * + * This must be done while still holding the lock on the target page. Note + * that even if the target page no longer exists, we still attempt to + * replay the change on the child page. + */ + if (XLogRecHasBlockRef(record, 1)) + gistRedoClearFollowRight(record, 1); + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + + +/* + * redo delete on gist index page to remove tuples marked as DEAD during index + * tuple insertion + */ +static void +gistRedoDeleteRecord(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record); + Buffer buffer; + Page page; + + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * GiST delete records can conflict with standby queries. You might think + * that vacuum records would conflict as well, but we've handled that + * already. XLOG_HEAP2_PRUNE records provide the highest xid cleaned by + * the vacuum of the heap and so we can resolve any conflicts just once + * when that arrives. After that we know that no conflicts exist from + * individual gist vacuum records on that index. + */ + if (InHotStandby) + { + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + + ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete) + { + OffsetNumber *todelete; + + todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete); + + PageIndexMultiDelete(page, todelete, xldata->ntodelete); + } + + GistClearPageHasGarbage(page); + GistMarkTuplesDeleted(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * Returns an array of index pointers. + */ +static IndexTuple * +decodePageSplitRecord(char *begin, int len, int *n) +{ + char *ptr; + int i = 0; + IndexTuple *tuples; + + /* extract the number of tuples */ + memcpy(n, begin, sizeof(int)); + ptr = begin + sizeof(int); + + tuples = palloc(*n * sizeof(IndexTuple)); + + for (i = 0; i < *n; i++) + { + Assert(ptr - begin < len); + tuples[i] = (IndexTuple) ptr; + ptr += IndexTupleSize((IndexTuple) ptr); + } + Assert(ptr - begin == len); + + return tuples; +} + +static void +gistRedoPageSplitRecord(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); + Buffer firstbuffer = InvalidBuffer; + Buffer buffer; + Page page; + int i; + bool isrootsplit = false; + + /* + * We must hold lock on the first-listed page throughout the action, + * including while updating the left child page (if any). We can unlock + * remaining pages in the list as soon as they've been written, because + * there is no path for concurrent queries to reach those pages without + * first visiting the first-listed page. + */ + + /* loop around all pages */ + for (i = 0; i < xldata->npage; i++) + { + int flags; + char *data; + Size datalen; + int num; + BlockNumber blkno; + IndexTuple *tuples; + + XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno); + if (blkno == GIST_ROOT_BLKNO) + { + Assert(i == 0); + isrootsplit = true; + } + + buffer = XLogInitBufferForRedo(record, i + 1); + page = (Page) BufferGetPage(buffer); + data = XLogRecGetBlockData(record, i + 1, &datalen); + + tuples = decodePageSplitRecord(data, datalen, &num); + + /* ok, clear buffer */ + if (xldata->origleaf && blkno != GIST_ROOT_BLKNO) + flags = F_LEAF; + else + flags = 0; + GISTInitBuffer(buffer, flags); + + /* and fill it */ + gistfillbuffer(page, tuples, num, FirstOffsetNumber); + + if (blkno == GIST_ROOT_BLKNO) + { + GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; + GistPageSetNSN(page, xldata->orignsn); + GistClearFollowRight(page); + } + else + { + if (i < xldata->npage - 1) + { + BlockNumber nextblkno; + + XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno); + GistPageGetOpaque(page)->rightlink = nextblkno; + } + else + GistPageGetOpaque(page)->rightlink = xldata->origrlink; + GistPageSetNSN(page, xldata->orignsn); + if (i < xldata->npage - 1 && !isrootsplit && + xldata->markfollowright) + GistMarkFollowRight(page); + else + GistClearFollowRight(page); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + if (i == 0) + firstbuffer = buffer; + else + UnlockReleaseBuffer(buffer); + } + + /* Fix follow-right data on left child page, if any */ + if (XLogRecHasBlockRef(record, 0)) + gistRedoClearFollowRight(record, 0); + + /* Finally, release lock on the first page */ + UnlockReleaseBuffer(firstbuffer); +} + +/* redo page deletion */ +static void +gistRedoPageDelete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record); + Buffer parentBuffer; + Buffer leafBuffer; + + if (XLogReadBufferForRedo(record, 0, &leafBuffer) == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(leafBuffer); + + GistPageSetDeleted(page, xldata->deleteXid); + + PageSetLSN(page, lsn); + MarkBufferDirty(leafBuffer); + } + + if (XLogReadBufferForRedo(record, 1, &parentBuffer) == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(parentBuffer); + + PageIndexTupleDelete(page, xldata->downlinkOffset); + + PageSetLSN(page, lsn); + MarkBufferDirty(parentBuffer); + } + + if (BufferIsValid(parentBuffer)) + UnlockReleaseBuffer(parentBuffer); + if (BufferIsValid(leafBuffer)) + UnlockReleaseBuffer(leafBuffer); +} + +static void +gistRedoPageReuse(XLogReaderState *record) +{ + gistxlogPageReuse *xlrec = (gistxlogPageReuse *) XLogRecGetData(record); + + /* + * PAGE_REUSE records exist to provide a conflict point when we reuse + * pages in the index via the FSM. That's all they do though. + * + * latestRemovedXid was the page's deleteXid. The + * GlobalVisCheckRemovableFullXid(deleteXid) test in gistPageRecyclable() + * conceptually mirrors the PGPROC->xmin > limitXmin test in + * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the + * same exclusion effect on primary and standby. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); +} + +void +gist_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCxt; + + /* + * GiST indexes do not require any conflict processing. NB: If we ever + * implement a similar optimization we have in b-tree, and remove killed + * tuples outside VACUUM, we'll need to handle that here. + */ + + oldCxt = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_GIST_PAGE_UPDATE: + gistRedoPageUpdateRecord(record); + break; + case XLOG_GIST_DELETE: + gistRedoDeleteRecord(record); + break; + case XLOG_GIST_PAGE_REUSE: + gistRedoPageReuse(record); + break; + case XLOG_GIST_PAGE_SPLIT: + gistRedoPageSplitRecord(record); + break; + case XLOG_GIST_PAGE_DELETE: + gistRedoPageDelete(record); + break; + case XLOG_GIST_ASSIGN_LSN: + /* nop. See gistGetFakeLSN(). */ + break; + default: + elog(PANIC, "gist_redo: unknown op code %u", info); + } + + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(opCtx); +} + +void +gist_xlog_startup(void) +{ + opCtx = createTempGistContext(); +} + +void +gist_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); +} + +/* + * Mask a Gist page before running consistency checks on it. + */ +void +gist_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + /* + * NSN is nothing but a special purpose LSN. Hence, mask it for the same + * reason as mask_page_lsn_and_checksum. + */ + GistPageSetNSN(page, (uint64) MASK_MARKER); + + /* + * We update F_FOLLOW_RIGHT flag on the left child after writing WAL + * record. Hence, mask this flag. See gistplacetopage() for details. + */ + GistMarkFollowRight(page); + + if (GistPageIsLeaf(page)) + { + /* + * In gist leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * gistkillitems() for details. + */ + mask_lp_flags(page); + } + + /* + * During gist redo, we never mark a page as garbage. Hence, mask it to + * ignore any differences. + */ + GistClearPageHasGarbage(page); +} + +/* + * Write WAL record of a page split. + */ +XLogRecPtr +gistXLogSplit(bool page_is_leaf, + SplitedPageLayout *dist, + BlockNumber origrlink, GistNSN orignsn, + Buffer leftchildbuf, bool markfollowright) +{ + gistxlogPageSplit xlrec; + SplitedPageLayout *ptr; + int npage = 0; + XLogRecPtr recptr; + int i; + + for (ptr = dist; ptr; ptr = ptr->next) + npage++; + + xlrec.origrlink = origrlink; + xlrec.orignsn = orignsn; + xlrec.origleaf = page_is_leaf; + xlrec.npage = (uint16) npage; + xlrec.markfollowright = markfollowright; + + XLogBeginInsert(); + + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) + XLogRegisterBuffer(0, leftchildbuf, REGBUF_STANDARD); + + /* + * NOTE: We register a lot of data. The caller must've called + * XLogEnsureRecordSpace() to prepare for that. We cannot do it here, + * because we're already in a critical section. If you change the number + * of buffer or data registrations here, make sure you modify the + * XLogEnsureRecordSpace() calls accordingly! + */ + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageSplit)); + + i = 1; + for (ptr = dist; ptr; ptr = ptr->next) + { + XLogRegisterBuffer(i, ptr->buffer, REGBUF_WILL_INIT); + XLogRegisterBufData(i, (char *) &(ptr->block.num), sizeof(int)); + XLogRegisterBufData(i, (char *) ptr->list, ptr->lenlist); + i++; + } + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT); + + return recptr; +} + +/* + * Write XLOG record describing a page deletion. This also includes removal of + * downlink from the parent page. + */ +XLogRecPtr +gistXLogPageDelete(Buffer buffer, FullTransactionId xid, + Buffer parentBuffer, OffsetNumber downlinkOffset) +{ + gistxlogPageDelete xlrec; + XLogRecPtr recptr; + + xlrec.deleteXid = xid; + xlrec.downlinkOffset = downlinkOffset; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfGistxlogPageDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, parentBuffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE); + + return recptr; +} + +/* + * Write an empty XLOG record to assign a distinct LSN. + */ +XLogRecPtr +gistXLogAssignLSN(void) +{ + int dummy = 0; + + /* + * Records other than SWITCH_WAL must have content. We use an integer 0 to + * follow the restriction. + */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) &dummy, sizeof(dummy)); + return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN); +} + +/* + * Write XLOG record about reuse of a deleted page. + */ +void +gistXLogPageReuse(Relation rel, BlockNumber blkno, FullTransactionId latestRemovedXid) +{ + gistxlogPageReuse xlrec_reuse; + + /* + * Note that we don't register the buffer with the record, because this + * operation doesn't modify the page. This record only exists to provide a + * conflict point for Hot Standby. + */ + + /* XLOG stuff */ + xlrec_reuse.node = rel->rd_node; + xlrec_reuse.block = blkno; + xlrec_reuse.latestRemovedFullXid = latestRemovedXid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec_reuse, SizeOfGistxlogPageReuse); + + XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_REUSE); +} + +/* + * Write XLOG record describing a page update. The update can include any + * number of deletions and/or insertions of tuples on a single index page. + * + * If this update inserts a downlink for a split page, also record that + * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set. + * + * Note that both the todelete array and the tuples are marked as belonging + * to the target buffer; they need not be stored in XLOG if XLogInsert decides + * to log the whole buffer contents instead. + */ +XLogRecPtr +gistXLogUpdate(Buffer buffer, + OffsetNumber *todelete, int ntodelete, + IndexTuple *itup, int ituplen, + Buffer leftchildbuf) +{ + gistxlogPageUpdate xlrec; + int i; + XLogRecPtr recptr; + + xlrec.ntodelete = ntodelete; + xlrec.ntoinsert = ituplen; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete); + + /* new tuples */ + for (i = 0; i < ituplen; i++) + XLogRegisterBufData(0, (char *) (itup[i]), IndexTupleSize(itup[i])); + + /* + * Include a full page image of the child buf. (only necessary if a + * checkpoint happened since the child page was split) + */ + if (BufferIsValid(leftchildbuf)) + XLogRegisterBuffer(1, leftchildbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE); + + return recptr; +} + +/* + * Write XLOG record describing a delete of leaf index tuples marked as DEAD + * during new tuple insertion. One may think that this case is already covered + * by gistXLogUpdate(). But deletion of index tuples might conflict with + * standby queries and needs special handling. + */ +XLogRecPtr +gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, + TransactionId latestRemovedXid) +{ + gistxlogDelete xlrec; + XLogRecPtr recptr; + + xlrec.latestRemovedXid = latestRemovedXid; + xlrec.ntodelete = ntodelete; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete); + + /* + * We need the target-offsets array whether or not we store the whole + * buffer, to allow us to find the latestRemovedXid on a standby server. + */ + XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE); + + return recptr; +} diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile new file mode 100644 index 0000000..75bf365 --- /dev/null +++ b/src/backend/access/hash/Makefile @@ -0,0 +1,27 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/hash +# +# IDENTIFICATION +# src/backend/access/hash/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/hash +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + hash.o \ + hash_xlog.o \ + hashfunc.o \ + hashinsert.o \ + hashovfl.o \ + hashpage.o \ + hashsearch.o \ + hashsort.o \ + hashutil.o \ + hashvalidate.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README new file mode 100644 index 0000000..2227ebf --- /dev/null +++ b/src/backend/access/hash/README @@ -0,0 +1,651 @@ +src/backend/access/hash/README + +Hash Indexing +============= + +This directory contains an implementation of hash indexing for Postgres. +Most of the core ideas are taken from Margo Seltzer and Ozan Yigit, +A New Hashing Package for UNIX, Proceedings of the Winter USENIX Conference, +January 1991. (Our in-memory hashtable implementation, +src/backend/utils/hash/dynahash.c, also relies on some of the same concepts; +it is derived from code written by Esmond Pitt and later improved by Margo +among others.) + +A hash index consists of two or more "buckets", into which tuples are +placed whenever their hash key maps to the bucket number. The +key-to-bucket-number mapping is chosen so that the index can be +incrementally expanded. When a new bucket is to be added to the index, +exactly one existing bucket will need to be "split", with some of its +tuples being transferred to the new bucket according to the updated +key-to-bucket-number mapping. This is essentially the same hash table +management technique embodied in src/backend/utils/hash/dynahash.c for +in-memory hash tables. + +Each bucket in the hash index comprises one or more index pages. The +bucket's first page is permanently assigned to it when the bucket is +created. Additional pages, called "overflow pages", are added if the +bucket receives too many tuples to fit in the primary bucket page. +The pages of a bucket are chained together in a doubly-linked list +using fields in the index page special space. + +There is currently no provision to shrink a hash index, other than by +rebuilding it with REINDEX. Overflow pages can be recycled for reuse +in other buckets, but we never give them back to the operating system. +There is no provision for reducing the number of buckets, either. + +As of PostgreSQL 8.4, hash index entries store only the hash code, not the +actual data value, for each indexed item. This makes the index entries +smaller (perhaps very substantially so) and speeds up various operations. +In particular, we can speed searches by keeping the index entries in any +one index page sorted by hash code, thus allowing binary search to be used +within an index page. Note however that there is *no* assumption about the +relative ordering of hash codes across different index pages of a bucket. + + +Page Addressing +--------------- + +There are four kinds of pages in a hash index: the meta page (page zero), +which contains statically allocated control information; primary bucket +pages; overflow pages; and bitmap pages, which keep track of overflow +pages that have been freed and are available for re-use. For addressing +purposes, bitmap pages are regarded as a subset of the overflow pages. + +Primary bucket pages and overflow pages are allocated independently (since +any given index might need more or fewer overflow pages relative to its +number of buckets). The hash code uses an interesting set of addressing +rules to support a variable number of overflow pages while not having to +move primary bucket pages around after they are created. + +Primary bucket pages (henceforth just "bucket pages") are allocated in +power-of-2 groups, called "split points" in the code. That means at every new +splitpoint we double the existing number of buckets. Allocating huge chunks +of bucket pages all at once isn't optimal and we will take ages to consume +those. To avoid this exponential growth of index size, we did use a trick to +break up allocation of buckets at the splitpoint into 4 equal phases. If +(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on +we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2)) +of total buckets at each phase of splitpoint group. Next quarter of allocation +will only happen if buckets of the previous phase have been already consumed. +For the initial splitpoint groups < 10 we will allocate all of their buckets in +single phase only, as number of buckets allocated at initial groups are small +in numbers. And for the groups >= 10 the allocation process is distributed +among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4 +different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces +indicate the number of buckets allocated within each phase of splitpoint group +10. And, for splitpoint group 11 and 12 allocation phases will be +{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We +can see that at each splitpoint group we double the total number of buckets +from the previous group but in an incremental phase. The bucket pages +allocated within one phase of a splitpoint group will appear consecutively in +the index. This addressing scheme allows the physical location of a bucket +page to be computed from the bucket number relatively easily, using only a +small amount of control information. If we look at the function +_hash_spareindex for a given bucket number we first compute the +splitpoint group it belongs to and then the phase to which the bucket belongs +to. Adding them we get the global splitpoint phase number S to which the +bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[] +is an array stored in the metapage) with given bucket number to compute its +physical address. The hashm_spares[S] can be interpreted as the total number +of overflow pages that have been allocated before the bucket pages of +splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1 +always appear at block numbers 1 and 2, just after the meta page. We always +have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the +former. The difference between the two represents the number of overflow pages +appearing between the bucket page groups of splitpoints phase N and N+1. +(Note: the above describes what happens when filling an initially minimally +sized hash index. In practice, we try to estimate the required index size and +allocate a suitable number of splitpoints phases immediately, to avoid +expensive re-splitting during initial index build.) + +When S splitpoints exist altogether, the array entries hashm_spares[0] +through hashm_spares[S] are valid; hashm_spares[S] records the current +total number of overflow pages. New overflow pages are created as needed +at the end of the index, and recorded by incrementing hashm_spares[S]. +When it is time to create a new splitpoint phase's worth of bucket pages, we +copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is +stored in the hashm_ovflpoint field of the meta page). This has the +effect of reserving the correct number of bucket pages at the end of the +index, and preparing to allocate additional overflow pages after those +bucket pages. hashm_spares[] entries before S cannot change anymore, +since that would require moving already-created bucket pages. + +The last page nominally used by the index is always determinable from +hashm_spares[S]. To avoid complaints from smgr, the logical EOF as seen by +the filesystem and smgr must always be greater than or equal to this page. +We have to allow the case "greater than" because it's possible that during +an index extension we crash after allocating filesystem space and before +updating the metapage. Note that on filesystems that allow "holes" in +files, it's entirely likely that pages before the logical EOF are not yet +allocated: when we allocate a new splitpoint phase's worth of bucket pages, we +physically zero the last such page to force the EOF up, and the first such +page will be used immediately, but the intervening pages are not written +until needed. + +Since overflow pages may be recycled if enough tuples are deleted from +their bucket, we need a way to keep track of currently-free overflow +pages. The state of each overflow page (0 = available, 1 = not available) +is recorded in "bitmap" pages dedicated to this purpose. The entries in +the bitmap are indexed by "bit number", a zero-based count in which every +overflow page has a unique entry. We can convert between an overflow +page's physical block number and its bit number using the information in +hashm_spares[] (see hashovfl.c for details). The bit number sequence +includes the bitmap pages, which is the reason for saying that bitmap +pages are a subset of the overflow pages. It turns out in fact that each +bitmap page's first bit represents itself --- this is not an essential +property, but falls out of the fact that we only allocate another bitmap +page when we really need one. Bit number zero always corresponds to the +first bitmap page, which is allocated during index creation just after all +the initially created buckets. + + +Lock Definitions +---------------- + +Concurrency control for hash indexes is provided using buffer content +locks, buffer pins, and cleanup locks. Here as elsewhere in PostgreSQL, +cleanup lock means that we hold an exclusive lock on the buffer and have +observed at some point after acquiring the lock that we hold the only pin +on that buffer. For hash indexes, a cleanup lock on a primary bucket page +represents the right to perform an arbitrary reorganization of the entire +bucket. Therefore, scans retain a pin on the primary bucket page for the +bucket they are currently scanning. Splitting a bucket requires a cleanup +lock on both the old and new primary bucket pages. VACUUM therefore takes +a cleanup lock on every bucket page in order to remove tuples. It can also +remove tuples copied to a new bucket by any previous split operation, because +the cleanup lock taken on the primary bucket page guarantees that no scans +which started prior to the most recent split can still be in progress. After +cleaning each page individually, it attempts to take a cleanup lock on the +primary bucket page in order to "squeeze" the bucket down to the minimum +possible number of pages. + +To avoid deadlocks, we must be consistent about the lock order in which we +lock the buckets for operations that requires locks on two different buckets. +We choose to always lock the lower-numbered bucket first. The metapage is +only ever locked after all bucket locks have been taken. + + +Metapage Caching +---------------- + +Both scanning the index and inserting tuples require locating the bucket +where a given tuple ought to be located. To do this, we need the bucket +count, highmask, and lowmask from the metapage; however, it's undesirable +for performance reasons to have to have to lock and pin the metapage for +every such operation. Instead, we retain a cached copy of the metapage +in each backend's relcache entry. This will produce the correct +bucket mapping as long as the target bucket hasn't been split since the +last cache refresh. + +To guard against the possibility that such a split has occurred, the +primary page of each bucket chain stores the number of buckets that +existed as of the time the bucket was last split, or if never split as +of the time it was created, in the space normally used for the +previous block number (that is, hasho_prevblkno). This doesn't cost +anything because the primary bucket page is always the first page in +the chain, and the previous block number is therefore always, in +reality, InvalidBlockNumber. + +After computing the ostensibly-correct bucket number based on our cached +copy of the metapage, we lock the corresponding primary bucket page and +check whether the bucket count stored in hasho_prevblkno is greater than +the number of buckets stored in our cached copy of the metapage. If +so, the bucket has certainly been split, because the count must originally +have been less than the number of buckets that existed at that time and +can't have increased except due to a split. If not, the bucket can't have +been split, because a split would have created a new bucket with a higher +bucket number than any we'd seen previously. In the latter case, we've +locked the correct bucket and can proceed; in the former case, we must +release the lock on this bucket, lock the metapage, update our cache, +unlock the metapage, and retry. + +Needing to retry occasionally might seem expensive, but the number of times +any given bucket can be split is limited to a few dozen no matter how +many times the hash index is accessed, because the total number of +buckets is limited to less than 2^32. On the other hand, the number of +times we access a bucket is unbounded and will be several orders of +magnitude larger even in unsympathetic cases. + +(The metapage cache is new in v10. Older hash indexes had the primary +bucket page's hasho_prevblkno initialized to InvalidBuffer.) + +Pseudocode Algorithms +--------------------- + +Various flags that are used in hash index operations are described as below: + +The bucket-being-split and bucket-being-populated flags indicate that split +the operation is in progress for a bucket. During split operation, a +bucket-being-split flag is set on the old bucket and bucket-being-populated +flag is set on new bucket. These flags are cleared once the split operation +is finished. + +The split-cleanup flag indicates that a bucket which has been recently split +still contains tuples that were also copied to the new bucket; it essentially +marks the split as incomplete. Once we're certain that no scans which +started before the new bucket was fully populated are still in progress, we +can remove the copies from the old bucket and clear the flag. We insist that +this flag must be clear before splitting a bucket; thus, a bucket can't be +split again until the previous split is totally complete. + +The moved-by-split flag on a tuple indicates that tuple is moved from old to +new bucket. Concurrent scans will skip such tuples until the split operation +is finished. Once the tuple is marked as moved-by-split, it will remain so +forever but that does no harm. We have intentionally not cleared it as that +can generate an additional I/O which is not necessary. + +The operations we need to support are: readers scanning the index for +entries of a particular hash code (which by definition are all in the same +bucket); insertion of a new tuple into the correct bucket; enlarging the +hash table by splitting an existing bucket; and garbage collection +(deletion of dead tuples and compaction of buckets). Bucket splitting is +done at conclusion of any insertion that leaves the hash table more full +than the target load factor, but it is convenient to consider it as an +independent operation. Note that we do not have a bucket-merge operation +--- the number of buckets never shrinks. Insertion, splitting, and +garbage collection may all need access to freelist management, which keeps +track of available overflow pages. + +The reader algorithm is: + + lock the primary bucket page of the target bucket + if the target bucket is still being populated by a split: + release the buffer content lock on current bucket page + pin and acquire the buffer content lock on old bucket in shared mode + release the buffer content lock on old bucket, but not pin + retake the buffer content lock on new bucket + arrange to scan the old bucket normally and the new bucket for + tuples which are not moved-by-split +-- then, per read request: + reacquire content lock on current page + step to next page if necessary (no chaining of content locks, but keep + the pin on the primary bucket throughout the scan) + save all the matching tuples from current index page into an items array + release pin and content lock (but if it is primary bucket page retain + its pin till the end of the scan) + get tuple from an item array +-- at scan shutdown: + release all pins still held + +Holding the buffer pin on the primary bucket page for the whole scan prevents +the reader's current-tuple pointer from being invalidated by splits or +compactions. (Of course, other buckets can still be split or compacted.) + +To minimize lock/unlock traffic, hash index scan always searches the entire +hash page to identify all the matching items at once, copying their heap tuple +IDs into backend-local storage. The heap tuple IDs are then processed while not +holding any page lock within the index thereby, allowing concurrent insertion +to happen on the same index page without any requirement of re-finding the +current scan position for the reader. We do continue to hold a pin on the +bucket page, to protect against concurrent deletions and bucket split. + +To allow for scans during a bucket split, if at the start of the scan, the +bucket is marked as bucket-being-populated, it scan all the tuples in that +bucket except for those that are marked as moved-by-split. Once it finishes +the scan of all the tuples in the current bucket, it scans the old bucket from +which this bucket is formed by split. + +The insertion algorithm is rather similar: + + lock the primary bucket page of the target bucket +-- (so far same as reader, except for acquisition of buffer content lock in + exclusive mode on primary bucket page) + if the bucket-being-split flag is set for a bucket and pin count on it is + one, then finish the split + release the buffer content lock on current bucket + get the "new" bucket which was being populated by the split + scan the new bucket and form the hash table of TIDs + conditionally get the cleanup lock on old and new buckets + if we get the lock on both the buckets + finish the split using algorithm mentioned below for split + release the pin on old bucket and restart the insert from beginning. + if current page is full, first check if this page contains any dead tuples. + if yes, remove dead tuples from the current page and again check for the + availability of the space. If enough space found, insert the tuple else + release lock but not pin, read/exclusive-lock + next page; repeat as needed + >> see below if no space in any page of bucket + take buffer content lock in exclusive mode on metapage + insert tuple at appropriate place in page + mark current page dirty + increment tuple count, decide if split needed + mark meta page dirty + write WAL for insertion of tuple + release the buffer content lock on metapage + release buffer content lock on current page + if current page is not a bucket page, release the pin on bucket page + if split is needed, enter Split algorithm below + release the pin on metapage + +To speed searches, the index entries within any individual index page are +kept sorted by hash code; the insertion code must take care to insert new +entries in the right place. It is okay for an insertion to take place in a +bucket that is being actively scanned, because readers can cope with this +as explained above. We only need the short-term buffer locks to ensure +that readers do not see a partially-updated page. + +To avoid deadlock between readers and inserters, whenever there is a need +to lock multiple buckets, we always take in the order suggested in Lock +Definitions above. This algorithm allows them a very high degree of +concurrency. (The exclusive metapage lock taken to update the tuple count +is stronger than necessary, since readers do not care about the tuple count, +but the lock is held for such a short time that this is probably not an +issue.) + +When an inserter cannot find space in any existing page of a bucket, it +must obtain an overflow page and add that page to the bucket's chain. +Details of that part of the algorithm appear later. + +The page split algorithm is entered whenever an inserter observes that the +index is overfull (has a higher-than-wanted ratio of tuples to buckets). +The algorithm attempts, but does not necessarily succeed, to split one +existing bucket in two, thereby lowering the fill ratio: + + pin meta page and take buffer content lock in exclusive mode + check split still needed + if split not needed anymore, drop buffer content lock and pin and exit + decide which bucket to split + try to take a cleanup lock on that bucket; if fail, give up + if that bucket is still being split or has split-cleanup work: + try to finish the split and the cleanup work + if that succeeds, start over; if it fails, give up + mark the old and new buckets indicating split is in progress + mark both old and new buckets as dirty + write WAL for allocation of new page for split + copy the tuples that belongs to new bucket from old bucket, marking + them as moved-by-split + write WAL record for moving tuples to new page once the new page is full + or all the pages of old bucket are finished + release lock but not pin for primary bucket page of old bucket, + read/shared-lock next page; repeat as needed + clear the bucket-being-split and bucket-being-populated flags + mark the old bucket indicating split-cleanup + write WAL for changing the flags on both old and new buckets + +The split operation's attempt to acquire cleanup-lock on the old bucket number +could fail if another process holds any lock or pin on it. We do not want to +wait if that happens, because we don't want to wait while holding the metapage +exclusive-lock. So, this is a conditional LWLockAcquire operation, and if +it fails we just abandon the attempt to split. This is all right since the +index is overfull but perfectly functional. Every subsequent inserter will +try to split, and eventually one will succeed. If multiple inserters failed +to split, the index might still be overfull, but eventually, the index will +not be overfull and split attempts will stop. (We could make a successful +splitter loop to see if the index is still overfull, but it seems better to +distribute the split overhead across successive insertions.) + +If a split fails partway through (e.g. due to insufficient disk space or an +interrupt), the index will not be corrupted. Instead, we'll retry the split +every time a tuple is inserted into the old bucket prior to inserting the new +tuple; eventually, we should succeed. The fact that a split is left +unfinished doesn't prevent subsequent buckets from being split, but we won't +try to split the bucket again until the prior split is finished. In other +words, a bucket can be in the middle of being split for some time, but it can't +be in the middle of two splits at the same time. + +The fourth operation is garbage collection (bulk deletion): + + next bucket := 0 + pin metapage and take buffer content lock in exclusive mode + fetch current max bucket number + release meta page buffer content lock and pin + while next bucket <= max bucket do + acquire cleanup lock on primary bucket page + loop: + scan and remove tuples + mark the target page dirty + write WAL for deleting tuples from target page + if this is the last bucket page, break out of loop + pin and x-lock next page + release prior lock and pin (except keep pin on primary bucket page) + if the page we have locked is not the primary bucket page: + release lock and take exclusive lock on primary bucket page + if there are no other pins on the primary bucket page: + squeeze the bucket to remove free space + release the pin on primary bucket page + next bucket ++ + end loop + pin metapage and take buffer content lock in exclusive mode + check if number of buckets changed + if so, release content lock and pin and return to for-each-bucket loop + else update metapage tuple count + mark meta page dirty and write WAL for update of metapage + release buffer content lock and pin + +Note that this is designed to allow concurrent splits and scans. If a split +occurs, tuples relocated into the new bucket will be visited twice by the +scan, but that does no harm. See also "Interlocking Between Scans and +VACUUM", below. + +We must be careful about the statistics reported by the VACUUM operation. +What we can do is count the number of tuples scanned, and believe this in +preference to the stored tuple count if the stored tuple count and number of +buckets did *not* change at any time during the scan. This provides a way of +correcting the stored tuple count if it gets out of sync for some reason. But +if a split or insertion does occur concurrently, the scan count is +untrustworthy; instead, subtract the number of tuples deleted from the stored +tuple count and use that. + +Interlocking Between Scans and VACUUM +------------------------------------- + +Since we release the lock on bucket page during a cleanup scan of a bucket, a +concurrent scan could start in that bucket before we've finished vacuuming it. +If a scan gets ahead of cleanup, we could have the following problem: (1) the +scan sees heap TIDs that are about to be removed before they are processed by +VACUUM, (2) the scan decides that one or more of those TIDs are dead, (3) +VACUUM completes, (4) one or more of the TIDs the scan decided were dead are +reused for an unrelated tuple, and finally (5) the scan wakes up and +erroneously kills the new tuple. + +Note that this requires VACUUM and a scan to be active in the same bucket at +the same time. If VACUUM completes before the scan starts, the scan never has +a chance to see the dead tuples; if the scan completes before the VACUUM +starts, the heap TIDs can't have been reused meanwhile. Furthermore, VACUUM +can't start on a bucket that has an active scan, because the scan holds a pin +on the primary bucket page, and VACUUM must take a cleanup lock on that page +in order to begin cleanup. Therefore, the only way this problem can occur is +for a scan to start after VACUUM has released the cleanup lock on the bucket +but before it has processed the entire bucket and then overtake the cleanup +operation. + +Currently, we prevent this using lock chaining: cleanup locks the next page +in the chain before releasing the lock and pin on the page just processed. + +Free Space Management +--------------------- + +(Question: why is this so complicated? Why not just have a linked list +of free pages with the list head in the metapage? It's not like we +avoid needing to modify the metapage with all this.) + +Free space management consists of two sub-algorithms, one for reserving +an overflow page to add to a bucket chain, and one for returning an empty +overflow page to the free pool. + +Obtaining an overflow page: + + take metapage content lock in exclusive mode + determine next bitmap page number; if none, exit loop + release meta page content lock + pin bitmap page and take content lock in exclusive mode + search for a free page (zero bit in bitmap) + if found: + set bit in bitmap + mark bitmap page dirty + take metapage buffer content lock in exclusive mode + if first-free-bit value did not change, + update it and mark meta page dirty + else (not found): + release bitmap page buffer content lock + loop back to try next bitmap page, if any +-- here when we have checked all bitmap pages; we hold meta excl. lock + extend index to add another overflow page; update meta information + mark meta page dirty + return page number + +It is slightly annoying to release and reacquire the metapage lock +multiple times, but it seems best to do it that way to minimize loss of +concurrency against processes just entering the index. We don't want +to hold the metapage exclusive lock while reading in a bitmap page. +(We can at least avoid repeated buffer pin/unpin here.) + +The normal path for extending the index does not require doing I/O while +holding the metapage lock. We do have to do I/O when the extension +requires adding a new bitmap page as well as the required overflow page +... but that is an infrequent case, so the loss of concurrency seems +acceptable. + +The portion of tuple insertion that calls the above subroutine looks +like this: + + -- having determined that no space is free in the target bucket: + remember last page of bucket, drop write lock on it + re-write-lock last page of bucket + if it is not last anymore, step to the last page + execute free-page-acquire (obtaining an overflow page) mechanism + described above + update (former) last page to point to the new page and mark buffer dirty + write-lock and initialize new page, with back link to former last page + write WAL for addition of overflow page + release the locks on meta page and bitmap page acquired in + free-page-acquire algorithm + release the lock on former last page + release the lock on new overflow page + insert tuple into new page + -- etc. + +Notice this handles the case where two concurrent inserters try to extend +the same bucket. They will end up with a valid, though perhaps +space-inefficient, configuration: two overflow pages will be added to the +bucket, each containing one tuple. + +The last part of this violates the rule about holding write lock on two +pages concurrently, but it should be okay to write-lock the previously +free page; there can be no other process holding lock on it. + +Bucket splitting uses a similar algorithm if it has to extend the new +bucket, but it need not worry about concurrent extension since it has +buffer content lock in exclusive mode on the new bucket. + +Freeing an overflow page requires the process to hold buffer content lock in +exclusive mode on the containing bucket, so need not worry about other +accessors of pages in the bucket. The algorithm is: + + delink overflow page from bucket chain + (this requires read/update/write/release of fore and aft siblings) + pin meta page and take buffer content lock in shared mode + determine which bitmap page contains the free space bit for page + release meta page buffer content lock + pin bitmap page and take buffer content lock in exclusive mode + retake meta page buffer content lock in exclusive mode + move (insert) tuples that belong to the overflow page being freed + update bitmap bit + mark bitmap page dirty + if page number is still less than first-free-bit, + update first-free-bit field and mark meta page dirty + write WAL for delinking overflow page operation + release buffer content lock and pin + release meta page buffer content lock and pin + +We have to do it this way because we must clear the bitmap bit before +changing the first-free-bit field (hashm_firstfree). It is possible that +we set first-free-bit too small (because someone has already reused the +page we just freed), but that is okay; the only cost is the next overflow +page acquirer will scan more bitmap bits than he needs to. What must be +avoided is having first-free-bit greater than the actual first free bit, +because then that free page would never be found by searchers. + +The reason of moving tuples from overflow page while delinking the later is +to make that as an atomic operation. Not doing so could lead to spurious reads +on standby. Basically, the user might see the same tuple twice. + + +WAL Considerations +------------------ + +The hash index operations like create index, insert, delete, bucket split, +allocate overflow page, and squeeze in themselves don't guarantee hash index +consistency after a crash. To provide robustness, we write WAL for each of +these operations. + +CREATE INDEX writes multiple WAL records. First, we write a record to cover +the initializatoin of the metapage, followed by one for each new bucket +created, followed by one for the initial bitmap page. It's not important for +index creation to appear atomic, because the index isn't yet visible to any +other transaction, and the creating transaction will roll back in the event of +a crash. It would be difficult to cover the whole operation with a single +write-ahead log record anyway, because we can log only a fixed number of +pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery. + +Ordinary item insertions (that don't force a page split or need a new overflow +page) are single WAL entries. They touch a single bucket page and the +metapage. The metapage is updated during replay as it is updated during +original operation. + +If an insertion causes the addition of an overflow page, there will be one +WAL entry for the new overflow page and second entry for insert itself. + +If an insertion causes a bucket split, there will be one WAL entry for insert +itself, followed by a WAL entry for allocating a new bucket, followed by a WAL +entry for each overflow bucket page in the new bucket to which the tuples are +moved from old bucket, followed by a WAL entry to indicate that split is +complete for both old and new buckets. A split operation which requires +overflow pages to complete the operation will need to write a WAL record for +each new allocation of an overflow page. + +As splitting involves multiple atomic actions, it's possible that the system +crashes between moving tuples from bucket pages of the old bucket to new +bucket. In such a case, after recovery, the old and new buckets will be +marked with bucket-being-split and bucket-being-populated flags respectively +which indicates that split is in progress for those buckets. The reader +algorithm works correctly, as it will scan both the old and new buckets when +the split is in progress as explained in the reader algorithm section above. + +We finish the split at next insert or split operation on the old bucket as +explained in insert and split algorithm above. It could be done during +searches, too, but it seems best not to put any extra updates in what would +otherwise be a read-only operation (updating is not possible in hot standby +mode anyway). It would seem natural to complete the split in VACUUM, but since +splitting a bucket might require allocating a new page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. + +Deletion of tuples from a bucket is performed for two reasons: to remove dead +tuples, and to remove tuples that were moved by a bucket split. A WAL entry +is made for each bucket page from which tuples are removed, and then another +WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples +are removed, a separate WAL entry is made to update the metapage. + +As deletion involves multiple atomic operations, it is quite possible that +system crashes after (a) removing tuples from some of the bucket pages, (b) +before clearing the garbage flag, or (c) before updating the metapage. If the +system crashes before completing (b), it will again try to clean the bucket +during next vacuum or insert after recovery which can have some performance +impact, but it will work fine. If the system crashes before completing (c), +after recovery there could be some additional splits until the next vacuum +updates the metapage, but the other operations like insert, delete and scan +will work correctly. We can fix this problem by actually updating the +metapage based on delete operation during replay, but it's not clear whether +it's worth the complication. + +A squeeze operation moves tuples from one of the buckets later in the chain to +one of the bucket earlier in chain and writes WAL record when either the +bucket to which it is writing tuples is filled or bucket from which it +is removing the tuples becomes empty. + +As a squeeze operation involves writing multiple atomic operations, it is +quite possible that the system crashes before completing the operation on +entire bucket. After recovery, the operations will work correctly, but +the index will remain bloated and this can impact performance of read and +insert operations until the next vacuum squeeze the bucket completely. + + +Other Notes +----------- + +Clean up locks prevent a split from occurring while *another* process is stopped +in a given bucket. It also ensures that one of our *own* backend's scans is not +stopped in the bucket. diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c new file mode 100644 index 0000000..0752fb3 --- /dev/null +++ b/src/backend/access/hash/hash.c @@ -0,0 +1,918 @@ +/*------------------------------------------------------------------------- + * + * hash.c + * Implementation of Margo Seltzer's Hashing package for postgres. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hash.c + * + * NOTES + * This file contains only the public interface routines. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "catalog/index.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "optimizer/plancat.h" +#include "pgstat.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/rel.h" + +/* Working state for hashbuild and its callback */ +typedef struct +{ + HSpool *spool; /* NULL if not using spooling */ + double indtuples; /* # tuples accepted into index */ + Relation heapRel; /* heap relation descriptor */ +} HashBuildState; + +static void hashbuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state); + + +/* + * Hash handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +hashhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = HTMaxStrategyNumber; + amroutine->amsupport = HASHNProcs; + amroutine->amoptsprocnum = HASHOPTIONS_PROC; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = true; + amroutine->amcanunique = false; + amroutine->amcanmulticol = false; + amroutine->amoptionalkey = false; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = false; + amroutine->amclusterable = false; + amroutine->ampredlocks = true; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL; + amroutine->amkeytype = INT4OID; + + amroutine->ambuild = hashbuild; + amroutine->ambuildempty = hashbuildempty; + amroutine->aminsert = hashinsert; + amroutine->ambulkdelete = hashbulkdelete; + amroutine->amvacuumcleanup = hashvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = hashcostestimate; + amroutine->amoptions = hashoptions; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = hashvalidate; + amroutine->amadjustmembers = hashadjustmembers; + amroutine->ambeginscan = hashbeginscan; + amroutine->amrescan = hashrescan; + amroutine->amgettuple = hashgettuple; + amroutine->amgetbitmap = hashgetbitmap; + amroutine->amendscan = hashendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * hashbuild() -- build a new hash index. + */ +IndexBuildResult * +hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + BlockNumber relpages; + double reltuples; + double allvisfrac; + uint32 num_buckets; + long sort_threshold; + HashBuildState buildstate; + + /* + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* Estimate the number of rows currently present in the table */ + estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); + + /* Initialize the hash index metadata page and initial buckets */ + num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); + + /* + * If we just insert the tuples into the index in scan order, then + * (assuming their hash codes are pretty random) there will be no locality + * of access to the index, and if the index is bigger than available RAM + * then we'll thrash horribly. To prevent that scenario, we can sort the + * tuples by (expected) bucket number. However, such a sort is useless + * overhead when the index does fit in RAM. We choose to sort if the + * initial index size exceeds maintenance_work_mem, or the number of + * buffers usable for the index, whichever is less. (Limiting by the + * number of buffers should reduce thrashing between PG buffers and kernel + * buffers, which seems useful even if no physical I/O results. Limiting + * by maintenance_work_mem is useful to allow easy testing of the sort + * code path, and may be useful to DBAs as an additional control knob.) + * + * NOTE: this test will need adjustment if a bucket is ever different from + * one page. Also, "initial index size" accounting does not include the + * metapage, nor the first bitmap page. + */ + sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ; + if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) + sort_threshold = Min(sort_threshold, NBuffers); + else + sort_threshold = Min(sort_threshold, NLocBuffer); + + if (num_buckets >= (uint32) sort_threshold) + buildstate.spool = _h_spoolinit(heap, index, num_buckets); + else + buildstate.spool = NULL; + + /* prepare to build the index */ + buildstate.indtuples = 0; + buildstate.heapRel = heap; + + /* do the heap scan */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + hashbuildCallback, + (void *) &buildstate, NULL); + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL, + buildstate.indtuples); + + if (buildstate.spool) + { + /* sort the tuples and insert them into the index */ + _h_indexbuild(buildstate.spool, buildstate.heapRel); + _h_spooldestroy(buildstate.spool); + } + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = buildstate.indtuples; + + return result; +} + +/* + * hashbuildempty() -- build an empty hash index in the initialization fork + */ +void +hashbuildempty(Relation index) +{ + _hash_init(index, 0, INIT_FORKNUM); +} + +/* + * Per-tuple callback for table_index_build_scan + */ +static void +hashbuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + HashBuildState *buildstate = (HashBuildState *) state; + Datum index_values[1]; + bool index_isnull[1]; + IndexTuple itup; + + /* convert data to a hash key; on failure, do not insert anything */ + if (!_hash_convert_tuple(index, + values, isnull, + index_values, index_isnull)) + return; + + /* Either spool the tuple for sorting, or just put it into the index */ + if (buildstate->spool) + _h_spool(buildstate->spool, tid, index_values, index_isnull); + else + { + /* form an index tuple and point it at the heap tuple */ + itup = index_form_tuple(RelationGetDescr(index), + index_values, index_isnull); + itup->t_tid = *tid; + _hash_doinsert(index, itup, buildstate->heapRel); + pfree(itup); + } + + buildstate->indtuples += 1; +} + +/* + * hashinsert() -- insert an index tuple into a hash table. + * + * Hash on the heap tuple's key, form an index tuple with hash code. + * Find the appropriate location for the new tuple, and put it there. + */ +bool +hashinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + Datum index_values[1]; + bool index_isnull[1]; + IndexTuple itup; + + /* convert data to a hash key; on failure, do not insert anything */ + if (!_hash_convert_tuple(rel, + values, isnull, + index_values, index_isnull)) + return false; + + /* form an index tuple and point it at the heap tuple */ + itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); + itup->t_tid = *ht_ctid; + + _hash_doinsert(rel, itup, heapRel); + + pfree(itup); + + return false; +} + + +/* + * hashgettuple() -- Get the next tuple in the scan. + */ +bool +hashgettuple(IndexScanDesc scan, ScanDirection dir) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool res; + + /* Hash indexes are always lossy since we store only the hash code */ + scan->xs_recheck = true; + + /* + * If we've already initialized this scan, we can just advance it in the + * appropriate direction. If we haven't done so yet, we call a routine to + * get the first item in the scan. + */ + if (!HashScanPosIsValid(so->currPos)) + res = _hash_first(scan, dir); + else + { + /* + * Check to see if we should kill the previously-fetched tuple. + */ + if (scan->kill_prior_tuple) + { + /* + * Yes, so remember it for later. (We'll deal with all such tuples + * at once right after leaving the index page or at end of scan.) + * In case if caller reverses the indexscan direction it is quite + * possible that the same item might get entered multiple times. + * But, we don't detect that; instead, we just forget any excess + * entries. + */ + if (so->killedItems == NULL) + so->killedItems = (int *) + palloc(MaxIndexTuplesPerPage * sizeof(int)); + + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = so->currPos.itemIndex; + } + + /* + * Now continue the scan. + */ + res = _hash_next(scan, dir); + } + + return res; +} + + +/* + * hashgetbitmap() -- get all tuples at once + */ +int64 +hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool res; + int64 ntids = 0; + HashScanPosItem *currItem; + + res = _hash_first(scan, ForwardScanDirection); + + while (res) + { + currItem = &so->currPos.items[so->currPos.itemIndex]; + + /* + * _hash_first and _hash_next handle eliminate dead index entries + * whenever scan->ignore_killed_tuples is true. Therefore, there's + * nothing to do here except add the results to the TIDBitmap. + */ + tbm_add_tuples(tbm, &(currItem->heapTid), 1, true); + ntids++; + + res = _hash_next(scan, ForwardScanDirection); + } + + return ntids; +} + + +/* + * hashbeginscan() -- start a scan on a hash index + */ +IndexScanDesc +hashbeginscan(Relation rel, int nkeys, int norderbys) +{ + IndexScanDesc scan; + HashScanOpaque so; + + /* no order by operators allowed */ + Assert(norderbys == 0); + + scan = RelationGetIndexScan(rel, nkeys, norderbys); + + so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); + HashScanPosInvalidate(so->currPos); + so->hashso_bucket_buf = InvalidBuffer; + so->hashso_split_bucket_buf = InvalidBuffer; + + so->hashso_buc_populated = false; + so->hashso_buc_split = false; + + so->killedItems = NULL; + so->numKilled = 0; + + scan->opaque = so; + + return scan; +} + +/* + * hashrescan() -- rescan an index relation + */ +void +hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + + if (HashScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _hash_kill_items(scan); + } + + _hash_dropscanbuf(rel, so); + + /* set position invalid (this will cause _hash_first call) */ + HashScanPosInvalidate(so->currPos); + + /* Update scan key, if a new one is given */ + if (scankey && scan->numberOfKeys > 0) + { + memmove(scan->keyData, + scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } + + so->hashso_buc_populated = false; + so->hashso_buc_split = false; +} + +/* + * hashendscan() -- close down a scan + */ +void +hashendscan(IndexScanDesc scan) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + + if (HashScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _hash_kill_items(scan); + } + + _hash_dropscanbuf(rel, so); + + if (so->killedItems != NULL) + pfree(so->killedItems); + pfree(so); + scan->opaque = NULL; +} + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * This function also deletes the tuples that are moved by split to other + * bucket. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation rel = info->index; + double tuples_removed; + double num_index_tuples; + double orig_ntuples; + Bucket orig_maxbucket; + Bucket cur_maxbucket; + Bucket cur_bucket; + Buffer metabuf = InvalidBuffer; + HashMetaPage metap; + HashMetaPage cachedmetap; + + tuples_removed = 0; + num_index_tuples = 0; + + /* + * We need a copy of the metapage so that we can use its hashm_spares[] + * values to compute bucket page addresses, but a cached copy should be + * good enough. (If not, we'll detect that further down and refresh the + * cache as necessary.) + */ + cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(cachedmetap != NULL); + + orig_maxbucket = cachedmetap->hashm_maxbucket; + orig_ntuples = cachedmetap->hashm_ntuples; + + /* Scan the buckets that we know exist */ + cur_bucket = 0; + cur_maxbucket = orig_maxbucket; + +loop_top: + while (cur_bucket <= cur_maxbucket) + { + BlockNumber bucket_blkno; + BlockNumber blkno; + Buffer bucket_buf; + Buffer buf; + HashPageOpaque bucket_opaque; + Page page; + bool split_cleanup = false; + + /* Get address of bucket's start page */ + bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); + + blkno = bucket_blkno; + + /* + * We need to acquire a cleanup lock on the primary bucket page to out + * wait concurrent scans before deleting the dead tuples. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); + LockBufferForCleanup(buf); + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); + + page = BufferGetPage(buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* + * If the bucket contains tuples that are moved by split, then we need + * to delete such tuples. We can't delete such tuples if the split + * operation on bucket is not finished as those are needed by scans. + */ + if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && + H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) + { + split_cleanup = true; + + /* + * This bucket might have been split since we last held a lock on + * the metapage. If so, hashm_maxbucket, hashm_highmask and + * hashm_lowmask might be old enough to cause us to fail to remove + * tuples left behind by the most recent split. To prevent that, + * now that the primary page of the target bucket has been locked + * (and thus can't be further split), check whether we need to + * update our cached metapage data. + */ + Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); + if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) + { + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); + } + } + + bucket_buf = buf; + + hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, + cachedmetap->hashm_maxbucket, + cachedmetap->hashm_highmask, + cachedmetap->hashm_lowmask, &tuples_removed, + &num_index_tuples, split_cleanup, + callback, callback_state); + + _hash_dropbuf(rel, bucket_buf); + + /* Advance to next bucket */ + cur_bucket++; + } + + if (BufferIsInvalid(metabuf)) + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); + + /* Write-lock metapage and check for split since we started */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + if (cur_maxbucket != metap->hashm_maxbucket) + { + /* There's been a split, so process the additional bucket(s) */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); + cur_maxbucket = cachedmetap->hashm_maxbucket; + goto loop_top; + } + + /* Okay, we're really done. Update tuple count in metapage. */ + START_CRIT_SECTION(); + + if (orig_maxbucket == metap->hashm_maxbucket && + orig_ntuples == metap->hashm_ntuples) + { + /* + * No one has split or inserted anything since start of scan, so + * believe our count as gospel. + */ + metap->hashm_ntuples = num_index_tuples; + } + else + { + /* + * Otherwise, our count is untrustworthy since we may have + * double-scanned tuples in split buckets. Proceed by dead-reckoning. + * (Note: we still return estimated_count = false, because using this + * count is better than not updating reltuples at all.) + */ + if (metap->hashm_ntuples > tuples_removed) + metap->hashm_ntuples -= tuples_removed; + else + metap->hashm_ntuples = 0; + num_index_tuples = metap->hashm_ntuples; + } + + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_update_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.ntuples = metap->hashm_ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); + + XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + _hash_relbuf(rel, metabuf); + + /* return statistics */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats->estimated_count = false; + stats->num_index_tuples = num_index_tuples; + stats->tuples_removed += tuples_removed; + /* hashvacuumcleanup will fill in num_pages */ + + return stats; +} + +/* + * Post-VACUUM cleanup. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation rel = info->index; + BlockNumber num_pages; + + /* If hashbulkdelete wasn't called, return NULL signifying no change */ + /* Note: this covers the analyze_only case too */ + if (stats == NULL) + return NULL; + + /* update statistics */ + num_pages = RelationGetNumberOfBlocks(rel); + stats->num_pages = num_pages; + + return stats; +} + +/* + * Helper function to perform deletion of index entries from a bucket. + * + * This function expects that the caller has acquired a cleanup lock on the + * primary bucket page, and will return with a write lock again held on the + * primary bucket page. The lock won't necessarily be held continuously, + * though, because we'll release it when visiting overflow pages. + * + * There can't be any concurrent scans in progress when we first enter this + * function because of the cleanup lock we hold on the primary bucket page, + * but as soon as we release that lock, there might be. If those scans got + * ahead of our cleanup scan, they might see a tuple before we kill it and + * wake up only after VACUUM has completed and the TID has been recycled for + * an unrelated tuple. To avoid that calamity, we prevent scans from passing + * our cleanup scan by locking the next page in the bucket chain before + * releasing the lock on the previous page. (This type of lock chaining is not + * ideal, so we might want to look for a better solution at some point.) + * + * We need to retain a pin on the primary bucket to ensure that no concurrent + * split can start. + */ +void +hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, + BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, + uint32 maxbucket, uint32 highmask, uint32 lowmask, + double *tuples_removed, double *num_index_tuples, + bool split_cleanup, + IndexBulkDeleteCallback callback, void *callback_state) +{ + BlockNumber blkno; + Buffer buf; + Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; + bool bucket_dirty = false; + + blkno = bucket_blkno; + buf = bucket_buf; + + if (split_cleanup) + new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, + lowmask, maxbucket); + + /* Scan each page in bucket */ + for (;;) + { + HashPageOpaque opaque; + OffsetNumber offno; + OffsetNumber maxoffno; + Buffer next_buf; + Page page; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + bool retain_pin = false; + bool clear_dead_marking = false; + + vacuum_delay_point(); + + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* Scan each tuple in page */ + maxoffno = PageGetMaxOffsetNumber(page); + for (offno = FirstOffsetNumber; + offno <= maxoffno; + offno = OffsetNumberNext(offno)) + { + ItemPointer htup; + IndexTuple itup; + Bucket bucket; + bool kill_tuple = false; + + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, offno)); + htup = &(itup->t_tid); + + /* + * To remove the dead tuples, we strictly want to rely on results + * of callback function. refer btvacuumpage for detailed reason. + */ + if (callback && callback(htup, callback_state)) + { + kill_tuple = true; + if (tuples_removed) + *tuples_removed += 1; + } + else if (split_cleanup) + { + /* delete the tuples that are moved by split. */ + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + maxbucket, + highmask, + lowmask); + /* mark the item for deletion */ + if (bucket != cur_bucket) + { + /* + * We expect tuples to either belong to current bucket or + * new_bucket. This is ensured because we don't allow + * further splits from bucket that contains garbage. See + * comments in _hash_expandtable. + */ + Assert(bucket == new_bucket); + kill_tuple = true; + } + } + + if (kill_tuple) + { + /* mark the item for deletion */ + deletable[ndeletable++] = offno; + } + else + { + /* we're keeping it, so count it */ + if (num_index_tuples) + *num_index_tuples += 1; + } + } + + /* retain the pin on primary bucket page till end of bucket scan */ + if (blkno == bucket_blkno) + retain_pin = true; + else + retain_pin = false; + + blkno = opaque->hasho_nextblkno; + + /* + * Apply deletions, advance to next page and write page if needed. + */ + if (ndeletable > 0) + { + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + bucket_dirty = true; + + /* + * Let us mark the page as clean if vacuum removes the DEAD tuples + * from an index page. We do this by clearing + * LH_PAGE_HAS_DEAD_TUPLES flag. + */ + if (tuples_removed && *tuples_removed > 0 && + H_HAS_DEAD_TUPLES(opaque)) + { + opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + clear_dead_marking = true; + } + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_delete xlrec; + XLogRecPtr recptr; + + xlrec.clear_dead_marking = clear_dead_marking; + xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashDelete); + + /* + * bucket buffer needs to be registered to ensure that we can + * acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_primary_bucket_page) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); + PageSetLSN(BufferGetPage(buf), recptr); + } + + END_CRIT_SECTION(); + } + + /* bail out if there are no more pages to scan. */ + if (!BlockNumberIsValid(blkno)) + break; + + next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + /* + * release the lock on previous page after acquiring the lock on next + * page + */ + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + buf = next_buf; + } + + /* + * lock the bucket page to clear the garbage flag and squeeze the bucket. + * if the current buffer is same as bucket buffer, then we already have + * lock on bucket page. + */ + if (buf != bucket_buf) + { + _hash_relbuf(rel, buf); + LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * Clear the garbage flag from bucket after deleting the tuples that are + * moved by split. We purposefully clear the flag before squeeze bucket, + * so that after restart, vacuum shouldn't again try to delete the moved + * by split tuples. + */ + if (split_cleanup) + { + HashPageOpaque bucket_opaque; + Page page; + + page = BufferGetPage(bucket_buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + MarkBufferDirty(bucket_buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + /* + * If we have deleted anything, try to compact free space. For squeezing + * the bucket, we must have a cleanup lock, else it can impact the + * ordering of tuples for a scan that has started before it. + */ + if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) + _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, + bstrategy); + else + LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); +} diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c new file mode 100644 index 0000000..af35a99 --- /dev/null +++ b/src/backend/access/hash/hash_xlog.c @@ -0,0 +1,1145 @@ +/*------------------------------------------------------------------------- + * + * hash_xlog.c + * WAL replay logic for hash index. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/hash/hash_xlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/procarray.h" + +/* + * replay a hash index meta page + */ +static void +hash_xlog_init_meta_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Page page; + Buffer metabuf; + ForkNumber forknum; + + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); + + /* create the index' metapage */ + metabuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(metabuf)); + _hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid, + xlrec->ffactor, true); + page = (Page) BufferGetPage(metabuf); + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + + /* + * Force the on-disk state of init forks to always be in sync with the + * state in shared buffers. See XLogReadBufferForRedoExtended. We need + * special handling for init forks as create index operations don't log a + * full page image of the metapage. + */ + XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(metabuf); + + /* all done */ + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index bitmap page + */ +static void +hash_xlog_init_bitmap_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer bitmapbuf; + Buffer metabuf; + Page page; + HashMetaPage metap; + uint32 num_buckets; + ForkNumber forknum; + + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); + + /* + * Initialize bitmap page + */ + bitmapbuf = XLogInitBufferForRedo(record, 0); + _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); + PageSetLSN(BufferGetPage(bitmapbuf), lsn); + MarkBufferDirty(bitmapbuf); + + /* + * Force the on-disk state of init forks to always be in sync with the + * state in shared buffers. See XLogReadBufferForRedoExtended. We need + * special handling for init forks as create index operations don't log a + * full page image of the metapage. + */ + XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(bitmapbuf); + UnlockReleaseBuffer(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the bitmap page. But during replay it's not + * necessary to hold that lock, since nobody can see it yet; the + * creating transaction hasn't yet committed. + */ + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + num_buckets = metap->hashm_maxbucket + 1; + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + metap->hashm_nmaps++; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + + XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index insert without split + */ +static void +hash_xlog_insert(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); + + page = BufferGetPage(buffer); + + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "hash_xlog_insert: failed to add item"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the page we inserted into. But during replay it's + * not necessary to hold that lock, since no other index updates can + * be happening concurrently. + */ + page = BufferGetPage(buffer); + metap = HashPageGetMeta(page); + metap->hashm_ntuples += 1; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay addition of overflow page for hash index + */ +static void +hash_xlog_add_ovfl_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); + Buffer leftbuf; + Buffer ovflbuf; + Buffer metabuf; + BlockNumber leftblk; + BlockNumber rightblk; + BlockNumber newmapblk = InvalidBlockNumber; + Page ovflpage; + HashPageOpaque ovflopaque; + uint32 *num_bucket; + char *data; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + bool new_bmpage = false; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); + XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); + + ovflbuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(ovflbuf)); + + data = XLogRecGetBlockData(record, 0, &datalen); + num_bucket = (uint32 *) data; + Assert(datalen == sizeof(uint32)); + _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, + true); + /* update backlink */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = leftblk; + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) + { + Page leftpage; + HashPageOpaque leftopaque; + + leftpage = BufferGetPage(leftbuf); + leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); + leftopaque->hasho_nextblkno = rightblk; + + PageSetLSN(leftpage, lsn); + MarkBufferDirty(leftbuf); + } + + if (BufferIsValid(leftbuf)) + UnlockReleaseBuffer(leftbuf); + UnlockReleaseBuffer(ovflbuf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the overflow pages. But during replay it's not + * necessary to hold those locks, since no other index updates can be + * happening concurrently. + */ + if (XLogRecHasBlockRef(record, 2)) + { + Buffer mapbuffer; + + if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuffer); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 2, &datalen); + bitmap_page_bit = (uint32 *) data; + + SETBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuffer); + } + if (BufferIsValid(mapbuffer)) + UnlockReleaseBuffer(mapbuffer); + } + + if (XLogRecHasBlockRef(record, 3)) + { + Buffer newmapbuf; + + newmapbuf = XLogInitBufferForRedo(record, 3); + + _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); + + new_bmpage = true; + newmapblk = BufferGetBlockNumber(newmapbuf); + + MarkBufferDirty(newmapbuf); + PageSetLSN(BufferGetPage(newmapbuf), lsn); + + UnlockReleaseBuffer(newmapbuf); + } + + if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + uint32 *firstfree_ovflpage; + + data = XLogRecGetBlockData(record, 4, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + if (!xlrec->bmpage_found) + { + metap->hashm_spares[metap->hashm_ovflpoint]++; + + if (new_bmpage) + { + Assert(BlockNumberIsValid(newmapblk)); + + metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; + metap->hashm_nmaps++; + metap->hashm_spares[metap->hashm_ovflpoint]++; + } + } + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay allocation of page for split operation + */ +static void +hash_xlog_split_allocate_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + Buffer metabuf; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + char *data; + XLogRedoAction action; + + /* + * To be consistent with normal operation, here we take cleanup locks on + * both the old and new buckets even though there can't be any concurrent + * inserts. + */ + + /* replay the record for old bucket */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the special space is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + oldopaque->hasho_prevblkno = xlrec->new_bucket; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + + /* replay the record for new bucket */ + newbuf = XLogInitBufferForRedo(record, 1); + _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, + xlrec->new_bucket_flag, true); + if (!IsBufferCleanupOK(newbuf)) + elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); + MarkBufferDirty(newbuf); + PageSetLSN(BufferGetPage(newbuf), lsn); + + /* + * We can release the lock on old bucket early as well but doing here to + * consistent with normal operation. + */ + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); + + /* + * Note: in normal operation, we'd update the meta page while still + * holding lock on the old and new bucket pages. But during replay it's + * not necessary to hold those locks, since no other bucket splits can be + * happening concurrently. + */ + + /* replay the record for metapage changes */ + if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) + { + Page page; + HashMetaPage metap; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_maxbucket = xlrec->new_bucket; + + data = XLogRecGetBlockData(record, 2, &datalen); + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) + { + uint32 lowmask; + uint32 *highmask; + + /* extract low and high masks. */ + memcpy(&lowmask, data, sizeof(uint32)); + highmask = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_lowmask = lowmask; + metap->hashm_highmask = *highmask; + + data += sizeof(uint32) * 2; + } + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) + { + uint32 ovflpoint; + uint32 *ovflpages; + + /* extract information of overflow pages. */ + memcpy(&ovflpoint, data, sizeof(uint32)); + ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_spares[ovflpoint] = *ovflpages; + metap->hashm_ovflpoint = ovflpoint; + } + + MarkBufferDirty(metabuf); + PageSetLSN(BufferGetPage(metabuf), lsn); + } + + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay of split operation + */ +static void +hash_xlog_split_page(XLogReaderState *record) +{ + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED) + elog(ERROR, "Hash split record did not contain a full-page image"); + + UnlockReleaseBuffer(buf); +} + +/* + * replay completion of split operation + */ +static void +hash_xlog_split_complete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + XLogRedoAction action; + + /* replay the record for old bucket */ + action = XLogReadBufferForRedo(record, 0, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + + /* replay the record for new bucket */ + action = XLogReadBufferForRedo(record, 1, &newbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page newpage; + HashPageOpaque nopaque; + + newpage = BufferGetPage(newbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->hasho_flag = xlrec->new_bucket_flag; + + PageSetLSN(newpage, lsn); + MarkBufferDirty(newbuf); + } + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); +} + +/* + * replay move of page contents for squeeze operation of hash index + */ +static void +hash_xlog_move_page_contents(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf = InvalidBuffer; + Buffer deletebuf = InvalidBuffer; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for deleting entries from overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) + { + Page page; + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 2, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + + /* + * Replay is complete, now we can release the buffers. We release locks at + * end of replay operation to ensure that we hold lock on primary bucket + * page till end of operation. We can optimize by releasing the lock on + * write buffer as soon as the operation for same is complete, if it is + * not same as primary bucket page, but that doesn't seem to be worth + * complicating the code. + */ + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay squeeze page operation of hash index + */ +static void +hash_xlog_squeeze_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf; + Buffer ovflbuf; + Buffer prevbuf = InvalidBuffer; + Buffer mapbuf; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + /* + * if the page on which are adding tuples is a page previous to freed + * overflow page, then update its nextblkno. + */ + if (xldata->is_prev_bucket_same_wrt) + { + HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); + + writeopaque->hasho_nextblkno = xldata->nextblkno; + } + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for initializing overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) + { + Page ovflpage; + HashPageOpaque ovflopaque; + + ovflpage = BufferGetPage(ovflbuf); + + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + } + if (BufferIsValid(ovflbuf)) + UnlockReleaseBuffer(ovflbuf); + + /* replay the record for page previous to the freed overflow page */ + if (!xldata->is_prev_bucket_same_wrt && + XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + prevopaque->hasho_nextblkno = xldata->nextblkno; + + PageSetLSN(prevpage, lsn); + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(prevbuf)) + UnlockReleaseBuffer(prevbuf); + + /* replay the record for page next to the freed overflow page */ + if (XLogRecHasBlockRef(record, 4)) + { + Buffer nextbuf; + + if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + nextopaque->hasho_prevblkno = xldata->prevblkno; + + PageSetLSN(nextpage, lsn); + MarkBufferDirty(nextbuf); + } + if (BufferIsValid(nextbuf)) + UnlockReleaseBuffer(nextbuf); + } + + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the primary bucket page and overflow pages. But + * during replay it's not necessary to hold those locks, since no other + * index updates can be happening concurrently. + */ + /* replay the record for bitmap page */ + if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuf); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + Size datalen; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 5, &datalen); + bitmap_page_bit = (uint32 *) data; + + CLRBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuf); + } + if (BufferIsValid(mapbuf)) + UnlockReleaseBuffer(mapbuf); + + /* replay the record for meta page */ + if (XLogRecHasBlockRef(record, 6)) + { + Buffer metabuf; + + if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + char *data; + uint32 *firstfree_ovflpage; + Size datalen; + + data = XLogRecGetBlockData(record, 6, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); + } +} + +/* + * replay delete operation of hash index + */ +static void +hash_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer deletebuf; + Page page; + XLogRedoAction action; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_primary_bucket_page) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); + else + { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &deletebuf); + } + + /* replay the record for deleting entries in bucket page */ + if (action == BLK_NEEDS_REDO) + { + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 1, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + /* + * Mark the page as not containing any LP_DEAD items only if + * clear_dead_marking flag is set to true. See comments in + * hashbucketcleanup() for details. + */ + if (xldata->clear_dead_marking) + { + HashPageOpaque pageopaque; + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay split cleanup flag operation for primary bucket page. + */ +static void +hash_xlog_split_cleanup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + HashPageOpaque bucket_opaque; + + page = (Page) BufferGetPage(buffer); + + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay for update meta page + */ +static void +hash_xlog_update_meta_page(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record); + Buffer metabuf; + Page page; + + if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) + { + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + metap->hashm_ntuples = xldata->ntuples; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay delete operation in hash index to remove + * tuples marked as DEAD during index tuple insertion. + */ +static void +hash_xlog_vacuum_one_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_vacuum_one_page *xldata; + Buffer buffer; + Buffer metabuf; + Page page; + XLogRedoAction action; + HashPageOpaque pageopaque; + + xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); + + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * Hash index records that are marked as LP_DEAD and being removed during + * hash index tuple insertion can conflict with standby queries. You might + * think that vacuum records would conflict as well, but we've handled + * that already. XLOG_HEAP2_PRUNE records provide the highest xid cleaned + * by the vacuum of the heap and so we can resolve any conflicts just once + * when that arrives. After that we know that no conflicts exist from + * individual hash index vacuum records on that index. + */ + if (InHotStandby) + { + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode); + } + + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); + + if (action == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + if (XLogRecGetDataLen(record) > SizeOfHashVacuumOnePage) + { + OffsetNumber *unused; + + unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage); + + PageIndexMultiDelete(page, unused, xldata->ntuples); + } + + /* + * Mark the page as not containing any LP_DEAD items. See comments in + * _hash_vacuum_one_page() for details. + */ + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) + { + Page metapage; + HashMetaPage metap; + + metapage = BufferGetPage(metabuf); + metap = HashPageGetMeta(metapage); + + metap->hashm_ntuples -= xldata->ntuples; + + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +void +hash_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + hash_xlog_init_meta_page(record); + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + hash_xlog_init_bitmap_page(record); + break; + case XLOG_HASH_INSERT: + hash_xlog_insert(record); + break; + case XLOG_HASH_ADD_OVFL_PAGE: + hash_xlog_add_ovfl_page(record); + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + hash_xlog_split_allocate_page(record); + break; + case XLOG_HASH_SPLIT_PAGE: + hash_xlog_split_page(record); + break; + case XLOG_HASH_SPLIT_COMPLETE: + hash_xlog_split_complete(record); + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + hash_xlog_move_page_contents(record); + break; + case XLOG_HASH_SQUEEZE_PAGE: + hash_xlog_squeeze_page(record); + break; + case XLOG_HASH_DELETE: + hash_xlog_delete(record); + break; + case XLOG_HASH_SPLIT_CLEANUP: + hash_xlog_split_cleanup(record); + break; + case XLOG_HASH_UPDATE_META_PAGE: + hash_xlog_update_meta_page(record); + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + hash_xlog_vacuum_one_page(record); + break; + default: + elog(PANIC, "hash_redo: unknown op code %u", info); + } +} + +/* + * Mask a hash page before performing consistency checks on it. + */ +void +hash_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + HashPageOpaque opaque; + int pagetype; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + pagetype = opaque->hasho_flag & LH_PAGE_TYPE; + if (pagetype == LH_UNUSED_PAGE) + { + /* + * Mask everything on a UNUSED page. + */ + mask_page_content(page); + } + else if (pagetype == LH_BUCKET_PAGE || + pagetype == LH_OVERFLOW_PAGE) + { + /* + * In hash bucket and overflow pages, it is possible to modify the + * LP_FLAGS without emitting any WAL record. Hence, mask the line + * pointer flags. See hashgettuple(), _hash_kill_items() for details. + */ + mask_lp_flags(page); + } + + /* + * It is possible that the hint bit LH_PAGE_HAS_DEAD_TUPLES may remain + * unlogged. So, mask it. See _hash_kill_items() for details. + */ + opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; +} diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c new file mode 100644 index 0000000..2423339 --- /dev/null +++ b/src/backend/access/hash/hashfunc.c @@ -0,0 +1,411 @@ +/*------------------------------------------------------------------------- + * + * hashfunc.c + * Support functions for hash access method. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hashfunc.c + * + * NOTES + * These functions are stored in pg_amproc. For each operator class + * defined for hash indexes, they compute the hash value of the argument. + * + * Additional hash functions appear in /utils/adt/ files for various + * specialized datatypes. + * + * It is expected that every bit of a hash function's 32-bit result is + * as random as every other; failure to ensure this is likely to lead + * to poor performance of hash joins, for example. In most cases a hash + * function should use hash_any() or its variant hash_uint32(). + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/hash.h" +#include "catalog/pg_collation.h" +#include "common/hashfn.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/pg_locale.h" + +/* + * Datatype-specific hash functions. + * + * These support both hash indexes and hash joins. + * + * NOTE: some of these are also used by catcache operations, without + * any direct connection to hash indexes. Also, the common hash_any + * routine is also used by dynahash tables. + */ + +/* Note: this is used for both "char" and boolean datatypes */ +Datum +hashchar(PG_FUNCTION_ARGS) +{ + return hash_uint32((int32) PG_GETARG_CHAR(0)); +} + +Datum +hashcharextended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1)); +} + +Datum +hashint2(PG_FUNCTION_ARGS) +{ + return hash_uint32((int32) PG_GETARG_INT16(0)); +} + +Datum +hashint2extended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1)); +} + +Datum +hashint4(PG_FUNCTION_ARGS) +{ + return hash_uint32(PG_GETARG_INT32(0)); +} + +Datum +hashint4extended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1)); +} + +Datum +hashint8(PG_FUNCTION_ARGS) +{ + /* + * The idea here is to produce a hash value compatible with the values + * produced by hashint4 and hashint2 for logically equal inputs; this is + * necessary to support cross-type hash joins across these input types. + * Since all three types are signed, we can xor the high half of the int8 + * value if the sign is positive, or the complement of the high half when + * the sign is negative. + */ + int64 val = PG_GETARG_INT64(0); + uint32 lohalf = (uint32) val; + uint32 hihalf = (uint32) (val >> 32); + + lohalf ^= (val >= 0) ? hihalf : ~hihalf; + + return hash_uint32(lohalf); +} + +Datum +hashint8extended(PG_FUNCTION_ARGS) +{ + /* Same approach as hashint8 */ + int64 val = PG_GETARG_INT64(0); + uint32 lohalf = (uint32) val; + uint32 hihalf = (uint32) (val >> 32); + + lohalf ^= (val >= 0) ? hihalf : ~hihalf; + + return hash_uint32_extended(lohalf, PG_GETARG_INT64(1)); +} + +Datum +hashoid(PG_FUNCTION_ARGS) +{ + return hash_uint32((uint32) PG_GETARG_OID(0)); +} + +Datum +hashoidextended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); +} + +Datum +hashenum(PG_FUNCTION_ARGS) +{ + return hash_uint32((uint32) PG_GETARG_OID(0)); +} + +Datum +hashenumextended(PG_FUNCTION_ARGS) +{ + return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); +} + +Datum +hashfloat4(PG_FUNCTION_ARGS) +{ + float4 key = PG_GETARG_FLOAT4(0); + float8 key8; + + /* + * On IEEE-float machines, minus zero and zero have different bit patterns + * but should compare as equal. We must ensure that they have the same + * hash value, which is most reliably done this way: + */ + if (key == (float4) 0) + PG_RETURN_UINT32(0); + + /* + * To support cross-type hashing of float8 and float4, we want to return + * the same hash value hashfloat8 would produce for an equal float8 value. + * So, widen the value to float8 and hash that. (We must do this rather + * than have hashfloat8 try to narrow its value to float4; that could fail + * on overflow.) + */ + key8 = key; + + /* + * Similarly, NaNs can have different bit patterns but they should all + * compare as equal. For backwards-compatibility reasons we force them to + * have the hash value of a standard float8 NaN. (You'd think we could + * replace key with a float4 NaN and then widen it; but on some old + * platforms, that way produces a different bit pattern.) + */ + if (isnan(key8)) + key8 = get_float8_nan(); + + return hash_any((unsigned char *) &key8, sizeof(key8)); +} + +Datum +hashfloat4extended(PG_FUNCTION_ARGS) +{ + float4 key = PG_GETARG_FLOAT4(0); + uint64 seed = PG_GETARG_INT64(1); + float8 key8; + + /* Same approach as hashfloat4 */ + if (key == (float4) 0) + PG_RETURN_UINT64(seed); + key8 = key; + if (isnan(key8)) + key8 = get_float8_nan(); + + return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed); +} + +Datum +hashfloat8(PG_FUNCTION_ARGS) +{ + float8 key = PG_GETARG_FLOAT8(0); + + /* + * On IEEE-float machines, minus zero and zero have different bit patterns + * but should compare as equal. We must ensure that they have the same + * hash value, which is most reliably done this way: + */ + if (key == (float8) 0) + PG_RETURN_UINT32(0); + + /* + * Similarly, NaNs can have different bit patterns but they should all + * compare as equal. For backwards-compatibility reasons we force them to + * have the hash value of a standard NaN. + */ + if (isnan(key)) + key = get_float8_nan(); + + return hash_any((unsigned char *) &key, sizeof(key)); +} + +Datum +hashfloat8extended(PG_FUNCTION_ARGS) +{ + float8 key = PG_GETARG_FLOAT8(0); + uint64 seed = PG_GETARG_INT64(1); + + /* Same approach as hashfloat8 */ + if (key == (float8) 0) + PG_RETURN_UINT64(seed); + if (isnan(key)) + key = get_float8_nan(); + + return hash_any_extended((unsigned char *) &key, sizeof(key), seed); +} + +Datum +hashoidvector(PG_FUNCTION_ARGS) +{ + oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + + return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid)); +} + +Datum +hashoidvectorextended(PG_FUNCTION_ARGS) +{ + oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + + return hash_any_extended((unsigned char *) key->values, + key->dim1 * sizeof(Oid), + PG_GETARG_INT64(1)); +} + +Datum +hashname(PG_FUNCTION_ARGS) +{ + char *key = NameStr(*PG_GETARG_NAME(0)); + + return hash_any((unsigned char *) key, strlen(key)); +} + +Datum +hashnameextended(PG_FUNCTION_ARGS) +{ + char *key = NameStr(*PG_GETARG_NAME(0)); + + return hash_any_extended((unsigned char *) key, strlen(key), + PG_GETARG_INT64(1)); +} + +Datum +hashtext(PG_FUNCTION_ARGS) +{ + text *key = PG_GETARG_TEXT_PP(0); + Oid collid = PG_GET_COLLATION(); + pg_locale_t mylocale = 0; + Datum result; + + if (!collid) + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for string hashing"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + + if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + + if (!mylocale || mylocale->deterministic) + { + result = hash_any((unsigned char *) VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key)); + } + else + { +#ifdef USE_ICU + if (mylocale->provider == COLLPROVIDER_ICU) + { + int32_t ulen = -1; + UChar *uchar = NULL; + Size bsize; + uint8_t *buf; + + ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); + + bsize = ucol_getSortKey(mylocale->info.icu.ucol, + uchar, ulen, NULL, 0); + buf = palloc(bsize); + ucol_getSortKey(mylocale->info.icu.ucol, + uchar, ulen, buf, bsize); + + result = hash_any(buf, bsize); + + pfree(buf); + } + else +#endif + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", mylocale->provider); + } + + /* Avoid leaking memory for toasted inputs */ + PG_FREE_IF_COPY(key, 0); + + return result; +} + +Datum +hashtextextended(PG_FUNCTION_ARGS) +{ + text *key = PG_GETARG_TEXT_PP(0); + Oid collid = PG_GET_COLLATION(); + pg_locale_t mylocale = 0; + Datum result; + + if (!collid) + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for string hashing"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + + if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + + if (!mylocale || mylocale->deterministic) + { + result = hash_any_extended((unsigned char *) VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key), + PG_GETARG_INT64(1)); + } + else + { +#ifdef USE_ICU + if (mylocale->provider == COLLPROVIDER_ICU) + { + int32_t ulen = -1; + UChar *uchar = NULL; + Size bsize; + uint8_t *buf; + + ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); + + bsize = ucol_getSortKey(mylocale->info.icu.ucol, + uchar, ulen, NULL, 0); + buf = palloc(bsize); + ucol_getSortKey(mylocale->info.icu.ucol, + uchar, ulen, buf, bsize); + + result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); + + pfree(buf); + } + else +#endif + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", mylocale->provider); + } + + PG_FREE_IF_COPY(key, 0); + + return result; +} + +/* + * hashvarlena() can be used for any varlena datatype in which there are + * no non-significant bits, ie, distinct bitpatterns never compare as equal. + */ +Datum +hashvarlena(PG_FUNCTION_ARGS) +{ + struct varlena *key = PG_GETARG_VARLENA_PP(0); + Datum result; + + result = hash_any((unsigned char *) VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key)); + + /* Avoid leaking memory for toasted inputs */ + PG_FREE_IF_COPY(key, 0); + + return result; +} + +Datum +hashvarlenaextended(PG_FUNCTION_ARGS) +{ + struct varlena *key = PG_GETARG_VARLENA_PP(0); + Datum result; + + result = hash_any_extended((unsigned char *) VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key), + PG_GETARG_INT64(1)); + + PG_FREE_IF_COPY(key, 0); + + return result; +} diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c new file mode 100644 index 0000000..d254a00 --- /dev/null +++ b/src/backend/access/hash/hashinsert.c @@ -0,0 +1,432 @@ +/*------------------------------------------------------------------------- + * + * hashinsert.c + * Item insertion in hash tables for Postgres. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hashinsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" +#include "storage/buf_internals.h" +#include "storage/lwlock.h" +#include "storage/predicate.h" +#include "utils/rel.h" + +static void _hash_vacuum_one_page(Relation rel, Relation hrel, + Buffer metabuf, Buffer buf); + +/* + * _hash_doinsert() -- Handle insertion of a single index tuple. + * + * This routine is called by the public interface routines, hashbuild + * and hashinsert. By here, itup is completely filled in. + */ +void +_hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel) +{ + Buffer buf = InvalidBuffer; + Buffer bucket_buf; + Buffer metabuf; + HashMetaPage metap; + HashMetaPage usedmetap = NULL; + Page metapage; + Page page; + HashPageOpaque pageopaque; + Size itemsz; + bool do_expand; + uint32 hashkey; + Bucket bucket; + OffsetNumber itup_off; + + /* + * Get the hash key for the item (it's stored in the index tuple itself). + */ + hashkey = _hash_get_indextuple_hashkey(itup); + + /* compute item size too */ + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ + +restart_insert: + + /* + * Read the metapage. We don't lock it yet; HashMaxItemSize() will + * examine pd_pagesize_version, but that can't change so we can examine it + * without a lock. + */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); + metapage = BufferGetPage(metabuf); + + /* + * Check whether the item can fit on a hash page at all. (Eventually, we + * ought to try to apply TOAST methods if not.) Note that at this point, + * itemsz doesn't include the ItemId. + * + * XXX this is useless code if we are only storing hash keys. + */ + if (itemsz > HashMaxItemSize(metapage)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds hash maximum %zu", + itemsz, HashMaxItemSize(metapage)), + errhint("Values larger than a buffer page cannot be indexed."))); + + /* Lock the primary bucket page for the target bucket. */ + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, + &usedmetap); + Assert(usedmetap != NULL); + + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(buf)); + + /* remember the primary bucket buffer to release the pin on it at end. */ + bucket_buf = buf; + + page = BufferGetPage(buf); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket = pageopaque->hasho_bucket; + + /* + * If this bucket is in the process of being split, try to finish the + * split before inserting, because that might create room for the + * insertion to proceed without allocating an additional overflow page. + * It's only interesting to finish the split if we're trying to insert + * into the bucket from which we're removing tuples (the "old" bucket), + * not if we're trying to insert into the bucket into which tuples are + * being moved (the "new" bucket). + */ + if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) + { + /* release the lock on bucket buffer, before completing the split. */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + _hash_finish_split(rel, metabuf, buf, bucket, + usedmetap->hashm_maxbucket, + usedmetap->hashm_highmask, + usedmetap->hashm_lowmask); + + /* release the pin on old and meta buffer. retry for insert. */ + _hash_dropbuf(rel, buf); + _hash_dropbuf(rel, metabuf); + goto restart_insert; + } + + /* Do the insertion */ + while (PageGetFreeSpace(page) < itemsz) + { + BlockNumber nextblkno; + + /* + * Check if current page has any DEAD tuples. If yes, delete these + * tuples and see if we can get a space for the new item to be + * inserted before moving to the next page in the bucket chain. + */ + if (H_HAS_DEAD_TUPLES(pageopaque)) + { + + if (IsBufferCleanupOK(buf)) + { + _hash_vacuum_one_page(rel, heapRel, metabuf, buf); + + if (PageGetFreeSpace(page) >= itemsz) + break; /* OK, now we have enough space */ + } + } + + /* + * no space on this page; check for an overflow page + */ + nextblkno = pageopaque->hasho_nextblkno; + + if (BlockNumberIsValid(nextblkno)) + { + /* + * ovfl page exists; go get it. if it doesn't have room, we'll + * find out next pass through the loop test above. we always + * release both the lock and pin if this is an overflow page, but + * only the lock if this is the primary bucket page, since the pin + * on the primary bucket must be retained throughout the scan. + */ + if (buf != bucket_buf) + _hash_relbuf(rel, buf); + else + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + } + else + { + /* + * we're at the end of the bucket chain and we haven't found a + * page with enough room. allocate a new overflow page. + */ + + /* release our write lock without modifying buffer */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* chain to a new overflow page */ + buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false); + page = BufferGetPage(buf); + + /* should fit now, given test above */ + Assert(PageGetFreeSpace(page) >= itemsz); + } + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE); + Assert(pageopaque->hasho_bucket == bucket); + } + + /* + * Write-lock the metapage so we can increment the tuple count. After + * incrementing it, check to see if it's time for a split. + */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* found page with enough space, so add the item here */ + itup_off = _hash_pgaddtup(rel, buf, itemsz, itup); + MarkBufferDirty(buf); + + /* metapage operations */ + metap = HashPageGetMeta(metapage); + metap->hashm_ntuples += 1; + + /* Make sure this stays in sync with _hash_expandtable() */ + do_expand = metap->hashm_ntuples > + (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); + + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_insert xlrec; + XLogRecPtr recptr; + + xlrec.offnum = itup_off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInsert); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock on metapage, but keep pin */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* + * Release the modified page and ensure to release the pin on primary + * page. + */ + _hash_relbuf(rel, buf); + if (buf != bucket_buf) + _hash_dropbuf(rel, bucket_buf); + + /* Attempt to split if a split is needed */ + if (do_expand) + _hash_expandtable(rel, metabuf); + + /* Finally drop our pin on the metapage */ + _hash_dropbuf(rel, metabuf); +} + +/* + * _hash_pgaddtup() -- add a tuple to a particular page in the index. + * + * This routine adds the tuple to the page as requested; it does not write out + * the page. It is an error to call this function without pin and write lock + * on the target buffer. + * + * Returns the offset number at which the tuple was inserted. This function + * is responsible for preserving the condition that tuples in a hash index + * page are sorted by hashkey value. + */ +OffsetNumber +_hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup) +{ + OffsetNumber itup_off; + Page page; + uint32 hashkey; + + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + + /* Find where to insert the tuple (preserving page's hashkey ordering) */ + hashkey = _hash_get_indextuple_hashkey(itup); + itup_off = _hash_binsearch(page, hashkey); + + if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) + == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", + RelationGetRelationName(rel)); + + return itup_off; +} + +/* + * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the + * index. + * + * This routine has same requirements for locking and tuple ordering as + * _hash_pgaddtup(). + * + * Returns the offset number array at which the tuples were inserted. + */ +void +_hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, + OffsetNumber *itup_offsets, uint16 nitups) +{ + OffsetNumber itup_off; + Page page; + uint32 hashkey; + int i; + + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + + for (i = 0; i < nitups; i++) + { + Size itemsize; + + itemsize = IndexTupleSize(itups[i]); + itemsize = MAXALIGN(itemsize); + + /* Find where to insert the tuple (preserving page's hashkey ordering) */ + hashkey = _hash_get_indextuple_hashkey(itups[i]); + itup_off = _hash_binsearch(page, hashkey); + + itup_offsets[i] = itup_off; + + if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) + == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", + RelationGetRelationName(rel)); + } +} + +/* + * _hash_vacuum_one_page - vacuum just one index page. + * + * Try to remove LP_DEAD items from the given page. We must acquire cleanup + * lock on the page being modified before calling this function. + */ + +static void +_hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf) +{ + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + OffsetNumber offnum, + maxoff; + Page page = BufferGetPage(buf); + HashPageOpaque pageopaque; + HashMetaPage metap; + + /* Scan each tuple in page to see if it is marked as LP_DEAD */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + TransactionId latestRemovedXid; + + latestRemovedXid = + index_compute_xid_horizon_for_tuples(rel, hrel, buf, + deletable, ndeletable); + + /* + * Write-lock the meta page so that we can decrement tuple count. + */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint + * anyway. + */ + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + + metap = HashPageGetMeta(BufferGetPage(metabuf)); + metap->hashm_ntuples -= ndeletable; + + MarkBufferDirty(buf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_vacuum_one_page xlrec; + XLogRecPtr recptr; + + xlrec.latestRemovedXid = latestRemovedXid; + xlrec.ntuples = ndeletable; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage); + + /* + * We need the target-offsets array whether or not we store the + * whole buffer, to allow us to find the latestRemovedXid on a + * standby server. + */ + XLogRegisterData((char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* + * Releasing write lock on meta page as we have updated the tuple + * count. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + } +} diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c new file mode 100644 index 0000000..1ff2e0c --- /dev/null +++ b/src/backend/access/hash/hashovfl.c @@ -0,0 +1,1083 @@ +/*------------------------------------------------------------------------- + * + * hashovfl.c + * Overflow page management code for the Postgres hash access method + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hashovfl.c + * + * NOTES + * Overflow pages look like ordinary relation pages. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + + +static uint32 _hash_firstfreebit(uint32 map); + + +/* + * Convert overflow page bit number (its index in the free-page bitmaps) + * to block number within the index. + */ +static BlockNumber +bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) +{ + uint32 splitnum = metap->hashm_ovflpoint; + uint32 i; + + /* Convert zero-based bitnumber to 1-based page number */ + ovflbitnum += 1; + + /* Determine the split number for this page (must be >= 1) */ + for (i = 1; + i < splitnum && ovflbitnum > metap->hashm_spares[i]; + i++) + /* loop */ ; + + /* + * Convert to absolute page number by adding the number of bucket pages + * that exist before this split point. + */ + return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); +} + +/* + * _hash_ovflblkno_to_bitno + * + * Convert overflow page block number to bit number for free-page bitmap. + */ +uint32 +_hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) +{ + uint32 splitnum = metap->hashm_ovflpoint; + uint32 i; + uint32 bitnum; + + /* Determine the split number containing this page */ + for (i = 1; i <= splitnum; i++) + { + if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) + break; /* oops */ + bitnum = ovflblkno - _hash_get_totalbuckets(i); + + /* + * bitnum has to be greater than number of overflow page added in + * previous split point. The overflow page at this splitnum (i) if any + * should start from (_hash_get_totalbuckets(i) + + * metap->hashm_spares[i - 1] + 1). + */ + if (bitnum > metap->hashm_spares[i - 1] && + bitnum <= metap->hashm_spares[i]) + return bitnum - 1; /* -1 to convert 1-based to 0-based */ + } + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid overflow block number %u", ovflblkno))); + return 0; /* keep compiler quiet */ +} + +/* + * _hash_addovflpage + * + * Add an overflow page to the bucket whose last page is pointed to by 'buf'. + * + * On entry, the caller must hold a pin but no lock on 'buf'. The pin is + * dropped before exiting (we assume the caller is not interested in 'buf' + * anymore) if not asked to retain. The pin will be retained only for the + * primary bucket. The returned overflow page will be pinned and + * write-locked; it is guaranteed to be empty. + * + * The caller must hold a pin, but no lock, on the metapage buffer. + * That buffer is returned in the same state. + * + * NB: since this could be executed concurrently by multiple processes, + * one should not assume that the returned overflow page will be the + * immediate successor of the originally passed 'buf'. Additional overflow + * pages might have been added to the bucket chain in between. + */ +Buffer +_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) +{ + Buffer ovflbuf; + Page page; + Page ovflpage; + HashPageOpaque pageopaque; + HashPageOpaque ovflopaque; + HashMetaPage metap; + Buffer mapbuf = InvalidBuffer; + Buffer newmapbuf = InvalidBuffer; + BlockNumber blkno; + uint32 orig_firstfree; + uint32 splitnum; + uint32 *freep = NULL; + uint32 max_ovflpg; + uint32 bit; + uint32 bitmap_page_bit; + uint32 first_page; + uint32 last_bit; + uint32 last_page; + uint32 i, + j; + bool page_found = false; + + /* + * Write-lock the tail page. Here, we need to maintain locking order such + * that, first acquire the lock on tail page of bucket, then on meta page + * to find and lock the bitmap page and if it is found, then lock on meta + * page is released, then finally acquire the lock on new overflow buffer. + * We need this locking order to avoid deadlock with backends that are + * doing inserts. + * + * Note: We could have avoided locking many buffers here if we made two + * WAL records for acquiring an overflow page (one to allocate an overflow + * page and another to add it to overflow bucket chain). However, doing + * so can leak an overflow page, if the system crashes after allocation. + * Needless to say, it is better to have a single record from a + * performance point of view as well. + */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* probably redundant... */ + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + /* loop to find current tail page, in case someone else inserted too */ + for (;;) + { + BlockNumber nextblkno; + + page = BufferGetPage(buf); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + nextblkno = pageopaque->hasho_nextblkno; + + if (!BlockNumberIsValid(nextblkno)) + break; + + /* we assume we do not need to write the unmodified page */ + if (retain_pin) + { + /* pin will be retained only for the primary bucket page */ + Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + _hash_relbuf(rel, buf); + + retain_pin = false; + + buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + } + + /* Get exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + _hash_checkpage(rel, metabuf, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + /* start search at hashm_firstfree */ + orig_firstfree = metap->hashm_firstfree; + first_page = orig_firstfree >> BMPG_SHIFT(metap); + bit = orig_firstfree & BMPG_MASK(metap); + i = first_page; + j = bit / BITS_PER_MAP; + bit &= ~(BITS_PER_MAP - 1); + + /* outer loop iterates once per bitmap page */ + for (;;) + { + BlockNumber mapblkno; + Page mappage; + uint32 last_inpage; + + /* want to end search with the last existing overflow page */ + splitnum = metap->hashm_ovflpoint; + max_ovflpg = metap->hashm_spares[splitnum] - 1; + last_page = max_ovflpg >> BMPG_SHIFT(metap); + last_bit = max_ovflpg & BMPG_MASK(metap); + + if (i > last_page) + break; + + Assert(i < metap->hashm_nmaps); + mapblkno = metap->hashm_mapp[i]; + + if (i == last_page) + last_inpage = last_bit; + else + last_inpage = BMPGSZ_BIT(metap) - 1; + + /* Release exclusive lock on metapage while reading bitmap page */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); + mappage = BufferGetPage(mapbuf); + freep = HashPageGetBitmap(mappage); + + for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) + { + if (freep[j] != ALL_SET) + { + page_found = true; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* convert bit to bit number within page */ + bit += _hash_firstfreebit(freep[j]); + bitmap_page_bit = bit; + + /* convert bit to absolute bit number */ + bit += (i << BMPG_SHIFT(metap)); + /* Calculate address of the recycled overflow page */ + blkno = bitno_to_blkno(metap, bit); + + /* Fetch and init the recycled page */ + ovflbuf = _hash_getinitbuf(rel, blkno); + + goto found; + } + } + + /* No free space here, try to advance to next map page */ + _hash_relbuf(rel, mapbuf); + mapbuf = InvalidBuffer; + i++; + j = 0; /* scan from start of next map page */ + bit = 0; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * No free pages --- have to extend the relation to add an overflow page. + * First, check to see if we have to add a new bitmap page too. + */ + if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) + { + /* + * We create the new bitmap page with all pages marked "in use". + * Actually two pages in the new bitmap's range will exist + * immediately: the bitmap page itself, and the following page which + * is the one we return to the caller. Both of these are correctly + * marked "in use". Subsequent pages do not exist yet, but it is + * convenient to pre-mark them as "in use" too. + */ + bit = metap->hashm_spares[splitnum]; + + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); + } + else + { + /* + * Nothing to do here; since the page will be past the last used page, + * we know its bitmap bit was preinitialized to "in use". + */ + } + + /* Calculate address of the new overflow page */ + bit = BufferIsValid(newmapbuf) ? + metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; + blkno = bitno_to_blkno(metap, bit); + + /* + * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the + * relation length stays in sync with ours. XXX It's annoying to do this + * with metapage write lock held; would be better to use a lock that + * doesn't block incoming searches. + * + * It is okay to hold two buffer locks here (one on tail page of bucket + * and other on new overflow page) since there cannot be anyone else + * contending for access to ovflbuf. + */ + ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); + +found: + + /* + * Do the update. No ereport(ERROR) until changes are logged. We want to + * log the changes for bitmap page and overflow page together to avoid + * loss of pages in case the new page is added. + */ + START_CRIT_SECTION(); + + if (page_found) + { + Assert(BufferIsValid(mapbuf)); + + /* mark page "in use" in the bitmap */ + SETBIT(freep, bitmap_page_bit); + MarkBufferDirty(mapbuf); + } + else + { + /* update the count to indicate new overflow page is added */ + metap->hashm_spares[splitnum]++; + + if (BufferIsValid(newmapbuf)) + { + _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(newmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); + metap->hashm_nmaps++; + metap->hashm_spares[splitnum]++; + } + + MarkBufferDirty(metabuf); + + /* + * for new overflow page, we don't need to explicitly set the bit in + * bitmap page, as by default that will be set to "in use". + */ + } + + /* + * Adjust hashm_firstfree to avoid redundant searches. But don't risk + * changing it if someone moved it while we were searching bitmap pages. + */ + if (metap->hashm_firstfree == orig_firstfree) + { + metap->hashm_firstfree = bit + 1; + MarkBufferDirty(metabuf); + } + + /* initialize new overflow page */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_add_ovfl_page xlrec; + + xlrec.bmpage_found = page_found; + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); + + XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); + XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + + if (BufferIsValid(mapbuf)) + { + XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); + } + + if (BufferIsValid(newmapbuf)) + XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); + + XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); + + PageSetLSN(BufferGetPage(ovflbuf), recptr); + PageSetLSN(BufferGetPage(buf), recptr); + + if (BufferIsValid(mapbuf)) + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (BufferIsValid(newmapbuf)) + PageSetLSN(BufferGetPage(newmapbuf), recptr); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + if (BufferIsValid(mapbuf)) + _hash_relbuf(rel, mapbuf); + + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newmapbuf)) + _hash_relbuf(rel, newmapbuf); + + return ovflbuf; +} + +/* + * _hash_firstfreebit() + * + * Return the number of the first bit that is not set in the word 'map'. + */ +static uint32 +_hash_firstfreebit(uint32 map) +{ + uint32 i, + mask; + + mask = 0x1; + for (i = 0; i < BITS_PER_MAP; i++) + { + if (!(mask & map)) + return i; + mask <<= 1; + } + + elog(ERROR, "firstfreebit found no free bit"); + + return 0; /* keep compiler quiet */ +} + +/* + * _hash_freeovflpage() - + * + * Remove this overflow page from its bucket's chain, and mark the page as + * free. On entry, ovflbuf is write-locked; it is released before exiting. + * + * Add the tuples (itups) to wbuf in this function. We could do that in the + * caller as well, but the advantage of doing it here is we can easily write + * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and + * removal of overflow page has to done as an atomic operation, otherwise + * during replay on standby users might find duplicate records. + * + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. + * + * Returns the block number of the page that followed the given page + * in the bucket, or InvalidBlockNumber if no following page. + * + * NB: caller must not hold lock on metapage, nor on page, that's next to + * ovflbuf in the bucket chain. We don't acquire the lock on page that's + * prior to ovflbuf in chain if it is same as wbuf because the caller already + * has a lock on same. + */ +BlockNumber +_hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, + BufferAccessStrategy bstrategy) +{ + HashMetaPage metap; + Buffer metabuf; + Buffer mapbuf; + BlockNumber ovflblkno; + BlockNumber prevblkno; + BlockNumber blkno; + BlockNumber nextblkno; + BlockNumber writeblkno; + HashPageOpaque ovflopaque; + Page ovflpage; + Page mappage; + uint32 *freep; + uint32 ovflbitno; + int32 bitmappage, + bitmapbit; + Bucket bucket PG_USED_FOR_ASSERTS_ONLY; + Buffer prevbuf = InvalidBuffer; + Buffer nextbuf = InvalidBuffer; + bool update_metap = false; + + /* Get information from the doomed page */ + _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); + ovflblkno = BufferGetBlockNumber(ovflbuf); + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + nextblkno = ovflopaque->hasho_nextblkno; + prevblkno = ovflopaque->hasho_prevblkno; + writeblkno = BufferGetBlockNumber(wbuf); + bucket = ovflopaque->hasho_bucket; + + /* + * Fix up the bucket chain. this is a doubly-linked list, so we must fix + * up the bucket chain members behind and ahead of the overflow page being + * deleted. Concurrency issues are avoided by using lock chaining as + * described atop hashbucketcleanup. + */ + if (BlockNumberIsValid(prevblkno)) + { + if (prevblkno == writeblkno) + prevbuf = wbuf; + else + prevbuf = _hash_getbuf_with_strategy(rel, + prevblkno, + HASH_WRITE, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, + bstrategy); + } + if (BlockNumberIsValid(nextblkno)) + nextbuf = _hash_getbuf_with_strategy(rel, + nextblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + /* Note: bstrategy is intentionally not used for metapage and bitmap */ + + /* Read the metapage so we can determine which bitmap page to use */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + /* Identify which bit to set */ + ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); + + bitmappage = ovflbitno >> BMPG_SHIFT(metap); + bitmapbit = ovflbitno & BMPG_MASK(metap); + + if (bitmappage >= metap->hashm_nmaps) + elog(ERROR, "invalid overflow bit number %u", ovflbitno); + blkno = metap->hashm_mapp[bitmappage]; + + /* Release metapage lock while we access the bitmap page */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* read the bitmap page to clear the bitmap bit */ + mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); + mappage = BufferGetPage(mapbuf); + freep = HashPageGetBitmap(mappage); + Assert(ISSET(freep, bitmapbit)); + + /* Get write-lock on metapage to update firstfree */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* This operation needs to log multiple tuples, prepare WAL for that */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being careful to preserve + * hashkey ordering. (If we insert many tuples into the same "write" page + * it would be worth qsort'ing them). + */ + if (nitups > 0) + { + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + } + + /* + * Reinitialize the freed overflow page. Just zeroing the page won't + * work, because WAL replay routines expect pages to be initialized. See + * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are + * careful to make the special space valid here so that tools like + * pageinspect won't get confused. + */ + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + if (BufferIsValid(prevbuf)) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(nextbuf)) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + Assert(nextopaque->hasho_bucket == bucket); + nextopaque->hasho_prevblkno = prevblkno; + MarkBufferDirty(nextbuf); + } + + /* Clear the bitmap bit to indicate that this overflow page is free */ + CLRBIT(freep, bitmapbit); + MarkBufferDirty(mapbuf); + + /* if this is now the first free page, update hashm_firstfree */ + if (ovflbitno < metap->hashm_firstfree) + { + metap->hashm_firstfree = ovflbitno; + update_metap = true; + MarkBufferDirty(metabuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_squeeze_page xlrec; + XLogRecPtr recptr; + int i; + + xlrec.prevblkno = prevblkno; + xlrec.nextblkno = nextblkno; + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); + xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); + + /* + * bucket buffer needs to be registered to ensure that we can acquire + * a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + if (xlrec.ntups > 0) + { + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + } + + XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); + + /* + * If prevpage and the writepage (block in which we are moving tuples + * from overflow) are same, then no need to separately register + * prevpage. During replay, we can directly update the nextblock in + * writepage. + */ + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); + + if (BufferIsValid(nextbuf)) + XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); + + XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); + + if (update_metap) + { + XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); + } + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(ovflbuf), recptr); + + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + PageSetLSN(BufferGetPage(prevbuf), recptr); + if (BufferIsValid(nextbuf)) + PageSetLSN(BufferGetPage(nextbuf), recptr); + + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (update_metap) + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* release previous bucket if it is not same as write bucket */ + if (BufferIsValid(prevbuf) && prevblkno != writeblkno) + _hash_relbuf(rel, prevbuf); + + if (BufferIsValid(ovflbuf)) + _hash_relbuf(rel, ovflbuf); + + if (BufferIsValid(nextbuf)) + _hash_relbuf(rel, nextbuf); + + _hash_relbuf(rel, mapbuf); + _hash_relbuf(rel, metabuf); + + return nextblkno; +} + + +/* + * _hash_initbitmapbuffer() + * + * Initialize a new bitmap page. All bits in the new bitmap page are set to + * "1", indicating "in use". + */ +void +_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) +{ + Page pg; + HashPageOpaque op; + uint32 *freep; + + pg = BufferGetPage(buf); + + /* initialize the page */ + if (initpage) + _hash_pageinit(pg, BufferGetPageSize(buf)); + + /* initialize the page's special space */ + op = (HashPageOpaque) PageGetSpecialPointer(pg); + op->hasho_prevblkno = InvalidBlockNumber; + op->hasho_nextblkno = InvalidBlockNumber; + op->hasho_bucket = -1; + op->hasho_flag = LH_BITMAP_PAGE; + op->hasho_page_id = HASHO_PAGE_ID; + + /* set all of the bits to 1 */ + freep = HashPageGetBitmap(pg); + MemSet(freep, 0xFF, bmsize); + + /* + * Set pd_lower just past the end of the bitmap page data. We could even + * set pd_lower equal to pd_upper, but this is more precise and makes the + * page look compressible to xlog.c. + */ + ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; +} + + +/* + * _hash_squeezebucket(rel, bucket) + * + * Try to squeeze the tuples onto pages occurring earlier in the + * bucket chain in an attempt to free overflow pages. When we start + * the "squeezing", the page from which we start taking tuples (the + * "read" page) is the last bucket in the bucket chain and the page + * onto which we start squeezing tuples (the "write" page) is the + * first page in the bucket chain. The read page works backward and + * the write page works forward; the procedure terminates when the + * read page and write page are the same page. + * + * At completion of this procedure, it is guaranteed that all pages in + * the bucket are nonempty, unless the bucket is totally empty (in + * which case all overflow pages will be freed). The original implementation + * required that to be true on entry as well, but it's a lot easier for + * callers to leave empty overflow pages and let this guy clean it up. + * + * Caller must acquire cleanup lock on the primary page of the target + * bucket to exclude any scans that are in progress, which could easily + * be confused into returning the same tuple more than once or some tuples + * not at all by the rearrangement we are performing here. To prevent + * any concurrent scan to cross the squeeze scan we use lock chaining + * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup. + * + * We need to retain a pin on the primary bucket to ensure that no concurrent + * split can start. + * + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. + */ +void +_hash_squeezebucket(Relation rel, + Bucket bucket, + BlockNumber bucket_blkno, + Buffer bucket_buf, + BufferAccessStrategy bstrategy) +{ + BlockNumber wblkno; + BlockNumber rblkno; + Buffer wbuf; + Buffer rbuf; + Page wpage; + Page rpage; + HashPageOpaque wopaque; + HashPageOpaque ropaque; + + /* + * start squeezing into the primary bucket page. + */ + wblkno = bucket_blkno; + wbuf = bucket_buf; + wpage = BufferGetPage(wbuf); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + + /* + * if there aren't any overflow pages, there's nothing to squeeze. caller + * is responsible for releasing the pin on primary bucket page. + */ + if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) + { + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); + return; + } + + /* + * Find the last page in the bucket chain by starting at the base bucket + * page and working forward. Note: we assume that a hash bucket chain is + * usually smaller than the buffer ring being used by VACUUM, else using + * the access strategy here would be counterproductive. + */ + rbuf = InvalidBuffer; + ropaque = wopaque; + do + { + rblkno = ropaque->hasho_nextblkno; + if (rbuf != InvalidBuffer) + _hash_relbuf(rel, rbuf); + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + rpage = BufferGetPage(rbuf); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); + + /* + * squeeze the tuples. + */ + for (;;) + { + OffsetNumber roffnum; + OffsetNumber maxroffnum; + OffsetNumber deletable[MaxOffsetNumber]; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size tups_size[MaxIndexTuplesPerPage]; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + uint16 ndeletable = 0; + uint16 nitups = 0; + Size all_tups_size = 0; + int i; + bool retain_pin = false; + +readpage: + /* Scan each tuple in "read" page */ + maxroffnum = PageGetMaxOffsetNumber(rpage); + for (roffnum = FirstOffsetNumber; + roffnum <= maxroffnum; + roffnum = OffsetNumberNext(roffnum)) + { + IndexTuple itup; + Size itemsz; + + /* skip dead tuples */ + if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) + continue; + + itup = (IndexTuple) PageGetItem(rpage, + PageGetItemId(rpage, roffnum)); + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); + + /* + * Walk up the bucket chain, looking for a page big enough for + * this item and all other accumulated items. Exit if we reach + * the read page. + */ + while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) + { + Buffer next_wbuf = InvalidBuffer; + bool tups_moved = false; + + Assert(!PageIsEmpty(wpage)); + + if (wblkno == bucket_blkno) + retain_pin = true; + + wblkno = wopaque->hasho_nextblkno; + Assert(BlockNumberIsValid(wblkno)); + + /* don't need to move to next page if we reached the read page */ + if (wblkno != rblkno) + next_wbuf = _hash_getbuf_with_strategy(rel, + wblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + if (nitups > 0) + { + Assert(nitups == ndeletable); + + /* + * This operation needs to log multiple tuples, prepare + * WAL for that. + */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(0, 3 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being + * careful to preserve hashkey ordering. (If we insert + * many tuples into the same "write" page it would be + * worth qsort'ing them). + */ + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + + /* Delete tuples we already moved off read page */ + PageIndexMultiDelete(rpage, deletable, ndeletable); + MarkBufferDirty(rbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_move_page_contents xlrec; + + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); + + /* + * bucket buffer needs to be registered to ensure that + * we can acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(rbuf), recptr); + } + + END_CRIT_SECTION(); + + tups_moved = true; + } + + /* + * release the lock on previous page after acquiring the lock + * on next page + */ + if (retain_pin) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, wbuf); + + /* nothing more to do if we reached the read page */ + if (rblkno == wblkno) + { + _hash_relbuf(rel, rbuf); + return; + } + + wbuf = next_wbuf; + wpage = BufferGetPage(wbuf); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + Assert(wopaque->hasho_bucket == bucket); + retain_pin = false; + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + ndeletable = 0; + + /* + * after moving the tuples, rpage would have been compacted, + * so we need to rescan it. + */ + if (tups_moved) + goto readpage; + } + + /* remember tuple for deletion from "read" page */ + deletable[ndeletable++] = roffnum; + + /* + * we need a copy of index tuples as they can be freed as part of + * overflow page, however we need them to write a WAL record in + * _hash_freeovflpage. + */ + itups[nitups] = CopyIndexTuple(itup); + tups_size[nitups++] = itemsz; + all_tups_size += itemsz; + } + + /* + * If we reach here, there are no live tuples on the "read" page --- + * it was empty when we got to it, or we moved them all. So we can + * just free the page without bothering with deleting tuples + * individually. Then advance to the previous "read" page. + * + * Tricky point here: if our read and write pages are adjacent in the + * bucket chain, our write lock on wbuf will conflict with + * _hash_freeovflpage's attempt to update the sibling links of the + * removed page. In that case, we don't need to lock it again. + */ + rblkno = ropaque->hasho_prevblkno; + Assert(BlockNumberIsValid(rblkno)); + + /* free this overflow page (releases rbuf) */ + _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, + tups_size, nitups, bstrategy); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + + /* are we freeing the page adjacent to wbuf? */ + if (rblkno == wblkno) + { + /* retain the pin on primary bucket page till end of bucket scan */ + if (wblkno == bucket_blkno) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, wbuf); + return; + } + + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + rpage = BufferGetPage(rbuf); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + } + + /* NOTREACHED */ +} diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c new file mode 100644 index 0000000..49a9867 --- /dev/null +++ b/src/backend/access/hash/hashpage.c @@ -0,0 +1,1612 @@ +/*------------------------------------------------------------------------- + * + * hashpage.c + * Hash table page management code for the Postgres hash access method + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hashpage.c + * + * NOTES + * Postgres hash pages look like ordinary relation pages. The opaque + * data at high addresses includes information about the page including + * whether a page is an overflow page or a true bucket, the bucket + * number, and the block numbers of the preceding and following pages + * in the same bucket. + * + * The first page in a hash relation, page zero, is special -- it stores + * information describing the hash table; it is referred to as the + * "meta page." Pages one and higher store the actual data. + * + * There are also bitmap pages, which are not manipulated here; + * see hashovfl.c. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/smgr.h" + +static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, + uint32 nblocks); +static void _hash_splitbucket(Relation rel, Buffer metabuf, + Bucket obucket, Bucket nbucket, + Buffer obuf, + Buffer nbuf, + HTAB *htab, + uint32 maxbucket, + uint32 highmask, uint32 lowmask); +static void log_split_page(Relation rel, Buffer buf); + + +/* + * _hash_getbuf() -- Get a buffer by block number for read or write. + * + * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK. + * 'flags' is a bitwise OR of the allowed page types. + * + * This must be used only to fetch pages that are expected to be valid + * already. _hash_checkpage() is applied using the given flags. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer and its reference count has been incremented + * (ie, the buffer is "locked and pinned"). + * + * P_NEW is disallowed because this routine can only be used + * to access pages that are known to be before the filesystem EOF. + * Extending the index should be done with _hash_getnewbuf. + */ +Buffer +_hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags) +{ + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + + buf = ReadBuffer(rel, blkno); + + if (access != HASH_NOLOCK) + LockBuffer(buf, access); + + /* ref count and lock type are correct */ + + _hash_checkpage(rel, buf, flags); + + return buf; +} + +/* + * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup. + * + * We read the page and try to acquire a cleanup lock. If we get it, + * we return the buffer; otherwise, we return InvalidBuffer. + */ +Buffer +_hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags) +{ + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + + buf = ReadBuffer(rel, blkno); + + if (!ConditionalLockBufferForCleanup(buf)) + { + ReleaseBuffer(buf); + return InvalidBuffer; + } + + /* ref count and lock type are correct */ + + _hash_checkpage(rel, buf, flags); + + return buf; +} + +/* + * _hash_getinitbuf() -- Get and initialize a buffer by block number. + * + * This must be used only to fetch pages that are known to be before + * the index's filesystem EOF, but are to be filled from scratch. + * _hash_pageinit() is applied automatically. Otherwise it has + * effects similar to _hash_getbuf() with access = HASH_WRITE. + * + * When this routine returns, a write lock is set on the + * requested buffer and its reference count has been incremented + * (ie, the buffer is "locked and pinned"). + * + * P_NEW is disallowed because this routine can only be used + * to access pages that are known to be before the filesystem EOF. + * Extending the index should be done with _hash_getnewbuf. + */ +Buffer +_hash_getinitbuf(Relation rel, BlockNumber blkno) +{ + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, + NULL); + + /* ref count and lock type are correct */ + + /* initialize the page */ + _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf)); + + return buf; +} + +/* + * _hash_initbuf() -- Get and initialize a buffer by bucket number. + */ +void +_hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, + bool initpage) +{ + HashPageOpaque pageopaque; + Page page; + + page = BufferGetPage(buf); + + /* initialize the page */ + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* + * Set hasho_prevblkno with current hashm_maxbucket. This value will be + * used to validate cached HashMetaPageData. See + * _hash_getbucketbuf_from_hashkey(). + */ + pageopaque->hasho_prevblkno = max_bucket; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_bucket = num_bucket; + pageopaque->hasho_flag = flag; + pageopaque->hasho_page_id = HASHO_PAGE_ID; +} + +/* + * _hash_getnewbuf() -- Get a new page at the end of the index. + * + * This has the same API as _hash_getinitbuf, except that we are adding + * a page to the index, and hence expect the page to be past the + * logical EOF. (However, we have to support the case where it isn't, + * since a prior try might have crashed after extending the filesystem + * EOF but before updating the metapage to reflect the added page.) + * + * It is caller's responsibility to ensure that only one process can + * extend the index at a time. In practice, this function is called + * only while holding write lock on the metapage, because adding a page + * is always associated with an update of metapage data. + */ +Buffer +_hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum) +{ + BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum); + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + if (blkno > nblocks) + elog(ERROR, "access to noncontiguous page in hash index \"%s\"", + RelationGetRelationName(rel)); + + /* smgr insists we use P_NEW to extend the relation */ + if (blkno == nblocks) + { + buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); + if (BufferGetBlockNumber(buf) != blkno) + elog(ERROR, "unexpected hash relation size: %u, should be %u", + BufferGetBlockNumber(buf), blkno); + LockBuffer(buf, HASH_WRITE); + } + else + { + buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK, + NULL); + } + + /* ref count and lock type are correct */ + + /* initialize the page */ + _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf)); + + return buf; +} + +/* + * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy. + * + * This is identical to _hash_getbuf() but also allows a buffer access + * strategy to be specified. We use this for VACUUM operations. + */ +Buffer +_hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, + int access, int flags, + BufferAccessStrategy bstrategy) +{ + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); + + if (access != HASH_NOLOCK) + LockBuffer(buf, access); + + /* ref count and lock type are correct */ + + _hash_checkpage(rel, buf, flags); + + return buf; +} + +/* + * _hash_relbuf() -- release a locked buffer. + * + * Lock and pin (refcount) are both dropped. + */ +void +_hash_relbuf(Relation rel, Buffer buf) +{ + UnlockReleaseBuffer(buf); +} + +/* + * _hash_dropbuf() -- release an unlocked buffer. + * + * This is used to unpin a buffer on which we hold no lock. + */ +void +_hash_dropbuf(Relation rel, Buffer buf) +{ + ReleaseBuffer(buf); +} + +/* + * _hash_dropscanbuf() -- release buffers used in scan. + * + * This routine unpins the buffers used during scan on which we + * hold no lock. + */ +void +_hash_dropscanbuf(Relation rel, HashScanOpaque so) +{ + /* release pin we hold on primary bucket page */ + if (BufferIsValid(so->hashso_bucket_buf) && + so->hashso_bucket_buf != so->currPos.buf) + _hash_dropbuf(rel, so->hashso_bucket_buf); + so->hashso_bucket_buf = InvalidBuffer; + + /* release pin we hold on primary bucket page of bucket being split */ + if (BufferIsValid(so->hashso_split_bucket_buf) && + so->hashso_split_bucket_buf != so->currPos.buf) + _hash_dropbuf(rel, so->hashso_split_bucket_buf); + so->hashso_split_bucket_buf = InvalidBuffer; + + /* release any pin we still hold */ + if (BufferIsValid(so->currPos.buf)) + _hash_dropbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; + + /* reset split scan */ + so->hashso_buc_populated = false; + so->hashso_buc_split = false; +} + + +/* + * _hash_init() -- Initialize the metadata page of a hash index, + * the initial buckets, and the initial bitmap page. + * + * The initial number of buckets is dependent on num_tuples, an estimate + * of the number of tuples to be loaded into the index initially. The + * chosen number of buckets is returned. + * + * We are fairly cavalier about locking here, since we know that no one else + * could be accessing this index. In particular the rule about not holding + * multiple buffer locks is ignored. + */ +uint32 +_hash_init(Relation rel, double num_tuples, ForkNumber forkNum) +{ + Buffer metabuf; + Buffer buf; + Buffer bitmapbuf; + Page pg; + HashMetaPage metap; + RegProcedure procid; + int32 data_width; + int32 item_width; + int32 ffactor; + uint32 num_buckets; + uint32 i; + bool use_wal; + + /* safety check */ + if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) + elog(ERROR, "cannot initialize non-empty hash index \"%s\"", + RelationGetRelationName(rel)); + + /* + * WAL log creation of pages if the relation is persistent, or this is the + * init fork. Init forks for unlogged relations always need to be WAL + * logged. + */ + use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; + + /* + * Determine the target fill factor (in tuples per bucket) for this index. + * The idea is to make the fill factor correspond to pages about as full + * as the user-settable fillfactor parameter says. We can compute it + * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. + */ + data_width = sizeof(uint32); + item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + + sizeof(ItemIdData); /* include the line pointer */ + ffactor = HashGetTargetPageUsage(rel) / item_width; + /* keep to a sane range */ + if (ffactor < 10) + ffactor = 10; + + procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); + + /* + * We initialize the metapage, the first N bucket pages, and the first + * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() + * calls to occur. This ensures that the smgr level has the right idea of + * the physical index length. + * + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. + */ + metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); + _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); + MarkBufferDirty(metabuf); + + pg = BufferGetPage(metabuf); + metap = HashPageGetMeta(pg); + + /* XLOG stuff */ + if (use_wal) + { + xl_hash_init_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.num_tuples = num_tuples; + xlrec.procid = metap->hashm_procid; + xlrec.ffactor = metap->hashm_ffactor; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + num_buckets = metap->hashm_maxbucket + 1; + + /* + * Release buffer lock on the metapage while we initialize buckets. + * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS + * won't accomplish anything. It's a bad idea to hold buffer locks for + * long intervals in any case, since that can block the bgwriter. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* + * Initialize and WAL Log the first N buckets + */ + for (i = 0; i < num_buckets; i++) + { + BlockNumber blkno; + + /* Allow interrupts, in case N is huge */ + CHECK_FOR_INTERRUPTS(); + + blkno = BUCKET_TO_BLKNO(metap, i); + buf = _hash_getnewbuf(rel, blkno, forkNum); + _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); + MarkBufferDirty(buf); + + if (use_wal) + log_newpage(&rel->rd_node, + forkNum, + blkno, + BufferGetPage(buf), + true); + _hash_relbuf(rel, buf); + } + + /* Now reacquire buffer lock on metapage */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Initialize bitmap page + */ + bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); + _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + + metap->hashm_nmaps++; + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (use_wal) + { + xl_hash_init_bitmap_page xlrec; + XLogRecPtr recptr; + + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); + XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); + + /* + * This is safe only because nobody else can be modifying the index at + * this stage; it's only visible to the transaction that is creating + * it. + */ + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); + + PageSetLSN(BufferGetPage(bitmapbuf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + /* all done */ + _hash_relbuf(rel, bitmapbuf); + _hash_relbuf(rel, metabuf); + + return num_buckets; +} + +/* + * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. + */ +void +_hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, + uint16 ffactor, bool initpage) +{ + HashMetaPage metap; + HashPageOpaque pageopaque; + Page page; + double dnumbuckets; + uint32 num_buckets; + uint32 spare_index; + uint32 lshift; + + /* + * Choose the number of initial bucket pages to match the fill factor + * given the estimated number of tuples. We round up the result to the + * total number of buckets which has to be allocated before using its + * hashm_spares element. However always force at least 2 bucket pages. The + * upper limit is determined by considerations explained in + * _hash_expandtable(). + */ + dnumbuckets = num_tuples / ffactor; + if (dnumbuckets <= 2.0) + num_buckets = 2; + else if (dnumbuckets >= (double) 0x40000000) + num_buckets = 0x40000000; + else + num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); + + spare_index = _hash_spareindex(num_buckets); + Assert(spare_index < HASH_MAX_SPLITPOINTS); + + page = BufferGetPage(buf); + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_prevblkno = InvalidBlockNumber; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_bucket = -1; + pageopaque->hasho_flag = LH_META_PAGE; + pageopaque->hasho_page_id = HASHO_PAGE_ID; + + metap = HashPageGetMeta(page); + + metap->hashm_magic = HASH_MAGIC; + metap->hashm_version = HASH_VERSION; + metap->hashm_ntuples = 0; + metap->hashm_nmaps = 0; + metap->hashm_ffactor = ffactor; + metap->hashm_bsize = HashGetMaxBitmapSize(page); + + /* find largest bitmap array size that will fit in page size */ + lshift = pg_leftmost_one_pos32(metap->hashm_bsize); + Assert(lshift > 0); + metap->hashm_bmsize = 1 << lshift; + metap->hashm_bmshift = lshift + BYTE_TO_BIT; + Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); + + /* + * Label the index with its primary hash support function's OID. This is + * pretty useless for normal operation (in fact, hashm_procid is not used + * anywhere), but it might be handy for forensic purposes so we keep it. + */ + metap->hashm_procid = procid; + + /* + * We initialize the index with N buckets, 0 .. N-1, occupying physical + * blocks 1 to N. The first freespace bitmap page is in block N+1. + */ + metap->hashm_maxbucket = num_buckets - 1; + + /* + * Set highmask as next immediate ((2 ^ x) - 1), which should be + * sufficient to cover num_buckets. + */ + metap->hashm_highmask = pg_nextpower2_32(num_buckets + 1) - 1; + metap->hashm_lowmask = (metap->hashm_highmask >> 1); + + MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); + MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); + + /* Set up mapping for one spare page after the initial splitpoints */ + metap->hashm_spares[spare_index] = 1; + metap->hashm_ovflpoint = spare_index; + metap->hashm_firstfree = 0; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; +} + +/* + * _hash_pageinit() -- Initialize a new hash index page. + */ +void +_hash_pageinit(Page page, Size size) +{ + PageInit(page, size, sizeof(HashPageOpaqueData)); +} + +/* + * Attempt to expand the hash table by creating one new bucket. + * + * This will silently do nothing if we don't get cleanup lock on old or + * new bucket. + * + * Complete the pending splits and remove the tuples from old bucket, + * if there are any left over from the previous split. + * + * The caller must hold a pin, but no lock, on the metapage buffer. + * The buffer is returned in the same state. + */ +void +_hash_expandtable(Relation rel, Buffer metabuf) +{ + HashMetaPage metap; + Bucket old_bucket; + Bucket new_bucket; + uint32 spare_ndx; + BlockNumber start_oblkno; + BlockNumber start_nblkno; + Buffer buf_nblkno; + Buffer buf_oblkno; + Page opage; + Page npage; + HashPageOpaque oopaque; + HashPageOpaque nopaque; + uint32 maxbucket; + uint32 highmask; + uint32 lowmask; + bool metap_update_masks = false; + bool metap_update_splitpoint = false; + +restart_expand: + + /* + * Write-lock the meta page. It used to be necessary to acquire a + * heavyweight lock to begin a split, but that is no longer required. + */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + _hash_checkpage(rel, metabuf, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + /* + * Check to see if split is still needed; someone else might have already + * done one while we waited for the lock. + * + * Make sure this stays in sync with _hash_doinsert() + */ + if (metap->hashm_ntuples <= + (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) + goto fail; + + /* + * Can't split anymore if maxbucket has reached its maximum possible + * value. + * + * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because + * the calculation maxbucket+1 mustn't overflow). Currently we restrict + * to half that to prevent failure of pg_ceil_log2_32() and insufficient + * space in hashm_spares[]. It's moot anyway because an index with 2^32 + * buckets would certainly overflow BlockNumber and hence + * _hash_alloc_buckets() would fail, but if we supported buckets smaller + * than a disk block then this would be an independent constraint. + * + * If you change this, see also the maximum initial number of buckets in + * _hash_init(). + */ + if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) + goto fail; + + /* + * Determine which bucket is to be split, and attempt to take cleanup lock + * on the old bucket. If we can't get the lock, give up. + * + * The cleanup lock protects us not only against other backends, but + * against our own backend as well. + * + * The cleanup lock is mainly to protect the split from concurrent + * inserts. See src/backend/access/hash/README, Lock Definitions for + * further details. Due to this locking restriction, if there is any + * pending scan, the split will give up which is not good, but harmless. + */ + new_bucket = metap->hashm_maxbucket + 1; + + old_bucket = (new_bucket & metap->hashm_lowmask); + + start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); + + buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); + if (!buf_oblkno) + goto fail; + + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* + * We want to finish the split from a bucket as there is no apparent + * benefit by not doing so and it will make the code complicated to finish + * the split that involves multiple buckets considering the case where new + * split also fails. We don't need to consider the new bucket for + * completing the split here as it is not possible that a re-split of new + * bucket starts when there is still a pending split from old bucket. + */ + if (H_BUCKET_BEING_SPLIT(oopaque)) + { + /* + * Copy bucket mapping info now; refer the comment in code below where + * we copy this information before calling _hash_splitbucket to see + * why this is okay. + */ + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + /* + * Release the lock on metapage and old_bucket, before completing the + * split. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK); + + _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket, + highmask, lowmask); + + /* release the pin on old buffer and retry for expand. */ + _hash_dropbuf(rel, buf_oblkno); + + goto restart_expand; + } + + /* + * Clean the tuples remained from the previous split. This operation + * requires cleanup lock and we already have one on the old bucket, so + * let's do it. We also don't want to allow further splits from the bucket + * till the garbage of previous split is cleaned. This has two + * advantages; first, it helps in avoiding the bloat due to garbage and + * second is, during cleanup of bucket, we are always sure that the + * garbage tuples belong to most recently split bucket. On the contrary, + * if we allow cleanup of bucket after meta page is updated to indicate + * the new split and before the actual split, the cleanup operation won't + * be able to decide whether the tuple has been moved to the newly created + * bucket and ended up deleting such tuples. + */ + if (H_NEEDS_SPLIT_CLEANUP(oopaque)) + { + /* + * Copy bucket mapping info now; refer to the comment in code below + * where we copy this information before calling _hash_splitbucket to + * see why this is okay. + */ + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + /* Release the metapage lock. */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL, + maxbucket, highmask, lowmask, NULL, NULL, true, + NULL, NULL); + + _hash_dropbuf(rel, buf_oblkno); + + goto restart_expand; + } + + /* + * There shouldn't be any active scan on new bucket. + * + * Note: it is safe to compute the new bucket's blkno here, even though we + * may still need to update the BUCKET_TO_BLKNO mapping. This is because + * the current value of hashm_spares[hashm_ovflpoint] correctly shows + * where we are going to put a new splitpoint's worth of buckets. + */ + start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); + + /* + * If the split point is increasing we need to allocate a new batch of + * bucket pages. + */ + spare_ndx = _hash_spareindex(new_bucket + 1); + if (spare_ndx > metap->hashm_ovflpoint) + { + uint32 buckets_to_add; + + Assert(spare_ndx == metap->hashm_ovflpoint + 1); + + /* + * We treat allocation of buckets as a separate WAL-logged action. + * Even if we fail after this operation, won't leak bucket pages; + * rather, the next split will consume this space. In any case, even + * without failure we don't use all the space in one split operation. + */ + buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket; + if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add)) + { + /* can't split due to BlockNumber overflow */ + _hash_relbuf(rel, buf_oblkno); + goto fail; + } + } + + /* + * Physically allocate the new bucket's primary page. We want to do this + * before changing the metapage's mapping info, in case we can't get the + * disk space. Ideally, we don't need to check for cleanup lock on new + * bucket as no other backend could find this bucket unless meta page is + * updated. However, it is good to be consistent with old bucket locking. + */ + buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); + if (!IsBufferCleanupOK(buf_nblkno)) + { + _hash_relbuf(rel, buf_oblkno); + _hash_relbuf(rel, buf_nblkno); + goto fail; + } + + /* + * Since we are scribbling on the pages in the shared buffers, establish a + * critical section. Any failure in this next code leaves us with a big + * problem: the metapage is effectively corrupt but could get written back + * to disk. + */ + START_CRIT_SECTION(); + + /* + * Okay to proceed with split. Update the metapage bucket mapping info. + */ + metap->hashm_maxbucket = new_bucket; + + if (new_bucket > metap->hashm_highmask) + { + /* Starting a new doubling */ + metap->hashm_lowmask = metap->hashm_highmask; + metap->hashm_highmask = new_bucket | metap->hashm_lowmask; + metap_update_masks = true; + } + + /* + * If the split point is increasing we need to adjust the hashm_spares[] + * array and hashm_ovflpoint so that future overflow pages will be created + * beyond this new batch of bucket pages. + */ + if (spare_ndx > metap->hashm_ovflpoint) + { + metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; + metap->hashm_ovflpoint = spare_ndx; + metap_update_splitpoint = true; + } + + MarkBufferDirty(metabuf); + + /* + * Copy bucket mapping info now; this saves re-accessing the meta page + * inside _hash_splitbucket's inner loop. Note that once we drop the + * split lock, other splits could begin, so these values might be out of + * date before _hash_splitbucket finishes. That's okay, since all it + * needs is to tell which of these two buckets to map hashkeys into. + */ + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* + * Mark the old bucket to indicate that split is in progress. (At + * operation end, we will clear the split-in-progress flag.) Also, for a + * primary bucket page, hasho_prevblkno stores the number of buckets that + * existed as of the last split, so we must update that value here. + */ + oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; + oopaque->hasho_prevblkno = maxbucket; + + MarkBufferDirty(buf_oblkno); + + npage = BufferGetPage(buf_nblkno); + + /* + * initialize the new bucket's primary page and mark it to indicate that + * split is in progress. + */ + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nopaque->hasho_prevblkno = maxbucket; + nopaque->hasho_nextblkno = InvalidBlockNumber; + nopaque->hasho_bucket = new_bucket; + nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; + nopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(buf_nblkno); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_split_allocate_page xlrec; + XLogRecPtr recptr; + + xlrec.new_bucket = maxbucket; + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + xlrec.flags = 0; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); + XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); + + if (metap_update_masks) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; + XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); + XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); + } + + if (metap_update_splitpoint) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; + XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, + sizeof(uint32)); + XLogRegisterBufData(2, + (char *) &metap->hashm_spares[metap->hashm_ovflpoint], + sizeof(uint32)); + } + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); + + PageSetLSN(BufferGetPage(buf_oblkno), recptr); + PageSetLSN(BufferGetPage(buf_nblkno), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock, but keep pin */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* Relocate records to the new bucket */ + _hash_splitbucket(rel, metabuf, + old_bucket, new_bucket, + buf_oblkno, buf_nblkno, NULL, + maxbucket, highmask, lowmask); + + /* all done, now release the pins on primary buckets. */ + _hash_dropbuf(rel, buf_oblkno); + _hash_dropbuf(rel, buf_nblkno); + + return; + + /* Here if decide not to split or fail to acquire old bucket lock */ +fail: + + /* We didn't write the metapage, so just drop lock */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); +} + + +/* + * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages + * + * This does not need to initialize the new bucket pages; we'll do that as + * each one is used by _hash_expandtable(). But we have to extend the logical + * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in + * sync with ours, so that we don't get complaints from smgr. + * + * We do this by writing a page of zeroes at the end of the splitpoint range. + * We expect that the filesystem will ensure that the intervening pages read + * as zeroes too. On many filesystems this "hole" will not be allocated + * immediately, which means that the index file may end up more fragmented + * than if we forced it all to be allocated now; but since we don't scan + * hash indexes sequentially anyway, that probably doesn't matter. + * + * XXX It's annoying that this code is executed with the metapage lock held. + * We need to interlock against _hash_addovflpage() adding a new overflow page + * concurrently, but it'd likely be better to use LockRelationForExtension + * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, + * so it may not be worth worrying about. + * + * Returns true if successful, or false if allocation failed due to + * BlockNumber overflow. + */ +static bool +_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) +{ + BlockNumber lastblock; + PGAlignedBlock zerobuf; + Page page; + HashPageOpaque ovflopaque; + + lastblock = firstblock + nblocks - 1; + + /* + * Check for overflow in block number calculation; if so, we cannot extend + * the index anymore. + */ + if (lastblock < firstblock || lastblock == InvalidBlockNumber) + return false; + + page = (Page) zerobuf.data; + + /* + * Initialize the page. Just zeroing the page won't work; see + * _hash_freeovflpage for similar usage. We take care to make the special + * space valid for the benefit of tools such as pageinspect. + */ + _hash_pageinit(page, BLCKSZ); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + if (RelationNeedsWAL(rel)) + log_newpage(&rel->rd_node, + MAIN_FORKNUM, + lastblock, + zerobuf.data, + true); + + RelationOpenSmgr(rel); + PageSetChecksumInplace(page, lastblock); + smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf.data, false); + + return true; +} + + +/* + * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' + * + * This routine is used to partition the tuples between old and new bucket and + * is used to finish the incomplete split operations. To finish the previously + * interrupted split operation, the caller needs to fill htab. If htab is set, + * then we skip the movement of tuples that exists in htab, otherwise NULL + * value of htab indicates movement of all the tuples that belong to the new + * bucket. + * + * We are splitting a bucket that consists of a base bucket page and zero + * or more overflow (bucket chain) pages. We must relocate tuples that + * belong in the new bucket. + * + * The caller must hold cleanup locks on both buckets to ensure that + * no one else is trying to access them (see README). + * + * The caller must hold a pin, but no lock, on the metapage buffer. + * The buffer is returned in the same state. (The metapage is only + * touched if it becomes necessary to add or remove overflow pages.) + * + * Split needs to retain pin on primary bucket pages of both old and new + * buckets till end of operation. This is to prevent vacuum from starting + * while a split is in progress. + * + * In addition, the caller must have created the new bucket's base page, + * which is passed in buffer nbuf, pinned and write-locked. The lock will be + * released here and pin must be released by the caller. (The API is set up + * this way because we must do _hash_getnewbuf() before releasing the metapage + * write lock. So instead of passing the new bucket's start block number, we + * pass an actual buffer.) + */ +static void +_hash_splitbucket(Relation rel, + Buffer metabuf, + Bucket obucket, + Bucket nbucket, + Buffer obuf, + Buffer nbuf, + HTAB *htab, + uint32 maxbucket, + uint32 highmask, + uint32 lowmask) +{ + Buffer bucket_obuf; + Buffer bucket_nbuf; + Page opage; + Page npage; + HashPageOpaque oopaque; + HashPageOpaque nopaque; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size all_tups_size = 0; + int i; + uint16 nitups = 0; + + bucket_obuf = obuf; + opage = BufferGetPage(obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + bucket_nbuf = nbuf; + npage = BufferGetPage(nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + + /* Copy the predicate locks from old bucket to new bucket. */ + PredicateLockPageSplit(rel, + BufferGetBlockNumber(bucket_obuf), + BufferGetBlockNumber(bucket_nbuf)); + + /* + * Partition the tuples in the old bucket between the old bucket and the + * new bucket, advancing along the old bucket's overflow bucket chain and + * adding overflow pages to the new bucket as needed. Outer loop iterates + * once per page in old bucket. + */ + for (;;) + { + BlockNumber oblkno; + OffsetNumber ooffnum; + OffsetNumber omaxoffnum; + + /* Scan each tuple in old page */ + omaxoffnum = PageGetMaxOffsetNumber(opage); + for (ooffnum = FirstOffsetNumber; + ooffnum <= omaxoffnum; + ooffnum = OffsetNumberNext(ooffnum)) + { + IndexTuple itup; + Size itemsz; + Bucket bucket; + bool found = false; + + /* skip dead tuples */ + if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) + continue; + + /* + * Before inserting a tuple, probe the hash table containing TIDs + * of tuples belonging to new bucket, if we find a match, then + * skip that tuple, else fetch the item's hash key (conveniently + * stored in the item) and determine which bucket it now belongs + * in. + */ + itup = (IndexTuple) PageGetItem(opage, + PageGetItemId(opage, ooffnum)); + + if (htab) + (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); + + if (found) + continue; + + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + maxbucket, highmask, lowmask); + + if (bucket == nbucket) + { + IndexTuple new_itup; + + /* + * make a copy of index tuple as we have to scribble on it. + */ + new_itup = CopyIndexTuple(itup); + + /* + * mark the index tuple as moved by split, such tuples are + * skipped by scan if there is split in progress for a bucket. + */ + new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; + + /* + * insert the tuple into the new bucket. if it doesn't fit on + * the current page in the new bucket, we must allocate a new + * overflow page and place the tuple on that page instead. + */ + itemsz = IndexTupleSize(new_itup); + itemsz = MAXALIGN(itemsz); + + if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) + { + /* + * Change the shared buffer state in critical section, + * otherwise any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); + MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + + /* drop lock, but keep pin */ + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + + /* chain to a new overflow page */ + nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); + npage = BufferGetPage(nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + } + + itups[nitups++] = new_itup; + all_tups_size += itemsz; + } + else + { + /* + * the tuple stays on this page, so nothing to do. + */ + Assert(bucket == obucket); + } + } + + oblkno = oopaque->hasho_nextblkno; + + /* retain the pin on the old primary bucket */ + if (obuf == bucket_obuf) + LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, obuf); + + /* Exit loop if no more overflow pages in old bucket */ + if (!BlockNumberIsValid(oblkno)) + { + /* + * Change the shared buffer state in critical section, otherwise + * any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); + MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + + if (nbuf == bucket_nbuf) + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, nbuf); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + break; + } + + /* Else, advance to next old page */ + obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); + opage = BufferGetPage(obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + } + + /* + * We're at the end of the old bucket chain, so we're done partitioning + * the tuples. Mark the old and new buckets to indicate split is + * finished. + * + * To avoid deadlocks due to locking order of buckets, first lock the old + * bucket and then the new bucket. + */ + LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); + opage = BufferGetPage(bucket_obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); + npage = BufferGetPage(bucket_nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + + START_CRIT_SECTION(); + + oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; + nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; + + /* + * After the split is finished, mark the old bucket to indicate that it + * contains deletable tuples. We will clear split-cleanup flag after + * deleting such tuples either at the end of split or at the next split + * from old bucket or at the time of vacuum. + */ + oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; + + /* + * now write the buffers, here we don't release the locks as caller is + * responsible to release locks. + */ + MarkBufferDirty(bucket_obuf); + MarkBufferDirty(bucket_nbuf); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_split_complete xlrec; + + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + + XLogBeginInsert(); + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); + + XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); + XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); + + PageSetLSN(BufferGetPage(bucket_obuf), recptr); + PageSetLSN(BufferGetPage(bucket_nbuf), recptr); + } + + END_CRIT_SECTION(); + + /* + * If possible, clean up the old bucket. We might not be able to do this + * if someone else has a pin on it, but if not then we can go ahead. This + * isn't absolutely necessary, but it reduces bloat; if we don't do it + * now, VACUUM will do it eventually, but maybe not until new overflow + * pages have been allocated. Note that there's no need to clean up the + * new bucket. + */ + if (IsBufferCleanupOK(bucket_obuf)) + { + LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); + hashbucketcleanup(rel, obucket, bucket_obuf, + BufferGetBlockNumber(bucket_obuf), NULL, + maxbucket, highmask, lowmask, NULL, NULL, true, + NULL, NULL); + } + else + { + LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); + } +} + +/* + * _hash_finish_split() -- Finish the previously interrupted split operation + * + * To complete the split operation, we form the hash table of TIDs in new + * bucket which is then used by split operation to skip tuples that are + * already moved before the split operation was previously interrupted. + * + * The caller must hold a pin, but no lock, on the metapage and old bucket's + * primary page buffer. The buffers are returned in the same state. (The + * metapage is only touched if it becomes necessary to add or remove overflow + * pages.) + */ +void +_hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, + uint32 maxbucket, uint32 highmask, uint32 lowmask) +{ + HASHCTL hash_ctl; + HTAB *tidhtab; + Buffer bucket_nbuf = InvalidBuffer; + Buffer nbuf; + Page npage; + BlockNumber nblkno; + BlockNumber bucket_nblkno; + HashPageOpaque npageopaque; + Bucket nbucket; + bool found; + + /* Initialize hash tables used to track TIDs */ + hash_ctl.keysize = sizeof(ItemPointerData); + hash_ctl.entrysize = sizeof(ItemPointerData); + hash_ctl.hcxt = CurrentMemoryContext; + + tidhtab = + hash_create("bucket ctids", + 256, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket); + + /* + * Scan the new bucket and build hash table of TIDs + */ + for (;;) + { + OffsetNumber noffnum; + OffsetNumber nmaxoffnum; + + nbuf = _hash_getbuf(rel, nblkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + /* remember the primary bucket buffer to acquire cleanup lock on it. */ + if (nblkno == bucket_nblkno) + bucket_nbuf = nbuf; + + npage = BufferGetPage(nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + + /* Scan each tuple in new page */ + nmaxoffnum = PageGetMaxOffsetNumber(npage); + for (noffnum = FirstOffsetNumber; + noffnum <= nmaxoffnum; + noffnum = OffsetNumberNext(noffnum)) + { + IndexTuple itup; + + /* Fetch the item's TID and insert it in hash table. */ + itup = (IndexTuple) PageGetItem(npage, + PageGetItemId(npage, noffnum)); + + (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found); + + Assert(!found); + } + + nblkno = npageopaque->hasho_nextblkno; + + /* + * release our write lock without modifying buffer and ensure to + * retain the pin on primary bucket. + */ + if (nbuf == bucket_nbuf) + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, nbuf); + + /* Exit loop if no more overflow pages in new bucket */ + if (!BlockNumberIsValid(nblkno)) + break; + } + + /* + * Conditionally get the cleanup lock on old and new buckets to perform + * the split operation. If we don't get the cleanup locks, silently give + * up and next insertion on old bucket will try again to complete the + * split. + */ + if (!ConditionalLockBufferForCleanup(obuf)) + { + hash_destroy(tidhtab); + return; + } + if (!ConditionalLockBufferForCleanup(bucket_nbuf)) + { + LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + hash_destroy(tidhtab); + return; + } + + npage = BufferGetPage(bucket_nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nbucket = npageopaque->hasho_bucket; + + _hash_splitbucket(rel, metabuf, obucket, + nbucket, obuf, bucket_nbuf, tidhtab, + maxbucket, highmask, lowmask); + + _hash_dropbuf(rel, bucket_nbuf); + hash_destroy(tidhtab); +} + +/* + * log_split_page() -- Log the split operation + * + * We log the split operation when the new page in new bucket gets full, + * so we log the entire page. + * + * 'buf' must be locked by the caller which is also responsible for unlocking + * it. + */ +static void +log_split_page(Relation rel, Buffer buf) +{ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + } +} + +/* + * _hash_getcachedmetap() -- Returns cached metapage data. + * + * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on + * the metapage. If not set, we'll set it before returning if we have to + * refresh the cache, and return with a pin but no lock on it; caller is + * responsible for releasing the pin. + * + * We refresh the cache if it's not initialized yet or force_refresh is true. + */ +HashMetaPage +_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) +{ + Page page; + + Assert(metabuf); + if (force_refresh || rel->rd_amcache == NULL) + { + char *cache = NULL; + + /* + * It's important that we don't set rd_amcache to an invalid value. + * Either MemoryContextAlloc or _hash_getbuf could fail, so don't + * install a pointer to the newly-allocated storage in the actual + * relcache entry until both have succeeded. + */ + if (rel->rd_amcache == NULL) + cache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(HashMetaPageData)); + + /* Read the metapage. */ + if (BufferIsValid(*metabuf)) + LockBuffer(*metabuf, BUFFER_LOCK_SHARE); + else + *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, + LH_META_PAGE); + page = BufferGetPage(*metabuf); + + /* Populate the cache. */ + if (rel->rd_amcache == NULL) + rel->rd_amcache = cache; + memcpy(rel->rd_amcache, HashPageGetMeta(page), + sizeof(HashMetaPageData)); + + /* Release metapage lock, but keep the pin. */ + LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK); + } + + return (HashMetaPage) rel->rd_amcache; +} + +/* + * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given + * hashkey. + * + * Bucket pages do not move or get removed once they are allocated. This give + * us an opportunity to use the previously saved metapage contents to reach + * the target bucket buffer, instead of reading from the metapage every time. + * This saves one buffer access every time we want to reach the target bucket + * buffer, which is very helpful savings in bufmgr traffic and contention. + * + * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the + * bucket buffer has to be locked for reading or writing. + * + * The out parameter cachedmetap is set with metapage contents used for + * hashkey to bucket buffer mapping. Some callers need this info to reach the + * old bucket in case of bucket split, see _hash_doinsert(). + */ +Buffer +_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, + HashMetaPage *cachedmetap) +{ + HashMetaPage metap; + Buffer buf; + Buffer metabuf = InvalidBuffer; + Page page; + Bucket bucket; + BlockNumber blkno; + HashPageOpaque opaque; + + /* We read from target bucket buffer, hence locking is must. */ + Assert(access == HASH_READ || access == HASH_WRITE); + + metap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(metap != NULL); + + /* + * Loop until we get a lock on the correct target bucket. + */ + for (;;) + { + /* + * Compute the target bucket number, and convert to block number. + */ + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); + + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + Assert(opaque->hasho_prevblkno != InvalidBlockNumber); + + /* + * If this bucket hasn't been split, we're done. + */ + if (opaque->hasho_prevblkno <= metap->hashm_maxbucket) + break; + + /* Drop lock on this buffer, update cached metapage, and retry. */ + _hash_relbuf(rel, buf); + metap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(metap != NULL); + } + + if (BufferIsValid(metabuf)) + _hash_dropbuf(rel, metabuf); + + if (cachedmetap) + *cachedmetap = metap; + + return buf; +} diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c new file mode 100644 index 0000000..2ffa28e --- /dev/null +++ b/src/backend/access/hash/hashsearch.c @@ -0,0 +1,721 @@ +/*------------------------------------------------------------------------- + * + * hashsearch.c + * search code for postgres hash tables + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hashsearch.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash.h" +#include "access/relscan.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/predicate.h" +#include "utils/rel.h" + +static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP, + ScanDirection dir); +static int _hash_load_qualified_items(IndexScanDesc scan, Page page, + OffsetNumber offnum, ScanDirection dir); +static inline void _hash_saveitem(HashScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup); +static void _hash_readnext(IndexScanDesc scan, Buffer *bufp, + Page *pagep, HashPageOpaque *opaquep); + +/* + * _hash_next() -- Get the next item in a scan. + * + * On entry, so->currPos describes the current page, which may + * be pinned but not locked, and so->currPos.itemIndex identifies + * which item was previously returned. + * + * On successful exit, scan->xs_ctup.t_self is set to the TID + * of the next heap tuple. so->currPos is updated as needed. + * + * On failure exit (no more tuples), we return false with pin + * held on bucket page but no pins or locks held on overflow + * page. + */ +bool +_hash_next(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + HashScanPosItem *currItem; + BlockNumber blkno; + Buffer buf; + bool end_of_scan = false; + + /* + * Advance to the next tuple on the current page; or if done, try to read + * data from the next or previous page based on the scan direction. Before + * moving to the next or previous page make sure that we deal with all the + * killed items. + */ + if (ScanDirectionIsForward(dir)) + { + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + if (so->numKilled > 0) + _hash_kill_items(scan); + + blkno = so->currPos.nextPage; + if (BlockNumberIsValid(blkno)) + { + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(buf)); + if (!_hash_readpage(scan, &buf, dir)) + end_of_scan = true; + } + else + end_of_scan = true; + } + } + else + { + if (--so->currPos.itemIndex < so->currPos.firstItem) + { + if (so->numKilled > 0) + _hash_kill_items(scan); + + blkno = so->currPos.prevPage; + if (BlockNumberIsValid(blkno)) + { + buf = _hash_getbuf(rel, blkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(buf)); + + /* + * We always maintain the pin on bucket page for whole scan + * operation, so releasing the additional pin we have acquired + * here. + */ + if (buf == so->hashso_bucket_buf || + buf == so->hashso_split_bucket_buf) + _hash_dropbuf(rel, buf); + + if (!_hash_readpage(scan, &buf, dir)) + end_of_scan = true; + } + else + end_of_scan = true; + } + } + + if (end_of_scan) + { + _hash_dropscanbuf(rel, so); + HashScanPosInvalidate(so->currPos); + return false; + } + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + + return true; +} + +/* + * Advance to next page in a bucket, if any. If we are scanning the bucket + * being populated during split operation then this function advances to the + * bucket being split after the last bucket page of bucket being populated. + */ +static void +_hash_readnext(IndexScanDesc scan, + Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) +{ + BlockNumber blkno; + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool block_found = false; + + blkno = (*opaquep)->hasho_nextblkno; + + /* + * Retain the pin on primary bucket page till the end of scan. Refer the + * comments in _hash_first to know the reason of retaining pin. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + LockBuffer(*bufp, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, *bufp); + + *bufp = InvalidBuffer; + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + if (BlockNumberIsValid(blkno)) + { + *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + block_found = true; + } + else if (so->hashso_buc_populated && !so->hashso_buc_split) + { + /* + * end of bucket, scan bucket being split if there was a split in + * progress at the start of scan. + */ + *bufp = so->hashso_split_bucket_buf; + + /* + * buffer for bucket being split must be valid as we acquire the pin + * on it before the start of scan and retain it till end of scan. + */ + Assert(BufferIsValid(*bufp)); + + LockBuffer(*bufp, BUFFER_LOCK_SHARE); + PredicateLockPage(rel, BufferGetBlockNumber(*bufp), scan->xs_snapshot); + + /* + * setting hashso_buc_split to true indicates that we are scanning + * bucket being split. + */ + so->hashso_buc_split = true; + + block_found = true; + } + + if (block_found) + { + *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + } +} + +/* + * Advance to previous page in a bucket, if any. If the current scan has + * started during split operation then this function advances to bucket + * being populated after the first bucket page of bucket being split. + */ +static void +_hash_readprev(IndexScanDesc scan, + Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) +{ + BlockNumber blkno; + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool haveprevblk; + + blkno = (*opaquep)->hasho_prevblkno; + + /* + * Retain the pin on primary bucket page till the end of scan. Refer the + * comments in _hash_first to know the reason of retaining pin. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + { + LockBuffer(*bufp, BUFFER_LOCK_UNLOCK); + haveprevblk = false; + } + else + { + _hash_relbuf(rel, *bufp); + haveprevblk = true; + } + + *bufp = InvalidBuffer; + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + + if (haveprevblk) + { + Assert(BlockNumberIsValid(blkno)); + *bufp = _hash_getbuf(rel, blkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + + /* + * We always maintain the pin on bucket page for whole scan operation, + * so releasing the additional pin we have acquired here. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + _hash_dropbuf(rel, *bufp); + } + else if (so->hashso_buc_populated && so->hashso_buc_split) + { + /* + * end of bucket, scan bucket being populated if there was a split in + * progress at the start of scan. + */ + *bufp = so->hashso_bucket_buf; + + /* + * buffer for bucket being populated must be valid as we acquire the + * pin on it before the start of scan and retain it till end of scan. + */ + Assert(BufferIsValid(*bufp)); + + LockBuffer(*bufp, BUFFER_LOCK_SHARE); + *pagep = BufferGetPage(*bufp); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + + /* move to the end of bucket chain */ + while (BlockNumberIsValid((*opaquep)->hasho_nextblkno)) + _hash_readnext(scan, bufp, pagep, opaquep); + + /* + * setting hashso_buc_split to false indicates that we are scanning + * bucket being populated. + */ + so->hashso_buc_split = false; + } +} + +/* + * _hash_first() -- Find the first item in a scan. + * + * We find the first item (or, if backward scan, the last item) in the + * index that satisfies the qualification associated with the scan + * descriptor. + * + * On successful exit, if the page containing current index tuple is an + * overflow page, both pin and lock are released whereas if it is a bucket + * page then it is pinned but not locked and data about the matching + * tuple(s) on the page has been loaded into so->currPos, + * scan->xs_ctup.t_self is set to the heap TID of the current tuple. + * + * On failure exit (no more tuples), we return false, with pin held on + * bucket page but no pins or locks held on overflow page. + */ +bool +_hash_first(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + ScanKey cur; + uint32 hashkey; + Bucket bucket; + Buffer buf; + Page page; + HashPageOpaque opaque; + HashScanPosItem *currItem; + + pgstat_count_index_scan(rel); + + /* + * We do not support hash scans with no index qualification, because we + * would have to read the whole index rather than just one bucket. That + * creates a whole raft of problems, since we haven't got a practical way + * to lock all the buckets against splits or compactions. + */ + if (scan->numberOfKeys < 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("hash indexes do not support whole-index scans"))); + + /* There may be more than one index qual, but we hash only the first */ + cur = &scan->keyData[0]; + + /* We support only single-column hash indexes */ + Assert(cur->sk_attno == 1); + /* And there's only one operator strategy, too */ + Assert(cur->sk_strategy == HTEqualStrategyNumber); + + /* + * If the constant in the index qual is NULL, assume it cannot match any + * items in the index. + */ + if (cur->sk_flags & SK_ISNULL) + return false; + + /* + * Okay to compute the hash key. We want to do this before acquiring any + * locks, in case a user-defined hash function happens to be slow. + * + * If scankey operator is not a cross-type comparison, we can use the + * cached hash function; otherwise gotta look it up in the catalogs. + * + * We support the convention that sk_subtype == InvalidOid means the + * opclass input type; this is a hack to simplify life for ScanKeyInit(). + */ + if (cur->sk_subtype == rel->rd_opcintype[0] || + cur->sk_subtype == InvalidOid) + hashkey = _hash_datum2hashkey(rel, cur->sk_argument); + else + hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument, + cur->sk_subtype); + + so->hashso_sk_hash = hashkey; + + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL); + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); + page = BufferGetPage(buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket = opaque->hasho_bucket; + + so->hashso_bucket_buf = buf; + + /* + * If a bucket split is in progress, then while scanning the bucket being + * populated, we need to skip tuples that were copied from bucket being + * split. We also need to maintain a pin on the bucket being split to + * ensure that split-cleanup work done by vacuum doesn't remove tuples + * from it till this scan is done. We need to maintain a pin on the + * bucket being populated to ensure that vacuum doesn't squeeze that + * bucket till this scan is complete; otherwise, the ordering of tuples + * can't be maintained during forward and backward scans. Here, we have + * to be cautious about locking order: first, acquire the lock on bucket + * being split; then, release the lock on it but not the pin; then, + * acquire a lock on bucket being populated and again re-verify whether + * the bucket split is still in progress. Acquiring the lock on bucket + * being split first ensures that the vacuum waits for this scan to + * finish. + */ + if (H_BUCKET_BEING_POPULATED(opaque)) + { + BlockNumber old_blkno; + Buffer old_buf; + + old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket); + + /* + * release the lock on new bucket and re-acquire it after acquiring + * the lock on old bucket. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE); + TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf)); + + /* + * remember the split bucket buffer so as to use it later for + * scanning. + */ + so->hashso_split_bucket_buf = old_buf; + LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); + + LockBuffer(buf, BUFFER_LOCK_SHARE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + + if (H_BUCKET_BEING_POPULATED(opaque)) + so->hashso_buc_populated = true; + else + { + _hash_dropbuf(rel, so->hashso_split_bucket_buf); + so->hashso_split_bucket_buf = InvalidBuffer; + } + } + + /* If a backwards scan is requested, move to the end of the chain */ + if (ScanDirectionIsBackward(dir)) + { + /* + * Backward scans that start during split needs to start from end of + * bucket being split. + */ + while (BlockNumberIsValid(opaque->hasho_nextblkno) || + (so->hashso_buc_populated && !so->hashso_buc_split)) + _hash_readnext(scan, &buf, &page, &opaque); + } + + /* remember which buffer we have pinned, if any */ + Assert(BufferIsInvalid(so->currPos.buf)); + so->currPos.buf = buf; + + /* Now find all the tuples satisfying the qualification from a page */ + if (!_hash_readpage(scan, &buf, dir)) + return false; + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + + /* if we're here, _hash_readpage found a valid tuples */ + return true; +} + +/* + * _hash_readpage() -- Load data from current index page into so->currPos + * + * We scan all the items in the current index page and save them into + * so->currPos if it satisfies the qualification. If no matching items + * are found in the current page, we move to the next or previous page + * in a bucket chain as indicated by the direction. + * + * Return true if any matching items are found else return false. + */ +static bool +_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + Buffer buf; + Page page; + HashPageOpaque opaque; + OffsetNumber offnum; + uint16 itemIndex; + + buf = *bufP; + Assert(BufferIsValid(buf)); + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + so->currPos.buf = buf; + so->currPos.currPage = BufferGetBlockNumber(buf); + + if (ScanDirectionIsForward(dir)) + { + BlockNumber prev_blkno = InvalidBlockNumber; + + for (;;) + { + /* new page, locate starting position by binary search */ + offnum = _hash_binsearch(page, so->hashso_sk_hash); + + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + + if (itemIndex != 0) + break; + + /* + * Could not find any matching tuples in the current page, move to + * the next page. Before leaving the current page, deal with any + * killed items. + */ + if (so->numKilled > 0) + _hash_kill_items(scan); + + /* + * If this is a primary bucket page, hasho_prevblkno is not a real + * block number. + */ + if (so->currPos.buf == so->hashso_bucket_buf || + so->currPos.buf == so->hashso_split_bucket_buf) + prev_blkno = InvalidBlockNumber; + else + prev_blkno = opaque->hasho_prevblkno; + + _hash_readnext(scan, &buf, &page, &opaque); + if (BufferIsValid(buf)) + { + so->currPos.buf = buf; + so->currPos.currPage = BufferGetBlockNumber(buf); + } + else + { + /* + * Remember next and previous block numbers for scrollable + * cursors to know the start position and return false + * indicating that no more matching tuples were found. Also, + * don't reset currPage or lsn, because we expect + * _hash_kill_items to be called for the old page after this + * function returns. + */ + so->currPos.prevPage = prev_blkno; + so->currPos.nextPage = InvalidBlockNumber; + so->currPos.buf = buf; + return false; + } + } + + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; + } + else + { + BlockNumber next_blkno = InvalidBlockNumber; + + for (;;) + { + /* new page, locate starting position by binary search */ + offnum = _hash_binsearch_last(page, so->hashso_sk_hash); + + itemIndex = _hash_load_qualified_items(scan, page, offnum, dir); + + if (itemIndex != MaxIndexTuplesPerPage) + break; + + /* + * Could not find any matching tuples in the current page, move to + * the previous page. Before leaving the current page, deal with + * any killed items. + */ + if (so->numKilled > 0) + _hash_kill_items(scan); + + if (so->currPos.buf == so->hashso_bucket_buf || + so->currPos.buf == so->hashso_split_bucket_buf) + next_blkno = opaque->hasho_nextblkno; + + _hash_readprev(scan, &buf, &page, &opaque); + if (BufferIsValid(buf)) + { + so->currPos.buf = buf; + so->currPos.currPage = BufferGetBlockNumber(buf); + } + else + { + /* + * Remember next and previous block numbers for scrollable + * cursors to know the start position and return false + * indicating that no more matching tuples were found. Also, + * don't reset currPage or lsn, because we expect + * _hash_kill_items to be called for the old page after this + * function returns. + */ + so->currPos.prevPage = InvalidBlockNumber; + so->currPos.nextPage = next_blkno; + so->currPos.buf = buf; + return false; + } + } + + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxIndexTuplesPerPage - 1; + so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + } + + if (so->currPos.buf == so->hashso_bucket_buf || + so->currPos.buf == so->hashso_split_bucket_buf) + { + so->currPos.prevPage = InvalidBlockNumber; + so->currPos.nextPage = opaque->hasho_nextblkno; + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + } + else + { + so->currPos.prevPage = opaque->hasho_prevblkno; + so->currPos.nextPage = opaque->hasho_nextblkno; + _hash_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; + } + + Assert(so->currPos.firstItem <= so->currPos.lastItem); + return true; +} + +/* + * Load all the qualified items from a current index page + * into so->currPos. Helper function for _hash_readpage. + */ +static int +_hash_load_qualified_items(IndexScanDesc scan, Page page, + OffsetNumber offnum, ScanDirection dir) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + IndexTuple itup; + int itemIndex; + OffsetNumber maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + + if (ScanDirectionIsForward(dir)) + { + /* load items[] in ascending order */ + itemIndex = 0; + + while (offnum <= maxoff) + { + Assert(offnum >= FirstOffsetNumber); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + /* + * skip the tuples that are moved by split operation for the scan + * that has started when split was in progress. Also, skip the + * tuples that are marked as dead. + */ + if ((so->hashso_buc_populated && !so->hashso_buc_split && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) || + (scan->ignore_killed_tuples && + (ItemIdIsDead(PageGetItemId(page, offnum))))) + { + offnum = OffsetNumberNext(offnum); /* move forward */ + continue; + } + + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) && + _hash_checkqual(scan, itup)) + { + /* tuple is qualified, so remember it */ + _hash_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + /* + * No more matching tuples exist in this page. so, exit while + * loop. + */ + break; + } + + offnum = OffsetNumberNext(offnum); + } + + Assert(itemIndex <= MaxIndexTuplesPerPage); + return itemIndex; + } + else + { + /* load items[] in descending order */ + itemIndex = MaxIndexTuplesPerPage; + + while (offnum >= FirstOffsetNumber) + { + Assert(offnum <= maxoff); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + /* + * skip the tuples that are moved by split operation for the scan + * that has started when split was in progress. Also, skip the + * tuples that are marked as dead. + */ + if ((so->hashso_buc_populated && !so->hashso_buc_split && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) || + (scan->ignore_killed_tuples && + (ItemIdIsDead(PageGetItemId(page, offnum))))) + { + offnum = OffsetNumberPrev(offnum); /* move back */ + continue; + } + + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) && + _hash_checkqual(scan, itup)) + { + itemIndex--; + /* tuple is qualified, so remember it */ + _hash_saveitem(so, itemIndex, offnum, itup); + } + else + { + /* + * No more matching tuples exist in this page. so, exit while + * loop. + */ + break; + } + + offnum = OffsetNumberPrev(offnum); + } + + Assert(itemIndex >= 0); + return itemIndex; + } +} + +/* Save an index item into so->currPos.items[itemIndex] */ +static inline void +_hash_saveitem(HashScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup) +{ + HashScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = itup->t_tid; + currItem->indexOffset = offnum; +} diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c new file mode 100644 index 0000000..3ce4248 --- /dev/null +++ b/src/backend/access/hash/hashsort.c @@ -0,0 +1,152 @@ +/*------------------------------------------------------------------------- + * + * hashsort.c + * Sort tuples for insertion into a new hash index. + * + * When building a very large hash index, we pre-sort the tuples by bucket + * number to improve locality of access to the index, and thereby avoid + * thrashing. We use tuplesort.c to sort the given index tuples into order. + * + * Note: if the number of rows in the table has been underestimated, + * bucket splits may occur during the index build. In that case we'd + * be inserting into two or more buckets for each possible masked-off + * hash code value. That's no big problem though, since we'll still have + * plenty of locality of access. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/hash/hashsort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/hash.h" +#include "commands/progress.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/pg_bitutils.h" +#include "utils/tuplesort.h" + + +/* + * Status record for spooling/sorting phase. + */ +struct HSpool +{ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + Relation index; + + /* + * We sort the hash keys based on the buckets they belong to. Below masks + * are used in _hash_hashkey2bucket to determine the bucket of given hash + * key. + */ + uint32 high_mask; + uint32 low_mask; + uint32 max_buckets; +}; + + +/* + * create and initialize a spool structure + */ +HSpool * +_h_spoolinit(Relation heap, Relation index, uint32 num_buckets) +{ + HSpool *hspool = (HSpool *) palloc0(sizeof(HSpool)); + + hspool->index = index; + + /* + * Determine the bitmask for hash code values. Since there are currently + * num_buckets buckets in the index, the appropriate mask can be computed + * as follows. + * + * NOTE : This hash mask calculation should be in sync with similar + * calculation in _hash_init_metabuffer. + */ + hspool->high_mask = pg_nextpower2_32(num_buckets + 1) - 1; + hspool->low_mask = (hspool->high_mask >> 1); + hspool->max_buckets = num_buckets - 1; + + /* + * We size the sort area as maintenance_work_mem rather than work_mem to + * speed index creation. This should be OK since a single backend can't + * run multiple index creations in parallel. + */ + hspool->sortstate = tuplesort_begin_index_hash(heap, + index, + hspool->high_mask, + hspool->low_mask, + hspool->max_buckets, + maintenance_work_mem, + NULL, + false); + + return hspool; +} + +/* + * clean up a spool structure and its substructures. + */ +void +_h_spooldestroy(HSpool *hspool) +{ + tuplesort_end(hspool->sortstate); + pfree(hspool); +} + +/* + * spool an index entry into the sort file. + */ +void +_h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull) +{ + tuplesort_putindextuplevalues(hspool->sortstate, hspool->index, + self, values, isnull); +} + +/* + * given a spool loaded by successive calls to _h_spool, + * create an entire index. + */ +void +_h_indexbuild(HSpool *hspool, Relation heapRel) +{ + IndexTuple itup; + int64 tups_done = 0; +#ifdef USE_ASSERT_CHECKING + uint32 hashkey = 0; +#endif + + tuplesort_performsort(hspool->sortstate); + + while ((itup = tuplesort_getindextuple(hspool->sortstate, true)) != NULL) + { + /* + * Technically, it isn't critical that hash keys be found in sorted + * order, since this sorting is only used to increase locality of + * access as a performance optimization. It still seems like a good + * idea to test tuplesort.c's handling of hash index tuple sorts + * through an assertion, though. + */ +#ifdef USE_ASSERT_CHECKING + uint32 lasthashkey = hashkey; + + hashkey = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + hspool->max_buckets, hspool->high_mask, + hspool->low_mask); + Assert(hashkey >= lasthashkey); +#endif + + _hash_doinsert(hspool->index, itup, heapRel); + + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tups_done); + } +} diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c new file mode 100644 index 0000000..5198728 --- /dev/null +++ b/src/backend/access/hash/hashutil.c @@ -0,0 +1,622 @@ +/*------------------------------------------------------------------------- + * + * hashutil.c + * Utility code for Postgres hash implementation. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hashutil.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "port/pg_bitutils.h" +#include "storage/buf_internals.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" + +#define CALC_NEW_BUCKET(old_bucket, lowmask) \ + old_bucket | (lowmask + 1) + +/* + * _hash_checkqual -- does the index tuple satisfy the scan conditions? + */ +bool +_hash_checkqual(IndexScanDesc scan, IndexTuple itup) +{ + /* + * Currently, we can't check any of the scan conditions since we do not + * have the original index entry value to supply to the sk_func. Always + * return true; we expect that hashgettuple already set the recheck flag + * to make the main indexscan code do it. + */ +#ifdef NOT_USED + TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); + ScanKey key = scan->keyData; + int scanKeySize = scan->numberOfKeys; + + while (scanKeySize > 0) + { + Datum datum; + bool isNull; + Datum test; + + datum = index_getattr(itup, + key->sk_attno, + tupdesc, + &isNull); + + /* assume sk_func is strict */ + if (isNull) + return false; + if (key->sk_flags & SK_ISNULL) + return false; + + test = FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument); + + if (!DatumGetBool(test)) + return false; + + key++; + scanKeySize--; + } +#endif + + return true; +} + +/* + * _hash_datum2hashkey -- given a Datum, call the index's hash function + * + * The Datum is assumed to be of the index's column type, so we can use the + * "primary" hash function that's tracked for us by the generic index code. + */ +uint32 +_hash_datum2hashkey(Relation rel, Datum key) +{ + FmgrInfo *procinfo; + Oid collation; + + /* XXX assumes index has only one attribute */ + procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC); + collation = rel->rd_indcollation[0]; + + return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key)); +} + +/* + * _hash_datum2hashkey_type -- given a Datum of a specified type, + * hash it in a fashion compatible with this index + * + * This is much more expensive than _hash_datum2hashkey, so use it only in + * cross-type situations. + */ +uint32 +_hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype) +{ + RegProcedure hash_proc; + Oid collation; + + /* XXX assumes index has only one attribute */ + hash_proc = get_opfamily_proc(rel->rd_opfamily[0], + keytype, + keytype, + HASHSTANDARD_PROC); + if (!RegProcedureIsValid(hash_proc)) + elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"", + HASHSTANDARD_PROC, keytype, keytype, + RelationGetRelationName(rel)); + collation = rel->rd_indcollation[0]; + + return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key)); +} + +/* + * _hash_hashkey2bucket -- determine which bucket the hashkey maps to. + */ +Bucket +_hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, + uint32 highmask, uint32 lowmask) +{ + Bucket bucket; + + bucket = hashkey & highmask; + if (bucket > maxbucket) + bucket = bucket & lowmask; + + return bucket; +} + +/* + * _hash_spareindex -- returns spare index / global splitpoint phase of the + * bucket + */ +uint32 +_hash_spareindex(uint32 num_bucket) +{ + uint32 splitpoint_group; + uint32 splitpoint_phases; + + splitpoint_group = pg_ceil_log2_32(num_bucket); + + if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) + return splitpoint_group; + + /* account for single-phase groups */ + splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; + + /* account for multi-phase groups before splitpoint_group */ + splitpoint_phases += + ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) << + HASH_SPLITPOINT_PHASE_BITS); + + /* account for phases within current group */ + splitpoint_phases += + (((num_bucket - 1) >> + (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) & + HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */ + + return splitpoint_phases; +} + +/* + * _hash_get_totalbuckets -- returns total number of buckets allocated till + * the given splitpoint phase. + */ +uint32 +_hash_get_totalbuckets(uint32 splitpoint_phase) +{ + uint32 splitpoint_group; + uint32 total_buckets; + uint32 phases_within_splitpoint_group; + + if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) + return (1 << splitpoint_phase); + + /* get splitpoint's group */ + splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; + splitpoint_group += + ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >> + HASH_SPLITPOINT_PHASE_BITS); + + /* account for buckets before splitpoint_group */ + total_buckets = (1 << (splitpoint_group - 1)); + + /* account for buckets within splitpoint_group */ + phases_within_splitpoint_group = + (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) & + HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */ + total_buckets += + (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) * + phases_within_splitpoint_group); + + return total_buckets; +} + +/* + * _hash_checkpage -- sanity checks on the format of all hash pages + * + * If flags is not zero, it is a bitwise OR of the acceptable page types + * (values of hasho_flag & LH_PAGE_TYPE). + */ +void +_hash_checkpage(Relation rel, Buffer buf, int flags) +{ + Page page = BufferGetPage(buf); + + /* + * ReadBuffer verifies that every newly-read page passes + * PageHeaderIsValid, which means it either contains a reasonably sane + * page header or is all-zero. We have to defend against the all-zero + * case, however. + */ + if (PageIsNew(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains unexpected zero page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + + /* + * Additionally check that the special area looks sane. + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + + if (flags) + { + HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + if ((opaque->hasho_flag & flags) == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + } + + /* + * When checking the metapage, also verify magic number and version. + */ + if (flags == LH_META_PAGE) + { + HashMetaPage metap = HashPageGetMeta(page); + + if (metap->hashm_magic != HASH_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" is not a hash index", + RelationGetRelationName(rel)))); + + if (metap->hashm_version != HASH_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has wrong hash version", + RelationGetRelationName(rel)), + errhint("Please REINDEX it."))); + } +} + +bytea * +hashoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(HashOptions, fillfactor)}, + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_HASH, + sizeof(HashOptions), + tab, lengthof(tab)); +} + +/* + * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value + */ +uint32 +_hash_get_indextuple_hashkey(IndexTuple itup) +{ + char *attp; + + /* + * We assume the hash key is the first attribute and can't be null, so + * this can be done crudely but very very cheaply ... + */ + attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info); + return *((uint32 *) attp); +} + +/* + * _hash_convert_tuple - convert raw index data to hash key + * + * Inputs: values and isnull arrays for the user data column(s) + * Outputs: values and isnull arrays for the index tuple, suitable for + * passing to index_form_tuple(). + * + * Returns true if successful, false if not (because there are null values). + * On a false result, the given data need not be indexed. + * + * Note: callers know that the index-column arrays are always of length 1. + * In principle, there could be more than one input column, though we do not + * currently support that. + */ +bool +_hash_convert_tuple(Relation index, + Datum *user_values, bool *user_isnull, + Datum *index_values, bool *index_isnull) +{ + uint32 hashkey; + + /* + * We do not insert null values into hash indexes. This is okay because + * the only supported search operator is '=', and we assume it is strict. + */ + if (user_isnull[0]) + return false; + + hashkey = _hash_datum2hashkey(index, user_values[0]); + index_values[0] = UInt32GetDatum(hashkey); + index_isnull[0] = false; + return true; +} + +/* + * _hash_binsearch - Return the offset number in the page where the + * specified hash value should be sought or inserted. + * + * We use binary search, relying on the assumption that the existing entries + * are ordered by hash key. + * + * Returns the offset of the first index entry having hashkey >= hash_value, + * or the page's max offset plus one if hash_value is greater than all + * existing hash keys in the page. This is the appropriate place to start + * a search, or to insert a new item. + */ +OffsetNumber +_hash_binsearch(Page page, uint32 hash_value) +{ + OffsetNumber upper; + OffsetNumber lower; + + /* Loop invariant: lower <= desired place <= upper */ + upper = PageGetMaxOffsetNumber(page) + 1; + lower = FirstOffsetNumber; + + while (upper > lower) + { + OffsetNumber off; + IndexTuple itup; + uint32 hashkey; + + off = (upper + lower) / 2; + Assert(OffsetNumberIsValid(off)); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + hashkey = _hash_get_indextuple_hashkey(itup); + if (hashkey < hash_value) + lower = off + 1; + else + upper = off; + } + + return lower; +} + +/* + * _hash_binsearch_last + * + * Same as above, except that if there are multiple matching items in the + * page, we return the offset of the last one instead of the first one, + * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1. + * This is handy for starting a new page in a backwards scan. + */ +OffsetNumber +_hash_binsearch_last(Page page, uint32 hash_value) +{ + OffsetNumber upper; + OffsetNumber lower; + + /* Loop invariant: lower <= desired place <= upper */ + upper = PageGetMaxOffsetNumber(page); + lower = FirstOffsetNumber - 1; + + while (upper > lower) + { + IndexTuple itup; + OffsetNumber off; + uint32 hashkey; + + off = (upper + lower + 1) / 2; + Assert(OffsetNumberIsValid(off)); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + hashkey = _hash_get_indextuple_hashkey(itup); + if (hashkey > hash_value) + upper = off - 1; + else + lower = off; + } + + return lower; +} + +/* + * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket + * from which current (new) bucket is being split. + */ +BlockNumber +_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) +{ + Bucket old_bucket; + uint32 mask; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + /* + * To get the old bucket from the current bucket, we need a mask to modulo + * into lower half of table. This mask is stored in meta page as + * hashm_lowmask, but here we can't rely on the same, because we need a + * value of lowmask that was prevalent at the time when bucket split was + * started. Masking the most significant bit of new bucket would give us + * old bucket. + */ + mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1; + old_bucket = new_bucket & mask; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + blkno = BUCKET_TO_BLKNO(metap, old_bucket); + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket + * that will be generated after split from old bucket. + * + * This is used to find the new bucket from old bucket based on current table + * half. It is mainly required to finish the incomplete splits where we are + * sure that not more than one bucket could have split in progress from old + * bucket. + */ +BlockNumber +_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) +{ + Bucket new_bucket; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, + metap->hashm_lowmask, + metap->hashm_maxbucket); + blkno = BUCKET_TO_BLKNO(metap, new_bucket); + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be + * generated after split from current (old) bucket. + * + * This is used to find the new bucket from old bucket. New bucket can be + * obtained by OR'ing old bucket with most significant bit of current table + * half (lowmask passed in this function can be used to identify msb of + * current table half). There could be multiple buckets that could have + * been split from current bucket. We need the first such bucket that exists. + * Caller must ensure that no more than one split has happened from old + * bucket. + */ +Bucket +_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, + uint32 lowmask, uint32 maxbucket) +{ + Bucket new_bucket; + + new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); + if (new_bucket > maxbucket) + { + lowmask = lowmask >> 1; + new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); + } + + return new_bucket; +} + +/* + * _hash_kill_items - set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * scan->opaque, referenced locally through so, contains information about the + * current page and killed tuples thereon (generally, this should only be + * called if so->numKilled > 0). + * + * The caller does not have a lock on the page and may or may not have the + * page pinned in a buffer. Note that read-lock is sufficient for setting + * LP_DEAD status (which is only a hint). + * + * The caller must have pin on bucket buffer, but may or may not have pin + * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos). + * + * We match items by heap TID before assuming they are the right ones to + * delete. + * + * There are never any scans active in a bucket at the time VACUUM begins, + * because VACUUM takes a cleanup lock on the primary bucket page and scans + * hold a pin. A scan can begin after VACUUM leaves the primary bucket page + * but before it finishes the entire bucket, but it can never pass VACUUM, + * because VACUUM always locks the next page before releasing the lock on + * the previous one. Therefore, we don't have to worry about accidentally + * killing a TID that has been reused for an unrelated tuple. + */ +void +_hash_kill_items(IndexScanDesc scan) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + BlockNumber blkno; + Buffer buf; + Page page; + HashPageOpaque opaque; + OffsetNumber offnum, + maxoff; + int numKilled = so->numKilled; + int i; + bool killedsomething = false; + bool havePin = false; + + Assert(so->numKilled > 0); + Assert(so->killedItems != NULL); + Assert(HashScanPosIsValid(so->currPos)); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; + + blkno = so->currPos.currPage; + if (HashScanPosIsPinned(so->currPos)) + { + /* + * We already have pin on this buffer, so, all we need to do is + * acquire lock on it. + */ + havePin = true; + buf = so->currPos.buf; + LockBuffer(buf, BUFFER_LOCK_SHARE); + } + else + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < numKilled; i++) + { + int itemIndex = so->killedItems[i]; + HashScanPosItem *currItem = &so->currPos.items[itemIndex]; + + offnum = currItem->indexOffset; + + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) + { + /* found the item */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. Whenever + * we mark anything LP_DEAD, we also set the page's + * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. + */ + if (killedsomething) + { + opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; + MarkBufferDirtyHint(buf, true); + } + + if (so->hashso_bucket_buf == so->currPos.buf || + havePin) + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); +} diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c new file mode 100644 index 0000000..1e343df --- /dev/null +++ b/src/backend/access/hash/hashvalidate.c @@ -0,0 +1,439 @@ +/*------------------------------------------------------------------------- + * + * hashvalidate.c + * Opclass validator for hash. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/hash/hashvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/pg_am.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "parser/parse_coerce.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + + +static bool check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype); + + +/* + * Validator for a hash opclass. + * + * Some of the checks done here cover the whole opfamily, and therefore are + * redundant when checking each opclass in a family. But they don't run long + * enough to be much of a problem, so we accept the duplication rather than + * complicate the amvalidate API. + */ +bool +hashvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + List *hashabletypes = NIL; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + + /* + * All hash functions should be registered with matching left/right + * types + */ + if (procform->amproclefttype != procform->amprocrighttype) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types", + opfamilyname, "hash", + format_procedure(procform->amproc)))); + result = false; + } + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case HASHSTANDARD_PROC: + case HASHEXTENDED_PROC: + if (!check_hash_func_signature(procform->amproc, procform->amprocnum, + procform->amproclefttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "hash", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + else + { + /* Remember which types we can hash */ + hashabletypes = + list_append_unique_oid(hashabletypes, + procform->amproclefttype); + } + break; + case HASHOPTIONS_PROC: + if (!check_amoptsproc_signature(procform->amproc)) + result = false; + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "hash", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + break; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || + oprform->amopstrategy > HTMaxStrategyNumber) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "hash", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* hash doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "hash", + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all hash strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "hash", + format_operator(oprform->amopopr)))); + result = false; + } + + /* There should be relevant hash functions for each datatype */ + if (!list_member_oid(hashabletypes, oprform->amoplefttype) || + !list_member_oid(hashabletypes, oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s lacks support function for operator %s", + opfamilyname, "hash", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * Complain if there seems to be an incomplete set of operators for + * this datatype pair (implying that we have a hash function but no + * operator). + */ + if (thisgroup->operatorset != (1 << HTEqualStrategyNumber)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s", + opfamilyname, "hash", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + } + + /* Check that the originally-named opclass is supported */ + /* (if group is there, we already checked it adequately above) */ + if (!opclassgroup) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing operator(s)", + opclassname, "hash"))); + result = false; + } + + /* + * Complain if the opfamily doesn't have entries for all possible + * combinations of its supported datatypes. While missing cross-type + * operators are not fatal, it seems reasonable to insist that all + * built-in hash opfamilies be complete. + */ + if (list_length(grouplist) != + list_length(hashabletypes) * list_length(hashabletypes)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)", + opfamilyname, "hash"))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + + +/* + * We need a custom version of check_amproc_signature because of assorted + * hacks in the core hash opclass definitions. + */ +static bool +check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype) +{ + bool result = true; + Oid restype; + int16 nargs; + HeapTuple tp; + Form_pg_proc procform; + + switch (amprocnum) + { + case HASHSTANDARD_PROC: + restype = INT4OID; + nargs = 1; + break; + + case HASHEXTENDED_PROC: + restype = INT8OID; + nargs = 2; + break; + + default: + elog(ERROR, "invalid amprocnum"); + } + + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + procform = (Form_pg_proc) GETSTRUCT(tp); + + if (procform->prorettype != restype || procform->proretset || + procform->pronargs != nargs) + result = false; + + if (!IsBinaryCoercible(argtype, procform->proargtypes.values[0])) + { + /* + * Some of the built-in hash opclasses cheat by using hash functions + * that are different from but physically compatible with the opclass + * datatype. In some of these cases, even a "binary coercible" check + * fails because there's no relevant cast. For the moment, fix it by + * having a list of allowed cases. Test the specific function + * identity, not just its input type, because hashvarlena() takes + * INTERNAL and allowing any such function seems too scary. + */ + if ((funcid == F_HASHINT4 || funcid == F_HASHINT4EXTENDED) && + (argtype == DATEOID || + argtype == XIDOID || argtype == CIDOID)) + /* okay, allowed use of hashint4() */ ; + else if ((funcid == F_HASHINT8 || funcid == F_HASHINT8EXTENDED) && + (argtype == XID8OID)) + /* okay, allowed use of hashint8() */ ; + else if ((funcid == F_TIMESTAMP_HASH || + funcid == F_TIMESTAMP_HASH_EXTENDED) && + argtype == TIMESTAMPTZOID) + /* okay, allowed use of timestamp_hash() */ ; + else if ((funcid == F_HASHCHAR || funcid == F_HASHCHAREXTENDED) && + argtype == BOOLOID) + /* okay, allowed use of hashchar() */ ; + else if ((funcid == F_HASHVARLENA || funcid == F_HASHVARLENAEXTENDED) && + argtype == BYTEAOID) + /* okay, allowed use of hashvarlena() */ ; + else + result = false; + } + + /* If function takes a second argument, it must be for a 64-bit salt. */ + if (nargs == 2 && procform->proargtypes.values[1] != INT8OID) + result = false; + + ReleaseSysCache(tp); + return result; +} + +/* + * Prechecking function for adding operators/functions to a hash opfamily. + */ +void +hashadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + Oid opcintype; + ListCell *lc; + + /* + * Hash operators and required support functions are always "loose" + * members of the opfamily if they are cross-type. If they are not + * cross-type, we prefer to tie them to the appropriate opclass ... but if + * the user hasn't created one, we can't do that, and must fall back to + * using the opfamily dependency. (We mustn't force creation of an + * opclass in such a case, as leaving an incomplete opclass laying about + * would be bad. Throwing an error is another undesirable alternative.) + * + * This behavior results in a bit of a dump/reload hazard, in that the + * order of restoring objects could affect what dependencies we end up + * with. pg_dump's existing behavior will preserve the dependency choices + * in most cases, but not if a cross-type operator has been bound tightly + * into an opclass. That's a mistake anyway, so silently "fixing" it + * isn't awful. + * + * Optional support functions are always "loose" family members. + * + * To avoid repeated lookups, we remember the most recently used opclass's + * input type. + */ + if (OidIsValid(opclassoid)) + { + /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */ + CommandCounterIncrement(); + opcintype = get_opclass_input_type(opclassoid); + } + else + opcintype = InvalidOid; + + /* + * We handle operators and support functions almost identically, so rather + * than duplicate this code block, just join the lists. + */ + foreach(lc, list_concat_copy(operators, functions)) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + if (op->is_func && op->number != HASHSTANDARD_PROC) + { + /* Optional support proc, so always a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + else if (op->lefttype != op->righttype) + { + /* Cross-type, so always a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + else + { + /* Not cross-type; is there a suitable opclass? */ + if (op->lefttype != opcintype) + { + /* Avoid repeating this expensive lookup, even if it fails */ + opcintype = op->lefttype; + opclassoid = opclass_for_family_datatype(HASH_AM_OID, + opfamilyoid, + opcintype); + } + if (OidIsValid(opclassoid)) + { + /* Hard dependency on opclass */ + op->ref_is_hard = true; + op->ref_is_family = false; + op->refobjid = opclassoid; + } + else + { + /* We're stuck, so make a soft dependency on the opfamily */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + } + } +} diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile new file mode 100644 index 0000000..af0bd18 --- /dev/null +++ b/src/backend/access/heap/Makefile @@ -0,0 +1,26 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/heap +# +# IDENTIFICATION +# src/backend/access/heap/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/heap +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + heapam.o \ + heapam_handler.o \ + heapam_visibility.o \ + heaptoast.o \ + hio.o \ + pruneheap.o \ + rewriteheap.o \ + vacuumlazy.o \ + visibilitymap.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT new file mode 100644 index 0000000..68c6709 --- /dev/null +++ b/src/backend/access/heap/README.HOT @@ -0,0 +1,499 @@ +src/backend/access/heap/README.HOT + +Heap Only Tuples (HOT) +====================== + +The Heap Only Tuple (HOT) feature eliminates redundant index entries and +allows the re-use of space taken by DELETEd or obsoleted UPDATEd tuples +without performing a table-wide vacuum. It does this by allowing +single-page vacuuming, also called "defragmentation". + +Note: there is a Glossary at the end of this document that may be helpful +for first-time readers. + + +Technical Challenges +-------------------- + +Page-at-a-time vacuuming is normally impractical because of the costs of +finding and removing the index entries that link to the tuples to be +reclaimed. Standard vacuuming scans the indexes to ensure all such index +entries are removed, amortizing the index scan cost across as many dead +tuples as possible; this approach does not scale down well to the case of +reclaiming just a few tuples. In principle one could recompute the index +keys and do standard index searches to find the index entries, but this is +risky in the presence of possibly-buggy user-defined functions in +functional indexes. An allegedly immutable function that in fact is not +immutable might prevent us from re-finding an index entry (and we cannot +throw an error for not finding it, in view of the fact that dead index +entries are sometimes reclaimed early). That would lead to a seriously +corrupt index, in the form of entries pointing to tuple slots that by now +contain some unrelated content. In any case we would prefer to be able +to do vacuuming without invoking any user-written code. + +HOT solves this problem for a restricted but useful special case: +where a tuple is repeatedly updated in ways that do not change its +indexed columns. (Here, "indexed column" means any column referenced +at all in an index definition, including for example columns that are +tested in a partial-index predicate but are not stored in the index.) + +An additional property of HOT is that it reduces index size by avoiding +the creation of identically-keyed index entries. This improves search +speeds. + + +Update Chains With a Single Index Entry +--------------------------------------- + +Without HOT, every version of a row in an update chain has its own index +entries, even if all indexed columns are the same. With HOT, a new tuple +placed on the same page and with all indexed columns the same as its +parent row version does not get new index entries. This means there is +only one index entry for the entire update chain on the heap page. +An index-entry-less tuple is marked with the HEAP_ONLY_TUPLE flag. +The prior row version is marked HEAP_HOT_UPDATED, and (as always in an +update chain) its t_ctid field links forward to the newer version. + +For example: + + Index points to 1 + lp [1] [2] + + [111111111]->[2222222222] + +In the above diagram, the index points to line pointer 1, and tuple 1 is +marked as HEAP_HOT_UPDATED. Tuple 2 is a HOT tuple, meaning it has +no index entry pointing to it, and is marked as HEAP_ONLY_TUPLE. +Although tuple 2 is not directly referenced by the index, it can still be +found by an index search: after traversing from the index to tuple 1, +the index search proceeds forward to child tuples as long as it sees the +HEAP_HOT_UPDATED flag set. Since we restrict the HOT chain to lie within +a single page, this requires no additional page fetches and doesn't +introduce much performance penalty. + +Eventually, tuple 1 will no longer be visible to any transaction. +At that point its space could be reclaimed, but its line pointer cannot, +since the index still links to that line pointer and we still need to +be able to find tuple 2 in an index search. HOT handles this by turning +line pointer 1 into a "redirecting line pointer", which links to tuple 2 +but has no actual tuple attached. This state of affairs looks like + + Index points to 1 + lp [1]->[2] + + [2222222222] + +If now the row is updated again, to version 3, the page looks like this: + + Index points to 1 + lp [1]->[2] [3] + + [2222222222]->[3333333333] + +At some later time when no transaction can see tuple 2 in its snapshot, +tuple 2 and its line pointer can be pruned entirely: + + Index points to 1 + lp [1]------>[3] + + [3333333333] + +This is safe because no index entry points to line pointer 2. Subsequent +insertions into the page can now recycle both line pointer 2 and the +space formerly used by tuple 2. + +If an update changes any indexed column, or there is not room on the +same page for the new tuple, then the HOT chain ends: the last member +has a regular t_ctid link to the next version and is not marked +HEAP_HOT_UPDATED. (In principle we could continue a HOT chain across +pages, but this would destroy the desired property of being able to +reclaim space with just page-local manipulations. Anyway, we don't +want to have to chase through multiple heap pages to get from an index +entry to the desired tuple, so it seems better to create a new index +entry for the new tuple.) If further updates occur, the next version +could become the root of a new HOT chain. + +Line pointer 1 has to remain as long as there is any non-dead member of +the chain on the page. When there is not, it is marked "dead". +This lets us reclaim the last child line pointer and associated tuple +immediately. The next regular VACUUM pass can reclaim the index entries +pointing at the line pointer and then the line pointer itself. Since a +line pointer is small compared to a tuple, this does not represent an +undue space cost. + +Note: we can use a "dead" line pointer for any DELETEd tuple, +whether it was part of a HOT chain or not. This allows space reclamation +in advance of running VACUUM for plain DELETEs as well as HOT updates. + +The requirement for doing a HOT update is that none of the indexed +columns are changed. This is checked at execution time by comparing the +binary representation of the old and new values. We insist on bitwise +equality rather than using datatype-specific equality routines. The +main reason to avoid the latter is that there might be multiple notions +of equality for a datatype, and we don't know exactly which one is +relevant for the indexes at hand. We assume that bitwise equality +guarantees equality for all purposes. + + +Abort Cases +----------- + +If a heap-only tuple's xmin is aborted, then it can be removed immediately: +it was never visible to any other transaction, and all descendant row +versions must be aborted as well. Therefore we need not consider it part +of a HOT chain. By the same token, if a HOT-updated tuple's xmax is +aborted, there is no need to follow the chain link. However, there is a +race condition here: the transaction that did the HOT update might abort +between the time we inspect the HOT-updated tuple and the time we reach +the descendant heap-only tuple. It is conceivable that someone prunes +the heap-only tuple before that, and even conceivable that the line pointer +is re-used for another purpose. Therefore, when following a HOT chain, +it is always necessary to be prepared for the possibility that the +linked-to line pointer is unused, dead, or redirected; and if it is a +normal line pointer, we still have to check that XMIN of the tuple matches +the XMAX of the tuple we left. Otherwise we should assume that we have +come to the end of the HOT chain. Note that this sort of XMIN/XMAX +matching is required when following ordinary update chains anyway. + +(Early versions of the HOT code assumed that holding pin on the page +buffer while following a HOT link would prevent this type of problem, +but checking XMIN/XMAX matching is a much more robust solution.) + + +Index/Sequential Scans +---------------------- + +When doing an index scan, whenever we reach a HEAP_HOT_UPDATED tuple whose +xmax is not aborted, we need to follow its t_ctid link and check that +entry as well; possibly repeatedly until we reach the end of the HOT +chain. (When using an MVCC snapshot it is possible to optimize this a +bit: there can be at most one visible tuple in the chain, so we can stop +when we find it. This rule does not work for non-MVCC snapshots, though.) + +Sequential scans do not need to pay attention to the HOT links because +they scan every line pointer on the page anyway. The same goes for a +bitmap heap scan with a lossy bitmap. + + +Pruning +------- + +HOT pruning means updating line pointers so that HOT chains are +reduced in length, by collapsing out line pointers for intermediate dead +tuples. Although this makes those line pointers available for re-use, +it does not immediately make the space occupied by their tuples available. + + +Defragmentation +--------------- + +Defragmentation centralizes unused space. After we have converted root +line pointers to redirected line pointers and pruned away any dead +intermediate line pointers, the tuples they linked to are free space. +But unless that space is adjacent to the central "hole" on the page +(the pd_lower-to-pd_upper area) it cannot be used by tuple insertion. +Defragmentation moves the surviving tuples to coalesce all the free +space into one "hole". This is done with the same PageRepairFragmentation +function that regular VACUUM uses. + + +When can/should we prune or defragment? +--------------------------------------- + +This is the most interesting question in HOT implementation, since there +is no simple right answer: we must use heuristics to determine when it's +most efficient to perform pruning and/or defragmenting. + +We cannot prune or defragment unless we can get a "buffer cleanup lock" +on the target page; otherwise, pruning might destroy line pointers that +other backends have live references to, and defragmenting might move +tuples that other backends have live pointers to. Thus the general +approach must be to heuristically decide if we should try to prune +or defragment, and if so try to acquire the buffer cleanup lock without +blocking. If we succeed we can proceed with our housekeeping work. +If we cannot get the lock (which should not happen often, except under +very heavy contention) then the housekeeping has to be postponed till +some other time. The worst-case consequence of this is only that an +UPDATE cannot be made HOT but has to link to a new tuple version placed on +some other page, for lack of centralized space on the original page. + +Ideally we would do defragmenting only when we are about to attempt +heap_update on a HOT-safe tuple. The difficulty with this approach +is that the update query has certainly got a pin on the old tuple, and +therefore our attempt to acquire a buffer cleanup lock will always fail. +(This corresponds to the idea that we don't want to move the old tuple +out from under where the query's HeapTuple pointer points. It might +be possible to finesse that, but it seems fragile.) + +Pruning, however, is potentially useful even when we are not about to +insert a new tuple, since shortening a HOT chain reduces the cost of +subsequent index searches. However it is unclear that this gain is +large enough to accept any extra maintenance burden for. + +The currently planned heuristic is to prune and defrag when first accessing +a page that potentially has prunable tuples (as flagged by the pd_prune_xid +page hint field) and that either has free space less than MAX(fillfactor +target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to +find enough free space to store an updated tuple version. (These rules +are subject to change.) + +We have effectively implemented the "truncate dead tuples to just line +pointer" idea that has been proposed and rejected before because of fear +of line pointer bloat: we might end up with huge numbers of line pointers +and just a few actual tuples on a page. To limit the damage in the worst +case, and to keep various work arrays as well as the bitmaps in bitmap +scans reasonably sized, the maximum number of line pointers per page +is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that +could fit without HOT pruning). + +Effectively, space reclamation happens during tuple retrieval when the +page is nearly full (<10% free) and a buffer cleanup lock can be +acquired. This means that UPDATE, DELETE, and SELECT can trigger space +reclamation, but often not during INSERT ... VALUES because it does +not retrieve a row. + + +VACUUM +------ + +There is little change to regular vacuum. It performs pruning to remove +dead heap-only tuples, and cleans up any dead line pointers as if they were +regular dead tuples. + + +Statistics +---------- + +Currently, we count HOT updates the same as cold updates for statistics +purposes, though there is an additional per-table counter that counts +only HOT updates. When a page pruning operation is able to remove a +physical tuple by eliminating an intermediate heap-only tuple or +replacing a physical root tuple by a redirect pointer, a decrement in +the table's number of dead tuples is reported to pgstats, which may +postpone autovacuuming. Note that we do not count replacing a root tuple +by a DEAD line pointer as decrementing n_dead_tuples; we still want +autovacuum to run to clean up the index entries and DEAD item. + +This area probably needs further work ... + + +CREATE INDEX +------------ + +CREATE INDEX presents a problem for HOT updates. While the existing HOT +chains all have the same index values for existing indexes, the columns +in the new index might change within a pre-existing HOT chain, creating +a "broken" chain that can't be indexed properly. + +To address this issue, regular (non-concurrent) CREATE INDEX makes the +new index usable only by new transactions and transactions that don't +have snapshots older than the CREATE INDEX command. This prevents +queries that can see the inconsistent HOT chains from trying to use the +new index and getting incorrect results. Queries that can see the index +can only see the rows that were visible after the index was created, +hence the HOT chains are consistent for them. + +Entries in the new index point to root tuples (tuples with current index +pointers) so that our index uses the same index pointers as all other +indexes on the table. However the row we want to index is actually at +the *end* of the chain, ie, the most recent live tuple on the HOT chain. +That is the one we compute the index entry values for, but the TID +we put into the index is that of the root tuple. Since queries that +will be allowed to use the new index cannot see any of the older tuple +versions in the chain, the fact that they might not match the index entry +isn't a problem. (Such queries will check the tuple visibility +information of the older versions and ignore them, without ever looking at +their contents, so the content inconsistency is OK.) Subsequent updates +to the live tuple will be allowed to extend the HOT chain only if they are +HOT-safe for all the indexes. + +Because we have ShareLock on the table, any DELETE_IN_PROGRESS or +INSERT_IN_PROGRESS tuples should have come from our own transaction. +Therefore we can consider them committed since if the CREATE INDEX +commits, they will be committed, and if it aborts the index is discarded. +An exception to this is that early lock release is customary for system +catalog updates, and so we might find such tuples when reindexing a system +catalog. In that case we deal with it by waiting for the source +transaction to commit or roll back. (We could do that for user tables +too, but since the case is unexpected we prefer to throw an error.) + +Practically, we prevent certain transactions from using the new index by +setting pg_index.indcheckxmin to TRUE. Transactions are allowed to use +such an index only after pg_index.xmin is below their TransactionXmin +horizon, thereby ensuring that any incompatible rows in HOT chains are +dead to them. (pg_index.xmin will be the XID of the CREATE INDEX +transaction. The reason for using xmin rather than a normal column is +that the regular vacuum freezing mechanism will take care of converting +xmin to FrozenTransactionId before it can wrap around.) + +This means in particular that the transaction creating the index will be +unable to use the index if the transaction has old snapshots. We +alleviate that problem somewhat by not setting indcheckxmin unless the +table actually contains HOT chains with RECENTLY_DEAD members. + +Another unpleasant consequence is that it is now risky to use SnapshotAny +in an index scan: if the index was created more recently than the last +vacuum, it's possible that some of the visited tuples do not match the +index entry they are linked to. This does not seem to be a fatal +objection, since there are few users of SnapshotAny and most use seqscans. +The only exception at this writing is CLUSTER, which is okay because it +does not require perfect ordering of the indexscan readout (and especially +so because CLUSTER tends to write recently-dead tuples out of order anyway). + + +CREATE INDEX CONCURRENTLY +------------------------- + +In the concurrent case we must take a different approach. We create the +pg_index entry immediately, before we scan the table. The pg_index entry +is marked as "not ready for inserts". Then we commit and wait for any +transactions which have the table open to finish. This ensures that no +new HOT updates will change the key value for our new index, because all +transactions will see the existence of the index and will respect its +constraint on which updates can be HOT. Other transactions must include +such an index when determining HOT-safety of updates, even though they +must ignore it for both insertion and searching purposes. + +We must do this to avoid making incorrect index entries. For example, +suppose we are building an index on column X and we make an index entry for +a non-HOT tuple with X=1. Then some other backend, unaware that X is an +indexed column, HOT-updates the row to have X=2, and commits. We now have +an index entry for X=1 pointing at a HOT chain whose live row has X=2. +We could make an index entry with X=2 during the validation pass, but +there is no nice way to get rid of the wrong entry with X=1. So we must +have the HOT-safety property enforced before we start to build the new +index. + +After waiting for transactions which had the table open, we build the index +for all rows that are valid in a fresh snapshot. Any tuples visible in the +snapshot will have only valid forward-growing HOT chains. (They might have +older HOT updates behind them which are broken, but this is OK for the same +reason it's OK in a regular index build.) As above, we point the index +entry at the root of the HOT-update chain but we use the key value from the +live tuple. + +We mark the index open for inserts (but still not ready for reads) then +we again wait for transactions which have the table open. Then we take +a second reference snapshot and validate the index. This searches for +tuples missing from the index, and inserts any missing ones. Again, +the index entries have to have TIDs equal to HOT-chain root TIDs, but +the value to be inserted is the one from the live tuple. + +Then we wait until every transaction that could have a snapshot older than +the second reference snapshot is finished. This ensures that nobody is +alive any longer who could need to see any tuples that might be missing +from the index, as well as ensuring that no one can see any inconsistent +rows in a broken HOT chain (the first condition is stronger than the +second). Finally, we can mark the index valid for searches. + +Note that we do not need to set pg_index.indcheckxmin in this code path, +because we have outwaited any transactions that would need to avoid using +the index. (indcheckxmin is only needed because non-concurrent CREATE +INDEX doesn't want to wait; its stronger lock would create too much risk of +deadlock if it did.) + + +DROP INDEX CONCURRENTLY +----------------------- + +DROP INDEX CONCURRENTLY is sort of the reverse sequence of CREATE INDEX +CONCURRENTLY. We first mark the index as not indisvalid, and then wait for +any transactions that could be using it in queries to end. (During this +time, index updates must still be performed as normal, since such +transactions might expect freshly inserted tuples to be findable.) +Then, we clear indisready and indislive, and again wait for transactions +that could be updating the index to end. Finally we can drop the index +normally (though taking only ShareUpdateExclusiveLock on its parent table). + +The reason we need the pg_index.indislive flag is that after the second +wait step begins, we don't want transactions to be touching the index at +all; otherwise they might suffer errors if the DROP finally commits while +they are reading catalog entries for the index. If we had only indisvalid +and indisready, this state would be indistinguishable from the first stage +of CREATE INDEX CONCURRENTLY --- but in that state, we *do* want +transactions to examine the index, since they must consider it in +HOT-safety checks. + + +Limitations and Restrictions +---------------------------- + +It is worth noting that HOT forever forecloses alternative approaches +to vacuuming, specifically the recompute-the-index-keys approach alluded +to in Technical Challenges above. It'll be tough to recompute the index +keys for a root line pointer you don't have data for anymore ... + + +Glossary +-------- + +Broken HOT Chain + + A HOT chain in which the key value for an index has changed. + + This is not allowed to occur normally but if a new index is created + it can happen. In that case various strategies are used to ensure + that no transaction for which the older tuples are visible can + use the index. + +Cold update + + A normal, non-HOT update, in which index entries are made for + the new version of the tuple. + +Dead line pointer + + A stub line pointer, that does not point to anything, but cannot + be removed or reused yet because there are index pointers to it. + Semantically same as a dead tuple. It has state LP_DEAD. + +Heap-only tuple + + A heap tuple with no index pointers, which can only be reached + from indexes indirectly through its ancestral root tuple. + Marked with HEAP_ONLY_TUPLE flag. + +HOT-safe + + A proposed tuple update is said to be HOT-safe if it changes + none of the tuple's indexed columns. It will only become an + actual HOT update if we can find room on the same page for + the new tuple version. + +HOT update + + An UPDATE where the new tuple becomes a heap-only tuple, and no + new index entries are made. + +HOT-updated tuple + + An updated tuple, for which the next tuple in the chain is a + heap-only tuple. Marked with HEAP_HOT_UPDATED flag. + +Indexed column + + A column used in an index definition. The column might not + actually be stored in the index --- it could be used in a + functional index's expression, or used in a partial index + predicate. HOT treats all these cases alike. + +Redirecting line pointer + + A line pointer that points to another line pointer and has no + associated tuple. It has the special lp_flags state LP_REDIRECT, + and lp_off is the OffsetNumber of the line pointer it links to. + This is used when a root tuple becomes dead but we cannot prune + the line pointer because there are non-dead heap-only tuples + further down the chain. + +Root tuple + + The first tuple in a HOT update chain; the one that indexes point to. + +Update chain + + A chain of updated tuples, in which each tuple's ctid points to + the next tuple in the chain. A HOT update chain is an update chain + (or portion of an update chain) that consists of a root tuple and + one or more heap-only tuples. A complete update chain can contain + both HOT and non-HOT (cold) updated tuples. diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock new file mode 100644 index 0000000..6441e8b --- /dev/null +++ b/src/backend/access/heap/README.tuplock @@ -0,0 +1,155 @@ +Locking tuples +-------------- + +Locking tuples is not as easy as locking tables or other database objects. +The problem is that transactions might want to lock large numbers of tuples at +any one time, so it's not possible to keep the locks objects in shared memory. +To work around this limitation, we use a two-level mechanism. The first level +is implemented by storing locking information in the tuple header: a tuple is +marked as locked by setting the current transaction's XID as its XMAX, and +setting additional infomask bits to distinguish this case from the more normal +case of having deleted the tuple. When multiple transactions concurrently +lock a tuple, a MultiXact is used; see below. This mechanism can accommodate +arbitrarily large numbers of tuples being locked simultaneously. + +When it is necessary to wait for a tuple-level lock to be released, the basic +delay is provided by XactLockTableWait or MultiXactIdWait on the contents of +the tuple's XMAX. However, that mechanism will release all waiters +concurrently, so there would be a race condition as to which waiter gets the +tuple, potentially leading to indefinite starvation of some waiters. The +possibility of share-locking makes the problem much worse --- a steady stream +of share-lockers can easily block an exclusive locker forever. To provide +more reliable semantics about who gets a tuple-level lock first, we use the +standard lock manager, which implements the second level mentioned above. The +protocol for waiting for a tuple-level lock is really + + LockTuple() + XactLockTableWait() + mark tuple as locked by me + UnlockTuple() + +When there are multiple waiters, arbitration of who is to get the lock next +is provided by LockTuple(). However, at most one tuple-level lock will +be held or awaited per backend at any time, so we don't risk overflow +of the lock table. Note that incoming share-lockers are required to +do LockTuple as well, if there is any conflict, to ensure that they don't +starve out waiting exclusive-lockers. However, if there is not any active +conflict for a tuple, we don't incur any extra overhead. + +We make an exception to the above rule for those lockers that already hold +some lock on a tuple and attempt to acquire a stronger one on it. In that +case, we skip the LockTuple() call even when there are conflicts, provided +that the target tuple is being locked, updated or deleted by multiple sessions +concurrently. Failing to skip the lock would risk a deadlock, e.g., between a +session that was first to record its weaker lock in the tuple header and would +be waiting on the LockTuple() call to upgrade to the stronger lock level, and +another session that has already done LockTuple() and is waiting for the first +session transaction to release its tuple header-level lock. + +We provide four levels of tuple locking strength: SELECT FOR UPDATE obtains an +exclusive lock which prevents any kind of modification of the tuple. This is +the lock level that is implicitly taken by DELETE operations, and also by +UPDATE operations if they modify any of the tuple's key fields. SELECT FOR NO +KEY UPDATE likewise obtains an exclusive lock, but only prevents tuple removal +and modifications which might alter the tuple's key. This is the lock that is +implicitly taken by UPDATE operations which leave all key fields unchanged. +SELECT FOR SHARE obtains a shared lock which prevents any kind of tuple +modification. Finally, SELECT FOR KEY SHARE obtains a shared lock which only +prevents tuple removal and modifications of key fields. This lock level is +just strong enough to implement RI checks, i.e. it ensures that tuples do not +go away from under a check, without blocking transactions that want to update +the tuple without changing its key. + +The conflict table is: + + UPDATE NO KEY UPDATE SHARE KEY SHARE +UPDATE conflict conflict conflict conflict +NO KEY UPDATE conflict conflict conflict +SHARE conflict conflict +KEY SHARE conflict + +When there is a single locker in a tuple, we can just store the locking info +in the tuple itself. We do this by storing the locker's Xid in XMAX, and +setting infomask bits specifying the locking strength. There is one exception +here: since infomask space is limited, we do not provide a separate bit +for SELECT FOR SHARE, so we have to use the extended info in a MultiXact in +that case. (The other cases, SELECT FOR UPDATE and SELECT FOR KEY SHARE, are +presumably more commonly used due to being the standards-mandated locking +mechanism, or heavily used by the RI code, so we want to provide fast paths +for those.) + +MultiXacts +---------- + +A tuple header provides very limited space for storing information about tuple +locking and updates: there is room only for a single Xid and a small number of +infomask bits. Whenever we need to store more than one lock, we replace the +first locker's Xid with a new MultiXactId. Each MultiXact provides extended +locking data; it comprises an array of Xids plus some flags bits for each one. +The flags are currently used to store the locking strength of each member +transaction. (The flags also distinguish a pure locker from an updater.) + +In earlier PostgreSQL releases, a MultiXact always meant that the tuple was +locked in shared mode by multiple transactions. This is no longer the case; a +MultiXact may contain an update or delete Xid. (Keep in mind that tuple locks +in a transaction do not conflict with other tuple locks in the same +transaction, so it's possible to have otherwise conflicting locks in a +MultiXact if they belong to the same transaction). + +Note that each lock is attributed to the subtransaction that acquires it. +This means that a subtransaction that aborts is seen as though it releases the +locks it acquired; concurrent transactions can then proceed without having to +wait for the main transaction to finish. It also means that a subtransaction +can upgrade to a stronger lock level than an earlier transaction had, and if +the subxact aborts, the earlier, weaker lock is kept. + +The possibility of having an update within a MultiXact means that they must +persist across crashes and restarts: a future reader of the tuple needs to +figure out whether the update committed or aborted. So we have a requirement +that pg_multixact needs to retain pages of its data until we're certain that +the MultiXacts in them are no longer of interest. + +VACUUM is in charge of removing old MultiXacts at the time of tuple freezing. +The lower bound used by vacuum (that is, the value below which all multixacts +are removed) is stored as pg_class.relminmxid for each table; the minimum of +all such values is stored in pg_database.datminmxid. The minimum across +all databases, in turn, is recorded in checkpoint records, and CHECKPOINT +removes pg_multixact/ segments older than that value once the checkpoint +record has been flushed. + +Infomask Bits +------------- + +The following infomask bits are applicable: + +- HEAP_XMAX_INVALID + Any tuple with this bit set does not have a valid value stored in XMAX. + +- HEAP_XMAX_IS_MULTI + This bit is set if the tuple's Xmax is a MultiXactId (as opposed to a + regular TransactionId). + +- HEAP_XMAX_LOCK_ONLY + This bit is set when the XMAX is a locker only; that is, if it's a + multixact, it does not contain an update among its members. It's set when + the XMAX is a plain Xid that locked the tuple, as well. + +- HEAP_XMAX_KEYSHR_LOCK +- HEAP_XMAX_SHR_LOCK +- HEAP_XMAX_EXCL_LOCK + These bits indicate the strength of the lock acquired; they are useful when + the XMAX is not a MultiXactId. If it's a multi, the info is to be found in + the member flags. If HEAP_XMAX_IS_MULTI is not set and HEAP_XMAX_LOCK_ONLY + is set, then one of these *must* be set as well. + + Note that HEAP_XMAX_EXCL_LOCK does not distinguish FOR NO KEY UPDATE from + FOR UPDATE; this is implemented by the HEAP_KEYS_UPDATED bit. + +- HEAP_KEYS_UPDATED + This bit lives in t_infomask2. If set, indicates that the operation(s) done + by the XMAX compromise the tuple key, such as a SELECT FOR UPDATE, an UPDATE + that modifies the columns of the key, or a DELETE. It's set regardless of + whether the XMAX is a TransactionId or a MultiXactId. + +We currently never set the HEAP_XMAX_COMMITTED when the HEAP_XMAX_IS_MULTI bit +is set. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c new file mode 100644 index 0000000..64b9ec0 --- /dev/null +++ b/src/backend/access/heap/heapam.c @@ -0,0 +1,9955 @@ +/*------------------------------------------------------------------------- + * + * heapam.c + * heap access method code + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/heapam.c + * + * + * INTERFACE ROUTINES + * heap_beginscan - begin relation scan + * heap_rescan - restart a relation scan + * heap_endscan - end relation scan + * heap_getnext - retrieve next tuple in scan + * heap_fetch - retrieve tuple with given tid + * heap_insert - insert tuple into a relation + * heap_multi_insert - insert multiple tuples into a relation + * heap_delete - delete a tuple from a relation + * heap_update - replace a tuple in a relation with another tuple + * + * NOTES + * This file contains the heap_ routines which implement + * the POSTGRES heap access method used for all POSTGRES + * relations. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/heaptoast.h" +#include "access/hio.h" +#include "access/multixact.h" +#include "access/parallel.h" +#include "access/relscan.h" +#include "access/subtrans.h" +#include "access/syncscan.h" +#include "access/sysattr.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/valid.h" +#include "access/visibilitymap.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/catalog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "port/pg_bitutils.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/datum.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/relcache.h" +#include "utils/snapmgr.h" +#include "utils/spccache.h" + + +static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, + TransactionId xid, CommandId cid, int options); +static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, + Buffer newbuf, HeapTuple oldtup, + HeapTuple newtup, HeapTuple old_key_tuple, + bool all_visible_cleared, bool new_all_visible_cleared); +static Bitmapset *HeapDetermineColumnsInfo(Relation relation, + Bitmapset *interesting_cols, + Bitmapset *external_cols, + HeapTuple oldtup, HeapTuple newtup, + bool *has_external); +static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, + LockTupleMode mode, LockWaitPolicy wait_policy, + bool *have_tuple_lock); +static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, + uint16 old_infomask2, TransactionId add_to_xmax, + LockTupleMode mode, bool is_update, + TransactionId *result_xmax, uint16 *result_infomask, + uint16 *result_infomask2); +static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, + ItemPointer ctid, TransactionId xid, + LockTupleMode mode); +static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, + uint16 *new_infomask2); +static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, + uint16 t_infomask); +static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, + LockTupleMode lockmode, bool *current_is_member); +static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining); +static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, Relation rel, int *remaining); +static void index_delete_sort(TM_IndexDeleteOp *delstate); +static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); +static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); +static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_required, + bool *copy); + + +/* + * Each tuple lock mode has a corresponding heavyweight lock, and one or two + * corresponding MultiXactStatuses (one to merely lock tuples, another one to + * update them). This table (and the macros below) helps us determine the + * heavyweight lock mode and MultiXactStatus values to use for any particular + * tuple lock strength. + * + * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock + * instead. + */ +static const struct +{ + LOCKMODE hwlock; + int lockstatus; + int updstatus; +} + + tupleLockExtraInfo[MaxLockTupleMode + 1] = +{ + { /* LockTupleKeyShare */ + AccessShareLock, + MultiXactStatusForKeyShare, + -1 /* KeyShare does not allow updating tuples */ + }, + { /* LockTupleShare */ + RowShareLock, + MultiXactStatusForShare, + -1 /* Share does not allow updating tuples */ + }, + { /* LockTupleNoKeyExclusive */ + ExclusiveLock, + MultiXactStatusForNoKeyUpdate, + MultiXactStatusNoKeyUpdate + }, + { /* LockTupleExclusive */ + AccessExclusiveLock, + MultiXactStatusForUpdate, + MultiXactStatusUpdate + } +}; + +/* Get the LOCKMODE for a given MultiXactStatus */ +#define LOCKMODE_from_mxstatus(status) \ + (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock) + +/* + * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. + * This is more readable than having every caller translate it to lock.h's + * LOCKMODE. + */ +#define LockTupleTuplock(rel, tup, mode) \ + LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define UnlockTupleTuplock(rel, tup, mode) \ + UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define ConditionalLockTupleTuplock(rel, tup, mode) \ + ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) + +#ifdef USE_PREFETCH +/* + * heap_index_delete_tuples and index_delete_prefetch_buffer use this + * structure to coordinate prefetching activity + */ +typedef struct +{ + BlockNumber cur_hblkno; + int next_item; + int ndeltids; + TM_IndexDelete *deltids; +} IndexDeletePrefetchState; +#endif + +/* heap_index_delete_tuples bottom-up index deletion costing constants */ +#define BOTTOMUP_MAX_NBLOCKS 6 +#define BOTTOMUP_TOLERANCE_NBLOCKS 3 + +/* + * heap_index_delete_tuples uses this when determining which heap blocks it + * must visit to help its bottom-up index deletion caller + */ +typedef struct IndexDeleteCounts +{ + int16 npromisingtids; /* Number of "promising" TIDs in group */ + int16 ntids; /* Number of TIDs in group */ + int16 ifirsttid; /* Offset to group's first deltid */ +} IndexDeleteCounts; + +/* + * This table maps tuple lock strength values for each particular + * MultiXactStatus value. + */ +static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = +{ + LockTupleKeyShare, /* ForKeyShare */ + LockTupleShare, /* ForShare */ + LockTupleNoKeyExclusive, /* ForNoKeyUpdate */ + LockTupleExclusive, /* ForUpdate */ + LockTupleNoKeyExclusive, /* NoKeyUpdate */ + LockTupleExclusive /* Update */ +}; + +/* Get the LockTupleMode for a given MultiXactStatus */ +#define TUPLOCK_from_mxstatus(status) \ + (MultiXactStatusLock[(status)]) + +/* ---------------------------------------------------------------- + * heap support routines + * ---------------------------------------------------------------- + */ + +/* ---------------- + * initscan - scan code common to heap_beginscan and heap_rescan + * ---------------- + */ +static void +initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) +{ + ParallelBlockTableScanDesc bpscan = NULL; + bool allow_strat; + bool allow_sync; + + /* + * Determine the number of blocks we have to scan. + * + * It is sufficient to do this once at scan start, since any tuples added + * while the scan is in progress will be invisible to my snapshot anyway. + * (That is not true when using a non-MVCC snapshot. However, we couldn't + * guarantee to return tuples added after scan start anyway, since they + * might go into pages we already scanned. To guarantee consistent + * results for a non-MVCC snapshot, the caller must hold some higher-level + * lock that ensures the interesting tuple(s) won't change.) + */ + if (scan->rs_base.rs_parallel != NULL) + { + bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + scan->rs_nblocks = bpscan->phs_nblocks; + } + else + scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd); + + /* + * If the table is large relative to NBuffers, use a bulk-read access + * strategy and enable synchronized scanning (see syncscan.c). Although + * the thresholds for these features could be different, we make them the + * same so that there are only two behaviors to tune rather than four. + * (However, some callers need to be able to disable one or both of these + * behaviors, independently of the size of the table; also there is a GUC + * variable that can disable synchronized scanning.) + * + * Note that table_block_parallelscan_initialize has a very similar test; + * if you change this, consider changing that one, too. + */ + if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) && + scan->rs_nblocks > NBuffers / 4) + { + allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0; + allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0; + } + else + allow_strat = allow_sync = false; + + if (allow_strat) + { + /* During a rescan, keep the previous strategy object. */ + if (scan->rs_strategy == NULL) + scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD); + } + else + { + if (scan->rs_strategy != NULL) + FreeAccessStrategy(scan->rs_strategy); + scan->rs_strategy = NULL; + } + + if (scan->rs_base.rs_parallel != NULL) + { + /* For parallel scan, believe whatever ParallelTableScanDesc says. */ + if (scan->rs_base.rs_parallel->phs_syncscan) + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + } + else if (keep_startblock) + { + /* + * When rescanning, we want to keep the previous startblock setting, + * so that rewinding a cursor doesn't generate surprising results. + * Reset the active syncscan setting, though. + */ + if (allow_sync && synchronize_seqscans) + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + } + else if (allow_sync && synchronize_seqscans) + { + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks); + } + else + { + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + scan->rs_startblock = 0; + } + + scan->rs_numblocks = InvalidBlockNumber; + scan->rs_inited = false; + scan->rs_ctup.t_data = NULL; + ItemPointerSetInvalid(&scan->rs_ctup.t_self); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + + /* page-at-a-time fields are always invalid when not rs_inited */ + + /* + * copy the scan key, if appropriate + */ + if (key != NULL && scan->rs_base.rs_nkeys > 0) + memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData)); + + /* + * Currently, we only have a stats counter for sequential heap scans (but + * e.g for bitmap scans the underlying bitmap index scans will be counted, + * and for sample scans we update stats for tuple fetches). + */ + if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN) + pgstat_count_heap_scan(scan->rs_base.rs_rd); +} + +/* + * heap_setscanlimits - restrict range of a heapscan + * + * startBlk is the page to start at + * numBlks is number of pages to scan (InvalidBlockNumber means "all") + */ +void +heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + Assert(!scan->rs_inited); /* else too late to change */ + /* else rs_startblock is significant */ + Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC)); + + /* Check startBlk is valid (but allow case of zero blocks...) */ + Assert(startBlk == 0 || startBlk < scan->rs_nblocks); + + scan->rs_startblock = startBlk; + scan->rs_numblocks = numBlks; +} + +/* + * heapgetpage - subroutine for heapgettup() + * + * This routine reads and pins the specified page of the relation. + * In page-at-a-time mode it performs additional work, namely determining + * which tuples on the page are visible. + */ +void +heapgetpage(TableScanDesc sscan, BlockNumber page) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + Buffer buffer; + Snapshot snapshot; + Page dp; + int lines; + int ntup; + OffsetNumber lineoff; + ItemId lpp; + bool all_visible; + + Assert(page < scan->rs_nblocks); + + /* release previous scan buffer, if any */ + if (BufferIsValid(scan->rs_cbuf)) + { + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + } + + /* + * Be sure to check for interrupts at least once per page. Checks at + * higher code levels won't be able to stop a seqscan that encounters many + * pages' worth of consecutive dead tuples. + */ + CHECK_FOR_INTERRUPTS(); + + /* read page using selected strategy */ + scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page, + RBM_NORMAL, scan->rs_strategy); + scan->rs_cblock = page; + + if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)) + return; + + buffer = scan->rs_cbuf; + snapshot = scan->rs_base.rs_snapshot; + + /* + * Prune and repair fragmentation for the whole page, if possible. + */ + heap_page_prune_opt(scan->rs_base.rs_rd, buffer); + + /* + * We must hold share lock on the buffer content while examining tuple + * visibility. Afterwards, however, the tuples we have found to be + * visible are guaranteed good as long as we hold the buffer pin. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + dp = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp); + lines = PageGetMaxOffsetNumber(dp); + ntup = 0; + + /* + * If the all-visible flag indicates that all tuples on the page are + * visible to everyone, we can skip the per-tuple visibility tests. + * + * Note: In hot standby, a tuple that's already visible to all + * transactions on the primary might still be invisible to a read-only + * transaction in the standby. We partly handle this problem by tracking + * the minimum xmin of visible tuples as the cut-off XID while marking a + * page all-visible on the primary and WAL log that along with the + * visibility map SET operation. In hot standby, we wait for (or abort) + * all transactions that can potentially may not see one or more tuples on + * the page. That's how index-only scans work fine in hot standby. A + * crucial difference between index-only scans and heap scans is that the + * index-only scan completely relies on the visibility map where as heap + * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if + * the page-level flag can be trusted in the same way, because it might + * get propagated somehow without being explicitly WAL-logged, e.g. via a + * full page write. Until we can prove that beyond doubt, let's check each + * tuple for visibility the hard way. + */ + all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery; + + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); + lineoff <= lines; + lineoff++, lpp++) + { + if (ItemIdIsNormal(lpp)) + { + HeapTupleData loctup; + bool valid; + + loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); + loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + loctup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(loctup.t_self), page, lineoff); + + if (all_visible) + valid = true; + else + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + + HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, + &loctup, buffer, snapshot); + + if (valid) + scan->rs_vistuples[ntup++] = lineoff; + } + } + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + Assert(ntup <= MaxHeapTuplesPerPage); + scan->rs_ntuples = ntup; +} + +/* ---------------- + * heapgettup - fetch next heap tuple + * + * Initialize the scan if not already done; then advance to the next + * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup, + * or set scan->rs_ctup.t_data = NULL if no more tuples. + * + * dir == NoMovementScanDirection means "re-fetch the tuple indicated + * by scan->rs_ctup". + * + * Note: the reason nkeys/key are passed separately, even though they are + * kept in the scan descriptor, is that the caller may not want us to check + * the scankeys. + * + * Note: when we fall off the end of the scan in either direction, we + * reset rs_inited. This means that a further request with the same + * scan direction will restart the scan, which is a bit odd, but a + * request with the opposite scan direction will start a fresh scan + * in the proper direction. The latter is required behavior for cursors, + * while the former case is generally undefined behavior in Postgres + * so we don't care too much. + * ---------------- + */ +static void +heapgettup(HeapScanDesc scan, + ScanDirection dir, + int nkeys, + ScanKey key) +{ + HeapTuple tuple = &(scan->rs_ctup); + Snapshot snapshot = scan->rs_base.rs_snapshot; + bool backward = ScanDirectionIsBackward(dir); + BlockNumber page; + bool finished; + Page dp; + int lines; + OffsetNumber lineoff; + int linesleft; + ItemId lpp; + + /* + * calculate next starting lineoff, given scan direction + */ + if (ScanDirectionIsForward(dir)) + { + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + if (scan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan = + (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + ParallelBlockTableScanWorker pbscanwork = + scan->rs_parallelworkerdata; + + table_block_parallelscan_startblock_init(scan->rs_base.rs_rd, + pbscanwork, pbscan); + + page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd, + pbscanwork, pbscan); + + /* Other processes might have already finished the scan. */ + if (page == InvalidBlockNumber) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + } + else + page = scan->rs_startblock; /* first page */ + heapgetpage((TableScanDesc) scan, page); + lineoff = FirstOffsetNumber; /* first offnum */ + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + page = scan->rs_cblock; /* current page */ + lineoff = /* next offnum */ + OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self))); + } + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp); + lines = PageGetMaxOffsetNumber(dp); + /* page and lineoff now reference the physically next tid */ + + linesleft = lines - lineoff + 1; + } + else if (backward) + { + /* backward parallel scan not supported */ + Assert(scan->rs_base.rs_parallel == NULL); + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + + /* + * Disable reporting to syncscan logic in a backwards scan; it's + * not very likely anyone else is doing the same thing at the same + * time, and much more likely that we'll just bollix things for + * forward scanners. + */ + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + + /* + * Start from last page of the scan. Ensure we take into account + * rs_numblocks if it's been adjusted by heap_setscanlimits(). + */ + if (scan->rs_numblocks != InvalidBlockNumber) + page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks; + else if (scan->rs_startblock > 0) + page = scan->rs_startblock - 1; + else + page = scan->rs_nblocks - 1; + heapgetpage((TableScanDesc) scan, page); + } + else + { + /* continue from previously returned page/tuple */ + page = scan->rs_cblock; /* current page */ + } + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp); + lines = PageGetMaxOffsetNumber(dp); + + if (!scan->rs_inited) + { + lineoff = lines; /* final offnum */ + scan->rs_inited = true; + } + else + { + /* + * The previous returned tuple may have been vacuumed since the + * previous scan when we use a non-MVCC snapshot, so we must + * re-establish the lineoff <= PageGetMaxOffsetNumber(dp) + * invariant + */ + lineoff = /* previous offnum */ + Min(lines, + OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)))); + } + /* page and lineoff now reference the physically previous tid */ + + linesleft = lineoff; + } + else + { + /* + * ``no movement'' scan direction: refetch prior tuple + */ + if (!scan->rs_inited) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + + page = ItemPointerGetBlockNumber(&(tuple->t_self)); + if (page != scan->rs_cblock) + heapgetpage((TableScanDesc) scan, page); + + /* Since the tuple was previously fetched, needn't lock page here */ + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp); + lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self)); + lpp = PageGetItemId(dp, lineoff); + Assert(ItemIdIsNormal(lpp)); + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + tuple->t_len = ItemIdGetLength(lpp); + + return; + } + + /* + * advance the scan until we find a qualifying tuple or run out of stuff + * to scan + */ + lpp = PageGetItemId(dp, lineoff); + for (;;) + { + /* + * Only continue scanning the page while we have lines left. + * + * Note that this protects us from accessing line pointers past + * PageGetMaxOffsetNumber(); both for forward scans when we resume the + * table scan, and for when we start scanning a new page. + */ + while (linesleft > 0) + { + if (ItemIdIsNormal(lpp)) + { + bool valid; + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + tuple->t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(tuple->t_self), page, lineoff); + + /* + * if current tuple qualifies, return it. + */ + valid = HeapTupleSatisfiesVisibility(tuple, + snapshot, + scan->rs_cbuf); + + HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, + tuple, scan->rs_cbuf, + snapshot); + + if (valid && key != NULL) + HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), + nkeys, key, valid); + + if (valid) + { + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + return; + } + } + + /* + * otherwise move to the next item on the page + */ + --linesleft; + if (backward) + { + --lpp; /* move back in this page's ItemId array */ + --lineoff; + } + else + { + ++lpp; /* move forward in this page's ItemId array */ + ++lineoff; + } + } + + /* + * if we get here, it means we've exhausted the items on this page and + * it's time to move to the next. + */ + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + /* + * advance to next/prior page and detect end of scan + */ + if (backward) + { + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false); + if (page == 0) + page = scan->rs_nblocks; + page--; + } + else if (scan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan = + (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + ParallelBlockTableScanWorker pbscanwork = + scan->rs_parallelworkerdata; + + page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd, + pbscanwork, pbscan); + finished = (page == InvalidBlockNumber); + } + else + { + page++; + if (page >= scan->rs_nblocks) + page = 0; + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false); + + /* + * Report our new scan position for synchronization purposes. We + * don't do that when moving backwards, however. That would just + * mess up any other forward-moving scanners. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_base.rs_flags & SO_ALLOW_SYNC) + ss_report_location(scan->rs_base.rs_rd, page); + } + + /* + * return NULL if we've exhausted all the pages + */ + if (finished) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return; + } + + heapgetpage((TableScanDesc) scan, page); + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp); + lines = PageGetMaxOffsetNumber((Page) dp); + linesleft = lines; + if (backward) + { + lineoff = lines; + lpp = PageGetItemId(dp, lines); + } + else + { + lineoff = FirstOffsetNumber; + lpp = PageGetItemId(dp, FirstOffsetNumber); + } + } +} + +/* ---------------- + * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode + * + * Same API as heapgettup, but used in page-at-a-time mode + * + * The internal logic is much the same as heapgettup's too, but there are some + * differences: we do not take the buffer content lock (that only needs to + * happen inside heapgetpage), and we iterate through just the tuples listed + * in rs_vistuples[] rather than all tuples on the page. Notice that + * lineindex is 0-based, where the corresponding loop variable lineoff in + * heapgettup is 1-based. + * ---------------- + */ +static void +heapgettup_pagemode(HeapScanDesc scan, + ScanDirection dir, + int nkeys, + ScanKey key) +{ + HeapTuple tuple = &(scan->rs_ctup); + bool backward = ScanDirectionIsBackward(dir); + BlockNumber page; + bool finished; + Page dp; + int lines; + int lineindex; + OffsetNumber lineoff; + int linesleft; + ItemId lpp; + + /* + * calculate next starting lineindex, given scan direction + */ + if (ScanDirectionIsForward(dir)) + { + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + if (scan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan = + (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + ParallelBlockTableScanWorker pbscanwork = + scan->rs_parallelworkerdata; + + table_block_parallelscan_startblock_init(scan->rs_base.rs_rd, + pbscanwork, pbscan); + + page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd, + pbscanwork, pbscan); + + /* Other processes might have already finished the scan. */ + if (page == InvalidBlockNumber) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + } + else + page = scan->rs_startblock; /* first page */ + heapgetpage((TableScanDesc) scan, page); + lineindex = 0; + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + page = scan->rs_cblock; /* current page */ + lineindex = scan->rs_cindex + 1; + } + + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp); + lines = scan->rs_ntuples; + /* page and lineindex now reference the next visible tid */ + + linesleft = lines - lineindex; + } + else if (backward) + { + /* backward parallel scan not supported */ + Assert(scan->rs_base.rs_parallel == NULL); + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + + /* + * Disable reporting to syncscan logic in a backwards scan; it's + * not very likely anyone else is doing the same thing at the same + * time, and much more likely that we'll just bollix things for + * forward scanners. + */ + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + + /* + * Start from last page of the scan. Ensure we take into account + * rs_numblocks if it's been adjusted by heap_setscanlimits(). + */ + if (scan->rs_numblocks != InvalidBlockNumber) + page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks; + else if (scan->rs_startblock > 0) + page = scan->rs_startblock - 1; + else + page = scan->rs_nblocks - 1; + heapgetpage((TableScanDesc) scan, page); + } + else + { + /* continue from previously returned page/tuple */ + page = scan->rs_cblock; /* current page */ + } + + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp); + lines = scan->rs_ntuples; + + if (!scan->rs_inited) + { + lineindex = lines - 1; + scan->rs_inited = true; + } + else + { + lineindex = scan->rs_cindex - 1; + } + /* page and lineindex now reference the previous visible tid */ + + linesleft = lineindex + 1; + } + else + { + /* + * ``no movement'' scan direction: refetch prior tuple + */ + if (!scan->rs_inited) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + + page = ItemPointerGetBlockNumber(&(tuple->t_self)); + if (page != scan->rs_cblock) + heapgetpage((TableScanDesc) scan, page); + + /* Since the tuple was previously fetched, needn't lock page here */ + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp); + lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self)); + lpp = PageGetItemId(dp, lineoff); + Assert(ItemIdIsNormal(lpp)); + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + tuple->t_len = ItemIdGetLength(lpp); + + /* check that rs_cindex is in sync */ + Assert(scan->rs_cindex < scan->rs_ntuples); + Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]); + + return; + } + + /* + * advance the scan until we find a qualifying tuple or run out of stuff + * to scan + */ + for (;;) + { + while (linesleft > 0) + { + lineoff = scan->rs_vistuples[lineindex]; + lpp = PageGetItemId(dp, lineoff); + Assert(ItemIdIsNormal(lpp)); + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + tuple->t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(tuple->t_self), page, lineoff); + + /* + * if current tuple qualifies, return it. + */ + if (key != NULL) + { + bool valid; + + HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), + nkeys, key, valid); + if (valid) + { + scan->rs_cindex = lineindex; + return; + } + } + else + { + scan->rs_cindex = lineindex; + return; + } + + /* + * otherwise move to the next item on the page + */ + --linesleft; + if (backward) + --lineindex; + else + ++lineindex; + } + + /* + * if we get here, it means we've exhausted the items on this page and + * it's time to move to the next. + */ + if (backward) + { + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false); + if (page == 0) + page = scan->rs_nblocks; + page--; + } + else if (scan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan = + (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + ParallelBlockTableScanWorker pbscanwork = + scan->rs_parallelworkerdata; + + page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd, + pbscanwork, pbscan); + finished = (page == InvalidBlockNumber); + } + else + { + page++; + if (page >= scan->rs_nblocks) + page = 0; + finished = (page == scan->rs_startblock) || + (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false); + + /* + * Report our new scan position for synchronization purposes. We + * don't do that when moving backwards, however. That would just + * mess up any other forward-moving scanners. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_base.rs_flags & SO_ALLOW_SYNC) + ss_report_location(scan->rs_base.rs_rd, page); + } + + /* + * return NULL if we've exhausted all the pages + */ + if (finished) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return; + } + + heapgetpage((TableScanDesc) scan, page); + + dp = BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp); + lines = scan->rs_ntuples; + linesleft = lines; + if (backward) + lineindex = lines - 1; + else + lineindex = 0; + } +} + + +#if defined(DISABLE_COMPLEX_MACRO) +/* + * This is formatted so oddly so that the correspondence to the macro + * definition in access/htup_details.h is maintained. + */ +Datum +fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, + bool *isnull) +{ + return ( + (attnum) > 0 ? + ( + (*(isnull) = false), + HeapTupleNoNulls(tup) ? + ( + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ? + ( + fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1), + (char *) (tup)->t_data + (tup)->t_data->t_hoff + + TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff) + ) + : + nocachegetattr((tup), (attnum), (tupleDesc)) + ) + : + ( + att_isnull((attnum) - 1, (tup)->t_data->t_bits) ? + ( + (*(isnull) = true), + (Datum) NULL + ) + : + ( + nocachegetattr((tup), (attnum), (tupleDesc)) + ) + ) + ) + : + ( + (Datum) NULL + ) + ); +} +#endif /* defined(DISABLE_COMPLEX_MACRO) */ + + +/* ---------------------------------------------------------------- + * heap access method interface + * ---------------------------------------------------------------- + */ + + +TableScanDesc +heap_beginscan(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + ParallelTableScanDesc parallel_scan, + uint32 flags) +{ + HeapScanDesc scan; + + /* + * increment relation ref count while scanning relation + * + * This is just to make really sure the relcache entry won't go away while + * the scan has a pointer to it. Caller should be holding the rel open + * anyway, so this is redundant in all normal scenarios... + */ + RelationIncrementReferenceCount(relation); + + /* + * allocate and initialize scan descriptor + */ + scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData)); + + scan->rs_base.rs_rd = relation; + scan->rs_base.rs_snapshot = snapshot; + scan->rs_base.rs_nkeys = nkeys; + scan->rs_base.rs_flags = flags; + scan->rs_base.rs_parallel = parallel_scan; + scan->rs_strategy = NULL; /* set in initscan */ + + /* + * Disable page-at-a-time mode if it's not a MVCC-safe snapshot. + */ + if (!(snapshot && IsMVCCSnapshot(snapshot))) + scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; + + /* + * For seqscan and sample scans in a serializable transaction, acquire a + * predicate lock on the entire relation. This is required not only to + * lock all the matching tuples, but also to conflict with new insertions + * into the table. In an indexscan, we take page locks on the index pages + * covering the range specified in the scan qual, but in a heap scan there + * is nothing more fine-grained to lock. A bitmap scan is a different + * story, there we have already scanned the index and locked the index + * pages covering the predicate. But in that case we still have to lock + * any matching heap tuples. For sample scan we could optimize the locking + * to be at least page-level granularity, but we'd need to add per-tuple + * locking for that. + */ + if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN)) + { + /* + * Ensure a missing snapshot is noticed reliably, even if the + * isolation mode means predicate locking isn't performed (and + * therefore the snapshot isn't used here). + */ + Assert(snapshot); + PredicateLockRelation(relation, snapshot); + } + + /* we only need to set this up once */ + scan->rs_ctup.t_tableOid = RelationGetRelid(relation); + + /* + * Allocate memory to keep track of page allocation for parallel workers + * when doing a parallel scan. + */ + if (parallel_scan != NULL) + scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData)); + else + scan->rs_parallelworkerdata = NULL; + + /* + * we do this here instead of in initscan() because heap_rescan also calls + * initscan() and we don't want to allocate memory again + */ + if (nkeys > 0) + scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); + else + scan->rs_base.rs_key = NULL; + + initscan(scan, key, false); + + return (TableScanDesc) scan; +} + +void +heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, + bool allow_strat, bool allow_sync, bool allow_pagemode) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + if (set_params) + { + if (allow_strat) + scan->rs_base.rs_flags |= SO_ALLOW_STRAT; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT; + + if (allow_sync) + scan->rs_base.rs_flags |= SO_ALLOW_SYNC; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; + + if (allow_pagemode && scan->rs_base.rs_snapshot && + IsMVCCSnapshot(scan->rs_base.rs_snapshot)) + scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE; + else + scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; + } + + /* + * unpin scan buffers + */ + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + + /* + * reinitialize scan descriptor + */ + initscan(scan, key, true); +} + +void +heap_endscan(TableScanDesc sscan) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + /* Note: no locking manipulations needed */ + + /* + * unpin scan buffers + */ + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + + /* + * decrement relation reference count and free scan descriptor storage + */ + RelationDecrementReferenceCount(scan->rs_base.rs_rd); + + if (scan->rs_base.rs_key) + pfree(scan->rs_base.rs_key); + + if (scan->rs_strategy != NULL) + FreeAccessStrategy(scan->rs_strategy); + + if (scan->rs_parallelworkerdata != NULL) + pfree(scan->rs_parallelworkerdata); + + if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) + UnregisterSnapshot(scan->rs_base.rs_snapshot); + + pfree(scan); +} + +HeapTuple +heap_getnext(TableScanDesc sscan, ScanDirection direction) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + /* + * This is still widely used directly, without going through table AM, so + * add a safety check. It's possible we should, at a later point, + * downgrade this to an assert. The reason for checking the AM routine, + * rather than the AM oid, is that this allows to write regression tests + * that create another AM reusing the heap handler. + */ + if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine())) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg_internal("only heap AM is supported"))); + + /* + * We don't expect direct calls to heap_getnext with valid CheckXidAlive + * for catalog or regular tables. See detailed comments in xact.c where + * these variables are declared. Normally we have such a check at tableam + * level API but this is called from many places so we need to ensure it + * here. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected heap_getnext call during logical decoding"); + + /* Note: no locking manipulations needed */ + + if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) + heapgettup_pagemode(scan, direction, + scan->rs_base.rs_nkeys, scan->rs_base.rs_key); + else + heapgettup(scan, direction, + scan->rs_base.rs_nkeys, scan->rs_base.rs_key); + + if (scan->rs_ctup.t_data == NULL) + return NULL; + + /* + * if we get here it means we have a new current scan tuple, so point to + * the proper return buffer and return the tuple. + */ + + pgstat_count_heap_getnext(scan->rs_base.rs_rd); + + return &scan->rs_ctup; +} + +bool +heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + /* Note: no locking manipulations needed */ + + if (sscan->rs_flags & SO_ALLOW_PAGEMODE) + heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); + else + heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); + + if (scan->rs_ctup.t_data == NULL) + { + ExecClearTuple(slot); + return false; + } + + /* + * if we get here it means we have a new current scan tuple, so point to + * the proper return buffer and return the tuple. + */ + + pgstat_count_heap_getnext(scan->rs_base.rs_rd); + + ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, + scan->rs_cbuf); + return true; +} + +void +heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, + ItemPointer maxtid) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + BlockNumber startBlk; + BlockNumber numBlks; + ItemPointerData highestItem; + ItemPointerData lowestItem; + + /* + * For relations without any pages, we can simply leave the TID range + * unset. There will be no tuples to scan, therefore no tuples outside + * the given TID range. + */ + if (scan->rs_nblocks == 0) + return; + + /* + * Set up some ItemPointers which point to the first and last possible + * tuples in the heap. + */ + ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber); + ItemPointerSet(&lowestItem, 0, FirstOffsetNumber); + + /* + * If the given maximum TID is below the highest possible TID in the + * relation, then restrict the range to that, otherwise we scan to the end + * of the relation. + */ + if (ItemPointerCompare(maxtid, &highestItem) < 0) + ItemPointerCopy(maxtid, &highestItem); + + /* + * If the given minimum TID is above the lowest possible TID in the + * relation, then restrict the range to only scan for TIDs above that. + */ + if (ItemPointerCompare(mintid, &lowestItem) > 0) + ItemPointerCopy(mintid, &lowestItem); + + /* + * Check for an empty range and protect from would be negative results + * from the numBlks calculation below. + */ + if (ItemPointerCompare(&highestItem, &lowestItem) < 0) + { + /* Set an empty range of blocks to scan */ + heap_setscanlimits(sscan, 0, 0); + return; + } + + /* + * Calculate the first block and the number of blocks we must scan. We + * could be more aggressive here and perform some more validation to try + * and further narrow the scope of blocks to scan by checking if the + * lowerItem has an offset above MaxOffsetNumber. In this case, we could + * advance startBlk by one. Likewise, if highestItem has an offset of 0 + * we could scan one fewer blocks. However, such an optimization does not + * seem worth troubling over, currently. + */ + startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem); + + numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) - + ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1; + + /* Set the start block and number of blocks to scan */ + heap_setscanlimits(sscan, startBlk, numBlks); + + /* Finally, set the TID range in sscan */ + ItemPointerCopy(&lowestItem, &sscan->rs_mintid); + ItemPointerCopy(&highestItem, &sscan->rs_maxtid); +} + +bool +heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, + TupleTableSlot *slot) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + ItemPointer mintid = &sscan->rs_mintid; + ItemPointer maxtid = &sscan->rs_maxtid; + + /* Note: no locking manipulations needed */ + for (;;) + { + if (sscan->rs_flags & SO_ALLOW_PAGEMODE) + heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); + else + heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); + + if (scan->rs_ctup.t_data == NULL) + { + ExecClearTuple(slot); + return false; + } + + /* + * heap_set_tidrange will have used heap_setscanlimits to limit the + * range of pages we scan to only ones that can contain the TID range + * we're scanning for. Here we must filter out any tuples from these + * pages that are outwith that range. + */ + if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0) + { + ExecClearTuple(slot); + + /* + * When scanning backwards, the TIDs will be in descending order. + * Future tuples in this direction will be lower still, so we can + * just return false to indicate there will be no more tuples. + */ + if (ScanDirectionIsBackward(direction)) + return false; + + continue; + } + + /* + * Likewise for the final page, we must filter out TIDs greater than + * maxtid. + */ + if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0) + { + ExecClearTuple(slot); + + /* + * When scanning forward, the TIDs will be in ascending order. + * Future tuples in this direction will be higher still, so we can + * just return false to indicate there will be no more tuples. + */ + if (ScanDirectionIsForward(direction)) + return false; + continue; + } + + break; + } + + /* + * if we get here it means we have a new current scan tuple, so point to + * the proper return buffer and return the tuple. + */ + pgstat_count_heap_getnext(scan->rs_base.rs_rd); + + ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf); + return true; +} + +/* + * heap_fetch - retrieve tuple with given tid + * + * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding + * the tuple, fill in the remaining fields of *tuple, and check the tuple + * against the specified snapshot. + * + * If successful (tuple found and passes snapshot time qual), then *userbuf + * is set to the buffer holding the tuple and true is returned. The caller + * must unpin the buffer when done with the tuple. + * + * If the tuple is not found (ie, item number references a deleted slot), + * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer, + * and false is returned. + * + * If the tuple is found but fails the time qual check, then false is returned + * and *userbuf is set to InvalidBuffer, but tuple->t_data is left pointing + * to the tuple. (Note that it is unsafe to dereference tuple->t_data in + * this case, but callers might choose to test it for NULL-ness.) + * + * heap_fetch does not follow HOT chains: only the exact TID requested will + * be fetched. + * + * It is somewhat inconsistent that we ereport() on invalid block number but + * return false on invalid item number. There are a couple of reasons though. + * One is that the caller can relatively easily check the block number for + * validity, but cannot check the item number without reading the page + * himself. Another is that when we are following a t_ctid link, we can be + * reasonably confident that the page number is valid (since VACUUM shouldn't + * truncate off the destination page without having killed the referencing + * tuple first), but the item number might well not be good. + */ +bool +heap_fetch(Relation relation, + Snapshot snapshot, + HeapTuple tuple, + Buffer *userbuf) +{ + return heap_fetch_extended(relation, snapshot, tuple, userbuf, false); +} + +/* + * heap_fetch_extended - fetch tuple even if it fails snapshot test + * + * If keep_buf is true, then upon finding a tuple that is valid but fails + * the snapshot check, we return the tuple pointer in tuple->t_data and the + * buffer ID in *userbuf, keeping the buffer pin, just as if it had passed + * the snapshot. (The function result is still "false" though.) + * If keep_buf is false then this behaves identically to heap_fetch(). + */ +bool +heap_fetch_extended(Relation relation, + Snapshot snapshot, + HeapTuple tuple, + Buffer *userbuf, + bool keep_buf) +{ + ItemPointer tid = &(tuple->t_self); + ItemId lp; + Buffer buffer; + Page page; + OffsetNumber offnum; + bool valid; + + /* + * Fetch and pin the appropriate page of the relation. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + + /* + * Need share lock on buffer to examine tuple commit status. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); + + /* + * We'd better check for out-of-range offnum in case of VACUUM since the + * TID was obtained. + */ + offnum = ItemPointerGetOffsetNumber(tid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + tuple->t_data = NULL; + return false; + } + + /* + * get the item line pointer corresponding to the requested tid + */ + lp = PageGetItemId(page, offnum); + + /* + * Must check for deleted tuple. + */ + if (!ItemIdIsNormal(lp)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + tuple->t_data = NULL; + return false; + } + + /* + * fill in *tuple fields + */ + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); + tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(relation); + + /* + * check tuple visibility, then release lock + */ + valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + + if (valid) + PredicateLockTID(relation, &(tuple->t_self), snapshot, + HeapTupleHeaderGetXmin(tuple->t_data)); + + HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (valid) + { + /* + * All checks passed, so return the tuple as valid. Caller is now + * responsible for releasing the buffer. + */ + *userbuf = buffer; + + return true; + } + + /* Tuple failed time qual, but maybe caller wants to see it anyway. */ + if (keep_buf) + *userbuf = buffer; + else + { + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + } + + return false; +} + +/* + * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot + * + * On entry, *tid is the TID of a tuple (either a simple tuple, or the root + * of a HOT chain), and buffer is the buffer holding this tuple. We search + * for the first chain member satisfying the given snapshot. If one is + * found, we update *tid to reference that tuple's offset number, and + * return true. If no match, return false without modifying *tid. + * + * heapTuple is a caller-supplied buffer. When a match is found, we return + * the tuple here, in addition to updating *tid. If no match is found, the + * contents of this buffer on return are undefined. + * + * If all_dead is not NULL, we check non-visible tuples to see if they are + * globally dead; *all_dead is set true if all members of the HOT chain + * are vacuumable, false if not. + * + * Unlike heap_fetch, the caller must already have pin and (at least) share + * lock on the buffer; it is still pinned/locked at exit. + */ +bool +heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, + Snapshot snapshot, HeapTuple heapTuple, + bool *all_dead, bool first_call) +{ + Page dp = (Page) BufferGetPage(buffer); + TransactionId prev_xmax = InvalidTransactionId; + BlockNumber blkno; + OffsetNumber offnum; + bool at_chain_start; + bool valid; + bool skip; + GlobalVisState *vistest = NULL; + + /* If this is not the first call, previous call returned a (live!) tuple */ + if (all_dead) + *all_dead = first_call; + + blkno = ItemPointerGetBlockNumber(tid); + offnum = ItemPointerGetOffsetNumber(tid); + at_chain_start = first_call; + skip = !first_call; + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + Assert(BufferGetBlockNumber(buffer) == blkno); + + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + { + /* We should only see a redirect at start of chain */ + if (ItemIdIsRedirected(lp) && at_chain_start) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + at_chain_start = false; + continue; + } + /* else must be end of chain */ + break; + } + + /* + * Update heapTuple to point to the element of the HOT chain we're + * currently investigating. Having t_self set correctly is important + * because the SSI checks and the *Satisfies routine for historical + * MVCC snapshots need the correct tid to decide about the visibility. + */ + heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); + heapTuple->t_len = ItemIdGetLength(lp); + heapTuple->t_tableOid = RelationGetRelid(relation); + ItemPointerSet(&heapTuple->t_self, blkno, offnum); + + /* + * Shouldn't see a HEAP_ONLY tuple at chain start. + */ + if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) + break; + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple->t_data))) + break; + + /* + * When first_call is true (and thus, skip is initially false) we'll + * return the first tuple we find. But on later passes, heapTuple + * will initially be pointing to the tuple we returned last time. + * Returning it again would be incorrect (and would loop forever), so + * we skip it and return the next match we find. + */ + if (!skip) + { + /* If it's visible per the snapshot, we must return it */ + valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer); + HeapCheckForSerializableConflictOut(valid, relation, heapTuple, + buffer, snapshot); + + if (valid) + { + ItemPointerSetOffsetNumber(tid, offnum); + PredicateLockTID(relation, &heapTuple->t_self, snapshot, + HeapTupleHeaderGetXmin(heapTuple->t_data)); + if (all_dead) + *all_dead = false; + return true; + } + } + skip = false; + + /* + * If we can't see it, maybe no one else can either. At caller + * request, check whether all chain members are dead to all + * transactions. + * + * Note: if you change the criterion here for what is "dead", fix the + * planner's get_actual_variable_range() function to match. + */ + if (all_dead && *all_dead) + { + if (!vistest) + vistest = GlobalVisTestFor(relation); + + if (!HeapTupleIsSurelyDead(heapTuple, vistest)) + *all_dead = false; + } + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (HeapTupleIsHotUpdated(heapTuple)) + { + Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) == + blkno); + offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid); + at_chain_start = false; + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); + } + else + break; /* end of chain */ + } + + return false; +} + +/* + * heap_get_latest_tid - get the latest tid of a specified tuple + * + * Actually, this gets the latest version that is visible according to the + * scan's snapshot. Create a scan using SnapshotDirty to get the very latest, + * possibly uncommitted version. + * + * *tid is both an input and an output parameter: it is updated to + * show the latest version of the row. Note that it will not be changed + * if no version of the row passes the snapshot test. + */ +void +heap_get_latest_tid(TableScanDesc sscan, + ItemPointer tid) +{ + Relation relation = sscan->rs_rd; + Snapshot snapshot = sscan->rs_snapshot; + ItemPointerData ctid; + TransactionId priorXmax; + + /* + * table_tuple_get_latest_tid() verified that the passed in tid is valid. + * Assume that t_ctid links are valid however - there shouldn't be invalid + * ones in the table. + */ + Assert(ItemPointerIsValid(tid)); + + /* + * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we + * need to examine, and *tid is the TID we will return if ctid turns out + * to be bogus. + * + * Note that we will loop until we reach the end of the t_ctid chain. + * Depending on the snapshot passed, there might be at most one visible + * version of the row, but we don't try to optimize for that. + */ + ctid = *tid; + priorXmax = InvalidTransactionId; /* cannot check first XMIN */ + for (;;) + { + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp; + HeapTupleData tp; + bool valid; + + /* + * Read, pin, and lock the page. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); + + /* + * Check for bogus item number. This is not treated as an error + * condition because it can happen while following a t_ctid link. We + * just assume that the prior tid is OK and return it unchanged. + */ + offnum = ItemPointerGetOffsetNumber(&ctid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) + { + UnlockReleaseBuffer(buffer); + break; + } + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) + { + UnlockReleaseBuffer(buffer); + break; + } + + /* OK to access the tuple */ + tp.t_self = ctid; + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_tableOid = RelationGetRelid(relation); + + /* + * After following a t_ctid link, we might arrive at an unrelated + * tuple. Check for XMIN match. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) + { + UnlockReleaseBuffer(buffer); + break; + } + + /* + * Check tuple visibility; if visible, set it as the new result + * candidate. + */ + valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer); + HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); + if (valid) + *tid = ctid; + + /* + * If there's a valid t_ctid link, follow it, else we're done. + */ + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsOnlyLocked(tp.t_data) || + HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) || + ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + { + UnlockReleaseBuffer(buffer); + break; + } + + ctid = tp.t_data->t_ctid; + priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); + UnlockReleaseBuffer(buffer); + } /* end of loop */ +} + + +/* + * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends + * + * This is called after we have waited for the XMAX transaction to terminate. + * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will + * be set on exit. If the transaction committed, we set the XMAX_COMMITTED + * hint bit if possible --- but beware that that may not yet be possible, + * if the transaction committed asynchronously. + * + * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID + * even if it commits. + * + * Hence callers should look only at XMAX_INVALID. + * + * Note this is not allowed for tuples whose xmax is a multixact. + */ +static void +UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid) +{ + Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid)); + Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + + if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID))) + { + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) && + TransactionIdDidCommit(xid)) + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + xid); + else + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + } +} + + +/* + * GetBulkInsertState - prepare status object for a bulk insert + */ +BulkInsertState +GetBulkInsertState(void) +{ + BulkInsertState bistate; + + bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData)); + bistate->strategy = GetAccessStrategy(BAS_BULKWRITE); + bistate->current_buf = InvalidBuffer; + return bistate; +} + +/* + * FreeBulkInsertState - clean up after finishing a bulk insert + */ +void +FreeBulkInsertState(BulkInsertState bistate) +{ + if (bistate->current_buf != InvalidBuffer) + ReleaseBuffer(bistate->current_buf); + FreeAccessStrategy(bistate->strategy); + pfree(bistate); +} + +/* + * ReleaseBulkInsertStatePin - release a buffer currently held in bistate + */ +void +ReleaseBulkInsertStatePin(BulkInsertState bistate) +{ + if (bistate->current_buf != InvalidBuffer) + ReleaseBuffer(bistate->current_buf); + bistate->current_buf = InvalidBuffer; +} + + +/* + * heap_insert - insert tuple into a heap + * + * The new tuple is stamped with current transaction ID and the specified + * command ID. + * + * See table_tuple_insert for comments about most of the input flags, except + * that this routine directly takes a tuple rather than a slot. + * + * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_ + * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to + * implement table_tuple_insert_speculative(). + * + * On return the header fields of *tup are updated to match the stored tuple; + * in particular tup->t_self receives the actual TID where the tuple was + * stored. But note that any toasting of fields within the tuple data is NOT + * reflected into *tup. + */ +void +heap_insert(Relation relation, HeapTuple tup, CommandId cid, + int options, BulkInsertState bistate) +{ + TransactionId xid = GetCurrentTransactionId(); + HeapTuple heaptup; + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + bool all_visible_cleared = false; + + /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ + Assert(HeapTupleHeaderGetNatts(tup->t_data) <= + RelationGetNumberOfAttributes(relation)); + + /* + * Fill in tuple header fields and toast the tuple if necessary. + * + * Note: below this point, heaptup is the data we actually intend to store + * into the relation; tup is the caller's original untoasted data. + */ + heaptup = heap_prepare_insert(relation, tup, xid, cid, options); + + /* + * Find buffer to insert this tuple into. If the page is all visible, + * this will also pin the requisite visibility map page. + */ + buffer = RelationGetBufferForTuple(relation, heaptup->t_len, + InvalidBuffer, options, bistate, + &vmbuffer, NULL); + + /* + * We're about to do the actual insert -- but check for conflict first, to + * avoid possibly having to roll back work we've just done. + * + * This is safe without a recheck as long as there is no possibility of + * another process scanning the page between this check and the insert + * being visible to the scan (i.e., an exclusive buffer content lock is + * continuously held from this point until the tuple insert is visible). + * + * For a heap insert, we only need to check for table-level SSI locks. Our + * new tuple can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call, which makes for a faster check. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + RelationPutHeapTuple(relation, buffer, heaptup, + (options & HEAP_INSERT_SPECULATIVE) != 0); + + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + visibilitymap_clear(relation, + ItemPointerGetBlockNumber(&(heaptup->t_self)), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + + /* + * XXX Should we set PageSetPrunable on this page ? + * + * The inserting transaction may eventually abort thus making this tuple + * DEAD and hence available for pruning. Though we don't want to optimize + * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the + * aborted tuple will never be pruned until next vacuum is triggered. + * + * If you do add PageSetPrunable here, add it in heap_xlog_insert too. + */ + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_heap_insert xlrec; + xl_heap_header xlhdr; + XLogRecPtr recptr; + Page page = BufferGetPage(buffer); + uint8 info = XLOG_HEAP_INSERT; + int bufflags = 0; + + /* + * If this is a catalog, we need to transmit combo CIDs to properly + * decode, so log that as well. + */ + if (RelationIsAccessibleInLogicalDecoding(relation)) + log_heap_new_cid(relation, heaptup); + + /* + * If this is the single and first tuple on page, we can reinit the + * page instead of restoring the whole thing. Set flag, and hide + * buffer references from XLogInsert. + */ + if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber && + PageGetMaxOffsetNumber(page) == FirstOffsetNumber) + { + info |= XLOG_HEAP_INIT_PAGE; + bufflags |= REGBUF_WILL_INIT; + } + + xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); + xlrec.flags = 0; + if (all_visible_cleared) + xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED; + if (options & HEAP_INSERT_SPECULATIVE) + xlrec.flags |= XLH_INSERT_IS_SPECULATIVE; + Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer)); + + /* + * For logical decoding, we need the tuple even if we're doing a full + * page write, so make sure it's included even if we take a full-page + * image. (XXX We could alternatively store a pointer into the FPW). + */ + if (RelationIsLogicallyLogged(relation) && + !(options & HEAP_INSERT_NO_LOGICAL)) + { + xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; + bufflags |= REGBUF_KEEP_DATA; + + if (IsToastRelation(relation)) + xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; + } + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapInsert); + + xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; + xlhdr.t_infomask = heaptup->t_data->t_infomask; + xlhdr.t_hoff = heaptup->t_data->t_hoff; + + /* + * note we mark xlhdr as belonging to buffer; if XLogInsert decides to + * write the whole page to the xlog, we don't need to store + * xl_heap_header in the xlog. + */ + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ + XLogRegisterBufData(0, + (char *) heaptup->t_data + SizeofHeapTupleHeader, + heaptup->t_len - SizeofHeapTupleHeader); + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP_ID, info); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* + * If tuple is cachable, mark it for invalidation from the caches in case + * we abort. Note it is OK to do this after releasing the buffer, because + * the heaptup data structure is all in local memory, not in the shared + * buffer. + */ + CacheInvalidateHeapTuple(relation, heaptup, NULL); + + /* Note: speculative insertions are counted too, even if aborted later */ + pgstat_count_heap_insert(relation, 1); + + /* + * If heaptup is a private copy, release it. Don't forget to copy t_self + * back to the caller's image, too. + */ + if (heaptup != tup) + { + tup->t_self = heaptup->t_self; + heap_freetuple(heaptup); + } +} + +/* + * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the + * tuple header fields and toasts the tuple if necessary. Returns a toasted + * version of the tuple if it was toasted, or the original tuple if not. Note + * that in any case, the header fields are also set in the original tuple. + */ +static HeapTuple +heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, + CommandId cid, int options) +{ + /* + * To allow parallel inserts, we need to ensure that they are safe to be + * performed in workers. We have the infrastructure to allow parallel + * inserts in general except for the cases where inserts generate a new + * CommandId (eg. inserts into a table having a foreign key column). + */ + if (IsParallelWorker()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot insert tuples in a parallel worker"))); + + tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); + tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); + tup->t_data->t_infomask |= HEAP_XMAX_INVALID; + HeapTupleHeaderSetXmin(tup->t_data, xid); + if (options & HEAP_INSERT_FROZEN) + HeapTupleHeaderSetXminFrozen(tup->t_data); + + HeapTupleHeaderSetCmin(tup->t_data, cid); + HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */ + tup->t_tableOid = RelationGetRelid(relation); + + /* + * If the new tuple is too big for storage or contains already toasted + * out-of-line attributes from some other relation, invoke the toaster. + */ + if (relation->rd_rel->relkind != RELKIND_RELATION && + relation->rd_rel->relkind != RELKIND_MATVIEW) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(tup)); + return tup; + } + else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) + return heap_toast_insert_or_update(relation, tup, NULL, options); + else + return tup; +} + +/* + * heap_multi_insert - insert multiple tuples into a heap + * + * This is like heap_insert(), but inserts multiple tuples in one operation. + * That's faster than calling heap_insert() in a loop, because when multiple + * tuples can be inserted on a single page, we can write just a single WAL + * record covering all of them, and only need to lock/unlock the page once. + * + * Note: this leaks memory into the current memory context. You can create a + * temporary context before calling this, if that's a problem. + */ +void +heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, + CommandId cid, int options, BulkInsertState bistate) +{ + TransactionId xid = GetCurrentTransactionId(); + HeapTuple *heaptuples; + int i; + int ndone; + PGAlignedBlock scratch; + Page page; + Buffer vmbuffer = InvalidBuffer; + bool needwal; + Size saveFreeSpace; + bool need_tuple_data = RelationIsLogicallyLogged(relation); + bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); + + /* currently not needed (thus unsupported) for heap_multi_insert() */ + AssertArg(!(options & HEAP_INSERT_NO_LOGICAL)); + + needwal = RelationNeedsWAL(relation); + saveFreeSpace = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + + /* Toast and set header data in all the slots */ + heaptuples = palloc(ntuples * sizeof(HeapTuple)); + for (i = 0; i < ntuples; i++) + { + HeapTuple tuple; + + tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL); + slots[i]->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slots[i]->tts_tableOid; + heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid, + options); + } + + /* + * We're about to do the actual inserts -- but check for conflict first, + * to minimize the possibility of having to roll back work we've just + * done. + * + * A check here does not definitively prevent a serialization anomaly; + * that check MUST be done at least past the point of acquiring an + * exclusive buffer content lock on every buffer that will be affected, + * and MAY be done after all inserts are reflected in the buffers and + * those locks are released; otherwise there is a race condition. Since + * multiple buffers can be locked and unlocked in the loop below, and it + * would not be feasible to identify and lock all of those buffers before + * the loop, we must do a final check at the end. + * + * The check here could be omitted with no loss of correctness; it is + * present strictly as an optimization. + * + * For heap inserts, we only need to check for table-level SSI locks. Our + * new tuples can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call, which makes for a faster check. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + ndone = 0; + while (ndone < ntuples) + { + Buffer buffer; + bool starting_with_empty_page; + bool all_visible_cleared = false; + bool all_frozen_set = false; + int nthispage; + + CHECK_FOR_INTERRUPTS(); + + /* + * Find buffer where at least the next tuple will fit. If the page is + * all-visible, this will also pin the requisite visibility map page. + * + * Also pin visibility map page if COPY FREEZE inserts tuples into an + * empty page. See all_frozen_set below. + */ + buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len, + InvalidBuffer, options, bistate, + &vmbuffer, NULL); + page = BufferGetPage(buffer); + + starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0; + + if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN)) + all_frozen_set = true; + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + /* + * RelationGetBufferForTuple has ensured that the first tuple fits. + * Put that on the page, and then as many other tuples as fit. + */ + RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); + + /* + * For logical decoding we need combo CIDs to properly decode the + * catalog. + */ + if (needwal && need_cids) + log_heap_new_cid(relation, heaptuples[ndone]); + + for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) + { + HeapTuple heaptup = heaptuples[ndone + nthispage]; + + if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) + break; + + RelationPutHeapTuple(relation, buffer, heaptup, false); + + /* + * For logical decoding we need combo CIDs to properly decode the + * catalog. + */ + if (needwal && need_cids) + log_heap_new_cid(relation, heaptup); + } + + /* + * If the page is all visible, need to clear that, unless we're only + * going to add further frozen rows to it. + * + * If we're only adding already frozen rows to a previously empty + * page, mark it as all-visible. + */ + if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + visibilitymap_clear(relation, + BufferGetBlockNumber(buffer), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + else if (all_frozen_set) + PageSetAllVisible(page); + + /* + * XXX Should we set PageSetPrunable on this page ? See heap_insert() + */ + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (needwal) + { + XLogRecPtr recptr; + xl_heap_multi_insert *xlrec; + uint8 info = XLOG_HEAP2_MULTI_INSERT; + char *tupledata; + int totaldatalen; + char *scratchptr = scratch.data; + bool init; + int bufflags = 0; + + /* + * If the page was previously empty, we can reinit the page + * instead of restoring the whole thing. + */ + init = starting_with_empty_page; + + /* allocate xl_heap_multi_insert struct from the scratch area */ + xlrec = (xl_heap_multi_insert *) scratchptr; + scratchptr += SizeOfHeapMultiInsert; + + /* + * Allocate offsets array. Unless we're reinitializing the page, + * in that case the tuples are stored in order starting at + * FirstOffsetNumber and we don't need to store the offsets + * explicitly. + */ + if (!init) + scratchptr += nthispage * sizeof(OffsetNumber); + + /* the rest of the scratch space is used for tuple data */ + tupledata = scratchptr; + + /* check that the mutually exclusive flags are not both set */ + Assert(!(all_visible_cleared && all_frozen_set)); + + xlrec->flags = 0; + if (all_visible_cleared) + xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED; + if (all_frozen_set) + xlrec->flags = XLH_INSERT_ALL_FROZEN_SET; + + xlrec->ntuples = nthispage; + + /* + * Write out an xl_multi_insert_tuple and the tuple data itself + * for each tuple. + */ + for (i = 0; i < nthispage; i++) + { + HeapTuple heaptup = heaptuples[ndone + i]; + xl_multi_insert_tuple *tuphdr; + int datalen; + + if (!init) + xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); + /* xl_multi_insert_tuple needs two-byte alignment. */ + tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); + scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; + + tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; + tuphdr->t_infomask = heaptup->t_data->t_infomask; + tuphdr->t_hoff = heaptup->t_data->t_hoff; + + /* write bitmap [+ padding] [+ oid] + data */ + datalen = heaptup->t_len - SizeofHeapTupleHeader; + memcpy(scratchptr, + (char *) heaptup->t_data + SizeofHeapTupleHeader, + datalen); + tuphdr->datalen = datalen; + scratchptr += datalen; + } + totaldatalen = scratchptr - tupledata; + Assert((scratchptr - scratch.data) < BLCKSZ); + + if (need_tuple_data) + xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; + + /* + * Signal that this is the last xl_heap_multi_insert record + * emitted by this call to heap_multi_insert(). Needed for logical + * decoding so it knows when to cleanup temporary data. + */ + if (ndone + nthispage == ntuples) + xlrec->flags |= XLH_INSERT_LAST_IN_MULTI; + + if (init) + { + info |= XLOG_HEAP_INIT_PAGE; + bufflags |= REGBUF_WILL_INIT; + } + + /* + * If we're doing logical decoding, include the new tuple data + * even if we take a full-page image of the page. + */ + if (need_tuple_data) + bufflags |= REGBUF_KEEP_DATA; + + XLogBeginInsert(); + XLogRegisterData((char *) xlrec, tupledata - scratch.data); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + + XLogRegisterBufData(0, tupledata, totaldatalen); + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP2_ID, info); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* + * If we've frozen everything on the page, update the visibilitymap. + * We're already holding pin on the vmbuffer. + */ + if (all_frozen_set) + { + Assert(PageIsAllVisible(page)); + Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); + + /* + * It's fine to use InvalidTransactionId here - this is only used + * when HEAP_INSERT_FROZEN is specified, which intentionally + * violates visibility rules. + */ + visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer, + InvalidXLogRecPtr, vmbuffer, + InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); + } + + UnlockReleaseBuffer(buffer); + ndone += nthispage; + + /* + * NB: Only release vmbuffer after inserting all tuples - it's fairly + * likely that we'll insert into subsequent heap pages that are likely + * to use the same vm page. + */ + } + + /* We're done with inserting all tuples, so release the last vmbuffer. */ + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* + * We're done with the actual inserts. Check for conflicts again, to + * ensure that all rw-conflicts in to these inserts are detected. Without + * this final check, a sequential scan of the heap may have locked the + * table after the "before" check, missing one opportunity to detect the + * conflict, and then scanned the table before the new tuples were there, + * missing the other chance to detect the conflict. + * + * For heap inserts, we only need to check for table-level SSI locks. Our + * new tuples can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + + /* + * If tuples are cachable, mark them for invalidation from the caches in + * case we abort. Note it is OK to do this after releasing the buffer, + * because the heaptuples data structure is all in local memory, not in + * the shared buffer. + */ + if (IsCatalogRelation(relation)) + { + for (i = 0; i < ntuples; i++) + CacheInvalidateHeapTuple(relation, heaptuples[i], NULL); + } + + /* copy t_self fields back to the caller's slots */ + for (i = 0; i < ntuples; i++) + slots[i]->tts_tid = heaptuples[i]->t_self; + + pgstat_count_heap_insert(relation, ntuples); +} + +/* + * simple_heap_insert - insert a tuple + * + * Currently, this routine differs from heap_insert only in supplying + * a default command ID and not allowing access to the speedup options. + * + * This should be used rather than using heap_insert directly in most places + * where we are modifying system catalogs. + */ +void +simple_heap_insert(Relation relation, HeapTuple tup) +{ + heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL); +} + +/* + * Given infomask/infomask2, compute the bits that must be saved in the + * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock, + * xl_heap_lock_updated WAL records. + * + * See fix_infomask_from_infobits. + */ +static uint8 +compute_infobits(uint16 infomask, uint16 infomask2) +{ + return + ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | + ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) | + ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) | + /* note we ignore HEAP_XMAX_SHR_LOCK here */ + ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) | + ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? + XLHL_KEYS_UPDATED : 0); +} + +/* + * Given two versions of the same t_infomask for a tuple, compare them and + * return whether the relevant status for a tuple Xmax has changed. This is + * used after a buffer lock has been released and reacquired: we want to ensure + * that the tuple state continues to be the same it was when we previously + * examined it. + * + * Note the Xmax field itself must be compared separately. + */ +static inline bool +xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) +{ + const uint16 interesting = + HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK; + + if ((new_infomask & interesting) != (old_infomask & interesting)) + return true; + + return false; +} + +/* + * heap_delete - delete a tuple + * + * See table_tuple_delete() for an explanation of the parameters, except that + * this routine directly takes a tuple rather than a slot. + * + * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, + * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last + * only for TM_SelfModified, since we cannot obtain cmax from a combo CID + * generated by another transaction). + */ +TM_Result +heap_delete(Relation relation, ItemPointer tid, + CommandId cid, Snapshot crosscheck, bool wait, + TM_FailureData *tmfd, bool changingPart) +{ + TM_Result result; + TransactionId xid = GetCurrentTransactionId(); + ItemId lp; + HeapTupleData tp; + Page page; + BlockNumber block; + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + TransactionId new_xmax; + uint16 new_infomask, + new_infomask2; + bool have_tuple_lock = false; + bool iscombo; + bool all_visible_cleared = false; + HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ + bool old_key_copied = false; + + Assert(ItemPointerIsValid(tid)); + + /* + * Forbid this during a parallel operation, lest it allocate a combo CID. + * Other workers might need that combo CID for visibility checks, and we + * have no provision for broadcasting it to them. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot delete tuples during a parallel operation"))); + + block = ItemPointerGetBlockNumber(tid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(page)) + visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, we'll have to unlock and + * re-lock, to avoid holding the buffer lock across an I/O. That's a bit + * unfortunate, but hopefully shouldn't happen often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tp.t_tableOid = RelationGetRelid(relation); + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_self = *tid; + +l1: + result = HeapTupleSatisfiesUpdate(&tp, cid, buffer); + + if (result == TM_Invisible) + { + UnlockReleaseBuffer(buffer); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("attempted to delete invisible tuple"))); + } + else if (result == TM_BeingModified && wait) + { + TransactionId xwait; + uint16 infomask; + + /* must copy state data before unlocking buffer */ + xwait = HeapTupleHeaderGetRawXmax(tp.t_data); + infomask = tp.t_data->t_infomask; + + /* + * Sleep until concurrent transaction ends -- except when there's a + * single locker and it's our own transaction. Note we don't care + * which lock mode the locker has, because we need the strongest one. + * + * Before sleeping, we need to acquire tuple lock to establish our + * priority for the tuple (see heap_lock_tuple). LockTuple will + * release us when we are next-in-line for the tuple. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while rechecking + * tuple state. + */ + if (infomask & HEAP_XMAX_IS_MULTI) + { + bool current_is_member = false; + + if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + LockTupleExclusive, ¤t_is_member)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Acquire the lock, if necessary (but skip it when we're + * requesting a lock and already have one; avoids deadlock). + */ + if (!current_is_member) + heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, + LockWaitBlock, &have_tuple_lock); + + /* wait for multixact */ + MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask, + relation, &(tp.t_self), XLTW_Delete, + NULL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If xwait had just locked the tuple then some other xact + * could update this tuple before we get to this point. Check + * for xmax change, and start over if so. + */ + if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), + xwait)) + goto l1; + } + + /* + * You might think the multixact is necessarily done here, but not + * so: it could have surviving members, namely our own xact or + * other subxacts of this backend. It is legal for us to delete + * the tuple in either case, however (the latter case is + * essentially a situation of upgrading our former shared lock to + * exclusive). We don't bother changing the on-disk hint bits + * since we are about to overwrite the xmax altogether. + */ + } + else if (!TransactionIdIsCurrentTransactionId(xwait)) + { + /* + * Wait for regular transaction to end; but first, acquire tuple + * lock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, + LockWaitBlock, &have_tuple_lock); + XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + */ + if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), + xwait)) + goto l1; + + /* Otherwise check if it committed or aborted */ + UpdateXmaxHintBits(tp.t_data, buffer, xwait); + } + + /* + * We may overwrite if previous xmax aborted, or if it committed but + * only locked the tuple without updating it. + */ + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) || + HeapTupleHeaderIsOnlyLocked(tp.t_data)) + result = TM_Ok; + else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + result = TM_Updated; + else + result = TM_Deleted; + } + + if (crosscheck != InvalidSnapshot && result == TM_Ok) + { + /* Perform additional check for transaction-snapshot mode RI updates */ + if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer)) + result = TM_Updated; + } + + if (result != TM_Ok) + { + Assert(result == TM_SelfModified || + result == TM_Updated || + result == TM_Deleted || + result == TM_BeingModified); + Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); + Assert(result != TM_Updated || + !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)); + tmfd->ctid = tp.t_data->t_ctid; + tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data); + if (result == TM_SelfModified) + tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); + else + tmfd->cmax = InvalidCommandId; + UnlockReleaseBuffer(buffer); + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + return result; + } + + /* + * We're about to do the actual delete -- check for conflict first, to + * avoid possibly having to roll back work we've just done. + * + * This is safe without a recheck as long as there is no possibility of + * another process scanning the page between this check and the delete + * being visible to the scan (i.e., an exclusive buffer content lock is + * continuously held from this point until the tuple delete is visible). + */ + CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer)); + + /* replace cid with a combo CID if necessary */ + HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); + + /* + * Compute replica identity tuple before entering the critical section so + * we don't PANIC upon a memory allocation failure. + */ + old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied); + + /* + * If this is the first possibly-multixact-able operation in the current + * transaction, set my per-backend OldestMemberMXactId setting. We can be + * certain that the transaction will never become a member of any older + * MultiXactIds than that. (We have to do this even if we end up just + * using our own TransactionId below, since some other backend could + * incorporate our XID into a MultiXact immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data), + tp.t_data->t_infomask, tp.t_data->t_infomask2, + xid, LockTupleExclusive, true, + &new_xmax, &new_infomask, &new_infomask2); + + START_CRIT_SECTION(); + + /* + * If this transaction commits, the tuple will become DEAD sooner or + * later. Set flag that this page is a candidate for pruning once our xid + * falls below the OldestXmin horizon. If the transaction finally aborts, + * the subsequent page pruning will be a no-op and the hint will be + * cleared. + */ + PageSetPrunable(page, xid); + + if (PageIsAllVisible(page)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + visibilitymap_clear(relation, BufferGetBlockNumber(buffer), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + + /* store transaction information of xact deleting the tuple */ + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + tp.t_data->t_infomask |= new_infomask; + tp.t_data->t_infomask2 |= new_infomask2; + HeapTupleHeaderClearHotUpdated(tp.t_data); + HeapTupleHeaderSetXmax(tp.t_data, new_xmax); + HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo); + /* Make sure there is no forward chain link in t_ctid */ + tp.t_data->t_ctid = tp.t_self; + + /* Signal that this is actually a move into another partition */ + if (changingPart) + HeapTupleHeaderSetMovedPartitions(tp.t_data); + + MarkBufferDirty(buffer); + + /* + * XLOG stuff + * + * NB: heap_abort_speculative() uses the same xlog record and replay + * routines. + */ + if (RelationNeedsWAL(relation)) + { + xl_heap_delete xlrec; + xl_heap_header xlhdr; + XLogRecPtr recptr; + + /* + * For logical decode we need combo CIDs to properly decode the + * catalog + */ + if (RelationIsAccessibleInLogicalDecoding(relation)) + log_heap_new_cid(relation, &tp); + + xlrec.flags = 0; + if (all_visible_cleared) + xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED; + if (changingPart) + xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE; + xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, + tp.t_data->t_infomask2); + xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = new_xmax; + + if (old_key_tuple != NULL) + { + if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE; + else + xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY; + } + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + /* + * Log replica identity of the deleted tuple if there is one + */ + if (old_key_tuple != NULL) + { + xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; + xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; + + XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); + XLogRegisterData((char *) old_key_tuple->t_data + + SizeofHeapTupleHeader, + old_key_tuple->t_len + - SizeofHeapTupleHeader); + } + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* + * If the tuple has toasted out-of-line attributes, we need to delete + * those items too. We have to do this before releasing the buffer + * because we need to look at the contents of the tuple, but it's OK to + * release the content lock on the buffer first. + */ + if (relation->rd_rel->relkind != RELKIND_RELATION && + relation->rd_rel->relkind != RELKIND_MATVIEW) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(&tp)); + } + else if (HeapTupleHasExternal(&tp)) + heap_toast_delete(relation, &tp, false); + + /* + * Mark tuple for invalidation from system caches at next command + * boundary. We have to do this before releasing the buffer because we + * need to look at the contents of the tuple. + */ + CacheInvalidateHeapTuple(relation, &tp, NULL); + + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + + /* + * Release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); + + pgstat_count_heap_delete(relation); + + if (old_key_tuple != NULL && old_key_copied) + heap_freetuple(old_key_tuple); + + return TM_Ok; +} + +/* + * simple_heap_delete - delete a tuple + * + * This routine may be used to delete a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_heap_delete(Relation relation, ItemPointer tid) +{ + TM_Result result; + TM_FailureData tmfd; + + result = heap_delete(relation, tid, + GetCurrentCommandId(true), InvalidSnapshot, + true /* wait for commit */ , + &tmfd, false /* changingPart */ ); + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized heap_delete status: %u", result); + break; + } +} + +/* + * heap_update - replace a tuple + * + * See table_tuple_update() for an explanation of the parameters, except that + * this routine directly takes a tuple rather than a slot. + * + * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, + * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last + * only for TM_SelfModified, since we cannot obtain cmax from a combo CID + * generated by another transaction). + */ +TM_Result +heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, + CommandId cid, Snapshot crosscheck, bool wait, + TM_FailureData *tmfd, LockTupleMode *lockmode) +{ + TM_Result result; + TransactionId xid = GetCurrentTransactionId(); + Bitmapset *hot_attrs; + Bitmapset *key_attrs; + Bitmapset *id_attrs; + Bitmapset *interesting_attrs; + Bitmapset *modified_attrs; + ItemId lp; + HeapTupleData oldtup; + HeapTuple heaptup; + HeapTuple old_key_tuple = NULL; + bool old_key_copied = false; + Page page; + BlockNumber block; + MultiXactStatus mxact_status; + Buffer buffer, + newbuf, + vmbuffer = InvalidBuffer, + vmbuffer_new = InvalidBuffer; + bool need_toast; + Size newtupsize, + pagefree; + bool have_tuple_lock = false; + bool iscombo; + bool use_hot_update = false; + bool hot_attrs_checked = false; + bool key_intact; + bool all_visible_cleared = false; + bool all_visible_cleared_new = false; + bool checked_lockers; + bool locker_remains; + bool id_has_external = false; + TransactionId xmax_new_tuple, + xmax_old_tuple; + uint16 infomask_old_tuple, + infomask2_old_tuple, + infomask_new_tuple, + infomask2_new_tuple; + + Assert(ItemPointerIsValid(otid)); + + /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ + Assert(HeapTupleHeaderGetNatts(newtup->t_data) <= + RelationGetNumberOfAttributes(relation)); + + /* + * Forbid this during a parallel operation, lest it allocate a combo CID. + * Other workers might need that combo CID for visibility checks, and we + * have no provision for broadcasting it to them. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot update tuples during a parallel operation"))); + + /* + * Fetch the list of attributes to be checked for various operations. + * + * For HOT considerations, this is wasted effort if we fail to update or + * have to put the new tuple on a different page. But we must compute the + * list before obtaining buffer lock --- in the worst case, if we are + * doing an update on one of the relevant system catalogs, we could + * deadlock if we try to fetch the list later. In any case, the relcache + * caches the data so this is usually pretty cheap. + * + * We also need columns used by the replica identity and columns that are + * considered the "key" of rows in the table. + * + * Note that we get copies of each bitmap, so we need not worry about + * relcache flush happening midway through. + */ + hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL); + key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); + id_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + + + block = ItemPointerGetBlockNumber(otid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + interesting_attrs = NULL; + + /* + * If the page is already full, there is hardly any chance of doing a HOT + * update on this page. It might be wasteful effort to look for index + * column updates only to later reject HOT updates for lack of space in + * the same page. So we be conservative and only fetch hot_attrs if the + * page is not already full. Since we are already holding a pin on the + * buffer, there is no chance that the buffer can get cleaned up + * concurrently and even if that was possible, in the worst case we lose a + * chance to do a HOT update. + */ + if (!PageIsFull(page)) + { + interesting_attrs = bms_add_members(interesting_attrs, hot_attrs); + hot_attrs_checked = true; + } + interesting_attrs = bms_add_members(interesting_attrs, key_attrs); + interesting_attrs = bms_add_members(interesting_attrs, id_attrs); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(page)) + visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid)); + Assert(ItemIdIsNormal(lp)); + + /* + * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work + * properly. + */ + oldtup.t_tableOid = RelationGetRelid(relation); + oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + oldtup.t_len = ItemIdGetLength(lp); + oldtup.t_self = *otid; + + /* the new tuple is ready, except for this: */ + newtup->t_tableOid = RelationGetRelid(relation); + + /* + * Determine columns modified by the update. Additionally, identify + * whether any of the unmodified replica identity key attributes in the + * old tuple is externally stored or not. This is required because for + * such attributes the flattened value won't be WAL logged as part of the + * new tuple so we must include it as part of the old_key_tuple. See + * ExtractReplicaIdentity. + */ + modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs, + id_attrs, &oldtup, + newtup, &id_has_external); + + /* + * If we're not updating any "key" column, we can grab a weaker lock type. + * This allows for more concurrency when we are running simultaneously + * with foreign key checks. + * + * Note that if a column gets detoasted while executing the update, but + * the value ends up being the same, this test will fail and we will use + * the stronger lock. This is acceptable; the important case to optimize + * is updates that don't manipulate key columns, not those that + * serendipitously arrive at the same key values. + */ + if (!bms_overlap(modified_attrs, key_attrs)) + { + *lockmode = LockTupleNoKeyExclusive; + mxact_status = MultiXactStatusNoKeyUpdate; + key_intact = true; + + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId + * setting. We can be certain that the transaction will never become a + * member of any older MultiXactIds than that. (We have to do this + * even if we end up just using our own TransactionId below, since + * some other backend could incorporate our XID into a MultiXact + * immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + } + else + { + *lockmode = LockTupleExclusive; + mxact_status = MultiXactStatusUpdate; + key_intact = false; + } + + /* + * Note: beyond this point, use oldtup not otid to refer to old tuple. + * otid may very well point at newtup->t_self, which we will overwrite + * with the new tuple's location, so there's great risk of confusion if we + * use otid anymore. + */ + +l2: + checked_lockers = false; + locker_remains = false; + result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); + + /* see below about the "no wait" case */ + Assert(result != TM_BeingModified || wait); + + if (result == TM_Invisible) + { + UnlockReleaseBuffer(buffer); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("attempted to update invisible tuple"))); + } + else if (result == TM_BeingModified && wait) + { + TransactionId xwait; + uint16 infomask; + bool can_continue = false; + + /* + * XXX note that we don't consider the "no wait" case here. This + * isn't a problem currently because no caller uses that case, but it + * should be fixed if such a caller is introduced. It wasn't a + * problem previously because this code would always wait, but now + * that some tuple locks do not conflict with one of the lock modes we + * use, it is possible that this case is interesting to handle + * specially. + * + * This may cause failures with third-party code that calls + * heap_update directly. + */ + + /* must copy state data before unlocking buffer */ + xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data); + infomask = oldtup.t_data->t_infomask; + + /* + * Now we have to do something about the existing locker. If it's a + * multi, sleep on it; we might be awakened before it is completely + * gone (or even not sleep at all in some cases); we need to preserve + * it as locker, unless it is gone completely. + * + * If it's not a multi, we need to check for sleeping conditions + * before actually going to sleep. If the update doesn't conflict + * with the locks, we just continue without sleeping (but making sure + * it is preserved). + * + * Before sleeping, we need to acquire tuple lock to establish our + * priority for the tuple (see heap_lock_tuple). LockTuple will + * release us when we are next-in-line for the tuple. Note we must + * not acquire the tuple lock until we're sure we're going to sleep; + * otherwise we're open for race conditions with other transactions + * holding the tuple lock which sleep on us. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while rechecking + * tuple state. + */ + if (infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId update_xact; + int remain; + bool current_is_member = false; + + if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + *lockmode, ¤t_is_member)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Acquire the lock, if necessary (but skip it when we're + * requesting a lock and already have one; avoids deadlock). + */ + if (!current_is_member) + heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, + LockWaitBlock, &have_tuple_lock); + + /* wait for multixact */ + MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask, + relation, &oldtup.t_self, XLTW_Update, + &remain); + checked_lockers = true; + locker_remains = remain != 0; + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If xwait had just locked the tuple then some other xact + * could update this tuple before we get to this point. Check + * for xmax change, and start over if so. + */ + if (xmax_infomask_changed(oldtup.t_data->t_infomask, + infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data), + xwait)) + goto l2; + } + + /* + * Note that the multixact may not be done by now. It could have + * surviving members; our own xact or other subxacts of this + * backend, and also any other concurrent transaction that locked + * the tuple with LockTupleKeyShare if we only got + * LockTupleNoKeyExclusive. If this is the case, we have to be + * careful to mark the updated tuple with the surviving members in + * Xmax. + * + * Note that there could have been another update in the + * MultiXact. In that case, we need to check whether it committed + * or aborted. If it aborted we are safe to update it again; + * otherwise there is an update conflict, and we have to return + * TableTuple{Deleted, Updated} below. + * + * In the LockTupleExclusive case, we still need to preserve the + * surviving members: those would include the tuple locks we had + * before this one, which are important to keep in case this + * subxact aborts. + */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask)) + update_xact = HeapTupleGetUpdateXid(oldtup.t_data); + else + update_xact = InvalidTransactionId; + + /* + * There was no UPDATE in the MultiXact; or it aborted. No + * TransactionIdIsInProgress() call needed here, since we called + * MultiXactIdWait() above. + */ + if (!TransactionIdIsValid(update_xact) || + TransactionIdDidAbort(update_xact)) + can_continue = true; + } + else if (TransactionIdIsCurrentTransactionId(xwait)) + { + /* + * The only locker is ourselves; we can avoid grabbing the tuple + * lock here, but must preserve our locking information. + */ + checked_lockers = true; + locker_remains = true; + can_continue = true; + } + else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact) + { + /* + * If it's just a key-share locker, and we're not changing the key + * columns, we don't need to wait for it to end; but we need to + * preserve it as locker. + */ + checked_lockers = true; + locker_remains = true; + can_continue = true; + } + else + { + /* + * Wait for regular transaction to end; but first, acquire tuple + * lock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, + LockWaitBlock, &have_tuple_lock); + XactLockTableWait(xwait, relation, &oldtup.t_self, + XLTW_Update); + checked_lockers = true; + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + */ + if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) || + !TransactionIdEquals(xwait, + HeapTupleHeaderGetRawXmax(oldtup.t_data))) + goto l2; + + /* Otherwise check if it committed or aborted */ + UpdateXmaxHintBits(oldtup.t_data, buffer, xwait); + if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) + can_continue = true; + } + + if (can_continue) + result = TM_Ok; + else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)) + result = TM_Updated; + else + result = TM_Deleted; + } + + if (crosscheck != InvalidSnapshot && result == TM_Ok) + { + /* Perform additional check for transaction-snapshot mode RI updates */ + if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer)) + { + result = TM_Updated; + Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)); + } + } + + if (result != TM_Ok) + { + Assert(result == TM_SelfModified || + result == TM_Updated || + result == TM_Deleted || + result == TM_BeingModified); + Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); + Assert(result != TM_Updated || + !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)); + tmfd->ctid = oldtup.t_data->t_ctid; + tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data); + if (result == TM_SelfModified) + tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); + else + tmfd->cmax = InvalidCommandId; + UnlockReleaseBuffer(buffer); + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + bms_free(hot_attrs); + bms_free(key_attrs); + bms_free(id_attrs); + bms_free(modified_attrs); + bms_free(interesting_attrs); + return result; + } + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, or during some + * subsequent window during which we had it unlocked, we'll have to unlock + * and re-lock, to avoid holding the buffer lock across an I/O. That's a + * bit unfortunate, especially since we'll now have to recheck whether the + * tuple has been locked or updated under us, but hopefully it won't + * happen very often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + goto l2; + } + + /* Fill in transaction status data */ + + /* + * If the tuple we're updating is locked, we need to preserve the locking + * info in the old tuple's Xmax. Prepare a new Xmax value for this. + */ + compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), + oldtup.t_data->t_infomask, + oldtup.t_data->t_infomask2, + xid, *lockmode, true, + &xmax_old_tuple, &infomask_old_tuple, + &infomask2_old_tuple); + + /* + * And also prepare an Xmax value for the new copy of the tuple. If there + * was no xmax previously, or there was one but all lockers are now gone, + * then use InvalidXid; otherwise, get the xmax from the old tuple. (In + * rare cases that might also be InvalidXid and yet not have the + * HEAP_XMAX_INVALID bit set; that's fine.) + */ + if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) || + (checked_lockers && !locker_remains)) + xmax_new_tuple = InvalidTransactionId; + else + xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data); + + if (!TransactionIdIsValid(xmax_new_tuple)) + { + infomask_new_tuple = HEAP_XMAX_INVALID; + infomask2_new_tuple = 0; + } + else + { + /* + * If we found a valid Xmax for the new tuple, then the infomask bits + * to use on the new tuple depend on what was there on the old one. + * Note that since we're doing an update, the only possibility is that + * the lockers had FOR KEY SHARE lock. + */ + if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) + { + GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple, + &infomask2_new_tuple); + } + else + { + infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY; + infomask2_new_tuple = 0; + } + } + + /* + * Prepare the new tuple with the appropriate initial values of Xmin and + * Xmax, as well as initial infomask bits as computed above. + */ + newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); + newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); + HeapTupleHeaderSetXmin(newtup->t_data, xid); + HeapTupleHeaderSetCmin(newtup->t_data, cid); + newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple; + newtup->t_data->t_infomask2 |= infomask2_new_tuple; + HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple); + + /* + * Replace cid with a combo CID if necessary. Note that we already put + * the plain cid into the new tuple. + */ + HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo); + + /* + * If the toaster needs to be activated, OR if the new tuple will not fit + * on the same page as the old, then we need to release the content lock + * (but not the pin!) on the old tuple's buffer while we are off doing + * TOAST and/or table-file-extension work. We must mark the old tuple to + * show that it's locked, else other processes may try to update it + * themselves. + * + * We need to invoke the toaster if there are already any out-of-line + * toasted values present, or if the new tuple is over-threshold. + */ + if (relation->rd_rel->relkind != RELKIND_RELATION && + relation->rd_rel->relkind != RELKIND_MATVIEW) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(&oldtup)); + Assert(!HeapTupleHasExternal(newtup)); + need_toast = false; + } + else + need_toast = (HeapTupleHasExternal(&oldtup) || + HeapTupleHasExternal(newtup) || + newtup->t_len > TOAST_TUPLE_THRESHOLD); + + pagefree = PageGetHeapFreeSpace(page); + + newtupsize = MAXALIGN(newtup->t_len); + + if (need_toast || newtupsize > pagefree) + { + TransactionId xmax_lock_old_tuple; + uint16 infomask_lock_old_tuple, + infomask2_lock_old_tuple; + bool cleared_all_frozen = false; + + /* + * To prevent concurrent sessions from updating the tuple, we have to + * temporarily mark it locked, while we release the page-level lock. + * + * To satisfy the rule that any xid potentially appearing in a buffer + * written out to disk, we unfortunately have to WAL log this + * temporary modification. We can reuse xl_heap_lock for this + * purpose. If we crash/error before following through with the + * actual update, xmax will be of an aborted transaction, allowing + * other sessions to proceed. + */ + + /* + * Compute xmax / infomask appropriate for locking the tuple. This has + * to be done separately from the combo that's going to be used for + * updating, because the potentially created multixact would otherwise + * be wrong. + */ + compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), + oldtup.t_data->t_infomask, + oldtup.t_data->t_infomask2, + xid, *lockmode, false, + &xmax_lock_old_tuple, &infomask_lock_old_tuple, + &infomask2_lock_old_tuple); + + Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple)); + + START_CRIT_SECTION(); + + /* Clear obsolete visibility flags ... */ + oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + HeapTupleClearHotUpdated(&oldtup); + /* ... and store info about transaction updating this tuple */ + Assert(TransactionIdIsValid(xmax_lock_old_tuple)); + HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple); + oldtup.t_data->t_infomask |= infomask_lock_old_tuple; + oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple; + HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); + + /* temporarily make it look not-updated, but locked */ + oldtup.t_data->t_ctid = oldtup.t_self; + + /* + * Clear all-frozen bit on visibility map if needed. We could + * immediately reset ALL_VISIBLE, but given that the WAL logging + * overhead would be unchanged, that doesn't seem necessarily + * worthwhile. + */ + if (PageIsAllVisible(page) && + visibilitymap_clear(relation, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(relation)) + { + xl_heap_lock xlrec; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self); + xlrec.locking_xid = xmax_lock_old_tuple; + xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask, + oldtup.t_data->t_infomask2); + xlrec.flags = + cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Let the toaster do its thing, if needed. + * + * Note: below this point, heaptup is the data we actually intend to + * store into the relation; newtup is the caller's original untoasted + * data. + */ + if (need_toast) + { + /* Note we always use WAL and FSM during updates */ + heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0); + newtupsize = MAXALIGN(heaptup->t_len); + } + else + heaptup = newtup; + + /* + * Now, do we need a new page for the tuple, or not? This is a bit + * tricky since someone else could have added tuples to the page while + * we weren't looking. We have to recheck the available space after + * reacquiring the buffer lock. But don't bother to do that if the + * former amount of free space is still not enough; it's unlikely + * there's more free now than before. + * + * What's more, if we need to get a new page, we will need to acquire + * buffer locks on both old and new pages. To avoid deadlock against + * some other backend trying to get the same two locks in the other + * order, we must be consistent about the order we get the locks in. + * We use the rule "lock the lower-numbered page of the relation + * first". To implement this, we must do RelationGetBufferForTuple + * while not holding the lock on the old page, and we must rely on it + * to get the locks on both pages in the correct order. + * + * Another consideration is that we need visibility map page pin(s) if + * we will have to clear the all-visible flag on either page. If we + * call RelationGetBufferForTuple, we rely on it to acquire any such + * pins; but if we don't, we have to handle that here. Hence we need + * a loop. + */ + for (;;) + { + if (newtupsize > pagefree) + { + /* It doesn't fit, must use RelationGetBufferForTuple. */ + newbuf = RelationGetBufferForTuple(relation, heaptup->t_len, + buffer, 0, NULL, + &vmbuffer_new, &vmbuffer); + /* We're all done. */ + break; + } + /* Acquire VM page pin if needed and we don't have it. */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + visibilitymap_pin(relation, block, &vmbuffer); + /* Re-acquire the lock on the old tuple's page. */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + /* Re-check using the up-to-date free space */ + pagefree = PageGetHeapFreeSpace(page); + if (newtupsize > pagefree || + (vmbuffer == InvalidBuffer && PageIsAllVisible(page))) + { + /* + * Rats, it doesn't fit anymore, or somebody just now set the + * all-visible flag. We must now unlock and loop to avoid + * deadlock. Fortunately, this path should seldom be taken. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + else + { + /* We're all done. */ + newbuf = buffer; + break; + } + } + } + else + { + /* No TOAST work needed, and it'll fit on same page */ + newbuf = buffer; + heaptup = newtup; + } + + /* + * We're about to do the actual update -- check for conflict first, to + * avoid possibly having to roll back work we've just done. + * + * This is safe without a recheck as long as there is no possibility of + * another process scanning the pages between this check and the update + * being visible to the scan (i.e., exclusive buffer content lock(s) are + * continuously held from this point until the tuple update is visible). + * + * For the new tuple the only check needed is at the relation level, but + * since both tuples are in the same relation and the check for oldtup + * will include checking the relation level, there is no benefit to a + * separate check for the new tuple. + */ + CheckForSerializableConflictIn(relation, &oldtup.t_self, + BufferGetBlockNumber(buffer)); + + /* + * At this point newbuf and buffer are both pinned and locked, and newbuf + * has enough space for the new tuple. If they are the same buffer, only + * one pin is held. + */ + + if (newbuf == buffer) + { + /* + * Since the new tuple is going into the same page, we might be able + * to do a HOT update. Check if any of the index columns have been + * changed. If the page was already full, we may have skipped checking + * for index columns, and also can't do a HOT update. + */ + if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs)) + use_hot_update = true; + } + else + { + /* Set a hint that the old page could use prune/defrag */ + PageSetFull(page); + } + + /* + * Compute replica identity tuple before entering the critical section so + * we don't PANIC upon a memory allocation failure. + * ExtractReplicaIdentity() will return NULL if nothing needs to be + * logged. Pass old key required as true only if the replica identity key + * columns are modified or it has external data. + */ + old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, + bms_overlap(modified_attrs, id_attrs) || + id_has_external, + &old_key_copied); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + /* + * If this transaction commits, the old tuple will become DEAD sooner or + * later. Set flag that this page is a candidate for pruning once our xid + * falls below the OldestXmin horizon. If the transaction finally aborts, + * the subsequent page pruning will be a no-op and the hint will be + * cleared. + * + * XXX Should we set hint on newbuf as well? If the transaction aborts, + * there would be a prunable tuple in the newbuf; but for now we choose + * not to optimize for aborts. Note that heap_xlog_update must be kept in + * sync if this decision changes. + */ + PageSetPrunable(page, xid); + + if (use_hot_update) + { + /* Mark the old tuple as HOT-updated */ + HeapTupleSetHotUpdated(&oldtup); + /* And mark the new tuple as heap-only */ + HeapTupleSetHeapOnly(heaptup); + /* Mark the caller's copy too, in case different from heaptup */ + HeapTupleSetHeapOnly(newtup); + } + else + { + /* Make sure tuples are correctly marked as not-HOT */ + HeapTupleClearHotUpdated(&oldtup); + HeapTupleClearHeapOnly(heaptup); + HeapTupleClearHeapOnly(newtup); + } + + RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */ + + + /* Clear obsolete visibility flags, possibly set by ourselves above... */ + oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + /* ... and store info about transaction updating this tuple */ + Assert(TransactionIdIsValid(xmax_old_tuple)); + HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple); + oldtup.t_data->t_infomask |= infomask_old_tuple; + oldtup.t_data->t_infomask2 |= infomask2_old_tuple; + HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); + + /* record address of new tuple in t_ctid of old one */ + oldtup.t_data->t_ctid = heaptup->t_self; + + /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */ + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + visibilitymap_clear(relation, BufferGetBlockNumber(buffer), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf))) + { + all_visible_cleared_new = true; + PageClearAllVisible(BufferGetPage(newbuf)); + visibilitymap_clear(relation, BufferGetBlockNumber(newbuf), + vmbuffer_new, VISIBILITYMAP_VALID_BITS); + } + + if (newbuf != buffer) + MarkBufferDirty(newbuf); + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + XLogRecPtr recptr; + + /* + * For logical decoding we need combo CIDs to properly decode the + * catalog. + */ + if (RelationIsAccessibleInLogicalDecoding(relation)) + { + log_heap_new_cid(relation, &oldtup); + log_heap_new_cid(relation, heaptup); + } + + recptr = log_heap_update(relation, buffer, + newbuf, &oldtup, heaptup, + old_key_tuple, + all_visible_cleared, + all_visible_cleared_new); + if (newbuf != buffer) + { + PageSetLSN(BufferGetPage(newbuf), recptr); + } + PageSetLSN(BufferGetPage(buffer), recptr); + } + + END_CRIT_SECTION(); + + if (newbuf != buffer) + LockBuffer(newbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * Mark old tuple for invalidation from system caches at next command + * boundary, and mark the new tuple for invalidation in case we abort. We + * have to do this before releasing the buffer because oldtup is in the + * buffer. (heaptup is all in local memory, but it's necessary to process + * both tuple versions in one call to inval.c so we can avoid redundant + * sinval messages.) + */ + CacheInvalidateHeapTuple(relation, &oldtup, heaptup); + + /* Now we can release the buffer(s) */ + if (newbuf != buffer) + ReleaseBuffer(newbuf); + ReleaseBuffer(buffer); + if (BufferIsValid(vmbuffer_new)) + ReleaseBuffer(vmbuffer_new); + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + + /* + * Release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); + + pgstat_count_heap_update(relation, use_hot_update); + + /* + * If heaptup is a private copy, release it. Don't forget to copy t_self + * back to the caller's image, too. + */ + if (heaptup != newtup) + { + newtup->t_self = heaptup->t_self; + heap_freetuple(heaptup); + } + + if (old_key_tuple != NULL && old_key_copied) + heap_freetuple(old_key_tuple); + + bms_free(hot_attrs); + bms_free(key_attrs); + bms_free(id_attrs); + bms_free(modified_attrs); + bms_free(interesting_attrs); + + return TM_Ok; +} + +/* + * Check if the specified attribute's values are the same. Subroutine for + * HeapDetermineColumnsInfo. + */ +static bool +heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, + bool isnull1, bool isnull2) +{ + Form_pg_attribute att; + + /* + * If one value is NULL and other is not, then they are certainly not + * equal + */ + if (isnull1 != isnull2) + return false; + + /* + * If both are NULL, they can be considered equal. + */ + if (isnull1) + return true; + + /* + * We do simple binary comparison of the two datums. This may be overly + * strict because there can be multiple binary representations for the + * same logical value. But we should be OK as long as there are no false + * positives. Using a type-specific equality operator is messy because + * there could be multiple notions of equality in different operator + * classes; furthermore, we cannot safely invoke user-defined functions + * while holding exclusive buffer lock. + */ + if (attrnum <= 0) + { + /* The only allowed system columns are OIDs, so do this */ + return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); + } + else + { + Assert(attrnum <= tupdesc->natts); + att = TupleDescAttr(tupdesc, attrnum - 1); + return datumIsEqual(value1, value2, att->attbyval, att->attlen); + } +} + +/* + * Check which columns are being updated. + * + * Given an updated tuple, determine (and return into the output bitmapset), + * from those listed as interesting, the set of columns that changed. + * + * has_external indicates if any of the unmodified attributes (from those + * listed as interesting) of the old tuple is a member of external_cols and is + * stored externally. + * + * The input interesting_cols bitmapset is destructively modified; that is OK + * since this is invoked at most once in heap_update. + */ +static Bitmapset * +HeapDetermineColumnsInfo(Relation relation, + Bitmapset *interesting_cols, + Bitmapset *external_cols, + HeapTuple oldtup, HeapTuple newtup, + bool *has_external) +{ + int attrnum; + Bitmapset *modified = NULL; + TupleDesc tupdesc = RelationGetDescr(relation); + + while ((attrnum = bms_first_member(interesting_cols)) >= 0) + { + Datum value1, + value2; + bool isnull1, + isnull2; + + attrnum += FirstLowInvalidHeapAttributeNumber; + + /* + * If it's a whole-tuple reference, say "not equal". It's not really + * worth supporting this case, since it could only succeed after a + * no-op update, which is hardly a case worth optimizing for. + */ + if (attrnum == 0) + { + modified = bms_add_member(modified, + attrnum - + FirstLowInvalidHeapAttributeNumber); + continue; + } + + /* + * Likewise, automatically say "not equal" for any system attribute + * other than tableOID; we cannot expect these to be consistent in a + * HOT chain, or even to be set correctly yet in the new tuple. + */ + if (attrnum < 0) + { + if (attrnum != TableOidAttributeNumber) + { + modified = bms_add_member(modified, + attrnum - + FirstLowInvalidHeapAttributeNumber); + continue; + } + } + + /* + * Extract the corresponding values. XXX this is pretty inefficient + * if there are many indexed columns. Should we do a single + * heap_deform_tuple call on each tuple, instead? But that doesn't + * work for system columns ... + */ + value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1); + value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2); + + if (!heap_attr_equals(tupdesc, attrnum, value1, + value2, isnull1, isnull2)) + { + modified = bms_add_member(modified, + attrnum - + FirstLowInvalidHeapAttributeNumber); + continue; + } + + /* + * No need to check attributes that can't be stored externally. Note + * that system attributes can't be stored externally. + */ + if (attrnum < 0 || isnull1 || + TupleDescAttr(tupdesc, attrnum - 1)->attlen != -1) + continue; + + /* + * Check if the old tuple's attribute is stored externally and is a + * member of external_cols. + */ + if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) && + bms_is_member(attrnum - FirstLowInvalidHeapAttributeNumber, + external_cols)) + *has_external = true; + } + + return modified; +} + +/* + * simple_heap_update - replace a tuple + * + * This routine may be used to update a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) +{ + TM_Result result; + TM_FailureData tmfd; + LockTupleMode lockmode; + + result = heap_update(relation, otid, tup, + GetCurrentCommandId(true), InvalidSnapshot, + true /* wait for commit */ , + &tmfd, &lockmode); + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized heap_update status: %u", result); + break; + } +} + + +/* + * Return the MultiXactStatus corresponding to the given tuple lock mode. + */ +static MultiXactStatus +get_mxact_status_for_lock(LockTupleMode mode, bool is_update) +{ + int retval; + + if (is_update) + retval = tupleLockExtraInfo[mode].updstatus; + else + retval = tupleLockExtraInfo[mode].lockstatus; + + if (retval == -1) + elog(ERROR, "invalid lock tuple mode %d/%s", mode, + is_update ? "true" : "false"); + + return (MultiXactStatus) retval; +} + +/* + * heap_lock_tuple - lock a tuple in shared or exclusive mode + * + * Note that this acquires a buffer pin, which the caller must release. + * + * Input parameters: + * relation: relation containing tuple (caller must hold suitable lock) + * tid: TID of tuple to lock + * cid: current command ID (used for visibility test, and stored into + * tuple's cmax if lock is successful) + * mode: indicates if shared or exclusive tuple lock is desired + * wait_policy: what to do if tuple lock is not available + * follow_updates: if true, follow the update chain to also lock descendant + * tuples. + * + * Output parameters: + * *tuple: all fields filled in + * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *tmfd: filled in failure cases (see below) + * + * Function results are the same as the ones for table_tuple_lock(). + * + * In the failure cases other than TM_Invisible, the routine fills + * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, + * if necessary), and t_cmax (the last only for TM_SelfModified, + * since we cannot obtain cmax from a combo CID generated by another + * transaction). + * See comments for struct TM_FailureData for additional info. + * + * See README.tuplock for a thorough explanation of this mechanism. + */ +TM_Result +heap_lock_tuple(Relation relation, HeapTuple tuple, + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_updates, + Buffer *buffer, TM_FailureData *tmfd) +{ + TM_Result result; + ItemPointer tid = &(tuple->t_self); + ItemId lp; + Page page; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + TransactionId xid, + xmax; + uint16 old_infomask, + new_infomask, + new_infomask2; + bool first_time = true; + bool skip_tuple_lock = false; + bool have_tuple_lock = false; + bool cleared_all_frozen = false; + + *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + block = ItemPointerGetBlockNumber(tid); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(BufferGetPage(*buffer))) + visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(*buffer); + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); + tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(relation); + +l3: + result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); + + if (result == TM_Invisible) + { + /* + * This is possible, but only when locking a tuple for ON CONFLICT + * UPDATE. We return this value here rather than throwing an error in + * order to give that case the opportunity to throw a more specific + * error. + */ + result = TM_Invisible; + goto out_locked; + } + else if (result == TM_BeingModified || + result == TM_Updated || + result == TM_Deleted) + { + TransactionId xwait; + uint16 infomask; + uint16 infomask2; + bool require_sleep; + ItemPointerData t_ctid; + + /* must copy state data before unlocking buffer */ + xwait = HeapTupleHeaderGetRawXmax(tuple->t_data); + infomask = tuple->t_data->t_infomask; + infomask2 = tuple->t_data->t_infomask2; + ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); + + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + + /* + * If any subtransaction of the current top transaction already holds + * a lock as strong as or stronger than what we're requesting, we + * effectively hold the desired lock already. We *must* succeed + * without trying to take the tuple lock, else we will deadlock + * against anyone wanting to acquire a stronger lock. + * + * Note we only do this the first time we loop on the HTSU result; + * there is no point in testing in subsequent passes, because + * evidently our own transaction cannot have acquired a new lock after + * the first time we checked. + */ + if (first_time) + { + first_time = false; + + if (infomask & HEAP_XMAX_IS_MULTI) + { + int i; + int nmembers; + MultiXactMember *members; + + /* + * We don't need to allow old multixacts here; if that had + * been the case, HeapTupleSatisfiesUpdate would have returned + * MayBeUpdated and we wouldn't be here. + */ + nmembers = + GetMultiXactIdMembers(xwait, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(infomask)); + + for (i = 0; i < nmembers; i++) + { + /* only consider members of our own transaction */ + if (!TransactionIdIsCurrentTransactionId(members[i].xid)) + continue; + + if (TUPLOCK_from_mxstatus(members[i].status) >= mode) + { + pfree(members); + result = TM_Ok; + goto out_unlocked; + } + else + { + /* + * Disable acquisition of the heavyweight tuple lock. + * Otherwise, when promoting a weaker lock, we might + * deadlock with another locker that has acquired the + * heavyweight tuple lock and is waiting for our + * transaction to finish. + * + * Note that in this case we still need to wait for + * the multixact if required, to avoid acquiring + * conflicting locks. + */ + skip_tuple_lock = true; + } + } + + if (members) + pfree(members); + } + else if (TransactionIdIsCurrentTransactionId(xwait)) + { + switch (mode) + { + case LockTupleKeyShare: + Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) || + HEAP_XMAX_IS_SHR_LOCKED(infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(infomask)); + result = TM_Ok; + goto out_unlocked; + case LockTupleShare: + if (HEAP_XMAX_IS_SHR_LOCKED(infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + result = TM_Ok; + goto out_unlocked; + } + break; + case LockTupleNoKeyExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + result = TM_Ok; + goto out_unlocked; + } + break; + case LockTupleExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) && + infomask2 & HEAP_KEYS_UPDATED) + { + result = TM_Ok; + goto out_unlocked; + } + break; + } + } + } + + /* + * Initially assume that we will have to wait for the locking + * transaction(s) to finish. We check various cases below in which + * this can be turned off. + */ + require_sleep = true; + if (mode == LockTupleKeyShare) + { + /* + * If we're requesting KeyShare, and there's no update present, we + * don't need to wait. Even if there is an update, we can still + * continue if the key hasn't been modified. + * + * However, if there are updates, we need to walk the update chain + * to mark future versions of the row as locked, too. That way, + * if somebody deletes that future version, we're protected + * against the key going away. This locking of future versions + * could block momentarily, if a concurrent transaction is + * deleting a key; or it could return a value to the effect that + * the transaction deleting the key has already committed. So we + * do this before re-locking the buffer; otherwise this would be + * prone to deadlocks. + * + * Note that the TID we're locking was grabbed before we unlocked + * the buffer. For it to change while we're not looking, the + * other properties we're testing for below after re-locking the + * buffer would also change, in which case we would restart this + * loop above. + */ + if (!(infomask2 & HEAP_KEYS_UPDATED)) + { + bool updated; + + updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask); + + /* + * If there are updates, follow the update chain; bail out if + * that cannot be done. + */ + if (follow_updates && updated) + { + TM_Result res; + + res = heap_lock_updated_tuple(relation, tuple, &t_ctid, + GetCurrentTransactionId(), + mode); + if (res != TM_Ok) + { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + } + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Make sure it's still an appropriate lock, else start over. + * Also, if it wasn't updated before we released the lock, but + * is updated now, we start over too; the reason is that we + * now need to follow the update chain to lock the new + * versions. + */ + if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) && + ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) || + !updated)) + goto l3; + + /* Things look okay, so we can skip sleeping */ + require_sleep = false; + + /* + * Note we allow Xmax to change here; other updaters/lockers + * could have modified it before we grabbed the buffer lock. + * However, this is not a problem, because with the recheck we + * just did we ensure that they still don't conflict with the + * lock we want. + */ + } + } + else if (mode == LockTupleShare) + { + /* + * If we're requesting Share, we can similarly avoid sleeping if + * there's no update and no exclusive lock present. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && + !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Make sure it's still an appropriate lock, else start over. + * See above about allowing xmax to change. + */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask)) + goto l3; + require_sleep = false; + } + } + else if (mode == LockTupleNoKeyExclusive) + { + /* + * If we're requesting NoKeyExclusive, we might also be able to + * avoid sleeping; just ensure that there no conflicting lock + * already acquired. + */ + if (infomask & HEAP_XMAX_IS_MULTI) + { + if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + mode, NULL)) + { + /* + * No conflict, but if the xmax changed under us in the + * meantime, start over. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + + /* otherwise, we're good */ + require_sleep = false; + } + } + else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) + { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* if the xmax changed in the meantime, start over */ + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + /* otherwise, we're good */ + require_sleep = false; + } + } + + /* + * As a check independent from those above, we can also avoid sleeping + * if the current transaction is the sole locker of the tuple. Note + * that the strength of the lock already held is irrelevant; this is + * not about recording the lock in Xmax (which will be done regardless + * of this optimization, below). Also, note that the cases where we + * hold a lock stronger than we are requesting are already handled + * above by not doing anything. + * + * Note we only deal with the non-multixact case here; MultiXactIdWait + * is well equipped to deal with this situation on its own. + */ + if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) && + TransactionIdIsCurrentTransactionId(xwait)) + { + /* ... but if the xmax changed in the meantime, start over */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask)); + require_sleep = false; + } + + /* + * Time to sleep on the other transaction/multixact, if necessary. + * + * If the other transaction is an update/delete that's already + * committed, then sleeping cannot possibly do any good: if we're + * required to sleep, get out to raise an error instead. + * + * By here, we either have already acquired the buffer exclusive lock, + * or we must wait for the locking transaction or multixact; so below + * we ensure that we grab buffer lock after the sleep. + */ + if (require_sleep && (result == TM_Updated || result == TM_Deleted)) + { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + else if (require_sleep) + { + /* + * Acquire tuple lock to establish our priority for the tuple, or + * die trying. LockTuple will release us when we are next-in-line + * for the tuple. We must do this even if we are share-locking, + * but not if we already have a weaker lock on the tuple. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while + * rechecking tuple state. + */ + if (!skip_tuple_lock && + !heap_acquire_tuplock(relation, tid, mode, wait_policy, + &have_tuple_lock)) + { + /* + * This can only happen if wait_policy is Skip and the lock + * couldn't be obtained. + */ + result = TM_WouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + + if (infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactStatus status = get_mxact_status_for_lock(mode, false); + + /* We only ever lock tuples, never update them */ + if (status >= MultiXactStatusNoKeyUpdate) + elog(ERROR, "invalid lock mode in heap_lock_tuple"); + + /* wait for multixact to end, or die trying */ + switch (wait_policy) + { + case LockWaitBlock: + MultiXactIdWait((MultiXactId) xwait, status, infomask, + relation, &tuple->t_self, XLTW_Lock, NULL); + break; + case LockWaitSkip: + if (!ConditionalMultiXactIdWait((MultiXactId) xwait, + status, infomask, relation, + NULL)) + { + result = TM_WouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + break; + case LockWaitError: + if (!ConditionalMultiXactIdWait((MultiXactId) xwait, + status, infomask, relation, + NULL)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + + break; + } + + /* + * Of course, the multixact might not be done here: if we're + * requesting a light lock mode, other transactions with light + * locks could still be alive, as well as locks owned by our + * own xact or other subxacts of this backend. We need to + * preserve the surviving MultiXact members. Note that it + * isn't absolutely necessary in the latter case, but doing so + * is simpler. + */ + } + else + { + /* wait for regular transaction to end, or die trying */ + switch (wait_policy) + { + case LockWaitBlock: + XactLockTableWait(xwait, relation, &tuple->t_self, + XLTW_Lock); + break; + case LockWaitSkip: + if (!ConditionalXactLockTableWait(xwait)) + { + result = TM_WouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + break; + case LockWaitError: + if (!ConditionalXactLockTableWait(xwait)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + } + + /* if there are updates, follow the update chain */ + if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) + { + TM_Result res; + + res = heap_lock_updated_tuple(relation, tuple, &t_ctid, + GetCurrentTransactionId(), + mode); + if (res != TM_Ok) + { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + } + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + */ + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + xwait)) + goto l3; + + if (!(infomask & HEAP_XMAX_IS_MULTI)) + { + /* + * Otherwise check if it committed or aborted. Note we cannot + * be here if the tuple was only locked by somebody who didn't + * conflict with us; that would have been handled above. So + * that transaction must necessarily be gone by now. But + * don't check for this in the multixact case, because some + * locker transactions might still be running. + */ + UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + } + } + + /* By here, we're certain that we hold buffer exclusive lock again */ + + /* + * We may lock if previous xmax aborted, or if it committed but only + * locked the tuple without updating it; or if we didn't have to wait + * at all for whatever reason. + */ + if (!require_sleep || + (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || + HeapTupleHeaderIsOnlyLocked(tuple->t_data)) + result = TM_Ok; + else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) + result = TM_Updated; + else + result = TM_Deleted; + } + +failed: + if (result != TM_Ok) + { + Assert(result == TM_SelfModified || result == TM_Updated || + result == TM_Deleted || result == TM_WouldBlock); + + /* + * When locking a tuple under LockWaitSkip semantics and we fail with + * TM_WouldBlock above, it's possible for concurrent transactions to + * release the lock and set HEAP_XMAX_INVALID in the meantime. So + * this assert is slightly different from the equivalent one in + * heap_delete and heap_update. + */ + Assert((result == TM_WouldBlock) || + !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); + Assert(result != TM_Updated || + !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)); + tmfd->ctid = tuple->t_data->t_ctid; + tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + if (result == TM_SelfModified) + tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); + else + tmfd->cmax = InvalidCommandId; + goto out_locked; + } + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, or during some + * subsequent window during which we had it unlocked, we'll have to unlock + * and re-lock, to avoid holding the buffer lock across I/O. That's a bit + * unfortunate, especially since we'll now have to recheck whether the + * tuple has been locked or updated under us, but hopefully it won't + * happen very often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto l3; + } + + xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); + old_infomask = tuple->t_data->t_infomask; + + /* + * If this is the first possibly-multixact-able operation in the current + * transaction, set my per-backend OldestMemberMXactId setting. We can be + * certain that the transaction will never become a member of any older + * MultiXactIds than that. (We have to do this even if we end up just + * using our own TransactionId below, since some other backend could + * incorporate our XID into a MultiXact immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + /* + * Compute the new xmax and infomask to store into the tuple. Note we do + * not modify the tuple just yet, because that would leave it in the wrong + * state if multixact.c elogs. + */ + compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2, + GetCurrentTransactionId(), mode, false, + &xid, &new_infomask, &new_infomask2); + + START_CRIT_SECTION(); + + /* + * Store transaction information of xact locking the tuple. + * + * Note: Cmax is meaningless in this context, so don't set it; this avoids + * possibly generating a useless combo CID. Moreover, if we're locking a + * previously updated tuple, it's important to preserve the Cmax. + * + * Also reset the HOT UPDATE bit, but only if there's no update; otherwise + * we would break the HOT chain. + */ + tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS; + tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + tuple->t_data->t_infomask |= new_infomask; + tuple->t_data->t_infomask2 |= new_infomask2; + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) + HeapTupleHeaderClearHotUpdated(tuple->t_data); + HeapTupleHeaderSetXmax(tuple->t_data, xid); + + /* + * Make sure there is no forward chain link in t_ctid. Note that in the + * cases where the tuple has been updated, we must not overwrite t_ctid, + * because it was set by the updater. Moreover, if the tuple has been + * updated, we need to follow the update chain to lock the new versions of + * the tuple as well. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) + tuple->t_data->t_ctid = *tid; + + /* Clear only the all-frozen bit on visibility map if needed */ + if (PageIsAllVisible(page) && + visibilitymap_clear(relation, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + + MarkBufferDirty(*buffer); + + /* + * XLOG stuff. You might think that we don't need an XLOG record because + * there is no state change worth restoring after a crash. You would be + * wrong however: we have just written either a TransactionId or a + * MultiXactId that may never have been seen on disk before, and we need + * to make sure that there are XLOG entries covering those ID numbers. + * Else the same IDs might be re-used after a crash, which would be + * disastrous if this page made it to disk before the crash. Essentially + * we have to enforce the WAL log-before-data rule even in this case. + * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG + * entries for everything anyway.) + */ + if (RelationNeedsWAL(relation)) + { + xl_heap_lock xlrec; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); + xlrec.locking_xid = xid; + xlrec.infobits_set = compute_infobits(new_infomask, + tuple->t_data->t_infomask2); + xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); + + /* we don't decode row locks atm, so no need to log the origin */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + result = TM_Ok; + +out_locked: + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + +out_unlocked: + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + + /* + * Don't update the visibility map here. Locking a tuple doesn't change + * visibility info. + */ + + /* + * Now that we have successfully marked the tuple as locked, we can + * release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + UnlockTupleTuplock(relation, tid, mode); + + return result; +} + +/* + * Acquire heavyweight lock on the given tuple, in preparation for acquiring + * its normal, Xmax-based tuple lock. + * + * have_tuple_lock is an input and output parameter: on input, it indicates + * whether the lock has previously been acquired (and this function does + * nothing in that case). If this function returns success, have_tuple_lock + * has been flipped to true. + * + * Returns false if it was unable to obtain the lock; this can only happen if + * wait_policy is Skip. + */ +static bool +heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool *have_tuple_lock) +{ + if (*have_tuple_lock) + return true; + + switch (wait_policy) + { + case LockWaitBlock: + LockTupleTuplock(relation, tid, mode); + break; + + case LockWaitSkip: + if (!ConditionalLockTupleTuplock(relation, tid, mode)) + return false; + break; + + case LockWaitError: + if (!ConditionalLockTupleTuplock(relation, tid, mode)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + *have_tuple_lock = true; + + return true; +} + +/* + * Given an original set of Xmax and infomask, and a transaction (identified by + * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and + * corresponding infomasks to use on the tuple. + * + * Note that this might have side effects such as creating a new MultiXactId. + * + * Most callers will have called HeapTupleSatisfiesUpdate before this function; + * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId + * but it was not running anymore. There is a race condition, which is that the + * MultiXactId may have finished since then, but that uncommon case is handled + * either here, or within MultiXactIdExpand. + * + * There is a similar race condition possible when the old xmax was a regular + * TransactionId. We test TransactionIdIsInProgress again just to narrow the + * window, but it's still possible to end up creating an unnecessary + * MultiXactId. Fortunately this is harmless. + */ +static void +compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, + uint16 old_infomask2, TransactionId add_to_xmax, + LockTupleMode mode, bool is_update, + TransactionId *result_xmax, uint16 *result_infomask, + uint16 *result_infomask2) +{ + TransactionId new_xmax; + uint16 new_infomask, + new_infomask2; + + Assert(TransactionIdIsCurrentTransactionId(add_to_xmax)); + +l5: + new_infomask = 0; + new_infomask2 = 0; + if (old_infomask & HEAP_XMAX_INVALID) + { + /* + * No previous locker; we just insert our own TransactionId. + * + * Note that it's critical that this case be the first one checked, + * because there are several blocks below that come back to this one + * to implement certain optimizations; old_infomask might contain + * other dirty bits in those cases, but we don't really care. + */ + if (is_update) + { + new_xmax = add_to_xmax; + if (mode == LockTupleExclusive) + new_infomask2 |= HEAP_KEYS_UPDATED; + } + else + { + new_infomask |= HEAP_XMAX_LOCK_ONLY; + switch (mode) + { + case LockTupleKeyShare: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_KEYSHR_LOCK; + break; + case LockTupleShare: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_SHR_LOCK; + break; + case LockTupleNoKeyExclusive: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_EXCL_LOCK; + break; + case LockTupleExclusive: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_EXCL_LOCK; + new_infomask2 |= HEAP_KEYS_UPDATED; + break; + default: + new_xmax = InvalidTransactionId; /* silence compiler */ + elog(ERROR, "invalid lock mode"); + } + } + } + else if (old_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactStatus new_status; + + /* + * Currently we don't allow XMAX_COMMITTED to be set for multis, so + * cross-check. + */ + Assert(!(old_infomask & HEAP_XMAX_COMMITTED)); + + /* + * A multixact together with LOCK_ONLY set but neither lock bit set + * (i.e. a pg_upgraded share locked tuple) cannot possibly be running + * anymore. This check is critical for databases upgraded by + * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume + * that such multis are never passed. + */ + if (HEAP_LOCKED_UPGRADED(old_infomask)) + { + old_infomask &= ~HEAP_XMAX_IS_MULTI; + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + /* + * If the XMAX is already a MultiXactId, then we need to expand it to + * include add_to_xmax; but if all the members were lockers and are + * all gone, we can do away with the IS_MULTI bit and just set + * add_to_xmax as the only locker/updater. If all lockers are gone + * and we have an updater that aborted, we can also do without a + * multi. + * + * The cost of doing GetMultiXactIdMembers would be paid by + * MultiXactIdExpand if we weren't to do this, so this check is not + * incurring extra work anyhow. + */ + if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) || + !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax, + old_infomask))) + { + /* + * Reset these bits and restart; otherwise fall through to + * create a new multi below. + */ + old_infomask &= ~HEAP_XMAX_IS_MULTI; + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + } + + new_status = get_mxact_status_for_lock(mode, is_update); + + new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax, + new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (old_infomask & HEAP_XMAX_COMMITTED) + { + /* + * It's a committed update, so we need to preserve him as updater of + * the tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; + + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; + + new_status = get_mxact_status_for_lock(mode, is_update); + + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (TransactionIdIsInProgress(xmax)) + { + /* + * If the XMAX is a valid, in-progress TransactionId, then we need to + * create a new MultiXactId that includes both the old locker or + * updater and our own TransactionId. + */ + MultiXactStatus new_status; + MultiXactStatus old_status; + LockTupleMode old_mode; + + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) + { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForKeyShare; + else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForShare; + else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) + { + if (old_infomask2 & HEAP_KEYS_UPDATED) + old_status = MultiXactStatusForUpdate; + else + old_status = MultiXactStatusForNoKeyUpdate; + } + else + { + /* + * LOCK_ONLY can be present alone only when a page has been + * upgraded by pg_upgrade. But in that case, + * TransactionIdIsInProgress() should have returned false. We + * assume it's no longer locked in this case. + */ + elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax); + old_infomask |= HEAP_XMAX_INVALID; + old_infomask &= ~HEAP_XMAX_LOCK_ONLY; + goto l5; + } + } + else + { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) + old_status = MultiXactStatusUpdate; + else + old_status = MultiXactStatusNoKeyUpdate; + } + + old_mode = TUPLOCK_from_mxstatus(old_status); + + /* + * If the lock to be acquired is for the same TransactionId as the + * existing lock, there's an optimization possible: consider only the + * strongest of both locks as the only one present, and restart. + */ + if (xmax == add_to_xmax) + { + /* + * Note that it's not possible for the original tuple to be + * updated: we wouldn't be here because the tuple would have been + * invisible and we wouldn't try to update it. As a subtlety, + * this code can also run when traversing an update chain to lock + * future versions of a tuple. But we wouldn't be here either, + * because the add_to_xmax would be different from the original + * updater. + */ + Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); + + /* acquire the strongest of both */ + if (mode < old_mode) + mode = old_mode; + /* mustn't touch is_update */ + + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + /* otherwise, just fall back to creating a new multixact */ + new_status = get_mxact_status_for_lock(mode, is_update); + new_xmax = MultiXactIdCreate(xmax, old_status, + add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) && + TransactionIdDidCommit(xmax)) + { + /* + * It's a committed update, so we gotta preserve him as updater of the + * tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; + + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; + + new_status = get_mxact_status_for_lock(mode, is_update); + + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else + { + /* + * Can get here iff the locking/updating transaction was running when + * the infomask was extracted from the tuple, but finished before + * TransactionIdIsInProgress got to run. Deal with it as if there was + * no locker at all in the first place. + */ + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + *result_infomask = new_infomask; + *result_infomask2 = new_infomask2; + *result_xmax = new_xmax; +} + +/* + * Subroutine for heap_lock_updated_tuple_rec. + * + * Given a hypothetical multixact status held by the transaction identified + * with the given xid, does the current transaction need to wait, fail, or can + * it continue if it wanted to acquire a lock of the given mode? "needwait" + * is set to true if waiting is necessary; if it can continue, then TM_Ok is + * returned. If the lock is already held by the current transaction, return + * TM_SelfModified. In case of a conflict with another transaction, a + * different HeapTupleSatisfiesUpdate return code is returned. + * + * The held status is said to be hypothetical because it might correspond to a + * lock held by a single Xid, i.e. not a real MultiXactId; we express it this + * way for simplicity of API. + */ +static TM_Result +test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, + LockTupleMode mode, HeapTuple tup, + bool *needwait) +{ + MultiXactStatus wantedstatus; + + *needwait = false; + wantedstatus = get_mxact_status_for_lock(mode, false); + + /* + * Note: we *must* check TransactionIdIsInProgress before + * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c + * for an explanation. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + { + /* + * The tuple has already been locked by our own transaction. This is + * very rare but can happen if multiple transactions are trying to + * lock an ancient version of the same tuple. + */ + return TM_SelfModified; + } + else if (TransactionIdIsInProgress(xid)) + { + /* + * If the locking transaction is running, what we do depends on + * whether the lock modes conflict: if they do, then we must wait for + * it to finish; otherwise we can fall through to lock this tuple + * version without waiting. + */ + if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), + LOCKMODE_from_mxstatus(wantedstatus))) + { + *needwait = true; + } + + /* + * If we set needwait above, then this value doesn't matter; + * otherwise, this value signals to caller that it's okay to proceed. + */ + return TM_Ok; + } + else if (TransactionIdDidAbort(xid)) + return TM_Ok; + else if (TransactionIdDidCommit(xid)) + { + /* + * The other transaction committed. If it was only a locker, then the + * lock is completely gone now and we can return success; but if it + * was an update, then what we do depends on whether the two lock + * modes conflict. If they conflict, then we must report error to + * caller. But if they don't, we can fall through to allow the current + * transaction to lock the tuple. + * + * Note: the reason we worry about ISUPDATE here is because as soon as + * a transaction ends, all its locks are gone and meaningless, and + * thus we can ignore them; whereas its updates persist. In the + * TransactionIdIsInProgress case, above, we don't need to check + * because we know the lock is still "alive" and thus a conflict needs + * always be checked. + */ + if (!ISUPDATE_from_mxstatus(status)) + return TM_Ok; + + if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), + LOCKMODE_from_mxstatus(wantedstatus))) + { + /* bummer */ + if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid)) + return TM_Updated; + else + return TM_Deleted; + } + + return TM_Ok; + } + + /* Not in progress, not aborted, not committed -- must have crashed */ + return TM_Ok; +} + + +/* + * Recursive part of heap_lock_updated_tuple + * + * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given + * xid with the given mode; if this tuple is updated, recurse to lock the new + * version as well. + */ +static TM_Result +heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, + LockTupleMode mode) +{ + TM_Result result; + ItemPointerData tupid; + HeapTupleData mytup; + Buffer buf; + uint16 new_infomask, + new_infomask2, + old_infomask, + old_infomask2; + TransactionId xmax, + new_xmax; + TransactionId priorXmax = InvalidTransactionId; + bool cleared_all_frozen = false; + bool pinned_desired_page; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + + ItemPointerCopy(tid, &tupid); + + for (;;) + { + new_infomask = 0; + new_xmax = InvalidTransactionId; + block = ItemPointerGetBlockNumber(&tupid); + ItemPointerCopy(&tupid, &(mytup.t_self)); + + if (!heap_fetch(rel, SnapshotAny, &mytup, &buf)) + { + /* + * if we fail to find the updated version of the tuple, it's + * because it was vacuumed/pruned away after its creator + * transaction aborted. So behave as if we got to the end of the + * chain, and there's no further tuple to lock: return success to + * caller. + */ + result = TM_Ok; + goto out_unlocked; + } + +l4: + CHECK_FOR_INTERRUPTS(); + + /* + * Before locking the buffer, pin the visibility map page if it + * appears to be necessary. Since we haven't got the lock yet, + * someone else might be in the middle of changing this, so we'll need + * to recheck after we have the lock. + */ + if (PageIsAllVisible(BufferGetPage(buf))) + { + visibilitymap_pin(rel, block, &vmbuffer); + pinned_desired_page = true; + } + else + pinned_desired_page = false; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * If we didn't pin the visibility map page and the page has become + * all visible while we were busy locking the buffer, we'll have to + * unlock and re-lock, to avoid holding the buffer lock across I/O. + * That's a bit unfortunate, but hopefully shouldn't happen often. + * + * Note: in some paths through this function, we will reach here + * holding a pin on a vm page that may or may not be the one matching + * this page. If this page isn't all-visible, we won't use the vm + * page, but we hold onto such a pin till the end of the function. + */ + if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf))) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + visibilitymap_pin(rel, block, &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * Check the tuple XMIN against prior XMAX, if any. If we reached the + * end of the chain, we're done, so return success. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data), + priorXmax)) + { + result = TM_Ok; + goto out_locked; + } + + /* + * Also check Xmin: if this tuple was created by an aborted + * (sub)transaction, then we already locked the last live one in the + * chain, thus we're done, so return success. + */ + if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) + { + result = TM_Ok; + goto out_locked; + } + + old_infomask = mytup.t_data->t_infomask; + old_infomask2 = mytup.t_data->t_infomask2; + xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + + /* + * If this tuple version has been updated or locked by some concurrent + * transaction(s), what we do depends on whether our lock mode + * conflicts with what those other transactions hold, and also on the + * status of them. + */ + if (!(old_infomask & HEAP_XMAX_INVALID)) + { + TransactionId rawxmax; + bool needwait; + + rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + if (old_infomask & HEAP_XMAX_IS_MULTI) + { + int nmembers; + int i; + MultiXactMember *members; + + /* + * We don't need a test for pg_upgrade'd tuples: this is only + * applied to tuples after the first in an update chain. Said + * first tuple in the chain may well be locked-in-9.2-and- + * pg_upgraded, but that one was already locked by our caller, + * not us; and any subsequent ones cannot be because our + * caller must necessarily have obtained a snapshot later than + * the pg_upgrade itself. + */ + Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask)); + + nmembers = GetMultiXactIdMembers(rawxmax, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); + for (i = 0; i < nmembers; i++) + { + result = test_lockmode_for_conflict(members[i].status, + members[i].xid, + mode, + &mytup, + &needwait); + + /* + * If the tuple was already locked by ourselves in a + * previous iteration of this (say heap_lock_tuple was + * forced to restart the locking loop because of a change + * in xmax), then we hold the lock already on this tuple + * version and we don't need to do anything; and this is + * not an error condition either. We just need to skip + * this tuple and continue locking the next version in the + * update chain. + */ + if (result == TM_SelfModified) + { + pfree(members); + goto next; + } + + if (needwait) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(members[i].xid, rel, + &mytup.t_self, + XLTW_LockUpdated); + pfree(members); + goto l4; + } + if (result != TM_Ok) + { + pfree(members); + goto out_locked; + } + } + if (members) + pfree(members); + } + else + { + MultiXactStatus status; + + /* + * For a non-multi Xmax, we first need to compute the + * corresponding MultiXactStatus by using the infomask bits. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) + { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) + status = MultiXactStatusForKeyShare; + else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) + status = MultiXactStatusForShare; + else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) + { + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusForUpdate; + else + status = MultiXactStatusForNoKeyUpdate; + } + else + { + /* + * LOCK_ONLY present alone (a pg_upgraded tuple marked + * as share-locked in the old cluster) shouldn't be + * seen in the middle of an update chain. + */ + elog(ERROR, "invalid lock status in tuple"); + } + } + else + { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; + } + + result = test_lockmode_for_conflict(status, rawxmax, mode, + &mytup, &needwait); + + /* + * If the tuple was already locked by ourselves in a previous + * iteration of this (say heap_lock_tuple was forced to + * restart the locking loop because of a change in xmax), then + * we hold the lock already on this tuple version and we don't + * need to do anything; and this is not an error condition + * either. We just need to skip this tuple and continue + * locking the next version in the update chain. + */ + if (result == TM_SelfModified) + goto next; + + if (needwait) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(rawxmax, rel, &mytup.t_self, + XLTW_LockUpdated); + goto l4; + } + if (result != TM_Ok) + { + goto out_locked; + } + } + } + + /* compute the new Xmax and infomask values for the tuple ... */ + compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2, + xid, mode, false, + &new_xmax, &new_infomask, &new_infomask2); + + if (PageIsAllVisible(BufferGetPage(buf)) && + visibilitymap_clear(rel, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + START_CRIT_SECTION(); + + /* ... and set them */ + HeapTupleHeaderSetXmax(mytup.t_data, new_xmax); + mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + mytup.t_data->t_infomask |= new_infomask; + mytup.t_data->t_infomask2 |= new_infomask2; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_heap_lock_updated xlrec; + XLogRecPtr recptr; + Page page = BufferGetPage(buf); + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); + xlrec.xmax = new_xmax; + xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2); + xlrec.flags = + cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + + XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + +next: + /* if we find the end of update chain, we're done. */ + if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || + HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) || + ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || + HeapTupleHeaderIsOnlyLocked(mytup.t_data)) + { + result = TM_Ok; + goto out_locked; + } + + /* tail recursion */ + priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); + ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); + UnlockReleaseBuffer(buf); + } + + result = TM_Ok; + +out_locked: + UnlockReleaseBuffer(buf); + +out_unlocked: + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + return result; +} + +/* + * heap_lock_updated_tuple + * Follow update chain when locking an updated tuple, acquiring locks (row + * marks) on the updated versions. + * + * The initial tuple is assumed to be already locked. + * + * This function doesn't check visibility, it just unconditionally marks the + * tuple(s) as locked. If any tuple in the updated chain is being deleted + * concurrently (or updated with the key being modified), sleep until the + * transaction doing it is finished. + * + * Note that we don't acquire heavyweight tuple locks on the tuples we walk + * when we have to wait for other transactions to release them, as opposed to + * what heap_lock_tuple does. The reason is that having more than one + * transaction walking the chain is probably uncommon enough that risk of + * starvation is not likely: one of the preconditions for being here is that + * the snapshot in use predates the update that created this tuple (because we + * started at an earlier version of the tuple), but at the same time such a + * transaction cannot be using repeatable read or serializable isolation + * levels, because that would lead to a serializability failure. + */ +static TM_Result +heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, + TransactionId xid, LockTupleMode mode) +{ + /* + * If the tuple has not been updated, or has moved into another partition + * (effectively a delete) stop here. + */ + if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) && + !ItemPointerEquals(&tuple->t_self, ctid)) + { + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId + * setting. We can be certain that the transaction will never become a + * member of any older MultiXactIds than that. (We have to do this + * even if we end up just using our own TransactionId below, since + * some other backend could incorporate our XID into a MultiXact + * immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + return heap_lock_updated_tuple_rec(rel, ctid, xid, mode); + } + + /* nothing to lock */ + return TM_Ok; +} + +/* + * heap_finish_speculative - mark speculative insertion as successful + * + * To successfully finish a speculative insertion we have to clear speculative + * token from tuple. To do so the t_ctid field, which will contain a + * speculative token value, is modified in place to point to the tuple itself, + * which is characteristic of a newly inserted ordinary tuple. + * + * NB: It is not ok to commit without either finishing or aborting a + * speculative insertion. We could treat speculative tuples of committed + * transactions implicitly as completed, but then we would have to be prepared + * to deal with speculative tokens on committed tuples. That wouldn't be + * difficult - no-one looks at the ctid field of a tuple with invalid xmax - + * but clearing the token at completion isn't very expensive either. + * An explicit confirmation WAL record also makes logical decoding simpler. + */ +void +heap_finish_speculative(Relation relation, ItemPointer tid) +{ + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(tid); + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(ERROR, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* SpecTokenOffsetNumber should be distinguishable from any real offset */ + StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber, + "invalid speculative token constant"); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + Assert(HeapTupleHeaderIsSpeculative(htup)); + + MarkBufferDirty(buffer); + + /* + * Replace the speculative insertion token with a real t_ctid, pointing to + * itself like it does on regular tuples. + */ + htup->t_ctid = *tid; + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_heap_confirm xlrec; + XLogRecPtr recptr; + + xlrec.offnum = ItemPointerGetOffsetNumber(tid); + + XLogBeginInsert(); + + /* We want the same filtering on this as on a plain insert */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); +} + +/* + * heap_abort_speculative - kill a speculatively inserted tuple + * + * Marks a tuple that was speculatively inserted in the same command as dead, + * by setting its xmin as invalid. That makes it immediately appear as dead + * to all transactions, including our own. In particular, it makes + * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend + * inserting a duplicate key value won't unnecessarily wait for our whole + * transaction to finish (it'll just wait for our speculative insertion to + * finish). + * + * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks + * that arise due to a mutual dependency that is not user visible. By + * definition, unprincipled deadlocks cannot be prevented by the user + * reordering lock acquisition in client code, because the implementation level + * lock acquisitions are not under the user's direct control. If speculative + * inserters did not take this precaution, then under high concurrency they + * could deadlock with each other, which would not be acceptable. + * + * This is somewhat redundant with heap_delete, but we prefer to have a + * dedicated routine with stripped down requirements. Note that this is also + * used to delete the TOAST tuples created during speculative insertion. + * + * This routine does not affect logical decoding as it only looks at + * confirmation records. + */ +void +heap_abort_speculative(Relation relation, ItemPointer tid) +{ + TransactionId xid = GetCurrentTransactionId(); + ItemId lp; + HeapTupleData tp; + Page page; + BlockNumber block; + Buffer buffer; + TransactionId prune_xid; + + Assert(ItemPointerIsValid(tid)); + + block = ItemPointerGetBlockNumber(tid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Page can't be all visible, we just inserted into it, and are still + * running. + */ + Assert(!PageIsAllVisible(page)); + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tp.t_tableOid = RelationGetRelid(relation); + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_self = *tid; + + /* + * Sanity check that the tuple really is a speculatively inserted tuple, + * inserted by us. + */ + if (tp.t_data->t_choice.t_heap.t_xmin != xid) + elog(ERROR, "attempted to kill a tuple inserted by another transaction"); + if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) + elog(ERROR, "attempted to kill a non-speculative tuple"); + Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + + /* + * No need to check for serializable conflicts here. There is never a + * need for a combo CID, either. No need to extract replica identity, or + * do anything special with infomask bits. + */ + + START_CRIT_SECTION(); + + /* + * The tuple will become DEAD immediately. Flag that this page is a + * candidate for pruning by setting xmin to TransactionXmin. While not + * immediately prunable, it is the oldest xid we can cheaply determine + * that's safe against wraparound / being older than the table's + * relfrozenxid. To defend against the unlikely case of a new relation + * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid + * if so (vacuum can't subsequently move relfrozenxid to beyond + * TransactionXmin, so there's no race here). + */ + Assert(TransactionIdIsValid(TransactionXmin)); + if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid)) + prune_xid = relation->rd_rel->relfrozenxid; + else + prune_xid = TransactionXmin; + PageSetPrunable(page, prune_xid); + + /* store transaction information of xact deleting the tuple */ + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + + /* + * Set the tuple header xmin to InvalidTransactionId. This makes the + * tuple immediately invisible everyone. (In particular, to any + * transactions waiting on the speculative token, woken up later.) + */ + HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); + + /* Clear the speculative insertion token too */ + tp.t_data->t_ctid = tp.t_self; + + MarkBufferDirty(buffer); + + /* + * XLOG stuff + * + * The WAL records generated here match heap_delete(). The same recovery + * routines are used. + */ + if (RelationNeedsWAL(relation)) + { + xl_heap_delete xlrec; + XLogRecPtr recptr; + + xlrec.flags = XLH_DELETE_IS_SUPER; + xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, + tp.t_data->t_infomask2); + xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = xid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + /* No replica identity & replication origin logged */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (HeapTupleHasExternal(&tp)) + { + Assert(!IsToastRelation(relation)); + heap_toast_delete(relation, &tp, true); + } + + /* + * Never need to mark tuple for invalidation, since catalogs don't support + * speculative insertion + */ + + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + + /* count deletion, as we counted the insertion too */ + pgstat_count_heap_delete(relation); +} + +/* + * heap_inplace_update - update a tuple "in place" (ie, overwrite it) + * + * Overwriting violates both MVCC and transactional safety, so the uses + * of this function in Postgres are extremely limited. Nonetheless we + * find some places to use it. + * + * The tuple cannot change size, and therefore it's reasonable to assume + * that its null bitmap (if any) doesn't change either. So we just + * overwrite the data portion of the tuple without touching the null + * bitmap or any of the header fields. + * + * tuple is an in-memory tuple structure containing the data to be written + * over the target tuple. Also, tuple->t_self identifies the target tuple. + * + * Note that the tuple updated here had better not come directly from the + * syscache if the relation has a toast relation as this tuple could + * include toast values that have been expanded, causing a failure here. + */ +void +heap_inplace_update(Relation relation, HeapTuple tuple) +{ + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + uint32 oldlen; + uint32 newlen; + + /* + * For now, we don't allow parallel updates. Unlike a regular update, + * this should never create a combo CID, so it might be possible to relax + * this restriction, but not without more thought and testing. It's not + * clear that it would be useful, anyway. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot update tuples during a parallel operation"))); + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self))); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(&(tuple->t_self)); + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(ERROR, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldlen = ItemIdGetLength(lp) - htup->t_hoff; + newlen = tuple->t_len - tuple->t_data->t_hoff; + if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff) + elog(ERROR, "wrong tuple length"); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + memcpy((char *) htup + htup->t_hoff, + (char *) tuple->t_data + tuple->t_data->t_hoff, + newlen); + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_heap_inplace xlrec; + XLogRecPtr recptr; + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapInplace); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); + + /* inplace updates aren't decoded atm, don't log the origin */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + + /* + * Send out shared cache inval if necessary. Note that because we only + * pass the new version of the tuple, this mustn't be used for any + * operations that could change catcache lookup keys. But we aren't + * bothering with index updates either, so that's true a fortiori. + */ + if (!IsBootstrapProcessingMode()) + CacheInvalidateHeapTuple(relation, tuple, NULL); +} + +#define FRM_NOOP 0x0001 +#define FRM_INVALIDATE_XMAX 0x0002 +#define FRM_RETURN_IS_XID 0x0004 +#define FRM_RETURN_IS_MULTI 0x0008 +#define FRM_MARK_COMMITTED 0x0010 + +/* + * FreezeMultiXactId + * Determine what to do during freezing when a tuple is marked by a + * MultiXactId. + * + * NB -- this might have the side-effect of creating a new MultiXactId! + * + * "flags" is an output value; it's used to tell caller what to do on return. + * Possible flags are: + * FRM_NOOP + * don't do anything -- keep existing Xmax + * FRM_INVALIDATE_XMAX + * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag. + * FRM_RETURN_IS_XID + * The Xid return value is a single update Xid to set as xmax. + * FRM_MARK_COMMITTED + * Xmax can be marked as HEAP_XMAX_COMMITTED + * FRM_RETURN_IS_MULTI + * The return value is a new MultiXactId to set as new Xmax. + * (caller must obtain proper infomask bits using GetMultiXactIdHintBits) + */ +static TransactionId +FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, + TransactionId relfrozenxid, TransactionId relminmxid, + TransactionId cutoff_xid, MultiXactId cutoff_multi, + uint16 *flags) +{ + TransactionId xid = InvalidTransactionId; + int i; + MultiXactMember *members; + int nmembers; + bool need_replace; + int nnewmembers; + MultiXactMember *newmembers; + bool has_lockers; + TransactionId update_xid; + bool update_committed; + + *flags = 0; + + /* We should only be called in Multis */ + Assert(t_infomask & HEAP_XMAX_IS_MULTI); + + if (!MultiXactIdIsValid(multi) || + HEAP_LOCKED_UPGRADED(t_infomask)) + { + /* Ensure infomask bits are appropriately set/reset */ + *flags |= FRM_INVALIDATE_XMAX; + return InvalidTransactionId; + } + else if (MultiXactIdPrecedes(multi, relminmxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found multixact %u from before relminmxid %u", + multi, relminmxid))); + else if (MultiXactIdPrecedes(multi, cutoff_multi)) + { + /* + * This old multi cannot possibly have members still running, but + * verify just in case. If it was a locker only, it can be removed + * without any further consideration; but if it contained an update, + * we might need to preserve it. + */ + if (MultiXactIdIsRunning(multi, + HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("multixact %u from before cutoff %u found to be still running", + multi, cutoff_multi))); + + if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)) + { + *flags |= FRM_INVALIDATE_XMAX; + xid = InvalidTransactionId; /* not strictly necessary */ + } + else + { + /* replace multi by update xid */ + xid = MultiXactIdGetUpdateXid(multi, t_infomask); + + /* wasn't only a lock, xid needs to be valid */ + Assert(TransactionIdIsValid(xid)); + + if (TransactionIdPrecedes(xid, relfrozenxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found update xid %u from before relfrozenxid %u", + xid, relfrozenxid))); + + /* + * If the xid is older than the cutoff, it has to have aborted, + * otherwise the tuple would have gotten pruned away. + */ + if (TransactionIdPrecedes(xid, cutoff_xid)) + { + if (TransactionIdDidCommit(xid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("cannot freeze committed update xid %u", xid))); + *flags |= FRM_INVALIDATE_XMAX; + xid = InvalidTransactionId; /* not strictly necessary */ + } + else + { + *flags |= FRM_RETURN_IS_XID; + } + } + + return xid; + } + + /* + * This multixact might have or might not have members still running, but + * we know it's valid and is newer than the cutoff point for multis. + * However, some member(s) of it may be below the cutoff for Xids, so we + * need to walk the whole members array to figure out what to do, if + * anything. + */ + + nmembers = + GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)); + if (nmembers <= 0) + { + /* Nothing worth keeping */ + *flags |= FRM_INVALIDATE_XMAX; + return InvalidTransactionId; + } + + /* is there anything older than the cutoff? */ + need_replace = false; + for (i = 0; i < nmembers; i++) + { + if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) + { + need_replace = true; + break; + } + } + + /* + * In the simplest case, there is no member older than the cutoff; we can + * keep the existing MultiXactId as is. + */ + if (!need_replace) + { + *flags |= FRM_NOOP; + pfree(members); + return InvalidTransactionId; + } + + /* + * If the multi needs to be updated, figure out which members do we need + * to keep. + */ + nnewmembers = 0; + newmembers = palloc(sizeof(MultiXactMember) * nmembers); + has_lockers = false; + update_xid = InvalidTransactionId; + update_committed = false; + + for (i = 0; i < nmembers; i++) + { + /* + * Determine whether to keep this member or ignore it. + */ + if (ISUPDATE_from_mxstatus(members[i].status)) + { + TransactionId xid = members[i].xid; + + Assert(TransactionIdIsValid(xid)); + if (TransactionIdPrecedes(xid, relfrozenxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found update xid %u from before relfrozenxid %u", + xid, relfrozenxid))); + + /* + * It's an update; should we keep it? If the transaction is known + * aborted or crashed then it's okay to ignore it, otherwise not. + * Note that an updater older than cutoff_xid cannot possibly be + * committed, because HeapTupleSatisfiesVacuum would have returned + * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple. + * + * As with all tuple visibility routines, it's critical to test + * TransactionIdIsInProgress before TransactionIdDidCommit, + * because of race conditions explained in detail in + * heapam_visibility.c. + */ + if (TransactionIdIsCurrentTransactionId(xid) || + TransactionIdIsInProgress(xid)) + { + Assert(!TransactionIdIsValid(update_xid)); + update_xid = xid; + } + else if (TransactionIdDidCommit(xid)) + { + /* + * The transaction committed, so we can tell caller to set + * HEAP_XMAX_COMMITTED. (We can only do this because we know + * the transaction is not running.) + */ + Assert(!TransactionIdIsValid(update_xid)); + update_committed = true; + update_xid = xid; + } + else + { + /* + * Not in progress, not committed -- must be aborted or + * crashed; we can ignore it. + */ + } + + /* + * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the + * update Xid cannot possibly be older than the xid cutoff. The + * presence of such a tuple would cause corruption, so be paranoid + * and check. + */ + if (TransactionIdIsValid(update_xid) && + TransactionIdPrecedes(update_xid, cutoff_xid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found update xid %u from before xid cutoff %u", + update_xid, cutoff_xid))); + + /* + * If we determined that it's an Xid corresponding to an update + * that must be retained, additionally add it to the list of + * members of the new Multi, in case we end up using that. (We + * might still decide to use only an update Xid and not a multi, + * but it's easier to maintain the list as we walk the old members + * list.) + */ + if (TransactionIdIsValid(update_xid)) + newmembers[nnewmembers++] = members[i]; + } + else + { + /* We only keep lockers if they are still running */ + if (TransactionIdIsCurrentTransactionId(members[i].xid) || + TransactionIdIsInProgress(members[i].xid)) + { + /* running locker cannot possibly be older than the cutoff */ + Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid)); + newmembers[nnewmembers++] = members[i]; + has_lockers = true; + } + } + } + + pfree(members); + + if (nnewmembers == 0) + { + /* nothing worth keeping!? Tell caller to remove the whole thing */ + *flags |= FRM_INVALIDATE_XMAX; + xid = InvalidTransactionId; + } + else if (TransactionIdIsValid(update_xid) && !has_lockers) + { + /* + * If there's a single member and it's an update, pass it back alone + * without creating a new Multi. (XXX we could do this when there's a + * single remaining locker, too, but that would complicate the API too + * much; moreover, the case with the single updater is more + * interesting, because those are longer-lived.) + */ + Assert(nnewmembers == 1); + *flags |= FRM_RETURN_IS_XID; + if (update_committed) + *flags |= FRM_MARK_COMMITTED; + xid = update_xid; + } + else + { + /* + * Create a new multixact with the surviving members of the previous + * one, to set as new Xmax in the tuple. + */ + xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers); + *flags |= FRM_RETURN_IS_MULTI; + } + + pfree(newmembers); + + return xid; +} + +/* + * heap_prepare_freeze_tuple + * + * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) + * are older than the specified cutoff XID and cutoff MultiXactId. If so, + * setup enough state (in the *frz output argument) to later execute and + * WAL-log what we would need to do, and return true. Return false if nothing + * is to be changed. In addition, set *totally_frozen_p to true if the tuple + * will be totally frozen after these operations are performed and false if + * more freezing will eventually be required. + * + * Caller is responsible for setting the offset field, if appropriate. + * + * It is assumed that the caller has checked the tuple with + * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD + * (else we should be removing the tuple, not freezing it). + * + * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any + * XID older than it could neither be running nor seen as running by any + * open transaction. This ensures that the replacement will not change + * anyone's idea of the tuple state. + * Similarly, cutoff_multi must be less than or equal to the smallest + * MultiXactId used by any transaction currently open. + * + * If the tuple is in a shared buffer, caller must hold an exclusive lock on + * that buffer. + * + * NB: It is not enough to set hint bits to indicate something is + * committed/invalid -- they might not be set on a standby, or after crash + * recovery. We really need to remove old xids. + */ +bool +heap_prepare_freeze_tuple(HeapTupleHeader tuple, + TransactionId relfrozenxid, TransactionId relminmxid, + TransactionId cutoff_xid, TransactionId cutoff_multi, + xl_heap_freeze_tuple *frz, bool *totally_frozen_p) +{ + bool changed = false; + bool xmax_already_frozen = false; + bool xmin_frozen; + bool freeze_xmax; + TransactionId xid; + + frz->frzflags = 0; + frz->t_infomask2 = tuple->t_infomask2; + frz->t_infomask = tuple->t_infomask; + frz->xmax = HeapTupleHeaderGetRawXmax(tuple); + + /* + * Process xmin. xmin_frozen has two slightly different meanings: in the + * !XidIsNormal case, it means "the xmin doesn't need any freezing" (it's + * already a permanent value), while in the block below it is set true to + * mean "xmin won't need freezing after what we do to it here" (false + * otherwise). In both cases we're allowed to set totally_frozen, as far + * as xmin is concerned. + */ + xid = HeapTupleHeaderGetXmin(tuple); + if (!TransactionIdIsNormal(xid)) + xmin_frozen = true; + else + { + if (TransactionIdPrecedes(xid, relfrozenxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found xmin %u from before relfrozenxid %u", + xid, relfrozenxid))); + + xmin_frozen = TransactionIdPrecedes(xid, cutoff_xid); + if (xmin_frozen) + { + if (!TransactionIdDidCommit(xid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen", + xid, cutoff_xid))); + + frz->t_infomask |= HEAP_XMIN_FROZEN; + changed = true; + } + } + + /* + * Process xmax. To thoroughly examine the current Xmax value we need to + * resolve a MultiXactId to its member Xids, in case some of them are + * below the given cutoff for Xids. In that case, those values might need + * freezing, too. Also, if a multi needs freezing, we cannot simply take + * it out --- if there's a live updater Xid, it needs to be kept. + * + * Make sure to keep heap_tuple_needs_freeze in sync with this. + */ + xid = HeapTupleHeaderGetRawXmax(tuple); + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId newxmax; + uint16 flags; + + newxmax = FreezeMultiXactId(xid, tuple->t_infomask, + relfrozenxid, relminmxid, + cutoff_xid, cutoff_multi, &flags); + + freeze_xmax = (flags & FRM_INVALIDATE_XMAX); + + if (flags & FRM_RETURN_IS_XID) + { + /* + * NB -- some of these transformations are only valid because we + * know the return Xid is a tuple updater (i.e. not merely a + * locker.) Also note that the only reason we don't explicitly + * worry about HEAP_KEYS_UPDATED is because it lives in + * t_infomask2 rather than t_infomask. + */ + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->xmax = newxmax; + if (flags & FRM_MARK_COMMITTED) + frz->t_infomask |= HEAP_XMAX_COMMITTED; + changed = true; + } + else if (flags & FRM_RETURN_IS_MULTI) + { + uint16 newbits; + uint16 newbits2; + + /* + * We can't use GetMultiXactIdHintBits directly on the new multi + * here; that routine initializes the masks to all zeroes, which + * would lose other bits we need. Doing it this way ensures all + * unrelated bits remain untouched. + */ + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; + GetMultiXactIdHintBits(newxmax, &newbits, &newbits2); + frz->t_infomask |= newbits; + frz->t_infomask2 |= newbits2; + + frz->xmax = newxmax; + + changed = true; + } + } + else if (TransactionIdIsNormal(xid)) + { + if (TransactionIdPrecedes(xid, relfrozenxid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found xmax %u from before relfrozenxid %u", + xid, relfrozenxid))); + + if (TransactionIdPrecedes(xid, cutoff_xid)) + { + /* + * If we freeze xmax, make absolutely sure that it's not an XID + * that is important. (Note, a lock-only xmax can be removed + * independent of committedness, since a committed lock holder has + * released the lock). + */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) && + TransactionIdDidCommit(xid)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("cannot freeze committed xmax %u", + xid))); + freeze_xmax = true; + } + else + freeze_xmax = false; + } + else if ((tuple->t_infomask & HEAP_XMAX_INVALID) || + !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple))) + { + freeze_xmax = false; + xmax_already_frozen = true; + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal", + xid, tuple->t_infomask))); + + if (freeze_xmax) + { + Assert(!xmax_already_frozen); + + frz->xmax = InvalidTransactionId; + + /* + * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + + * LOCKED. Normalize to INVALID just to be sure no one gets confused. + * Also get rid of the HEAP_KEYS_UPDATED bit. + */ + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->t_infomask |= HEAP_XMAX_INVALID; + frz->t_infomask2 &= ~HEAP_HOT_UPDATED; + frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; + changed = true; + } + + /* + * Old-style VACUUM FULL is gone, but we have to keep this code as long as + * we support having MOVED_OFF/MOVED_IN tuples in the database. + */ + if (tuple->t_infomask & HEAP_MOVED) + { + xid = HeapTupleHeaderGetXvac(tuple); + + /* + * For Xvac, we ignore the cutoff_xid and just always perform the + * freeze operation. The oldest release in which such a value can + * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL + * was removed in PostgreSQL 9.0. Note that if we were to respect + * cutoff_xid here, we'd need to make surely to clear totally_frozen + * when we skipped freezing on that basis. + */ + if (TransactionIdIsNormal(xid)) + { + /* + * If a MOVED_OFF tuple is not dead, the xvac transaction must + * have failed; whereas a non-dead MOVED_IN tuple must mean the + * xvac transaction succeeded. + */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + frz->frzflags |= XLH_INVALID_XVAC; + else + frz->frzflags |= XLH_FREEZE_XVAC; + + /* + * Might as well fix the hint bits too; usually XMIN_COMMITTED + * will already be set here, but there's a small chance not. + */ + Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); + frz->t_infomask |= HEAP_XMIN_COMMITTED; + changed = true; + } + } + + *totally_frozen_p = (xmin_frozen && + (freeze_xmax || xmax_already_frozen)); + return changed; +} + +/* + * heap_execute_freeze_tuple + * Execute the prepared freezing of a tuple. + * + * Caller is responsible for ensuring that no other backend can access the + * storage underlying this tuple, either by holding an exclusive lock on the + * buffer containing it (which is what lazy VACUUM does), or by having it be + * in private storage (which is what CLUSTER and friends do). + * + * Note: it might seem we could make the changes without exclusive lock, since + * TransactionId read/write is assumed atomic anyway. However there is a race + * condition: someone who just fetched an old XID that we overwrite here could + * conceivably not finish checking the XID against pg_xact before we finish + * the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting + * exclusive lock ensures no other backend is in process of checking the + * tuple status. Also, getting exclusive lock makes it safe to adjust the + * infomask bits. + * + * NB: All code in here must be safe to execute during crash recovery! + */ +void +heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz) +{ + HeapTupleHeaderSetXmax(tuple, frz->xmax); + + if (frz->frzflags & XLH_FREEZE_XVAC) + HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); + + if (frz->frzflags & XLH_INVALID_XVAC) + HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); + + tuple->t_infomask = frz->t_infomask; + tuple->t_infomask2 = frz->t_infomask2; +} + +/* + * heap_freeze_tuple + * Freeze tuple in place, without WAL logging. + * + * Useful for callers like CLUSTER that perform their own WAL logging. + */ +bool +heap_freeze_tuple(HeapTupleHeader tuple, + TransactionId relfrozenxid, TransactionId relminmxid, + TransactionId cutoff_xid, TransactionId cutoff_multi) +{ + xl_heap_freeze_tuple frz; + bool do_freeze; + bool tuple_totally_frozen; + + do_freeze = heap_prepare_freeze_tuple(tuple, + relfrozenxid, relminmxid, + cutoff_xid, cutoff_multi, + &frz, &tuple_totally_frozen); + + /* + * Note that because this is not a WAL-logged operation, we don't need to + * fill in the offset in the freeze record. + */ + + if (do_freeze) + heap_execute_freeze_tuple(tuple, &frz); + return do_freeze; +} + +/* + * For a given MultiXactId, return the hint bits that should be set in the + * tuple's infomask. + * + * Normally this should be called for a multixact that was just created, and + * so is on our local cache, so the GetMembers call is fast. + */ +static void +GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, + uint16 *new_infomask2) +{ + int nmembers; + MultiXactMember *members; + int i; + uint16 bits = HEAP_XMAX_IS_MULTI; + uint16 bits2 = 0; + bool has_update = false; + LockTupleMode strongest = LockTupleKeyShare; + + /* + * We only use this in multis we just created, so they cannot be values + * pre-pg_upgrade. + */ + nmembers = GetMultiXactIdMembers(multi, &members, false, false); + + for (i = 0; i < nmembers; i++) + { + LockTupleMode mode; + + /* + * Remember the strongest lock mode held by any member of the + * multixact. + */ + mode = TUPLOCK_from_mxstatus(members[i].status); + if (mode > strongest) + strongest = mode; + + /* See what other bits we need */ + switch (members[i].status) + { + case MultiXactStatusForKeyShare: + case MultiXactStatusForShare: + case MultiXactStatusForNoKeyUpdate: + break; + + case MultiXactStatusForUpdate: + bits2 |= HEAP_KEYS_UPDATED; + break; + + case MultiXactStatusNoKeyUpdate: + has_update = true; + break; + + case MultiXactStatusUpdate: + bits2 |= HEAP_KEYS_UPDATED; + has_update = true; + break; + } + } + + if (strongest == LockTupleExclusive || + strongest == LockTupleNoKeyExclusive) + bits |= HEAP_XMAX_EXCL_LOCK; + else if (strongest == LockTupleShare) + bits |= HEAP_XMAX_SHR_LOCK; + else if (strongest == LockTupleKeyShare) + bits |= HEAP_XMAX_KEYSHR_LOCK; + + if (!has_update) + bits |= HEAP_XMAX_LOCK_ONLY; + + if (nmembers > 0) + pfree(members); + + *new_infomask = bits; + *new_infomask2 = bits2; +} + +/* + * MultiXactIdGetUpdateXid + * + * Given a multixact Xmax and corresponding infomask, which does not have the + * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating + * transaction. + * + * Caller is expected to check the status of the updating transaction, if + * necessary. + */ +static TransactionId +MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask) +{ + TransactionId update_xact = InvalidTransactionId; + MultiXactMember *members; + int nmembers; + + Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY)); + Assert(t_infomask & HEAP_XMAX_IS_MULTI); + + /* + * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from + * pre-pg_upgrade. + */ + nmembers = GetMultiXactIdMembers(xmax, &members, false, false); + + if (nmembers > 0) + { + int i; + + for (i = 0; i < nmembers; i++) + { + /* Ignore lockers */ + if (!ISUPDATE_from_mxstatus(members[i].status)) + continue; + + /* there can be at most one updater */ + Assert(update_xact == InvalidTransactionId); + update_xact = members[i].xid; +#ifndef USE_ASSERT_CHECKING + + /* + * in an assert-enabled build, walk the whole array to ensure + * there's no other updater. + */ + break; +#endif + } + + pfree(members); + } + + return update_xact; +} + +/* + * HeapTupleGetUpdateXid + * As above, but use a HeapTupleHeader + * + * See also HeapTupleHeaderGetUpdateXid, which can be used without previously + * checking the hint bits. + */ +TransactionId +HeapTupleGetUpdateXid(HeapTupleHeader tuple) +{ + return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple), + tuple->t_infomask); +} + +/* + * Does the given multixact conflict with the current transaction grabbing a + * tuple lock of the given strength? + * + * The passed infomask pairs up with the given multixact in the tuple header. + * + * If current_is_member is not NULL, it is set to 'true' if the current + * transaction is a member of the given multixact. + */ +static bool +DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, + LockTupleMode lockmode, bool *current_is_member) +{ + int nmembers; + MultiXactMember *members; + bool result = false; + LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock; + + if (HEAP_LOCKED_UPGRADED(infomask)) + return false; + + nmembers = GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(infomask)); + if (nmembers >= 0) + { + int i; + + for (i = 0; i < nmembers; i++) + { + TransactionId memxid; + LOCKMODE memlockmode; + + if (result && (current_is_member == NULL || *current_is_member)) + break; + + memlockmode = LOCKMODE_from_mxstatus(members[i].status); + + /* ignore members from current xact (but track their presence) */ + memxid = members[i].xid; + if (TransactionIdIsCurrentTransactionId(memxid)) + { + if (current_is_member != NULL) + *current_is_member = true; + continue; + } + else if (result) + continue; + + /* ignore members that don't conflict with the lock we want */ + if (!DoLockModesConflict(memlockmode, wanted)) + continue; + + if (ISUPDATE_from_mxstatus(members[i].status)) + { + /* ignore aborted updaters */ + if (TransactionIdDidAbort(memxid)) + continue; + } + else + { + /* ignore lockers-only that are no longer in progress */ + if (!TransactionIdIsInProgress(memxid)) + continue; + } + + /* + * Whatever remains are either live lockers that conflict with our + * wanted lock, and updaters that are not aborted. Those conflict + * with what we want. Set up to return true, but keep going to + * look for the current transaction among the multixact members, + * if needed. + */ + result = true; + } + pfree(members); + } + + return result; +} + +/* + * Do_MultiXactIdWait + * Actual implementation for the two functions below. + * + * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is + * needed to ensure we only sleep on conflicting members, and the infomask is + * used to optimize multixact access in case it's a lock-only multi); 'nowait' + * indicates whether to use conditional lock acquisition, to allow callers to + * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up + * context information for error messages. 'remaining', if not NULL, receives + * the number of members that are still running, including any (non-aborted) + * subtransactions of our own transaction. + * + * We do this by sleeping on each member using XactLockTableWait. Any + * members that belong to the current backend are *not* waited for, however; + * this would not merely be useless but would lead to Assert failure inside + * XactLockTableWait. By the time this returns, it is certain that all + * transactions *of other backends* that were members of the MultiXactId + * that conflict with the requested status are dead (and no new ones can have + * been added, since it is not legal to add members to an existing + * MultiXactId). + * + * But by the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * Note that in case we return false, the number of remaining members is + * not to be trusted. + */ +static bool +Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, bool nowait, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining) +{ + bool result = true; + MultiXactMember *members; + int nmembers; + int remain = 0; + + /* for pre-pg_upgrade tuples, no need to sleep at all */ + nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 : + GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(infomask)); + + if (nmembers >= 0) + { + int i; + + for (i = 0; i < nmembers; i++) + { + TransactionId memxid = members[i].xid; + MultiXactStatus memstatus = members[i].status; + + if (TransactionIdIsCurrentTransactionId(memxid)) + { + remain++; + continue; + } + + if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus), + LOCKMODE_from_mxstatus(status))) + { + if (remaining && TransactionIdIsInProgress(memxid)) + remain++; + continue; + } + + /* + * This member conflicts with our multi, so we have to sleep (or + * return failure, if asked to avoid waiting.) + * + * Note that we don't set up an error context callback ourselves, + * but instead we pass the info down to XactLockTableWait. This + * might seem a bit wasteful because the context is set up and + * tore down for each member of the multixact, but in reality it + * should be barely noticeable, and it avoids duplicate code. + */ + if (nowait) + { + result = ConditionalXactLockTableWait(memxid); + if (!result) + break; + } + else + XactLockTableWait(memxid, rel, ctid, oper); + } + + pfree(members); + } + + if (remaining) + *remaining = remain; + + return result; +} + +/* + * MultiXactIdWait + * Sleep on a MultiXactId. + * + * By the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * We return (in *remaining, if not NULL) the number of members that are still + * running, including any (non-aborted) subtransactions of our own transaction. + */ +static void +MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining) +{ + (void) Do_MultiXactIdWait(multi, status, infomask, false, + rel, ctid, oper, remaining); +} + +/* + * ConditionalMultiXactIdWait + * As above, but only lock if we can get the lock without blocking. + * + * By the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * If the multixact is now all gone, return true. Returns false if some + * transactions might still be running. + * + * We return (in *remaining, if not NULL) the number of members that are still + * running, including any (non-aborted) subtransactions of our own transaction. + */ +static bool +ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, Relation rel, int *remaining) +{ + return Do_MultiXactIdWait(multi, status, infomask, true, + rel, NULL, XLTW_None, remaining); +} + +/* + * heap_tuple_needs_eventual_freeze + * + * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) + * will eventually require freezing. Similar to heap_tuple_needs_freeze, + * but there's no cutoff, since we're trying to figure out whether freezing + * will ever be needed, not whether it's needed now. + */ +bool +heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) +{ + TransactionId xid; + + /* + * If xmin is a normal transaction ID, this tuple is definitely not + * frozen. + */ + xid = HeapTupleHeaderGetXmin(tuple); + if (TransactionIdIsNormal(xid)) + return true; + + /* + * If xmax is a valid xact or multixact, this tuple is also not frozen. + */ + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactId multi; + + multi = HeapTupleHeaderGetRawXmax(tuple); + if (MultiXactIdIsValid(multi)) + return true; + } + else + { + xid = HeapTupleHeaderGetRawXmax(tuple); + if (TransactionIdIsNormal(xid)) + return true; + } + + if (tuple->t_infomask & HEAP_MOVED) + { + xid = HeapTupleHeaderGetXvac(tuple); + if (TransactionIdIsNormal(xid)) + return true; + } + + return false; +} + +/* + * heap_tuple_needs_freeze + * + * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) + * are older than the specified cutoff XID or MultiXactId. If so, return true. + * + * It doesn't matter whether the tuple is alive or dead, we are checking + * to see if a tuple needs to be removed or frozen to avoid wraparound. + * + * NB: Cannot rely on hint bits here, they might not be set after a crash or + * on a standby. + */ +bool +heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, + MultiXactId cutoff_multi, Buffer buf) +{ + TransactionId xid; + + xid = HeapTupleHeaderGetXmin(tuple); + if (TransactionIdIsNormal(xid) && + TransactionIdPrecedes(xid, cutoff_xid)) + return true; + + /* + * The considerations for multixacts are complicated; look at + * heap_prepare_freeze_tuple for justifications. This routine had better + * be in sync with that one! + */ + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactId multi; + + multi = HeapTupleHeaderGetRawXmax(tuple); + if (!MultiXactIdIsValid(multi)) + { + /* no xmax set, ignore */ + ; + } + else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask)) + return true; + else if (MultiXactIdPrecedes(multi, cutoff_multi)) + return true; + else + { + MultiXactMember *members; + int nmembers; + int i; + + /* need to check whether any member of the mxact is too old */ + + nmembers = GetMultiXactIdMembers(multi, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + for (i = 0; i < nmembers; i++) + { + if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) + { + pfree(members); + return true; + } + } + if (nmembers > 0) + pfree(members); + } + } + else + { + xid = HeapTupleHeaderGetRawXmax(tuple); + if (TransactionIdIsNormal(xid) && + TransactionIdPrecedes(xid, cutoff_xid)) + return true; + } + + if (tuple->t_infomask & HEAP_MOVED) + { + xid = HeapTupleHeaderGetXvac(tuple); + if (TransactionIdIsNormal(xid) && + TransactionIdPrecedes(xid, cutoff_xid)) + return true; + } + + return false; +} + +/* + * If 'tuple' contains any visible XID greater than latestRemovedXid, + * ratchet forwards latestRemovedXid to the greatest one found. + * This is used as the basis for generating Hot Standby conflicts, so + * if a tuple was never visible then removing it should not conflict + * with queries. + */ +void +HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, + TransactionId *latestRemovedXid) +{ + TransactionId xmin = HeapTupleHeaderGetXmin(tuple); + TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple); + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (tuple->t_infomask & HEAP_MOVED) + { + if (TransactionIdPrecedes(*latestRemovedXid, xvac)) + *latestRemovedXid = xvac; + } + + /* + * Ignore tuples inserted by an aborted transaction or if the tuple was + * updated/deleted by the inserting transaction. + * + * Look for a committed hint bit, or if no xmin bit is set, check clog. + */ + if (HeapTupleHeaderXminCommitted(tuple) || + (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin))) + { + if (xmax != xmin && + TransactionIdFollows(xmax, *latestRemovedXid)) + *latestRemovedXid = xmax; + } + + /* *latestRemovedXid may still be invalid at end */ +} + +#ifdef USE_PREFETCH +/* + * Helper function for heap_index_delete_tuples. Issues prefetch requests for + * prefetch_count buffers. The prefetch_state keeps track of all the buffers + * we can prefetch, and which have already been prefetched; each call to this + * function picks up where the previous call left off. + * + * Note: we expect the deltids array to be sorted in an order that groups TIDs + * by heap block, with all TIDs for each block appearing together in exactly + * one group. + */ +static void +index_delete_prefetch_buffer(Relation rel, + IndexDeletePrefetchState *prefetch_state, + int prefetch_count) +{ + BlockNumber cur_hblkno = prefetch_state->cur_hblkno; + int count = 0; + int i; + int ndeltids = prefetch_state->ndeltids; + TM_IndexDelete *deltids = prefetch_state->deltids; + + for (i = prefetch_state->next_item; + i < ndeltids && count < prefetch_count; + i++) + { + ItemPointer htid = &deltids[i].tid; + + if (cur_hblkno == InvalidBlockNumber || + ItemPointerGetBlockNumber(htid) != cur_hblkno) + { + cur_hblkno = ItemPointerGetBlockNumber(htid); + PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno); + count++; + } + } + + /* + * Save the prefetch position so that next time we can continue from that + * position. + */ + prefetch_state->next_item = i; + prefetch_state->cur_hblkno = cur_hblkno; +} +#endif + +/* + * heapam implementation of tableam's index_delete_tuples interface. + * + * This helper function is called by index AMs during index tuple deletion. + * See tableam header comments for an explanation of the interface implemented + * here and a general theory of operation. Note that each call here is either + * a simple index deletion call, or a bottom-up index deletion call. + * + * It's possible for this to generate a fair amount of I/O, since we may be + * deleting hundreds of tuples from a single index block. To amortize that + * cost to some degree, this uses prefetching and combines repeat accesses to + * the same heap block. + */ +TransactionId +heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) +{ + /* Initial assumption is that earlier pruning took care of conflict */ + TransactionId latestRemovedXid = InvalidTransactionId; + BlockNumber blkno = InvalidBlockNumber; + Buffer buf = InvalidBuffer; + Page page = NULL; + OffsetNumber maxoff = InvalidOffsetNumber; + TransactionId priorXmax; +#ifdef USE_PREFETCH + IndexDeletePrefetchState prefetch_state; + int prefetch_distance; +#endif + SnapshotData SnapshotNonVacuumable; + int finalndeltids = 0, + nblocksaccessed = 0; + + /* State that's only used in bottom-up index deletion case */ + int nblocksfavorable = 0; + int curtargetfreespace = delstate->bottomupfreespace, + lastfreespace = 0, + actualfreespace = 0; + bool bottomup_final_block = false; + + InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel)); + + /* Sort caller's deltids array by TID for further processing */ + index_delete_sort(delstate); + + /* + * Bottom-up case: resort deltids array in an order attuned to where the + * greatest number of promising TIDs are to be found, and determine how + * many blocks from the start of sorted array should be considered + * favorable. This will also shrink the deltids array in order to + * eliminate completely unfavorable blocks up front. + */ + if (delstate->bottomup) + nblocksfavorable = bottomup_sort_and_shrink(delstate); + +#ifdef USE_PREFETCH + /* Initialize prefetch state. */ + prefetch_state.cur_hblkno = InvalidBlockNumber; + prefetch_state.next_item = 0; + prefetch_state.ndeltids = delstate->ndeltids; + prefetch_state.deltids = delstate->deltids; + + /* + * Determine the prefetch distance that we will attempt to maintain. + * + * Since the caller holds a buffer lock somewhere in rel, we'd better make + * sure that isn't a catalog relation before we call code that does + * syscache lookups, to avoid risk of deadlock. + */ + if (IsCatalogRelation(rel)) + prefetch_distance = maintenance_io_concurrency; + else + prefetch_distance = + get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + + /* Cap initial prefetch distance for bottom-up deletion caller */ + if (delstate->bottomup) + { + Assert(nblocksfavorable >= 1); + Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS); + prefetch_distance = Min(prefetch_distance, nblocksfavorable); + } + + /* Start prefetching. */ + index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance); +#endif + + /* Iterate over deltids, determine which to delete, check their horizon */ + Assert(delstate->ndeltids > 0); + for (int i = 0; i < delstate->ndeltids; i++) + { + TM_IndexDelete *ideltid = &delstate->deltids[i]; + TM_IndexStatus *istatus = delstate->status + ideltid->id; + ItemPointer htid = &ideltid->tid; + OffsetNumber offnum; + + /* + * Read buffer, and perform required extra steps each time a new block + * is encountered. Avoid refetching if it's the same block as the one + * from the last htid. + */ + if (blkno == InvalidBlockNumber || + ItemPointerGetBlockNumber(htid) != blkno) + { + /* + * Consider giving up early for bottom-up index deletion caller + * first. (Only prefetch next-next block afterwards, when it + * becomes clear that we're at least going to access the next + * block in line.) + * + * Sometimes the first block frees so much space for bottom-up + * caller that the deletion process can end without accessing any + * more blocks. It is usually necessary to access 2 or 3 blocks + * per bottom-up deletion operation, though. + */ + if (delstate->bottomup) + { + /* + * We often allow caller to delete a few additional items + * whose entries we reached after the point that space target + * from caller was satisfied. The cost of accessing the page + * was already paid at that point, so it made sense to finish + * it off. When that happened, we finalize everything here + * (by finishing off the whole bottom-up deletion operation + * without needlessly paying the cost of accessing any more + * blocks). + */ + if (bottomup_final_block) + break; + + /* + * Give up when we didn't enable our caller to free any + * additional space as a result of processing the page that we + * just finished up with. This rule is the main way in which + * we keep the cost of bottom-up deletion under control. + */ + if (nblocksaccessed >= 1 && actualfreespace == lastfreespace) + break; + lastfreespace = actualfreespace; /* for next time */ + + /* + * Deletion operation (which is bottom-up) will definitely + * access the next block in line. Prepare for that now. + * + * Decay target free space so that we don't hang on for too + * long with a marginal case. (Space target is only truly + * helpful when it allows us to recognize that we don't need + * to access more than 1 or 2 blocks to satisfy caller due to + * agreeable workload characteristics.) + * + * We are a bit more patient when we encounter contiguous + * blocks, though: these are treated as favorable blocks. The + * decay process is only applied when the next block in line + * is not a favorable/contiguous block. This is not an + * exception to the general rule; we still insist on finding + * at least one deletable item per block accessed. See + * bottomup_nblocksfavorable() for full details of the theory + * behind favorable blocks and heap block locality in general. + * + * Note: The first block in line is always treated as a + * favorable block, so the earliest possible point that the + * decay can be applied is just before we access the second + * block in line. The Assert() verifies this for us. + */ + Assert(nblocksaccessed > 0 || nblocksfavorable > 0); + if (nblocksfavorable > 0) + nblocksfavorable--; + else + curtargetfreespace /= 2; + } + + /* release old buffer */ + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); + + blkno = ItemPointerGetBlockNumber(htid); + buf = ReadBuffer(rel, blkno); + nblocksaccessed++; + Assert(!delstate->bottomup || + nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS); + +#ifdef USE_PREFETCH + + /* + * To maintain the prefetch distance, prefetch one more page for + * each page we read. + */ + index_delete_prefetch_buffer(rel, &prefetch_state, 1); +#endif + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + } + + if (istatus->knowndeletable) + Assert(!delstate->bottomup && !istatus->promising); + else + { + ItemPointerData tmp = *htid; + HeapTupleData heapTuple; + + /* Are any tuples from this HOT chain non-vacuumable? */ + if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable, + &heapTuple, NULL, true)) + continue; /* can't delete entry */ + + /* Caller will delete, since whole HOT chain is vacuumable */ + istatus->knowndeletable = true; + + /* Maintain index free space info for bottom-up deletion case */ + if (delstate->bottomup) + { + Assert(istatus->freespace > 0); + actualfreespace += istatus->freespace; + if (actualfreespace >= curtargetfreespace) + bottomup_final_block = true; + } + } + + /* + * Maintain latestRemovedXid value for deletion operation as a whole + * by advancing current value using heap tuple headers. This is + * loosely based on the logic for pruning a HOT chain. + */ + offnum = ItemPointerGetOffsetNumber(htid); + priorXmax = InvalidTransactionId; /* cannot check first XMIN */ + for (;;) + { + ItemId lp; + HeapTupleHeader htup; + + /* Some sanity checks */ + if (offnum < FirstOffsetNumber || offnum > maxoff) + break; + + lp = PageGetItemId(page, offnum); + if (ItemIdIsRedirected(lp)) + { + offnum = ItemIdGetRedirect(lp); + continue; + } + + /* + * We'll often encounter LP_DEAD line pointers (especially with an + * entry marked knowndeletable by our caller up front). No heap + * tuple headers get examined for an htid that leads us to an + * LP_DEAD item. This is okay because the earlier pruning + * operation that made the line pointer LP_DEAD in the first place + * must have considered the original tuple header as part of + * generating its own latestRemovedXid value. + * + * Relying on XLOG_HEAP2_PRUNE records like this is the same + * strategy that index vacuuming uses in all cases. Index VACUUM + * WAL records don't even have a latestRemovedXid field of their + * own for this reason. + */ + if (!ItemIdIsNormal(lp)) + break; + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Check the tuple XMIN against prior XMAX, if any + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + break; + + HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid); + + /* + * If the tuple is not HOT-updated, then we are at the end of this + * HOT-chain. No need to visit later tuples from the same update + * chain (they get their own index entries) -- just move on to + * next htid from index AM caller. + */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + /* Advance to next HOT chain member */ + Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno); + offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + + /* Enable further/final shrinking of deltids for caller */ + finalndeltids = i + 1; + } + + UnlockReleaseBuffer(buf); + + /* + * Shrink deltids array to exclude non-deletable entries at the end. This + * is not just a minor optimization. Final deltids array size might be + * zero for a bottom-up caller. Index AM is explicitly allowed to rely on + * ndeltids being zero in all cases with zero total deletable entries. + */ + Assert(finalndeltids > 0 || delstate->bottomup); + delstate->ndeltids = finalndeltids; + + return latestRemovedXid; +} + +/* + * Specialized inlineable comparison function for index_delete_sort() + */ +static inline int +index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2) +{ + ItemPointer tid1 = &deltid1->tid; + ItemPointer tid2 = &deltid2->tid; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(tid1); + BlockNumber blk2 = ItemPointerGetBlockNumber(tid2); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + Assert(false); + + return 0; +} + +/* + * Sort deltids array from delstate by TID. This prepares it for further + * processing by heap_index_delete_tuples(). + * + * This operation becomes a noticeable consumer of CPU cycles with some + * workloads, so we go to the trouble of specialization/micro optimization. + * We use shellsort for this because it's easy to specialize, compiles to + * relatively few instructions, and is adaptive to presorted inputs/subsets + * (which are typical here). + */ +static void +index_delete_sort(TM_IndexDeleteOp *delstate) +{ + TM_IndexDelete *deltids = delstate->deltids; + int ndeltids = delstate->ndeltids; + int low = 0; + + /* + * Shellsort gap sequence (taken from Sedgewick-Incerpi paper). + * + * This implementation is fast with array sizes up to ~4500. This covers + * all supported BLCKSZ values. + */ + const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1}; + + /* Think carefully before changing anything here -- keep swaps cheap */ + StaticAssertStmt(sizeof(TM_IndexDelete) <= 8, + "element size exceeds 8 bytes"); + + for (int g = 0; g < lengthof(gaps); g++) + { + for (int hi = gaps[g], i = low + hi; i < ndeltids; i++) + { + TM_IndexDelete d = deltids[i]; + int j = i; + + while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0) + { + deltids[j] = deltids[j - hi]; + j -= hi; + } + deltids[j] = d; + } + } +} + +/* + * Returns how many blocks should be considered favorable/contiguous for a + * bottom-up index deletion pass. This is a number of heap blocks that starts + * from and includes the first block in line. + * + * There is always at least one favorable block during bottom-up index + * deletion. In the worst case (i.e. with totally random heap blocks) the + * first block in line (the only favorable block) can be thought of as a + * degenerate array of contiguous blocks that consists of a single block. + * heap_index_delete_tuples() will expect this. + * + * Caller passes blockgroups, a description of the final order that deltids + * will be sorted in for heap_index_delete_tuples() bottom-up index deletion + * processing. Note that deltids need not actually be sorted just yet (caller + * only passes deltids to us so that we can interpret blockgroups). + * + * You might guess that the existence of contiguous blocks cannot matter much, + * since in general the main factor that determines which blocks we visit is + * the number of promising TIDs, which is a fixed hint from the index AM. + * We're not really targeting the general case, though -- the actual goal is + * to adapt our behavior to a wide variety of naturally occurring conditions. + * The effects of most of the heuristics we apply are only noticeable in the + * aggregate, over time and across many _related_ bottom-up index deletion + * passes. + * + * Deeming certain blocks favorable allows heapam to recognize and adapt to + * workloads where heap blocks visited during bottom-up index deletion can be + * accessed contiguously, in the sense that each newly visited block is the + * neighbor of the block that bottom-up deletion just finished processing (or + * close enough to it). It will likely be cheaper to access more favorable + * blocks sooner rather than later (e.g. in this pass, not across a series of + * related bottom-up passes). Either way it is probably only a matter of time + * (or a matter of further correlated version churn) before all blocks that + * appear together as a single large batch of favorable blocks get accessed by + * _some_ bottom-up pass. Large batches of favorable blocks tend to either + * appear almost constantly or not even once (it all depends on per-index + * workload characteristics). + * + * Note that the blockgroups sort order applies a power-of-two bucketing + * scheme that creates opportunities for contiguous groups of blocks to get + * batched together, at least with workloads that are naturally amenable to + * being driven by heap block locality. This doesn't just enhance the spatial + * locality of bottom-up heap block processing in the obvious way. It also + * enables temporal locality of access, since sorting by heap block number + * naturally tends to make the bottom-up processing order deterministic. + * + * Consider the following example to get a sense of how temporal locality + * might matter: There is a heap relation with several indexes, each of which + * is low to medium cardinality. It is subject to constant non-HOT updates. + * The updates are skewed (in one part of the primary key, perhaps). None of + * the indexes are logically modified by the UPDATE statements (if they were + * then bottom-up index deletion would not be triggered in the first place). + * Naturally, each new round of index tuples (for each heap tuple that gets a + * heap_update() call) will have the same heap TID in each and every index. + * Since these indexes are low cardinality and never get logically modified, + * heapam processing during bottom-up deletion passes will access heap blocks + * in approximately sequential order. Temporal locality of access occurs due + * to bottom-up deletion passes behaving very similarly across each of the + * indexes at any given moment. This keeps the number of buffer misses needed + * to visit heap blocks to a minimum. + */ +static int +bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, + TM_IndexDelete *deltids) +{ + int64 lastblock = -1; + int nblocksfavorable = 0; + + Assert(nblockgroups >= 1); + Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS); + + /* + * We tolerate heap blocks that will be accessed only slightly out of + * physical order. Small blips occur when a pair of almost-contiguous + * blocks happen to fall into different buckets (perhaps due only to a + * small difference in npromisingtids that the bucketing scheme didn't + * quite manage to ignore). We effectively ignore these blips by applying + * a small tolerance. The precise tolerance we use is a little arbitrary, + * but it works well enough in practice. + */ + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + TM_IndexDelete *firstdtid = deltids + group->ifirsttid; + BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid); + + if (lastblock != -1 && + ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS || + (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS)) + break; + + nblocksfavorable++; + lastblock = block; + } + + /* Always indicate that there is at least 1 favorable block */ + Assert(nblocksfavorable >= 1); + + return nblocksfavorable; +} + +/* + * qsort comparison function for bottomup_sort_and_shrink() + */ +static int +bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2) +{ + const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1; + const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2; + + /* + * Most significant field is npromisingtids (which we invert the order of + * so as to sort in desc order). + * + * Caller should have already normalized npromisingtids fields into + * power-of-two values (buckets). + */ + if (group1->npromisingtids > group2->npromisingtids) + return -1; + if (group1->npromisingtids < group2->npromisingtids) + return 1; + + /* + * Tiebreak: desc ntids sort order. + * + * We cannot expect power-of-two values for ntids fields. We should + * behave as if they were already rounded up for us instead. + */ + if (group1->ntids != group2->ntids) + { + uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids); + uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids); + + if (ntids1 > ntids2) + return -1; + if (ntids1 < ntids2) + return 1; + } + + /* + * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for + * block in deltids array) order. + * + * This is equivalent to sorting in ascending heap block number order + * (among otherwise equal subsets of the array). This approach allows us + * to avoid accessing the out-of-line TID. (We rely on the assumption + * that the deltids array was sorted in ascending heap TID order when + * these offsets to the first TID from each heap block group were formed.) + */ + if (group1->ifirsttid > group2->ifirsttid) + return 1; + if (group1->ifirsttid < group2->ifirsttid) + return -1; + + pg_unreachable(); + + return 0; +} + +/* + * heap_index_delete_tuples() helper function for bottom-up deletion callers. + * + * Sorts deltids array in the order needed for useful processing by bottom-up + * deletion. The array should already be sorted in TID order when we're + * called. The sort process groups heap TIDs from deltids into heap block + * groupings. Earlier/more-promising groups/blocks are usually those that are + * known to have the most "promising" TIDs. + * + * Sets new size of deltids array (ndeltids) in state. deltids will only have + * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we + * return. This often means that deltids will be shrunk to a small fraction + * of its original size (we eliminate many heap blocks from consideration for + * caller up front). + * + * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable() + * for a definition and full details. + */ +static int +bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) +{ + IndexDeleteCounts *blockgroups; + TM_IndexDelete *reordereddeltids; + BlockNumber curblock = InvalidBlockNumber; + int nblockgroups = 0; + int ncopied = 0; + int nblocksfavorable = 0; + + Assert(delstate->bottomup); + Assert(delstate->ndeltids > 0); + + /* Calculate per-heap-block count of TIDs */ + blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids); + for (int i = 0; i < delstate->ndeltids; i++) + { + TM_IndexDelete *ideltid = &delstate->deltids[i]; + TM_IndexStatus *istatus = delstate->status + ideltid->id; + ItemPointer htid = &ideltid->tid; + bool promising = istatus->promising; + + if (curblock != ItemPointerGetBlockNumber(htid)) + { + /* New block group */ + nblockgroups++; + + Assert(curblock < ItemPointerGetBlockNumber(htid) || + !BlockNumberIsValid(curblock)); + + curblock = ItemPointerGetBlockNumber(htid); + blockgroups[nblockgroups - 1].ifirsttid = i; + blockgroups[nblockgroups - 1].ntids = 1; + blockgroups[nblockgroups - 1].npromisingtids = 0; + } + else + { + blockgroups[nblockgroups - 1].ntids++; + } + + if (promising) + blockgroups[nblockgroups - 1].npromisingtids++; + } + + /* + * We're about ready to sort block groups to determine the optimal order + * for visiting heap blocks. But before we do, round the number of + * promising tuples for each block group up to the next power-of-two, + * unless it is very low (less than 4), in which case we round up to 4. + * npromisingtids is far too noisy to trust when choosing between a pair + * of block groups that both have very low values. + * + * This scheme divides heap blocks/block groups into buckets. Each bucket + * contains blocks that have _approximately_ the same number of promising + * TIDs as each other. The goal is to ignore relatively small differences + * in the total number of promising entries, so that the whole process can + * give a little weight to heapam factors (like heap block locality) + * instead. This isn't a trade-off, really -- we have nothing to lose. It + * would be foolish to interpret small differences in npromisingtids + * values as anything more than noise. + * + * We tiebreak on nhtids when sorting block group subsets that have the + * same npromisingtids, but this has the same issues as npromisingtids, + * and so nhtids is subject to the same power-of-two bucketing scheme. The + * only reason that we don't fix nhtids in the same way here too is that + * we'll need accurate nhtids values after the sort. We handle nhtids + * bucketization dynamically instead (in the sort comparator). + * + * See bottomup_nblocksfavorable() for a full explanation of when and how + * heap locality/favorable blocks can significantly influence when and how + * heap blocks are accessed. + */ + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + + /* Better off falling back on nhtids with low npromisingtids */ + if (group->npromisingtids <= 4) + group->npromisingtids = 4; + else + group->npromisingtids = + pg_nextpower2_32((uint32) group->npromisingtids); + } + + /* Sort groups and rearrange caller's deltids array */ + qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts), + bottomup_sort_and_shrink_cmp); + reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete)); + + nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups); + /* Determine number of favorable blocks at the start of final deltids */ + nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups, + delstate->deltids); + + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid; + + memcpy(reordereddeltids + ncopied, firstdtid, + sizeof(TM_IndexDelete) * group->ntids); + ncopied += group->ntids; + } + + /* Copy final grouped and sorted TIDs back into start of caller's array */ + memcpy(delstate->deltids, reordereddeltids, + sizeof(TM_IndexDelete) * ncopied); + delstate->ndeltids = ncopied; + + pfree(reordereddeltids); + pfree(blockgroups); + + return nblocksfavorable; +} + +/* + * Perform XLogInsert for a heap-freeze operation. Caller must have already + * modified the buffer and marked it dirty. + */ +XLogRecPtr +log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, + xl_heap_freeze_tuple *tuples, int ntuples) +{ + xl_heap_freeze_page xlrec; + XLogRecPtr recptr; + + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); + /* nor when there are no tuples to freeze */ + Assert(ntuples > 0); + + xlrec.cutoff_xid = cutoff_xid; + xlrec.ntuples = ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); + + /* + * The freeze plan array is not actually in the buffer, but pretend that + * it is. When XLogInsert stores the whole buffer, the freeze plan need + * not be stored too. + */ + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) tuples, + ntuples * sizeof(xl_heap_freeze_tuple)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE); + + return recptr; +} + +/* + * Perform XLogInsert for a heap-visible operation. 'block' is the block + * being marked all-visible, and vm_buffer is the buffer containing the + * corresponding visibility map block. Both should have already been modified + * and dirtied. + * + * If checksums are enabled, we also generate a full-page image of + * heap_buffer, if necessary. + */ +XLogRecPtr +log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, + TransactionId cutoff_xid, uint8 vmflags) +{ + xl_heap_visible xlrec; + XLogRecPtr recptr; + uint8 flags; + + Assert(BufferIsValid(heap_buffer)); + Assert(BufferIsValid(vm_buffer)); + + xlrec.cutoff_xid = cutoff_xid; + xlrec.flags = vmflags; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapVisible); + + XLogRegisterBuffer(0, vm_buffer, 0); + + flags = REGBUF_STANDARD; + if (!XLogHintBitIsNeeded()) + flags |= REGBUF_NO_IMAGE; + XLogRegisterBuffer(1, heap_buffer, flags); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); + + return recptr; +} + +/* + * Perform XLogInsert for a heap-update operation. Caller must already + * have modified the buffer(s) and marked them dirty. + */ +static XLogRecPtr +log_heap_update(Relation reln, Buffer oldbuf, + Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, + HeapTuple old_key_tuple, + bool all_visible_cleared, bool new_all_visible_cleared) +{ + xl_heap_update xlrec; + xl_heap_header xlhdr; + xl_heap_header xlhdr_idx; + uint8 info; + uint16 prefix_suffix[2]; + uint16 prefixlen = 0, + suffixlen = 0; + XLogRecPtr recptr; + Page page = BufferGetPage(newbuf); + bool need_tuple_data = RelationIsLogicallyLogged(reln); + bool init; + int bufflags; + + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); + + XLogBeginInsert(); + + if (HeapTupleIsHeapOnly(newtup)) + info = XLOG_HEAP_HOT_UPDATE; + else + info = XLOG_HEAP_UPDATE; + + /* + * If the old and new tuple are on the same page, we only need to log the + * parts of the new tuple that were changed. That saves on the amount of + * WAL we need to write. Currently, we just count any unchanged bytes in + * the beginning and end of the tuple. That's quick to check, and + * perfectly covers the common case that only one field is updated. + * + * We could do this even if the old and new tuple are on different pages, + * but only if we don't make a full-page image of the old page, which is + * difficult to know in advance. Also, if the old tuple is corrupt for + * some reason, it would allow the corruption to propagate the new page, + * so it seems best to avoid. Under the general assumption that most + * updates tend to create the new tuple version on the same page, there + * isn't much to be gained by doing this across pages anyway. + * + * Skip this if we're taking a full-page image of the new page, as we + * don't include the new tuple in the WAL record in that case. Also + * disable if wal_level='logical', as logical decoding needs to be able to + * read the new tuple in whole from the WAL record alone. + */ + if (oldbuf == newbuf && !need_tuple_data && + !XLogCheckBufferNeedsBackup(newbuf)) + { + char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff; + char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff; + int oldlen = oldtup->t_len - oldtup->t_data->t_hoff; + int newlen = newtup->t_len - newtup->t_data->t_hoff; + + /* Check for common prefix between old and new tuple */ + for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++) + { + if (newp[prefixlen] != oldp[prefixlen]) + break; + } + + /* + * Storing the length of the prefix takes 2 bytes, so we need to save + * at least 3 bytes or there's no point. + */ + if (prefixlen < 3) + prefixlen = 0; + + /* Same for suffix */ + for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++) + { + if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1]) + break; + } + if (suffixlen < 3) + suffixlen = 0; + } + + /* Prepare main WAL data chain */ + xlrec.flags = 0; + if (all_visible_cleared) + xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED; + if (new_all_visible_cleared) + xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED; + if (prefixlen > 0) + xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD; + if (suffixlen > 0) + xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD; + if (need_tuple_data) + { + xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE; + if (old_key_tuple) + { + if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE; + else + xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; + } + } + + /* If new tuple is the single and first tuple on page... */ + if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && + PageGetMaxOffsetNumber(page) == FirstOffsetNumber) + { + info |= XLOG_HEAP_INIT_PAGE; + init = true; + } + else + init = false; + + /* Prepare WAL data for the old page */ + xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self); + xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); + xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, + oldtup->t_data->t_infomask2); + + /* Prepare WAL data for the new page */ + xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); + xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); + + bufflags = REGBUF_STANDARD; + if (init) + bufflags |= REGBUF_WILL_INIT; + if (need_tuple_data) + bufflags |= REGBUF_KEEP_DATA; + + XLogRegisterBuffer(0, newbuf, bufflags); + if (oldbuf != newbuf) + XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); + + XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate); + + /* + * Prepare WAL data for the new tuple. + */ + if (prefixlen > 0 || suffixlen > 0) + { + if (prefixlen > 0 && suffixlen > 0) + { + prefix_suffix[0] = prefixlen; + prefix_suffix[1] = suffixlen; + XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2); + } + else if (prefixlen > 0) + { + XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16)); + } + else + { + XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16)); + } + } + + xlhdr.t_infomask2 = newtup->t_data->t_infomask2; + xlhdr.t_infomask = newtup->t_data->t_infomask; + xlhdr.t_hoff = newtup->t_data->t_hoff; + Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); + + /* + * PG73FORMAT: write bitmap [+ padding] [+ oid] + data + * + * The 'data' doesn't include the common prefix or suffix. + */ + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + if (prefixlen == 0) + { + XLogRegisterBufData(0, + ((char *) newtup->t_data) + SizeofHeapTupleHeader, + newtup->t_len - SizeofHeapTupleHeader - suffixlen); + } + else + { + /* + * Have to write the null bitmap and data after the common prefix as + * two separate rdata entries. + */ + /* bitmap [+ padding] [+ oid] */ + if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0) + { + XLogRegisterBufData(0, + ((char *) newtup->t_data) + SizeofHeapTupleHeader, + newtup->t_data->t_hoff - SizeofHeapTupleHeader); + } + + /* data after common prefix */ + XLogRegisterBufData(0, + ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen, + newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen); + } + + /* We need to log a tuple identity */ + if (need_tuple_data && old_key_tuple) + { + /* don't really need this, but its more comfy to decode */ + xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; + xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; + + XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); + + /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ + XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, + old_key_tuple->t_len - SizeofHeapTupleHeader); + } + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP_ID, info); + + return recptr; +} + +/* + * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record + * + * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog + * tuples. + */ +static XLogRecPtr +log_heap_new_cid(Relation relation, HeapTuple tup) +{ + xl_heap_new_cid xlrec; + + XLogRecPtr recptr; + HeapTupleHeader hdr = tup->t_data; + + Assert(ItemPointerIsValid(&tup->t_self)); + Assert(tup->t_tableOid != InvalidOid); + + xlrec.top_xid = GetTopTransactionId(); + xlrec.target_node = relation->rd_node; + xlrec.target_tid = tup->t_self; + + /* + * If the tuple got inserted & deleted in the same TX we definitely have a + * combo CID, set cmin and cmax. + */ + if (hdr->t_infomask & HEAP_COMBOCID) + { + Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID)); + Assert(!HeapTupleHeaderXminInvalid(hdr)); + xlrec.cmin = HeapTupleHeaderGetCmin(hdr); + xlrec.cmax = HeapTupleHeaderGetCmax(hdr); + xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr); + } + /* No combo CID, so only cmin or cmax can be set by this TX */ + else + { + /* + * Tuple inserted. + * + * We need to check for LOCK ONLY because multixacts might be + * transferred to the new tuple in case of FOR KEY SHARE updates in + * which case there will be an xmax, although the tuple just got + * inserted. + */ + if (hdr->t_infomask & HEAP_XMAX_INVALID || + HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask)) + { + xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr); + xlrec.cmax = InvalidCommandId; + } + /* Tuple from a different tx updated or deleted. */ + else + { + xlrec.cmin = InvalidCommandId; + xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr); + + } + xlrec.combocid = InvalidCommandId; + } + + /* + * Note that we don't need to register the buffer here, because this + * operation does not modify the page. The insert/update/delete that + * called us certainly did, but that's WAL-logged separately. + */ + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid); + + /* will be looked at irrespective of origin */ + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID); + + return recptr; +} + +/* + * Build a heap tuple representing the configured REPLICA IDENTITY to represent + * the old tuple in a UPDATE or DELETE. + * + * Returns NULL if there's no need to log an identity or if there's no suitable + * key defined. + * + * Pass key_required true if any replica identity columns changed value, or if + * any of them have any external data. Delete must always pass true. + * + * *copy is set to true if the returned tuple is a modified copy rather than + * the same tuple that was passed in. + */ +static HeapTuple +ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, + bool *copy) +{ + TupleDesc desc = RelationGetDescr(relation); + char replident = relation->rd_rel->relreplident; + Bitmapset *idattrs; + HeapTuple key_tuple; + bool nulls[MaxHeapAttributeNumber]; + Datum values[MaxHeapAttributeNumber]; + + *copy = false; + + if (!RelationIsLogicallyLogged(relation)) + return NULL; + + if (replident == REPLICA_IDENTITY_NOTHING) + return NULL; + + if (replident == REPLICA_IDENTITY_FULL) + { + /* + * When logging the entire old tuple, it very well could contain + * toasted columns. If so, force them to be inlined. + */ + if (HeapTupleHasExternal(tp)) + { + *copy = true; + tp = toast_flatten_tuple(tp, desc); + } + return tp; + } + + /* if the key isn't required and we're only logging the key, we're done */ + if (!key_required) + return NULL; + + /* find out the replica identity columns */ + idattrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + + /* + * If there's no defined replica identity columns, treat as !key_required. + * (This case should not be reachable from heap_update, since that should + * calculate key_required accurately. But heap_delete just passes + * constant true for key_required, so we can hit this case in deletes.) + */ + if (bms_is_empty(idattrs)) + return NULL; + + /* + * Construct a new tuple containing only the replica identity columns, + * with nulls elsewhere. While we're at it, assert that the replica + * identity columns aren't null. + */ + heap_deform_tuple(tp, desc, values, nulls); + + for (int i = 0; i < desc->natts; i++) + { + if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber, + idattrs)) + Assert(!nulls[i]); + else + nulls[i] = true; + } + + key_tuple = heap_form_tuple(desc, values, nulls); + *copy = true; + + bms_free(idattrs); + + /* + * If the tuple, which by here only contains indexed columns, still has + * toasted columns, force them to be inlined. This is somewhat unlikely + * since there's limits on the size of indexed columns, so we don't + * duplicate toast_flatten_tuple()s functionality in the above loop over + * the indexed columns, even if it would be more efficient. + */ + if (HeapTupleHasExternal(key_tuple)) + { + HeapTuple oldtup = key_tuple; + + key_tuple = toast_flatten_tuple(oldtup, desc); + heap_freetuple(oldtup); + } + + return key_tuple; +} + +/* + * Handles XLOG_HEAP2_PRUNE record type. + * + * Acquires a super-exclusive lock. + */ +static void +heap_xlog_prune(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_prune *xlrec = (xl_heap_prune *) XLogRecGetData(record); + Buffer buffer; + RelFileNode rnode; + BlockNumber blkno; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + /* + * We're about to remove tuples. In Hot Standby mode, ensure that there's + * no queries running for which the removed tuples are still visible. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode); + + /* + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *end; + OffsetNumber *redirected; + OffsetNumber *nowdead; + OffsetNumber *nowunused; + int nredirected; + int ndead; + int nunused; + Size datalen; + + redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + nredirected = xlrec->nredirected; + ndead = xlrec->ndead; + end = (OffsetNumber *) ((char *) redirected + datalen); + nowdead = redirected + (nredirected * 2); + nowunused = nowdead + ndead; + nunused = (end - nowunused); + Assert(nunused >= 0); + + /* Update all line pointers per the record, and repair fragmentation */ + heap_page_prune_execute(buffer, + redirected, nredirected, + nowdead, ndead, + nowunused, nunused); + + /* + * Note: we don't worry about updating the page's prunability hints. + * At worst this will cause an extra prune cycle to occur soon. + */ + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + { + Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + + UnlockReleaseBuffer(buffer); + + /* + * After pruning records from a page, it's useful to update the FSM + * about it, as it may cause the page become target for insertions + * later even if vacuum decides not to visit it (which is possible if + * gets marked all-visible.) + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + XLogRecordPageWithFreeSpace(rnode, blkno, freespace); + } +} + +/* + * Handles XLOG_HEAP2_VACUUM record type. + * + * Acquires an exclusive lock only. + */ +static void +heap_xlog_vacuum(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_vacuum *xlrec = (xl_heap_vacuum *) XLogRecGetData(record); + Buffer buffer; + BlockNumber blkno; + XLogRedoAction action; + + /* + * If we have a full-page image, restore it (without using a cleanup lock) + * and we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *nowunused; + Size datalen; + OffsetNumber *offnum; + + nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + /* Shouldn't be a record unless there's something to do */ + Assert(xlrec->nunused > 0); + + /* Update all now-unused line pointers */ + offnum = nowunused; + for (int i = 0; i < xlrec->nunused; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + + Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp)); + ItemIdSetUnused(lp); + } + + /* Attempt to truncate line pointer array now */ + PageTruncateLinePointerArray(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + { + Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + UnlockReleaseBuffer(buffer); + + /* + * After vacuuming LP_DEAD items from a page, it's useful to update + * the FSM about it, as it may cause the page become target for + * insertions later even if vacuum decides not to visit it (which is + * possible if gets marked all-visible.) + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + XLogRecordPageWithFreeSpace(rnode, blkno, freespace); + } +} + +/* + * Replay XLOG_HEAP2_VISIBLE record. + * + * The critical integrity requirement here is that we must never end up with + * a situation where the visibility map bit is set, and the page-level + * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent + * page modification would fail to clear the visibility map bit. + */ +static void +heap_xlog_visible(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record); + Buffer vmbuffer = InvalidBuffer; + Buffer buffer; + Page page; + RelFileNode rnode; + BlockNumber blkno; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno); + + /* + * If there are any Hot Standby transactions running that have an xmin + * horizon old enough that this page isn't all-visible for them, they + * might incorrectly decide that an index-only scan can skip a heap fetch. + * + * NB: It might be better to throw some kind of "soft" conflict here that + * forces any index-only scan that is in flight to perform heap fetches, + * rather than killing the transaction outright. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode); + + /* + * Read the heap page, if it still exists. If the heap file has dropped or + * truncated later in recovery, we don't need to update the page, but we'd + * better still update the visibility map. + */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + /* + * We don't bump the LSN of the heap page when setting the visibility + * map bit (unless checksums or wal_hint_bits is enabled, in which + * case we must), because that would generate an unworkable volume of + * full-page writes. This exposes us to torn page hazards, but since + * we're not inspecting the existing page contents in any way, we + * don't care. + * + * However, all operations that clear the visibility map bit *do* bump + * the LSN, and those operations will only be replayed if the XLOG LSN + * follows the page LSN. Thus, if the page LSN has advanced past our + * XLOG record's LSN, we mustn't mark the page all-visible, because + * the subsequent update won't be replayed to clear the flag. + */ + page = BufferGetPage(buffer); + + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + else if (action == BLK_RESTORED) + { + /* + * If heap block was backed up, we already restored it and there's + * nothing more to do. (This can only happen with checksums or + * wal_log_hints enabled.) + */ + } + + if (BufferIsValid(buffer)) + { + Size space = PageGetFreeSpace(BufferGetPage(buffer)); + + UnlockReleaseBuffer(buffer); + + /* + * Since FSM is not WAL-logged and only updated heuristically, it + * easily becomes stale in standbys. If the standby is later promoted + * and runs VACUUM, it will skip updating individual free space + * figures for pages that became all-visible (or all-frozen, depending + * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum + * propagates too optimistic free space values to upper FSM layers; + * later inserters try to use such pages only to find out that they + * are unusable. This can cause long stalls when there are many such + * pages. + * + * Forestall those problems by updating FSM's idea about a page that + * is becoming all-visible or all-frozen. + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + if (xlrec->flags & VISIBILITYMAP_VALID_BITS) + XLogRecordPageWithFreeSpace(rnode, blkno, space); + } + + /* + * Even if we skipped the heap page update due to the LSN interlock, it's + * still safe to update the visibility map. Any WAL record that clears + * the visibility map bit does so before checking the page LSN, so any + * bits that need to be cleared will still be cleared. + */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false, + &vmbuffer) == BLK_NEEDS_REDO) + { + Page vmpage = BufferGetPage(vmbuffer); + Relation reln; + + /* initialize the page if it was read as zeros */ + if (PageIsNew(vmpage)) + PageInit(vmpage, BLCKSZ, 0); + + /* + * XLogReadBufferForRedoExtended locked the buffer. But + * visibilitymap_set will handle locking itself. + */ + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + + reln = CreateFakeRelcacheEntry(rnode); + visibilitymap_pin(reln, blkno, &vmbuffer); + + /* + * Don't set the bit if replay has already passed this point. + * + * It might be safe to do this unconditionally; if replay has passed + * this point, we'll replay at least as far this time as we did + * before, and if this bit needs to be cleared, the record responsible + * for doing so should be again replayed, and clear it. For right + * now, out of an abundance of conservatism, we use the same test here + * we did for the heap page. If this results in a dropped bit, no + * real harm is done; and the next VACUUM will fix it. + */ + if (lsn > PageGetLSN(vmpage)) + visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, + xlrec->cutoff_xid, xlrec->flags); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + else if (BufferIsValid(vmbuffer)) + UnlockReleaseBuffer(vmbuffer); +} + +/* + * Replay XLOG_HEAP2_FREEZE_PAGE records + */ +static void +heap_xlog_freeze_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record); + TransactionId cutoff_xid = xlrec->cutoff_xid; + Buffer buffer; + int ntup; + + /* + * In Hot Standby mode, ensure that there's no queries running which still + * consider the frozen xids as running. + */ + if (InHotStandby) + { + RelFileNode rnode; + TransactionId latestRemovedXid = cutoff_xid; + + TransactionIdRetreat(latestRemovedXid); + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + xl_heap_freeze_tuple *tuples; + + tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL); + + /* now execute freeze plan for each frozen tuple */ + for (ntup = 0; ntup < xlrec->ntuples; ntup++) + { + xl_heap_freeze_tuple *xlrec_tp; + ItemId lp; + HeapTupleHeader tuple; + + xlrec_tp = &tuples[ntup]; + lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */ + tuple = (HeapTupleHeader) PageGetItem(page, lp); + + heap_execute_freeze_tuple(tuple, xlrec_tp); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * Given an "infobits" field from an XLog record, set the correct bits in the + * given infomask and infomask2 for the tuple touched by the record. + * + * (This is the reverse of compute_infobits). + */ +static void +fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) +{ + *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + *infomask2 &= ~HEAP_KEYS_UPDATED; + + if (infobits & XLHL_XMAX_IS_MULTI) + *infomask |= HEAP_XMAX_IS_MULTI; + if (infobits & XLHL_XMAX_LOCK_ONLY) + *infomask |= HEAP_XMAX_LOCK_ONLY; + if (infobits & XLHL_XMAX_EXCL_LOCK) + *infomask |= HEAP_XMAX_EXCL_LOCK; + /* note HEAP_XMAX_SHR_LOCK isn't considered here */ + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + *infomask |= HEAP_XMAX_KEYSHR_LOCK; + + if (infobits & XLHL_KEYS_UPDATED) + *infomask2 |= HEAP_KEYS_UPDATED; +} + +static void +heap_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record); + Buffer buffer; + Page page; + ItemId lp = NULL; + HeapTupleHeader htup; + BlockNumber blkno; + RelFileNode target_node; + ItemPointerData target_tid; + + XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_node); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) + lp = PageGetItemId(page, xlrec->offnum); + + if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->infobits_set, + &htup->t_infomask, &htup->t_infomask2); + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + else + HeapTupleHeaderSetXmin(htup, InvalidTransactionId); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* Make sure t_ctid is set correctly */ + if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE) + HeapTupleHeaderSetMovedPartitions(htup); + else + htup->t_ctid = target_tid; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +heap_xlog_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + xl_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + RelFileNode target_node; + BlockNumber blkno; + ItemPointerData target_tid; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_node); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + Size datalen; + char *data; + + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) + elog(PANIC, "invalid max offset number"); + + data = XLogRecGetBlockData(record, 0, &datalen); + + newlen = datalen - SizeOfHeapHeader; + Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize); + memcpy((char *) &xlhdr, data, SizeOfHeapHeader); + data += SizeOfHeapHeader; + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + data, + newlen); + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + htup->t_ctid = target_tid; + + if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, + true, true) == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(target_node, blkno, freespace); +} + +/* + * Handles MULTI_INSERT record type. + */ +static void +heap_xlog_multi_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_multi_insert *xlrec; + RelFileNode rnode; + BlockNumber blkno; + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + uint32 newlen; + Size freespace = 0; + int i; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; + XLogRedoAction action; + + /* + * Insertion doesn't overwrite MVCC data, so no conflict processing is + * required. + */ + xlrec = (xl_heap_multi_insert *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + /* check that the mutually exclusive flags are not both set */ + Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) && + (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET))); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rnode); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (isinit) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + char *tupdata; + char *endptr; + Size len; + + /* Tuples are stored as block data */ + tupdata = XLogRecGetBlockData(record, 0, &len); + endptr = tupdata + len; + + page = (Page) BufferGetPage(buffer); + + for (i = 0; i < xlrec->ntuples; i++) + { + OffsetNumber offnum; + xl_multi_insert_tuple *xlhdr; + + /* + * If we're reinitializing the page, the tuples are stored in + * order from FirstOffsetNumber. Otherwise there's an array of + * offsets in the WAL record, and the tuples come after that. + */ + if (isinit) + offnum = FirstOffsetNumber + i; + else + offnum = xlrec->offsets[i]; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "invalid max offset number"); + + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); + tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple; + + newlen = xlhdr->datalen; + Assert(newlen <= MaxHeapTupleSize); + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + (char *) tupdata, + newlen); + tupdata += newlen; + + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr->t_infomask2; + htup->t_infomask = xlhdr->t_infomask; + htup->t_hoff = xlhdr->t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + ItemPointerSetBlockNumber(&htup->t_ctid, blkno); + ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + } + if (tupdata != endptr) + elog(PANIC, "total tuple length mismatch"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rnode, blkno, freespace); +} + +/* + * Handles UPDATE and HOT_UPDATE + */ +static void +heap_xlog_update(XLogReaderState *record, bool hot_update) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record); + RelFileNode rnode; + BlockNumber oldblk; + BlockNumber newblk; + ItemPointerData newtid; + Buffer obuffer, + nbuffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleData oldtup; + HeapTupleHeader htup; + uint16 prefixlen = 0, + suffixlen = 0; + char *newp; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + xl_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + XLogRedoAction oldaction; + XLogRedoAction newaction; + + /* initialize to keep the compiler quiet */ + oldtup.t_data = NULL; + oldtup.t_len = 0; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk); + if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk)) + { + /* HOT updates are never done across pages */ + Assert(!hot_update); + } + else + oldblk = newblk; + + ItemPointerSet(&newtid, newblk, xlrec->new_offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rnode); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, oldblk, &vmbuffer); + visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * In normal operation, it is important to lock the two pages in + * page-number order, to avoid possible deadlocks against other update + * operations going the other way. However, during WAL replay there can + * be no other update happening, so we don't need to worry about that. But + * we *do* need to worry that we don't expose an inconsistent state to Hot + * Standby queries --- so the original page can't be unlocked before we've + * added the new tuple to the new page. + */ + + /* Deal with old tuple version */ + oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, + &obuffer); + if (oldaction == BLK_NEEDS_REDO) + { + page = BufferGetPage(obuffer); + offnum = xlrec->old_offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldtup.t_data = htup; + oldtup.t_len = ItemIdGetLength(lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + if (hot_update) + HeapTupleHeaderSetHotUpdated(htup); + else + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + /* Set forward chain link in t_ctid */ + htup->t_ctid = newtid; + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(obuffer); + } + + /* + * Read the page the new tuple goes into, if different from old. + */ + if (oldblk == newblk) + { + nbuffer = obuffer; + newaction = oldaction; + } + else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + nbuffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(nbuffer); + PageInit(page, BufferGetPageSize(nbuffer), 0); + newaction = BLK_NEEDS_REDO; + } + else + newaction = XLogReadBufferForRedo(record, 0, &nbuffer); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rnode); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, newblk, &vmbuffer); + visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* Deal with new tuple */ + if (newaction == BLK_NEEDS_REDO) + { + char *recdata; + char *recdata_end; + Size datalen; + Size tuplen; + + recdata = XLogRecGetBlockData(record, 0, &datalen); + recdata_end = recdata + datalen; + + page = BufferGetPage(nbuffer); + + offnum = xlrec->new_offnum; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "invalid max offset number"); + + if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&prefixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&suffixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + + memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader); + recdata += SizeOfHeapHeader; + + tuplen = recdata_end - recdata; + Assert(tuplen <= MaxHeapTupleSize); + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + + /* + * Reconstruct the new tuple using the prefix and/or suffix from the + * old tuple, and the data stored in the WAL record. + */ + newp = (char *) htup + SizeofHeapTupleHeader; + if (prefixlen > 0) + { + int len; + + /* copy bitmap [+ padding] [+ oid] from WAL record */ + len = xlhdr.t_hoff - SizeofHeapTupleHeader; + memcpy(newp, recdata, len); + recdata += len; + newp += len; + + /* copy prefix from old tuple */ + memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen); + newp += prefixlen; + + /* copy new tuple data from WAL record */ + len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader); + memcpy(newp, recdata, len); + recdata += len; + newp += len; + } + else + { + /* + * copy bitmap [+ padding] [+ oid] + data from record, all in one + * go + */ + memcpy(newp, recdata, tuplen); + recdata += tuplen; + newp += tuplen; + } + Assert(recdata == recdata_end); + + /* copy suffix from old tuple */ + if (suffixlen > 0) + memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); + + newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + /* Make sure there is no forward chain link in t_ctid */ + htup->t_ctid = newtid; + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + MarkBufferDirty(nbuffer); + } + + if (BufferIsValid(nbuffer) && nbuffer != obuffer) + UnlockReleaseBuffer(nbuffer); + if (BufferIsValid(obuffer)) + UnlockReleaseBuffer(obuffer); + + /* + * If the new page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * However, don't update the FSM on HOT updates, because after crash + * recovery, either the old or the new tuple will certainly be dead and + * prunable. After pruning, the page will have roughly as much free space + * as it did before the update, assuming the new tuple is about the same + * size as the old one. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rnode, newblk, freespace); +} + +static void +heap_xlog_confirm(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Confirm tuple as actually inserted + */ + ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +heap_xlog_lock(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileNode rnode; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &block); + reln = CreateFakeRelcacheEntry(rnode); + + visibilitymap_pin(reln, block, &vmbuffer); + visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + + /* + * Clear relevant update flags, but only if the modified infomask says + * there's no update. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) + { + HeapTupleHeaderClearHotUpdated(htup); + /* Make sure there is no forward chain link in t_ctid */ + ItemPointerSet(&htup->t_ctid, + BufferGetBlockNumber(buffer), + offnum); + } + HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +heap_xlog_lock_updated(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_lock_updated *xlrec; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + xlrec = (xl_heap_lock_updated *) XLogRecGetData(record); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileNode rnode; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &block); + reln = CreateFakeRelcacheEntry(rnode); + + visibilitymap_pin(reln, block, &vmbuffer); + visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +heap_xlog_inplace(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + uint32 oldlen; + Size newlen; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + char *newtup = XLogRecGetBlockData(record, 0, &newlen); + + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldlen = ItemIdGetLength(lp) - htup->t_hoff; + if (oldlen != newlen) + elog(PANIC, "wrong tuple length"); + + memcpy((char *) htup + htup->t_hoff, newtup, newlen); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +void +heap_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* + * These operations don't overwrite MVCC data so no conflict processing is + * required. The ones in heap2 rmgr do. + */ + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP_INSERT: + heap_xlog_insert(record); + break; + case XLOG_HEAP_DELETE: + heap_xlog_delete(record); + break; + case XLOG_HEAP_UPDATE: + heap_xlog_update(record, false); + break; + case XLOG_HEAP_TRUNCATE: + + /* + * TRUNCATE is a no-op because the actions are already logged as + * SMGR WAL records. TRUNCATE WAL record only exists for logical + * decoding. + */ + break; + case XLOG_HEAP_HOT_UPDATE: + heap_xlog_update(record, true); + break; + case XLOG_HEAP_CONFIRM: + heap_xlog_confirm(record); + break; + case XLOG_HEAP_LOCK: + heap_xlog_lock(record); + break; + case XLOG_HEAP_INPLACE: + heap_xlog_inplace(record); + break; + default: + elog(PANIC, "heap_redo: unknown op code %u", info); + } +} + +void +heap2_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP2_PRUNE: + heap_xlog_prune(record); + break; + case XLOG_HEAP2_VACUUM: + heap_xlog_vacuum(record); + break; + case XLOG_HEAP2_FREEZE_PAGE: + heap_xlog_freeze_page(record); + break; + case XLOG_HEAP2_VISIBLE: + heap_xlog_visible(record); + break; + case XLOG_HEAP2_MULTI_INSERT: + heap_xlog_multi_insert(record); + break; + case XLOG_HEAP2_LOCK_UPDATED: + heap_xlog_lock_updated(record); + break; + case XLOG_HEAP2_NEW_CID: + + /* + * Nothing to do on a real replay, only used during logical + * decoding. + */ + break; + case XLOG_HEAP2_REWRITE: + heap_xlog_logical_rewrite(record); + break; + default: + elog(PANIC, "heap2_redo: unknown op code %u", info); + } +} + +/* + * Mask a heap page before performing consistency checks on it. + */ +void +heap_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See heap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, heap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, heap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions on the primary. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + + /* + * NB: Not ignoring ctid changes due to the tuple having moved + * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's + * important information that needs to be in-sync between primary + * and standby, and thus is WAL logged. + */ + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} + +/* + * HeapCheckForSerializableConflictOut + * We are reading a tuple. If it's not visible, there may be a + * rw-conflict out with the inserter. Otherwise, if it is visible to us + * but has been deleted, there may be a rw-conflict out with the deleter. + * + * We will determine the top level xid of the writing transaction with which + * we may be in conflict, and ask CheckForSerializableConflictOut() to check + * for overlap with our own transaction. + * + * This function should be called just about anywhere in heapam.c where a + * tuple has been read. The caller must hold at least a shared lock on the + * buffer, because this function might set hint bits on the tuple. There is + * currently no known reason to call this function from an index AM. + */ +void +HeapCheckForSerializableConflictOut(bool visible, Relation relation, + HeapTuple tuple, Buffer buffer, + Snapshot snapshot) +{ + TransactionId xid; + HTSV_Result htsvResult; + + if (!CheckForSerializableConflictOutNeeded(relation, snapshot)) + return; + + /* + * Check to see whether the tuple has been written to by a concurrent + * transaction, either to create it not visible to us, or to delete it + * while it is visible to us. The "visible" bool indicates whether the + * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else + * is going on with it. + * + * In the event of a concurrently inserted tuple that also happens to have + * been concurrently updated (by a separate transaction), the xmin of the + * tuple will be used -- not the updater's xid. + */ + htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer); + switch (htsvResult) + { + case HEAPTUPLE_LIVE: + if (visible) + return; + xid = HeapTupleHeaderGetXmin(tuple->t_data); + break; + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_DELETE_IN_PROGRESS: + if (visible) + xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); + else + xid = HeapTupleHeaderGetXmin(tuple->t_data); + + if (TransactionIdPrecedes(xid, TransactionXmin)) + { + /* This is like the HEAPTUPLE_DEAD case */ + Assert(!visible); + return; + } + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + xid = HeapTupleHeaderGetXmin(tuple->t_data); + break; + case HEAPTUPLE_DEAD: + Assert(!visible); + return; + default: + + /* + * The only way to get to this default clause is if a new value is + * added to the enum type without adding it to this switch + * statement. That's a bug, so elog. + */ + elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult); + + /* + * In spite of having all enum values covered and calling elog on + * this default, some compilers think this is a code path which + * allows xid to be used below without initialization. Silence + * that warning. + */ + xid = InvalidTransactionId; + } + + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + /* + * Find top level xid. Bail out if xid is too early to be a conflict, or + * if it's our own xid. + */ + if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) + return; + xid = SubTransGetTopmostTransaction(xid); + if (TransactionIdPrecedes(xid, TransactionXmin)) + return; + + CheckForSerializableConflictOut(relation, xid, snapshot); +} diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c new file mode 100644 index 0000000..6633939 --- /dev/null +++ b/src/backend/access/heap/heapam_handler.c @@ -0,0 +1,2608 @@ +/*------------------------------------------------------------------------- + * + * heapam_handler.c + * heap table access method code + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/heapam_handler.c + * + * + * NOTES + * This files wires up the lower level heapam.c et al routines with the + * tableam abstraction. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/multixact.h" +#include "access/rewriteheap.h" +#include "access/syncscan.h" +#include "access/tableam.h" +#include "access/tsmapi.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/index.h" +#include "catalog/storage.h" +#include "catalog/storage_xlog.h" +#include "commands/progress.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/rel.h" + +static void reform_and_rewrite_tuple(HeapTuple tuple, + Relation OldHeap, Relation NewHeap, + Datum *values, bool *isnull, RewriteState rwstate); + +static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset); + +static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); + +static const TableAmRoutine heapam_methods; + + +/* ------------------------------------------------------------------------ + * Slot related callbacks for heap AM + * ------------------------------------------------------------------------ + */ + +static const TupleTableSlotOps * +heapam_slot_callbacks(Relation relation) +{ + return &TTSOpsBufferHeapTuple; +} + + +/* ------------------------------------------------------------------------ + * Index Scan Callbacks for heap AM + * ------------------------------------------------------------------------ + */ + +static IndexFetchTableData * +heapam_index_fetch_begin(Relation rel) +{ + IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); + + hscan->xs_base.rel = rel; + hscan->xs_cbuf = InvalidBuffer; + + return &hscan->xs_base; +} + +static void +heapam_index_fetch_reset(IndexFetchTableData *scan) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + + if (BufferIsValid(hscan->xs_cbuf)) + { + ReleaseBuffer(hscan->xs_cbuf); + hscan->xs_cbuf = InvalidBuffer; + } +} + +static void +heapam_index_fetch_end(IndexFetchTableData *scan) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + + heapam_index_fetch_reset(scan); + + pfree(hscan); +} + +static bool +heapam_index_fetch_tuple(struct IndexFetchTableData *scan, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *call_again, bool *all_dead) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + bool got_heap_tuple; + + Assert(TTS_IS_BUFFERTUPLE(slot)); + + /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ + if (!*call_again) + { + /* Switch to correct buffer if we don't have it already */ + Buffer prev_buf = hscan->xs_cbuf; + + hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, + hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid)); + + /* + * Prune page, but only if we weren't already on this page + */ + if (prev_buf != hscan->xs_cbuf) + heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + } + + /* Obtain share-lock on the buffer so we can examine visibility */ + LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE); + got_heap_tuple = heap_hot_search_buffer(tid, + hscan->xs_base.rel, + hscan->xs_cbuf, + snapshot, + &bslot->base.tupdata, + all_dead, + !*call_again); + bslot->base.tupdata.t_self = *tid; + LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK); + + if (got_heap_tuple) + { + /* + * Only in a non-MVCC snapshot can more than one member of the HOT + * chain be visible. + */ + *call_again = !IsMVCCSnapshot(snapshot); + + slot->tts_tableOid = RelationGetRelid(scan->rel); + ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf); + } + else + { + /* We've reached the end of the HOT chain. */ + *call_again = false; + } + + return got_heap_tuple; +} + + +/* ------------------------------------------------------------------------ + * Callbacks for non-modifying operations on individual tuples for heap AM + * ------------------------------------------------------------------------ + */ + +static bool +heapam_fetch_row_version(Relation relation, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + Buffer buffer; + + Assert(TTS_IS_BUFFERTUPLE(slot)); + + bslot->base.tupdata.t_self = *tid; + if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer)) + { + /* store in slot, transferring existing pin */ + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer); + slot->tts_tableOid = RelationGetRelid(relation); + + return true; + } + + return false; +} + +static bool +heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + return ItemPointerIsValid(tid) && + ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks; +} + +static bool +heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, + Snapshot snapshot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + bool res; + + Assert(TTS_IS_BUFFERTUPLE(slot)); + Assert(BufferIsValid(bslot->buffer)); + + /* + * We need buffer pin and lock to call HeapTupleSatisfiesVisibility. + * Caller should be holding pin, but not lock. + */ + LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE); + res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot, + bslot->buffer); + LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK); + + return res; +} + + +/* ---------------------------------------------------------------------------- + * Functions for manipulations of physical tuples for heap AM. + * ---------------------------------------------------------------------------- + */ + +static void +heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, BulkInsertState bistate) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + + /* Update the tuple with table oid */ + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + /* Perform the insertion, and copy the resulting ItemPointer */ + heap_insert(relation, tuple, cid, options, bistate); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + if (shouldFree) + pfree(tuple); +} + +static void +heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot, + CommandId cid, int options, + BulkInsertState bistate, uint32 specToken) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + + /* Update the tuple with table oid */ + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken); + options |= HEAP_INSERT_SPECULATIVE; + + /* Perform the insertion, and copy the resulting ItemPointer */ + heap_insert(relation, tuple, cid, options, bistate); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + if (shouldFree) + pfree(tuple); +} + +static void +heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, + uint32 specToken, bool succeeded) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + + /* adjust the tuple's state accordingly */ + if (succeeded) + heap_finish_speculative(relation, &slot->tts_tid); + else + heap_abort_speculative(relation, &slot->tts_tid); + + if (shouldFree) + pfree(tuple); +} + +static TM_Result +heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, bool wait, + TM_FailureData *tmfd, bool changingPart) +{ + /* + * Currently Deleting of index tuples are handled at vacuum, in case if + * the storage itself is cleaning the dead tuples by itself, it is the + * time to call the index tuple deletion also. + */ + return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); +} + + +static TM_Result +heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, + CommandId cid, Snapshot snapshot, Snapshot crosscheck, + bool wait, TM_FailureData *tmfd, + LockTupleMode *lockmode, bool *update_indexes) +{ + bool shouldFree = true; + HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); + TM_Result result; + + /* Update the tuple with table oid */ + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + result = heap_update(relation, otid, tuple, cid, crosscheck, wait, + tmfd, lockmode); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + /* + * Decide whether new index entries are needed for the tuple + * + * Note: heap_update returns the tid (location) of the new tuple in the + * t_self field. + * + * If it's a HOT update, we mustn't insert new index entries. + */ + *update_indexes = result == TM_Ok && !HeapTupleIsHeapOnly(tuple); + + if (shouldFree) + pfree(tuple); + + return result; +} + +static TM_Result +heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *slot, CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + TM_Result result; + Buffer buffer; + HeapTuple tuple = &bslot->base.tupdata; + bool follow_updates; + + follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; + tmfd->traversed = false; + + Assert(TTS_IS_BUFFERTUPLE(slot)); + +tuple_lock_retry: + tuple->t_self = *tid; + result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy, + follow_updates, &buffer, tmfd); + + if (result == TM_Updated && + (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) + { + /* Should not encounter speculative tuple on recheck */ + Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); + + ReleaseBuffer(buffer); + + if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) + { + SnapshotData SnapshotDirty; + TransactionId priorXmax; + + /* it was updated, so look at the updated version */ + *tid = tmfd->ctid; + /* updated row should have xmin matching this xmax */ + priorXmax = tmfd->xmax; + + /* signal that a tuple later in the chain is getting locked */ + tmfd->traversed = true; + + /* + * fetch target tuple + * + * Loop here to deal with updated or busy tuples + */ + InitDirtySnapshot(SnapshotDirty); + for (;;) + { + if (ItemPointerIndicatesMovedPartitions(tid)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update"))); + + tuple->t_self = *tid; + if (heap_fetch_extended(relation, &SnapshotDirty, tuple, + &buffer, true)) + { + /* + * If xmin isn't what we're expecting, the slot must have + * been recycled and reused for an unrelated tuple. This + * implies that the latest version of the row was deleted, + * so we need do nothing. (Should be safe to examine xmin + * without getting buffer's content lock. We assume + * reading a TransactionId to be atomic, and Xmin never + * changes in an existing tuple, except to invalid or + * frozen, and neither of those can match priorXmax.) + */ + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), + priorXmax)) + { + ReleaseBuffer(buffer); + return TM_Deleted; + } + + /* otherwise xmin should not be dirty... */ + if (TransactionIdIsValid(SnapshotDirty.xmin)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"", + SnapshotDirty.xmin, + ItemPointerGetBlockNumber(&tuple->t_self), + ItemPointerGetOffsetNumber(&tuple->t_self), + RelationGetRelationName(relation)))); + + /* + * If tuple is being updated by other transaction then we + * have to wait for its commit/abort, or die trying. + */ + if (TransactionIdIsValid(SnapshotDirty.xmax)) + { + ReleaseBuffer(buffer); + switch (wait_policy) + { + case LockWaitBlock: + XactLockTableWait(SnapshotDirty.xmax, + relation, &tuple->t_self, + XLTW_FetchUpdated); + break; + case LockWaitSkip: + if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) + /* skip instead of waiting */ + return TM_WouldBlock; + break; + case LockWaitError: + if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + continue; /* loop back to repeat heap_fetch */ + } + + /* + * If tuple was inserted by our own transaction, we have + * to check cmin against cid: cmin >= current CID means + * our command cannot see the tuple, so we should ignore + * it. Otherwise heap_lock_tuple() will throw an error, + * and so would any later attempt to update or delete the + * tuple. (We need not check cmax because + * HeapTupleSatisfiesDirty will consider a tuple deleted + * by our transaction dead, regardless of cmax.) We just + * checked that priorXmax == xmin, so we can test that + * variable instead of doing HeapTupleHeaderGetXmin again. + */ + if (TransactionIdIsCurrentTransactionId(priorXmax) && + HeapTupleHeaderGetCmin(tuple->t_data) >= cid) + { + tmfd->xmax = priorXmax; + + /* + * Cmin is the problematic value, so store that. See + * above. + */ + tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data); + ReleaseBuffer(buffer); + return TM_SelfModified; + } + + /* + * This is a live tuple, so try to lock it again. + */ + ReleaseBuffer(buffer); + goto tuple_lock_retry; + } + + /* + * If the referenced slot was actually empty, the latest + * version of the row must have been deleted, so we need do + * nothing. + */ + if (tuple->t_data == NULL) + { + Assert(!BufferIsValid(buffer)); + return TM_Deleted; + } + + /* + * As above, if xmin isn't what we're expecting, do nothing. + */ + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), + priorXmax)) + { + ReleaseBuffer(buffer); + return TM_Deleted; + } + + /* + * If we get here, the tuple was found but failed + * SnapshotDirty. Assuming the xmin is either a committed xact + * or our own xact (as it certainly should be if we're trying + * to modify the tuple), this must mean that the row was + * updated or deleted by either a committed xact or our own + * xact. If it was deleted, we can ignore it; if it was + * updated then chain up to the next version and repeat the + * whole process. + * + * As above, it should be safe to examine xmax and t_ctid + * without the buffer content lock, because they can't be + * changing. We'd better hold a buffer pin though. + */ + if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) + { + /* deleted, so forget about it */ + ReleaseBuffer(buffer); + return TM_Deleted; + } + + /* updated, so look at the updated row */ + *tid = tuple->t_data->t_ctid; + /* updated row should have xmin matching this xmax */ + priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + ReleaseBuffer(buffer); + /* loop back to fetch next in chain */ + } + } + else + { + /* tuple was deleted, so give up */ + return TM_Deleted; + } + } + + slot->tts_tableOid = RelationGetRelid(relation); + tuple->t_tableOid = slot->tts_tableOid; + + /* store in slot, transferring existing pin */ + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); + + return result; +} + + +/* ------------------------------------------------------------------------ + * DDL related callbacks for heap AM. + * ------------------------------------------------------------------------ + */ + +static void +heapam_relation_set_new_filenode(Relation rel, + const RelFileNode *newrnode, + char persistence, + TransactionId *freezeXid, + MultiXactId *minmulti) +{ + SMgrRelation srel; + + /* + * Initialize to the minimum XID that could put tuples in the table. We + * know that no xacts older than RecentXmin are still running, so that + * will do. + */ + *freezeXid = RecentXmin; + + /* + * Similarly, initialize the minimum Multixact to the first value that + * could possibly be stored in tuples in the table. Running transactions + * could reuse values from their local cache, so we are careful to + * consider all currently running multis. + * + * XXX this could be refined further, but is it worth the hassle? + */ + *minmulti = GetOldestMultiXactId(); + + srel = RelationCreateStorage(*newrnode, persistence); + + /* + * If required, set up an init fork for an unlogged table so that it can + * be correctly reinitialized on restart. An immediate sync is required + * even if the page has been logged, because the write did not go through + * shared_buffers and therefore a concurrent checkpoint may have moved the + * redo pointer past our xlog record. Recovery may as well remove it + * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE + * record. Therefore, logging is necessary even if wal_level=minimal. + */ + if (persistence == RELPERSISTENCE_UNLOGGED) + { + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + smgrcreate(srel, INIT_FORKNUM, false); + log_smgrcreate(newrnode, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } + + smgrclose(srel); +} + +static void +heapam_relation_nontransactional_truncate(Relation rel) +{ + RelationTruncate(rel, 0); +} + +static void +heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode) +{ + SMgrRelation dstrel; + + dstrel = smgropen(*newrnode, rel->rd_backend); + RelationOpenSmgr(rel); + + /* + * Since we copy the file directly without looking at the shared buffers, + * we'd better first flush out any pages of the source relation that are + * in shared buffers. We assume no new changes will be made while we are + * holding exclusive lock on the rel. + */ + FlushRelationBuffers(rel); + + /* + * Create and copy all forks of the relation, and schedule unlinking of + * old physical files. + * + * NOTE: any conflict in relfilenode value will be caught in + * RelationCreateStorage(). + */ + RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence); + + /* copy main fork */ + RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM, + rel->rd_rel->relpersistence); + + /* copy those extra forks that exist */ + for (ForkNumber forkNum = MAIN_FORKNUM + 1; + forkNum <= MAX_FORKNUM; forkNum++) + { + if (smgrexists(rel->rd_smgr, forkNum)) + { + smgrcreate(dstrel, forkNum, false); + + /* + * WAL log creation if the relation is persistent, or this is the + * init fork of an unlogged relation. + */ + if (RelationIsPermanent(rel) || + (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && + forkNum == INIT_FORKNUM)) + log_smgrcreate(newrnode, forkNum); + RelationCopyStorage(rel->rd_smgr, dstrel, forkNum, + rel->rd_rel->relpersistence); + } + } + + + /* drop old relation, and close new one */ + RelationDropStorage(rel); + smgrclose(dstrel); +} + +static void +heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, + Relation OldIndex, bool use_sort, + TransactionId OldestXmin, + TransactionId *xid_cutoff, + MultiXactId *multi_cutoff, + double *num_tuples, + double *tups_vacuumed, + double *tups_recently_dead) +{ + RewriteState rwstate; + IndexScanDesc indexScan; + TableScanDesc tableScan; + HeapScanDesc heapScan; + bool is_system_catalog; + Tuplesortstate *tuplesort; + TupleDesc oldTupDesc = RelationGetDescr(OldHeap); + TupleDesc newTupDesc = RelationGetDescr(NewHeap); + TupleTableSlot *slot; + int natts; + Datum *values; + bool *isnull; + BufferHeapTupleTableSlot *hslot; + BlockNumber prev_cblock = InvalidBlockNumber; + + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(OldHeap); + + /* + * Valid smgr_targblock implies something already wrote to the relation. + * This may be harmless, but this function hasn't planned for it. + */ + Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber); + + /* Preallocate values/isnull arrays */ + natts = newTupDesc->natts; + values = (Datum *) palloc(natts * sizeof(Datum)); + isnull = (bool *) palloc(natts * sizeof(bool)); + + /* Initialize the rewrite operation */ + rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff, + *multi_cutoff); + + + /* Set up sorting if wanted */ + if (use_sort) + tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, + maintenance_work_mem, + NULL, false); + else + tuplesort = NULL; + + /* + * Prepare to scan the OldHeap. To ensure we see recently-dead tuples + * that still need to be copied, we scan with SnapshotAny and use + * HeapTupleSatisfiesVacuum for the visibility test. + */ + if (OldIndex != NULL && !use_sort) + { + const int ci_index[] = { + PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_INDEX_RELID + }; + int64 ci_val[2]; + + /* Set phase and OIDOldIndex to columns */ + ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; + ci_val[1] = RelationGetRelid(OldIndex); + pgstat_progress_update_multi_param(2, ci_index, ci_val); + + tableScan = NULL; + heapScan = NULL; + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0); + index_rescan(indexScan, NULL, 0, NULL, 0); + } + else + { + /* In scan-and-sort mode and also VACUUM FULL, set phase */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); + + tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); + heapScan = (HeapScanDesc) tableScan; + indexScan = NULL; + + /* Set total heap blocks */ + pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, + heapScan->rs_nblocks); + } + + slot = table_slot_create(OldHeap, NULL); + hslot = (BufferHeapTupleTableSlot *) slot; + + /* + * Scan through the OldHeap, either in OldIndex order or sequentially; + * copy each tuple into the NewHeap, or transiently to the tuplesort + * module. Note that we don't bother sorting dead tuples (they won't get + * to the new table anyway). + */ + for (;;) + { + HeapTuple tuple; + Buffer buf; + bool isdead; + + CHECK_FOR_INTERRUPTS(); + + if (indexScan != NULL) + { + if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) + break; + + /* Since we used no scan keys, should never need to recheck */ + if (indexScan->xs_recheck) + elog(ERROR, "CLUSTER does not support lossy index conditions"); + } + else + { + if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot)) + { + /* + * If the last pages of the scan were empty, we would go to + * the next phase while heap_blks_scanned != heap_blks_total. + * Instead, to ensure that heap_blks_scanned is equivalent to + * total_heap_blks after the table scan phase, this parameter + * is manually updated to the correct value when the table + * scan finishes. + */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + heapScan->rs_nblocks); + break; + } + + /* + * In scan-and-sort mode and also VACUUM FULL, set heap blocks + * scanned + * + * Note that heapScan may start at an offset and wrap around, i.e. + * rs_startblock may be >0, and rs_cblock may end with a number + * below rs_startblock. To prevent showing this wraparound to the + * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks). + */ + if (prev_cblock != heapScan->rs_cblock) + { + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + (heapScan->rs_cblock + + heapScan->rs_nblocks - + heapScan->rs_startblock + ) % heapScan->rs_nblocks + 1); + prev_cblock = heapScan->rs_cblock; + } + } + + tuple = ExecFetchSlotHeapTuple(slot, false, NULL); + buf = hslot->buffer; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf)) + { + case HEAPTUPLE_DEAD: + /* Definitely dead */ + isdead = true; + break; + case HEAPTUPLE_RECENTLY_DEAD: + *tups_recently_dead += 1; + /* fall through */ + case HEAPTUPLE_LIVE: + /* Live or recently dead, must copy it */ + isdead = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Since we hold exclusive lock on the relation, normally the + * only way to see this is if it was inserted earlier in our + * own transaction. However, it can happen in system + * catalogs, since we tend to release write lock before commit + * there. Give a warning if neither case applies; but in any + * case we had better copy it. + */ + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) + elog(WARNING, "concurrent insert in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); + /* treat as live */ + isdead = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * Similar situation to INSERT_IN_PROGRESS case. + */ + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) + elog(WARNING, "concurrent delete in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); + /* treat as recently dead */ + *tups_recently_dead += 1; + isdead = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + isdead = false; /* keep compiler quiet */ + break; + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (isdead) + { + *tups_vacuumed += 1; + /* heap rewrite module still needs to see it... */ + if (rewrite_heap_dead_tuple(rwstate, tuple)) + { + /* A previous recently-dead tuple is now known dead */ + *tups_vacuumed += 1; + *tups_recently_dead -= 1; + } + continue; + } + + *num_tuples += 1; + if (tuplesort != NULL) + { + tuplesort_putheaptuple(tuplesort, tuple); + + /* + * In scan-and-sort mode, report increase in number of tuples + * scanned + */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + *num_tuples); + } + else + { + const int ct_index[] = { + PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN + }; + int64 ct_val[2]; + + reform_and_rewrite_tuple(tuple, OldHeap, NewHeap, + values, isnull, rwstate); + + /* + * In indexscan mode and also VACUUM FULL, report increase in + * number of tuples scanned and written + */ + ct_val[0] = *num_tuples; + ct_val[1] = *num_tuples; + pgstat_progress_update_multi_param(2, ct_index, ct_val); + } + } + + if (indexScan != NULL) + index_endscan(indexScan); + if (tableScan != NULL) + table_endscan(tableScan); + if (slot) + ExecDropSingleTupleTableSlot(slot); + + /* + * In scan-and-sort mode, complete the sort, then read out all live tuples + * from the tuplestore and write them to the new relation. + */ + if (tuplesort != NULL) + { + double n_tuples = 0; + + /* Report that we are now sorting tuples */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SORT_TUPLES); + + tuplesort_performsort(tuplesort); + + /* Report that we are now writing new heap */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP); + + for (;;) + { + HeapTuple tuple; + + CHECK_FOR_INTERRUPTS(); + + tuple = tuplesort_getheaptuple(tuplesort, true); + if (tuple == NULL) + break; + + n_tuples += 1; + reform_and_rewrite_tuple(tuple, + OldHeap, NewHeap, + values, isnull, + rwstate); + /* Report n_tuples */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, + n_tuples); + } + + tuplesort_end(tuplesort); + } + + /* Write out any remaining tuples, and fsync if needed */ + end_heap_rewrite(rwstate); + + /* Clean up */ + pfree(values); + pfree(isnull); +} + +static bool +heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, + BufferAccessStrategy bstrategy) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + /* + * We must maintain a pin on the target page's buffer to ensure that + * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from + * under us. Hence, pin the page until we are done looking at it. We + * also choose to hold sharelock on the buffer throughout --- we could + * release and re-acquire sharelock for each tuple, but since we aren't + * doing much work per tuple, the extra lock traffic is probably better + * avoided. + */ + hscan->rs_cblock = blockno; + hscan->rs_cindex = FirstOffsetNumber; + hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, + blockno, RBM_NORMAL, bstrategy); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + /* in heap all blocks can contain tuples, so always return true */ + return true; +} + +static bool +heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, + double *liverows, double *deadrows, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + Page targpage; + OffsetNumber maxoffset; + BufferHeapTupleTableSlot *hslot; + + Assert(TTS_IS_BUFFERTUPLE(slot)); + + hslot = (BufferHeapTupleTableSlot *) slot; + targpage = BufferGetPage(hscan->rs_cbuf); + maxoffset = PageGetMaxOffsetNumber(targpage); + + /* Inner loop over all tuples on the selected page */ + for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++) + { + ItemId itemid; + HeapTuple targtuple = &hslot->base.tupdata; + bool sample_it = false; + + itemid = PageGetItemId(targpage, hscan->rs_cindex); + + /* + * We ignore unused and redirect line pointers. DEAD line pointers + * should be counted as dead, because we need vacuum to run to get rid + * of them. Note that this rule agrees with the way that + * heap_page_prune() counts things. + */ + if (!ItemIdIsNormal(itemid)) + { + if (ItemIdIsDead(itemid)) + *deadrows += 1; + continue; + } + + ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex); + + targtuple->t_tableOid = RelationGetRelid(scan->rs_rd); + targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); + targtuple->t_len = ItemIdGetLength(itemid); + + switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin, + hscan->rs_cbuf)) + { + case HEAPTUPLE_LIVE: + sample_it = true; + *liverows += 1; + break; + + case HEAPTUPLE_DEAD: + case HEAPTUPLE_RECENTLY_DEAD: + /* Count dead and recently-dead rows */ + *deadrows += 1; + break; + + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Insert-in-progress rows are not counted. We assume that + * when the inserting transaction commits or aborts, it will + * send a stats message to increment the proper count. This + * works right only if that transaction ends after we finish + * analyzing the table; if things happen in the other order, + * its stats update will be overwritten by ours. However, the + * error will be large only if the other transaction runs long + * enough to insert many tuples, so assuming it will finish + * after us is the safer option. + * + * A special case is that the inserting transaction might be + * our own. In this case we should count and sample the row, + * to accommodate users who load a table and analyze it in one + * transaction. (pgstat_report_analyze has to adjust the + * numbers we send to the stats collector to make this come + * out right.) + */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data))) + { + sample_it = true; + *liverows += 1; + } + break; + + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * We count and sample delete-in-progress rows the same as + * live ones, so that the stats counters come out right if the + * deleting transaction commits after us, per the same + * reasoning given above. + * + * If the delete was done by our own transaction, however, we + * must count the row as dead to make pgstat_report_analyze's + * stats adjustments come out right. (Note: this works out + * properly when the row was both inserted and deleted in our + * xact.) + * + * The net effect of these choices is that we act as though an + * IN_PROGRESS transaction hasn't happened yet, except if it + * is our own transaction, which we assume has happened. + * + * This approach ensures that we behave sanely if we see both + * the pre-image and post-image rows for a row being updated + * by a concurrent transaction: we will sample the pre-image + * but not the post-image. We also get sane results if the + * concurrent transaction never commits. + */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data))) + *deadrows += 1; + else + { + sample_it = true; + *liverows += 1; + } + break; + + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + if (sample_it) + { + ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf); + hscan->rs_cindex++; + + /* note that we leave the buffer locked here! */ + return true; + } + } + + /* Now release the lock and pin on the page */ + UnlockReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + + /* also prevent old slot contents from having pin on page */ + ExecClearTuple(slot); + + return false; +} + +static double +heapam_index_build_range_scan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + bool allow_sync, + bool anyvisible, + bool progress, + BlockNumber start_blockno, + BlockNumber numblocks, + IndexBuildCallback callback, + void *callback_state, + TableScanDesc scan) +{ + HeapScanDesc hscan; + bool is_system_catalog; + bool checking_uniqueness; + HeapTuple heapTuple; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + double reltuples; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + Snapshot snapshot; + bool need_unregister_snapshot = false; + TransactionId OldestXmin; + BlockNumber previous_blkno = InvalidBlockNumber; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(heapRelation); + + /* See whether we're verifying uniqueness/exclusion properties */ + checking_uniqueness = (indexInfo->ii_Unique || + indexInfo->ii_ExclusionOps != NULL); + + /* + * "Any visible" mode is not compatible with uniqueness checks; make sure + * only one of those is requested. + */ + Assert(!(anyvisible && checking_uniqueness)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = table_slot_create(heapRelation, NULL); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. In a normal index build, we use + * SnapshotAny because we must retrieve all tuples and do our own time + * qual checks (because we have to index RECENTLY_DEAD tuples). In a + * concurrent build, or during bootstrap, we take a regular MVCC snapshot + * and index whatever's live according to that. + */ + OldestXmin = InvalidTransactionId; + + /* okay to ignore lazy VACUUMs here */ + if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) + OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); + + if (!scan) + { + /* + * Serial index build. + * + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + */ + if (!TransactionIdIsValid(OldestXmin)) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + need_unregister_snapshot = true; + } + else + snapshot = SnapshotAny; + + scan = table_beginscan_strat(heapRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + allow_sync); /* syncscan OK? */ + } + else + { + /* + * Parallel index build. + * + * Parallel case never registers/unregisters own snapshot. Snapshot + * is taken from parallel heap scan, and is SnapshotAny or an MVCC + * snapshot, based on same criteria as serial case. + */ + Assert(!IsBootstrapProcessingMode()); + Assert(allow_sync); + snapshot = scan->rs_snapshot; + } + + hscan = (HeapScanDesc) scan; + + /* + * Must have called GetOldestNonRemovableTransactionId() if using + * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially + * worth checking this for parallel builds, since ambuild routines that + * support parallel builds must work these details out for themselves.) + */ + Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); + Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : + !TransactionIdIsValid(OldestXmin)); + Assert(snapshot == SnapshotAny || !anyvisible); + + /* Publish number of blocks to scan */ + if (progress) + { + BlockNumber nblocks; + + if (hscan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan; + + pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; + nblocks = pbscan->phs_nblocks; + } + else + nblocks = hscan->rs_nblocks; + + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, + nblocks); + } + + /* set our scan endpoints */ + if (!allow_sync) + heap_setscanlimits(scan, start_blockno, numblocks); + else + { + /* syncscan can only be requested on whole relation */ + Assert(start_blockno == 0); + Assert(numblocks == InvalidBlockNumber); + } + + reltuples = 0; + + /* + * Scan all tuples in the base relation. + */ + while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + bool tupleIsAlive; + + CHECK_FOR_INTERRUPTS(); + + /* Report scan progress, if asked to. */ + if (progress) + { + BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan); + + if (blocks_done != previous_blkno) + { + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + blocks_done); + previous_blkno = blocks_done; + } + } + + /* + * When dealing with a HOT-chain of updated tuples, we want to index + * the values of the live tuple (if any), but index it under the TID + * of the chain's root tuple. This approach is necessary to preserve + * the HOT-chain structure in the heap. So we need to be able to find + * the root item offset for every tuple that's in a HOT-chain. When + * first reaching a new page of the relation, call + * heap_get_root_tuples() to build a map of root item offsets on the + * page. + * + * It might look unsafe to use this information across buffer + * lock/unlock. However, we hold ShareLock on the table so no + * ordinary insert/update/delete should occur; and we hold pin on the + * buffer continuously while visiting the page, so no pruning + * operation can occur either. + * + * In cases with only ShareUpdateExclusiveLock on the table, it's + * possible for some HOT tuples to appear that we didn't know about + * when we first read the page. To handle that case, we re-obtain the + * list of root offsets when a HOT tuple points to a root item that we + * don't know about. + * + * Also, although our opinions about tuple liveness could change while + * we scan the page (due to concurrent transaction commits/aborts), + * the chain root locations won't, so this info doesn't need to be + * rebuilt after waiting for another transaction. + * + * Note the implied assumption that there is no more than one live + * tuple per HOT-chain --- else we could create more than one index + * entry pointing to the same root tuple. + */ + if (hscan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + root_blkno = hscan->rs_cblock; + } + + if (snapshot == SnapshotAny) + { + /* do our own time qual check */ + bool indexIt; + TransactionId xwait; + + recheck: + + /* + * We could possibly get away with not locking the buffer here, + * since caller should hold ShareLock on the relation, but let's + * be conservative about it. (This remark is still correct even + * with HOT-pruning: our pin on the buffer prevents pruning.) + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's heapam_scan_analyze_next_tuple() does, + * otherwise CREATE INDEX and ANALYZE may produce wildly different + * reltuples values, e.g. when there are many recently-dead + * tuples. + */ + switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin, + hscan->rs_cbuf)) + { + case HEAPTUPLE_DEAD: + /* Definitely dead, we can ignore it */ + indexIt = false; + tupleIsAlive = false; + break; + case HEAPTUPLE_LIVE: + /* Normal case, index and unique-check it */ + indexIt = true; + tupleIsAlive = true; + /* Count it as live, too */ + reltuples += 1; + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently deleted then we must index it + * anyway to preserve MVCC semantics. (Pre-existing + * transactions could try to use the index after we finish + * building it, and may need to see such tuples.) + * + * However, if it was HOT-updated then we must only index + * the live tuple at the end of the HOT-chain. Since this + * breaks semantics for pre-existing snapshots, mark the + * index as unusable for them. + * + * We don't count recently-dead tuples in reltuples, even + * if we index them; see heapam_scan_analyze_next_tuple(). + */ + if (HeapTupleIsHotUpdated(heapTuple)) + { + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else + indexIt = true; + /* In any case, exclude the tuple from unique-checking */ + tupleIsAlive = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * In "anyvisible" mode, this tuple is visible and we + * don't need any further checks. + */ + if (anyvisible) + { + indexIt = true; + tupleIsAlive = true; + reltuples += 1; + break; + } + + /* + * Since caller should hold ShareLock or better, normally + * the only way to see this is if it was inserted earlier + * in our own transaction. However, it can happen in + * system catalogs, since we tend to release write lock + * before commit there. Give a warning if neither case + * applies. + */ + xwait = HeapTupleHeaderGetXmin(heapTuple->t_data); + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + if (!is_system_catalog) + elog(WARNING, "concurrent insert in progress within table \"%s\"", + RelationGetRelationName(heapRelation)); + + /* + * If we are performing uniqueness checks, indexing + * such a tuple could lead to a bogus uniqueness + * failure. In that case we wait for the inserting + * transaction to finish and check again. + */ + if (checking_uniqueness) + { + /* + * Must drop the lock on the buffer before we wait + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait, heapRelation, + &heapTuple->t_self, + XLTW_InsertIndexUnique); + CHECK_FOR_INTERRUPTS(); + goto recheck; + } + } + else + { + /* + * For consistency with + * heapam_scan_analyze_next_tuple(), count + * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only + * when inserted by our own transaction. + */ + reltuples += 1; + } + + /* + * We must index such tuples, since if the index build + * commits then they're good. + */ + indexIt = true; + tupleIsAlive = true; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * As with INSERT_IN_PROGRESS case, this is unexpected + * unless it's our own deletion or a system catalog; but + * in anyvisible mode, this tuple is visible. + */ + if (anyvisible) + { + indexIt = true; + tupleIsAlive = false; + reltuples += 1; + break; + } + + xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); + if (!TransactionIdIsCurrentTransactionId(xwait)) + { + if (!is_system_catalog) + elog(WARNING, "concurrent delete in progress within table \"%s\"", + RelationGetRelationName(heapRelation)); + + /* + * If we are performing uniqueness checks, assuming + * the tuple is dead could lead to missing a + * uniqueness violation. In that case we wait for the + * deleting transaction to finish and check again. + * + * Also, if it's a HOT-updated tuple, we should not + * index it but rather the live tuple at the end of + * the HOT-chain. However, the deleting transaction + * could abort, possibly leaving this tuple as live + * after all, in which case it has to be indexed. The + * only way to know what to do is to wait for the + * deleting transaction to finish and check again. + */ + if (checking_uniqueness || + HeapTupleIsHotUpdated(heapTuple)) + { + /* + * Must drop the lock on the buffer before we wait + */ + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait, heapRelation, + &heapTuple->t_self, + XLTW_InsertIndexUnique); + CHECK_FOR_INTERRUPTS(); + goto recheck; + } + + /* + * Otherwise index it but don't check for uniqueness, + * the same as a RECENTLY_DEAD tuple. + */ + indexIt = true; + + /* + * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live, + * if they were not deleted by the current + * transaction. That's what + * heapam_scan_analyze_next_tuple() does, and we want + * the behavior to be consistent. + */ + reltuples += 1; + } + else if (HeapTupleIsHotUpdated(heapTuple)) + { + /* + * It's a HOT-updated tuple deleted by our own xact. + * We can assume the deletion will commit (else the + * index contents don't matter), so treat the same as + * RECENTLY_DEAD HOT-updated tuples. + */ + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else + { + /* + * It's a regular tuple deleted by our own xact. Index + * it, but don't check for uniqueness nor count in + * reltuples, the same as a RECENTLY_DEAD tuple. + */ + indexIt = true; + } + /* In any case, exclude the tuple from unique-checking */ + tupleIsAlive = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + indexIt = tupleIsAlive = false; /* keep compiler quiet */ + break; + } + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + if (!indexIt) + continue; + } + else + { + /* heap_getnext did the time qual check */ + tupleIsAlive = true; + reltuples += 1; + } + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* Set up for predicate or expression evaluation */ + ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use in + * this index, and note which are null. This also performs evaluation + * of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* + * You'd think we should go ahead and build the index tuple here, but + * some index AMs want to do further processing on the data first. So + * pass the values[] and isnull[] arrays, instead. + */ + + if (HeapTupleIsHeapOnly(heapTuple)) + { + /* + * For a heap-only tuple, pretend its TID is that of the root. See + * src/backend/access/heap/README.HOT for discussion. + */ + ItemPointerData tid; + OffsetNumber offnum; + + offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); + + /* + * If a HOT tuple points to a root that we don't know about, + * obtain root items afresh. If that still fails, report it as + * corruption. + */ + if (root_offsets[offnum - 1] == InvalidOffsetNumber) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + } + + if (!OffsetNumberIsValid(root_offsets[offnum - 1])) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", + ItemPointerGetBlockNumber(&heapTuple->t_self), + offnum, + RelationGetRelationName(heapRelation)))); + + ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self), + root_offsets[offnum - 1]); + + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, &tid, values, isnull, tupleIsAlive, + callback_state); + } + else + { + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, &heapTuple->t_self, values, isnull, + tupleIsAlive, callback_state); + } + } + + /* Report scan progress one last time. */ + if (progress) + { + BlockNumber blks_done; + + if (hscan->rs_base.rs_parallel != NULL) + { + ParallelBlockTableScanDesc pbscan; + + pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; + blks_done = pbscan->phs_nblocks; + } + else + blks_done = hscan->rs_nblocks; + + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + blks_done); + } + + table_endscan(scan); + + /* we can now forget our snapshot, if set and registered by us */ + if (need_unregister_snapshot) + UnregisterSnapshot(snapshot); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; + + return reltuples; +} + +static void +heapam_index_validate_scan(Relation heapRelation, + Relation indexRelation, + IndexInfo *indexInfo, + Snapshot snapshot, + ValidateIndexState *state) +{ + TableScanDesc scan; + HeapScanDesc hscan; + HeapTuple heapTuple; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + bool in_index[MaxHeapTuplesPerPage]; + BlockNumber previous_blkno = InvalidBlockNumber; + + /* state variables for the merge */ + ItemPointer indexcursor = NULL; + ItemPointerData decoded; + bool tuplesort_empty = false; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + /* + * Need an EState for evaluation of index expressions and partial-index + * predicates. Also a slot to hold the current tuple. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + /* + * Prepare for scan of the base relation. We need just those tuples + * satisfying the passed-in reference snapshot. We must disable syncscan + * here, because it's critical that we read from block zero forward to + * match the sorted TIDs. + */ + scan = table_beginscan_strat(heapRelation, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + false); /* syncscan not OK */ + hscan = (HeapScanDesc) scan; + + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, + hscan->rs_nblocks); + + /* + * Scan all tuples matching the snapshot. + */ + while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + ItemPointer heapcursor = &heapTuple->t_self; + ItemPointerData rootTuple; + OffsetNumber root_offnum; + + CHECK_FOR_INTERRUPTS(); + + state->htups += 1; + + if ((previous_blkno == InvalidBlockNumber) || + (hscan->rs_cblock != previous_blkno)) + { + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + hscan->rs_cblock); + previous_blkno = hscan->rs_cblock; + } + + /* + * As commented in table_index_build_scan, we should index heap-only + * tuples under the TIDs of their root tuples; so when we advance onto + * a new heap page, build a map of root item offsets on the page. + * + * This complicates merging against the tuplesort output: we will + * visit the live tuples in order by their offsets, but the root + * offsets that we need to compare against the index contents might be + * ordered differently. So we might have to "look back" within the + * tuplesort output, but only within the current page. We handle that + * by keeping a bool array in_index[] showing all the + * already-passed-over tuplesort output TIDs of the current page. We + * clear that array here, when advancing onto a new heap page. + */ + if (hscan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(hscan->rs_cbuf); + + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + memset(in_index, 0, sizeof(in_index)); + + root_blkno = hscan->rs_cblock; + } + + /* Convert actual tuple TID to root TID */ + rootTuple = *heapcursor; + root_offnum = ItemPointerGetOffsetNumber(heapcursor); + + if (HeapTupleIsHeapOnly(heapTuple)) + { + root_offnum = root_offsets[root_offnum - 1]; + if (!OffsetNumberIsValid(root_offnum)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", + ItemPointerGetBlockNumber(heapcursor), + ItemPointerGetOffsetNumber(heapcursor), + RelationGetRelationName(heapRelation)))); + ItemPointerSetOffsetNumber(&rootTuple, root_offnum); + } + + /* + * "merge" by skipping through the index tuples until we find or pass + * the current root tuple. + */ + while (!tuplesort_empty && + (!indexcursor || + ItemPointerCompare(indexcursor, &rootTuple) < 0)) + { + Datum ts_val; + bool ts_isnull; + + if (indexcursor) + { + /* + * Remember index items seen earlier on the current heap page + */ + if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) + in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; + } + + tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, + &ts_val, &ts_isnull, NULL); + Assert(tuplesort_empty || !ts_isnull); + if (!tuplesort_empty) + { + itemptr_decode(&decoded, DatumGetInt64(ts_val)); + indexcursor = &decoded; + + /* If int8 is pass-by-ref, free (encoded) TID Datum memory */ +#ifndef USE_FLOAT8_BYVAL + pfree(DatumGetPointer(ts_val)); +#endif + } + else + { + /* Be tidy */ + indexcursor = NULL; + } + } + + /* + * If the tuplesort has overshot *and* we didn't see a match earlier, + * then this tuple is missing from the index, so insert it. + */ + if ((tuplesort_empty || + ItemPointerCompare(indexcursor, &rootTuple) > 0) && + !in_index[root_offnum - 1]) + { + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* Set up for predicate or expression evaluation */ + ExecStoreHeapTuple(heapTuple, slot, false); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate != NULL) + { + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * For the current heap tuple, extract all the attributes we use + * in this index, and note which are null. This also performs + * evaluation of any expressions needed. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* + * You'd think we should go ahead and build the index tuple here, + * but some index AMs want to do further processing on the data + * first. So pass the values[] and isnull[] arrays, instead. + */ + + /* + * If the tuple is already committed dead, you might think we + * could suppress uniqueness checking, but this is no longer true + * in the presence of HOT, because the insert is actually a proxy + * for a uniqueness check on the whole HOT-chain. That is, the + * tuple we have here could be dead because it was already + * HOT-updated, and if so the updating transaction will not have + * thought it should insert index entries. The index AM will + * check the whole HOT-chain and correctly detect a conflict if + * there is one. + */ + + index_insert(indexRelation, + values, + isnull, + &rootTuple, + heapRelation, + indexInfo->ii_Unique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + false, + indexInfo); + + state->tups_inserted += 1; + } + } + + table_endscan(scan); + + ExecDropSingleTupleTableSlot(slot); + + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; +} + +/* + * Return the number of blocks that have been read by this scan since + * starting. This is meant for progress reporting rather than be fully + * accurate: in a parallel scan, workers can be concurrently reading blocks + * further ahead than what we report. + */ +static BlockNumber +heapam_scan_get_blocks_done(HeapScanDesc hscan) +{ + ParallelBlockTableScanDesc bpscan = NULL; + BlockNumber startblock; + BlockNumber blocks_done; + + if (hscan->rs_base.rs_parallel != NULL) + { + bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; + startblock = bpscan->phs_startblock; + } + else + startblock = hscan->rs_startblock; + + /* + * Might have wrapped around the end of the relation, if startblock was + * not zero. + */ + if (hscan->rs_cblock > startblock) + blocks_done = hscan->rs_cblock - startblock; + else + { + BlockNumber nblocks; + + nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks; + blocks_done = nblocks - startblock + + hscan->rs_cblock; + } + + return blocks_done; +} + + +/* ------------------------------------------------------------------------ + * Miscellaneous callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +/* + * Check to see whether the table needs a TOAST table. It does only if + * (1) there are any toastable attributes, and (2) the maximum length + * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to + * create a toast table for something like "f1 varchar(20)".) + */ +static bool +heapam_relation_needs_toast_table(Relation rel) +{ + int32 data_length = 0; + bool maxlength_unknown = false; + bool has_toastable_attrs = false; + TupleDesc tupdesc = rel->rd_att; + int32 tuple_length; + int i; + + for (i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(tupdesc, i); + + if (att->attisdropped) + continue; + data_length = att_align_nominal(data_length, att->attalign); + if (att->attlen > 0) + { + /* Fixed-length types are never toastable */ + data_length += att->attlen; + } + else + { + int32 maxlen = type_maximum_size(att->atttypid, + att->atttypmod); + + if (maxlen < 0) + maxlength_unknown = true; + else + data_length += maxlen; + if (att->attstorage != TYPSTORAGE_PLAIN) + has_toastable_attrs = true; + } + } + if (!has_toastable_attrs) + return false; /* nothing to toast? */ + if (maxlength_unknown) + return true; /* any unlimited-length attrs? */ + tuple_length = MAXALIGN(SizeofHeapTupleHeader + + BITMAPLEN(tupdesc->natts)) + + MAXALIGN(data_length); + return (tuple_length > TOAST_TUPLE_THRESHOLD); +} + +/* + * TOAST tables for heap relations are just heap relations. + */ +static Oid +heapam_relation_toast_am(Relation rel) +{ + return rel->rd_rel->relam; +} + + +/* ------------------------------------------------------------------------ + * Planner related callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +#define HEAP_OVERHEAD_BYTES_PER_TUPLE \ + (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)) +#define HEAP_USABLE_BYTES_PER_PAGE \ + (BLCKSZ - SizeOfPageHeaderData) + +static void +heapam_estimate_rel_size(Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples, + double *allvisfrac) +{ + table_block_relation_estimate_size(rel, attr_widths, pages, + tuples, allvisfrac, + HEAP_OVERHEAD_BYTES_PER_TUPLE, + HEAP_USABLE_BYTES_PER_PAGE); +} + + +/* ------------------------------------------------------------------------ + * Executor related callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +static bool +heapam_scan_bitmap_next_block(TableScanDesc scan, + TBMIterateResult *tbmres) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + BlockNumber page = tbmres->blockno; + Buffer buffer; + Snapshot snapshot; + int ntup; + + hscan->rs_cindex = 0; + hscan->rs_ntuples = 0; + + /* + * Ignore any claimed entries past what we think is the end of the + * relation. It may have been extended after the start of our scan (we + * only hold an AccessShareLock, and it could be inserts from this + * backend). + */ + if (page >= hscan->rs_nblocks) + return false; + + /* + * Acquire pin on the target heap page, trading in any pin we held before. + */ + hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf, + scan->rs_rd, + page); + hscan->rs_cblock = page; + buffer = hscan->rs_cbuf; + snapshot = scan->rs_snapshot; + + ntup = 0; + + /* + * Prune and repair fragmentation for the whole page, if possible. + */ + heap_page_prune_opt(scan->rs_rd, buffer); + + /* + * We must hold share lock on the buffer content while examining tuple + * visibility. Afterwards, however, the tuples we have found to be + * visible are guaranteed good as long as we hold the buffer pin. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + /* + * We need two separate strategies for lossy and non-lossy cases. + */ + if (tbmres->ntuples >= 0) + { + /* + * Bitmap is non-lossy, so we just look through the offsets listed in + * tbmres; but we have to follow any HOT chain starting at each such + * offset. + */ + int curslot; + + for (curslot = 0; curslot < tbmres->ntuples; curslot++) + { + OffsetNumber offnum = tbmres->offsets[curslot]; + ItemPointerData tid; + HeapTupleData heapTuple; + + ItemPointerSet(&tid, page, offnum); + if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, + &heapTuple, NULL, true)) + hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + } + } + else + { + /* + * Bitmap is lossy, so we must examine each line pointer on the page. + * But we can ignore HOT chains, since we'll check each tuple anyway. + */ + Page dp = (Page) BufferGetPage(buffer); + OffsetNumber maxoff = PageGetMaxOffsetNumber(dp); + OffsetNumber offnum; + + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId lp; + HeapTupleData loctup; + bool valid; + + lp = PageGetItemId(dp, offnum); + if (!ItemIdIsNormal(lp)) + continue; + loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); + loctup.t_len = ItemIdGetLength(lp); + loctup.t_tableOid = scan->rs_rd->rd_id; + ItemPointerSet(&loctup.t_self, page, offnum); + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + if (valid) + { + hscan->rs_vistuples[ntup++] = offnum; + PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot, + HeapTupleHeaderGetXmin(loctup.t_data)); + } + HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, + buffer, snapshot); + } + } + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + Assert(ntup <= MaxHeapTuplesPerPage); + hscan->rs_ntuples = ntup; + + return ntup > 0; +} + +static bool +heapam_scan_bitmap_next_tuple(TableScanDesc scan, + TBMIterateResult *tbmres, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + OffsetNumber targoffset; + Page dp; + ItemId lp; + + /* + * Out of range? If so, nothing more to look at on this page + */ + if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples) + return false; + + targoffset = hscan->rs_vistuples[hscan->rs_cindex]; + dp = (Page) BufferGetPage(hscan->rs_cbuf); + lp = PageGetItemId(dp, targoffset); + Assert(ItemIdIsNormal(lp)); + + hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); + hscan->rs_ctup.t_len = ItemIdGetLength(lp); + hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; + ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset); + + pgstat_count_heap_fetch(scan->rs_rd); + + /* + * Set up the result slot to point to this tuple. Note that the slot + * acquires a pin on the buffer. + */ + ExecStoreBufferHeapTuple(&hscan->rs_ctup, + slot, + hscan->rs_cbuf); + + hscan->rs_cindex++; + + return true; +} + +static bool +heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno; + + /* return false immediately if relation is empty */ + if (hscan->rs_nblocks == 0) + return false; + + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks); + hscan->rs_cblock = blockno; + } + else + { + /* scanning table sequentially */ + + if (hscan->rs_cblock == InvalidBlockNumber) + { + Assert(!hscan->rs_inited); + blockno = hscan->rs_startblock; + } + else + { + Assert(hscan->rs_inited); + + blockno = hscan->rs_cblock + 1; + + if (blockno >= hscan->rs_nblocks) + { + /* wrap to beginning of rel, might not have started at 0 */ + blockno = 0; + } + + /* + * Report our new scan position for synchronization purposes. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_flags & SO_ALLOW_SYNC) + ss_report_location(scan->rs_rd, blockno); + + if (blockno == hscan->rs_startblock) + { + blockno = InvalidBlockNumber; + } + } + } + + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(hscan->rs_cbuf)) + ReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + hscan->rs_cblock = InvalidBlockNumber; + hscan->rs_inited = false; + + return false; + } + + heapgetpage(scan, blockno); + hscan->rs_inited = true; + + return true; +} + +static bool +heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno = hscan->rs_cblock; + bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0; + + Page page; + bool all_visible; + OffsetNumber maxoffset; + + /* + * When not using pagemode, we must lock the buffer during tuple + * visibility checks. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(hscan->rs_cbuf); + all_visible = PageIsAllVisible(page) && + !scan->rs_snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + OffsetNumber tupoffset; + + CHECK_FOR_INTERRUPTS(); + + /* Ask the tablesample method which tuples to check on this page. */ + tupoffset = tsm->NextSampleTuple(scanstate, + blockno, + maxoffset); + + if (OffsetNumberIsValid(tupoffset)) + { + ItemId itemid; + bool visible; + HeapTuple tuple = &(hscan->rs_ctup); + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + + if (all_visible) + visible = true; + else + visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf, + tuple, tupoffset); + + /* in pagemode, heapgetpage did this for us */ + if (!pagemode) + HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple, + hscan->rs_cbuf, scan->rs_snapshot); + + /* Try next tuple from same page. */ + if (!visible) + continue; + + /* Found visible tuple, return it. */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf); + + /* Count successfully-fetched tuples as heap fetches */ + pgstat_count_heap_getnext(scan->rs_rd); + + return true; + } + else + { + /* + * If we get here, it means we've exhausted the items on this page + * and it's time to move to the next. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecClearTuple(slot); + return false; + } + } + + Assert(0); +} + + +/* ---------------------------------------------------------------------------- + * Helper functions for the above. + * ---------------------------------------------------------------------------- + */ + +/* + * Reconstruct and rewrite the given tuple + * + * We cannot simply copy the tuple as-is, for several reasons: + * + * 1. We'd like to squeeze out the values of any dropped columns, both + * to save space and to ensure we have no corner-case failures. (It's + * possible for example that the new table hasn't got a TOAST table + * and so is unable to store any large values of dropped cols.) + * + * 2. The tuple might not even be legal for the new table; this is + * currently only known to happen as an after-effect of ALTER TABLE + * SET WITHOUT OIDS. + * + * So, we must reconstruct the tuple from component Datums. + */ +static void +reform_and_rewrite_tuple(HeapTuple tuple, + Relation OldHeap, Relation NewHeap, + Datum *values, bool *isnull, RewriteState rwstate) +{ + TupleDesc oldTupDesc = RelationGetDescr(OldHeap); + TupleDesc newTupDesc = RelationGetDescr(NewHeap); + HeapTuple copiedTuple; + int i; + + heap_deform_tuple(tuple, oldTupDesc, values, isnull); + + /* Be sure to null out any dropped columns */ + for (i = 0; i < newTupDesc->natts; i++) + { + if (TupleDescAttr(newTupDesc, i)->attisdropped) + isnull[i] = true; + } + + copiedTuple = heap_form_tuple(newTupDesc, values, isnull); + + /* The heap rewrite module does the rest */ + rewrite_heap_tuple(rwstate, tuple, copiedTuple); + + heap_freetuple(copiedTuple); +} + +/* + * Check visibility of the tuple. + */ +static bool +SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + if (scan->rs_flags & SO_ALLOW_PAGEMODE) + { + /* + * In pageatatime mode, heapgetpage() already did visibility checks, + * so just look at the info it left in rs_vistuples[]. + * + * We use a binary search over the known-sorted array. Note: we could + * save some effort if we insisted that NextSampleTuple select tuples + * in increasing order, but it's not clear that there would be enough + * gain to justify the restriction. + */ + int start = 0, + end = hscan->rs_ntuples - 1; + + while (start <= end) + { + int mid = (start + end) / 2; + OffsetNumber curoffset = hscan->rs_vistuples[mid]; + + if (tupoffset == curoffset) + return true; + else if (tupoffset < curoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* Otherwise, we have to check the tuple individually. */ + return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot, + buffer); + } +} + + +/* ------------------------------------------------------------------------ + * Definition of the heap table access method. + * ------------------------------------------------------------------------ + */ + +static const TableAmRoutine heapam_methods = { + .type = T_TableAmRoutine, + + .slot_callbacks = heapam_slot_callbacks, + + .scan_begin = heap_beginscan, + .scan_end = heap_endscan, + .scan_rescan = heap_rescan, + .scan_getnextslot = heap_getnextslot, + + .scan_set_tidrange = heap_set_tidrange, + .scan_getnextslot_tidrange = heap_getnextslot_tidrange, + + .parallelscan_estimate = table_block_parallelscan_estimate, + .parallelscan_initialize = table_block_parallelscan_initialize, + .parallelscan_reinitialize = table_block_parallelscan_reinitialize, + + .index_fetch_begin = heapam_index_fetch_begin, + .index_fetch_reset = heapam_index_fetch_reset, + .index_fetch_end = heapam_index_fetch_end, + .index_fetch_tuple = heapam_index_fetch_tuple, + + .tuple_insert = heapam_tuple_insert, + .tuple_insert_speculative = heapam_tuple_insert_speculative, + .tuple_complete_speculative = heapam_tuple_complete_speculative, + .multi_insert = heap_multi_insert, + .tuple_delete = heapam_tuple_delete, + .tuple_update = heapam_tuple_update, + .tuple_lock = heapam_tuple_lock, + + .tuple_fetch_row_version = heapam_fetch_row_version, + .tuple_get_latest_tid = heap_get_latest_tid, + .tuple_tid_valid = heapam_tuple_tid_valid, + .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, + .index_delete_tuples = heap_index_delete_tuples, + + .relation_set_new_filenode = heapam_relation_set_new_filenode, + .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, + .relation_copy_data = heapam_relation_copy_data, + .relation_copy_for_cluster = heapam_relation_copy_for_cluster, + .relation_vacuum = heap_vacuum_rel, + .scan_analyze_next_block = heapam_scan_analyze_next_block, + .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple, + .index_build_range_scan = heapam_index_build_range_scan, + .index_validate_scan = heapam_index_validate_scan, + + .relation_size = table_block_relation_size, + .relation_needs_toast_table = heapam_relation_needs_toast_table, + .relation_toast_am = heapam_relation_toast_am, + .relation_fetch_toast_slice = heap_fetch_toast_slice, + + .relation_estimate_size = heapam_estimate_rel_size, + + .scan_bitmap_next_block = heapam_scan_bitmap_next_block, + .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, + .scan_sample_next_block = heapam_scan_sample_next_block, + .scan_sample_next_tuple = heapam_scan_sample_next_tuple +}; + + +const TableAmRoutine * +GetHeapamTableAmRoutine(void) +{ + return &heapam_methods; +} + +Datum +heap_tableam_handler(PG_FUNCTION_ARGS) +{ + PG_RETURN_POINTER(&heapam_methods); +} diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c new file mode 100644 index 0000000..20d82ca --- /dev/null +++ b/src/backend/access/heap/heapam_visibility.c @@ -0,0 +1,1794 @@ +/*------------------------------------------------------------------------- + * + * heapam_visibility.c + * Tuple visibility rules for tuples stored in heap. + * + * NOTE: all the HeapTupleSatisfies routines will update the tuple's + * "hint" status bits if we see that the inserting or deleting transaction + * has now committed or aborted (and it is safe to set the hint bits). + * If the hint bits are changed, MarkBufferDirtyHint is called on + * the passed-in buffer. The caller must hold not only a pin, but at least + * shared buffer content lock on the buffer containing the tuple. + * + * NOTE: When using a non-MVCC snapshot, we must check + * TransactionIdIsInProgress (which looks in the PGPROC array) + * before TransactionIdDidCommit/TransactionIdDidAbort (which look in + * pg_xact). Otherwise we have a race condition: we might decide that a + * just-committed transaction crashed, because none of the tests succeed. + * xact.c is careful to record commit/abort in pg_xact before it unsets + * MyProc->xid in the PGPROC array. That fixes that problem, but it + * also means there is a window where TransactionIdIsInProgress and + * TransactionIdDidCommit will both return true. If we check only + * TransactionIdDidCommit, we could consider a tuple committed when a + * later GetSnapshotData call will still think the originating transaction + * is in progress, which leads to application-level inconsistency. The + * upshot is that we gotta check TransactionIdIsInProgress first in all + * code paths, except for a few cases where we are looking at + * subtransactions of our own main transaction and so there can't be any + * race condition. + * + * When using an MVCC snapshot, we rely on XidInMVCCSnapshot rather than + * TransactionIdIsInProgress, but the logic is otherwise the same: do not + * check pg_xact until after deciding that the xact is no longer in progress. + * + * + * Summary of visibility functions: + * + * HeapTupleSatisfiesMVCC() + * visible to supplied snapshot, excludes current command + * HeapTupleSatisfiesUpdate() + * visible to instant snapshot, with user-supplied command + * counter and more complex result + * HeapTupleSatisfiesSelf() + * visible to instant snapshot and current command + * HeapTupleSatisfiesDirty() + * like HeapTupleSatisfiesSelf(), but includes open transactions + * HeapTupleSatisfiesVacuum() + * visible to any running transaction, used by VACUUM + * HeapTupleSatisfiesNonVacuumable() + * Snapshot-style API for HeapTupleSatisfiesVacuum + * HeapTupleSatisfiesToast() + * visible unless part of interrupted vacuum, used for TOAST + * HeapTupleSatisfiesAny() + * all tuples are visible + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/heap/heapam_visibility.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/subtrans.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "storage/bufmgr.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/combocid.h" +#include "utils/snapmgr.h" + + +/* + * SetHintBits() + * + * Set commit/abort hint bits on a tuple, if appropriate at this time. + * + * It is only safe to set a transaction-committed hint bit if we know the + * transaction's commit record is guaranteed to be flushed to disk before the + * buffer, or if the table is temporary or unlogged and will be obliterated by + * a crash anyway. We cannot change the LSN of the page here, because we may + * hold only a share lock on the buffer, so we can only use the LSN to + * interlock this if the buffer's LSN already is newer than the commit LSN; + * otherwise we have to just refrain from setting the hint bit until some + * future re-examination of the tuple. + * + * We can always set hint bits when marking a transaction aborted. (Some + * code in heapam.c relies on that!) + * + * Also, if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then + * we can always set the hint bits, since pre-9.0 VACUUM FULL always used + * synchronous commits and didn't move tuples that weren't previously + * hinted. (This is not known by this subroutine, but is applied by its + * callers.) Note: old-style VACUUM FULL is gone, but we have to keep this + * module's support for MOVED_OFF/MOVED_IN flag bits for as long as we + * support in-place update from pre-9.0 databases. + * + * Normal commits may be asynchronous, so for those we need to get the LSN + * of the transaction and then check whether this is flushed. + * + * The caller should pass xid as the XID of the transaction to check, or + * InvalidTransactionId if no check is needed. + */ +static inline void +SetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid) +{ + if (TransactionIdIsValid(xid)) + { + /* NB: xid must be known committed here! */ + XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); + + if (BufferIsPermanent(buffer) && XLogNeedsFlush(commitLSN) && + BufferGetLSNAtomic(buffer) < commitLSN) + { + /* not flushed and no LSN interlock, so don't set hint */ + return; + } + } + + tuple->t_infomask |= infomask; + MarkBufferDirtyHint(buffer, true); +} + +/* + * HeapTupleSetHintBits --- exported version of SetHintBits() + * + * This must be separate because of C99's brain-dead notions about how to + * implement inline functions. + */ +void +HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid) +{ + SetHintBits(tuple, buffer, infomask, xid); +} + + +/* + * HeapTupleSatisfiesSelf + * True iff heap tuple is valid "for itself". + * + * See SNAPSHOT_MVCC's definition for the intended behaviour. + * + * Note: + * Assumes heap tuple is valid. + * + * The satisfaction of "itself" requires the following: + * + * ((Xmin == my-transaction && the row was updated by the current transaction, and + * (Xmax is null it was not deleted + * [|| Xmax != my-transaction)]) [or it was deleted by another transaction] + * || + * + * (Xmin is committed && the row was modified by a committed transaction, and + * (Xmax is null || the row has not been deleted, or + * (Xmax != my-transaction && the row was deleted by another transaction + * Xmax is not committed))) that has not been committed + */ +static bool +HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else + return false; + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + return false; + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + return false; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; /* updated by other */ + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + return true; + if (TransactionIdDidCommit(xmax)) + return false; + /* it must have aborted or crashed */ + return true; + } + + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; + } + + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return true; + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + /* xmax transaction committed */ + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + return false; +} + +/* + * HeapTupleSatisfiesAny + * Dummy "satisfies" routine: any tuple satisfies SnapshotAny. + */ +static bool +HeapTupleSatisfiesAny(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + return true; +} + +/* + * HeapTupleSatisfiesToast + * True iff heap tuple is valid as a TOAST row. + * + * See SNAPSHOT_TOAST's definition for the intended behaviour. + * + * This is a simplified version that only checks for VACUUM moving conditions. + * It's appropriate for TOAST usage because TOAST really doesn't want to do + * its own time qual checks; if you can see the main table row that contains + * a TOAST reference, you should be able to see the TOASTed value. However, + * vacuuming a TOAST table is independent of the main table, and in case such + * a vacuum fails partway through, we'd better do this much checking. + * + * Among other things, this means you can't do UPDATEs of rows in a TOAST + * table. + */ +static bool +HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + + /* + * An invalid Xmin can be left behind by a speculative insertion that + * is canceled by super-deleting the tuple. This also applies to + * TOAST tuples created during speculative insertion. + */ + else if (!TransactionIdIsValid(HeapTupleHeaderGetXmin(tuple))) + return false; + } + + /* otherwise assume the tuple is valid for TOAST. */ + return true; +} + +/* + * HeapTupleSatisfiesUpdate + * + * This function returns a more detailed result code than most of the + * functions in this file, since UPDATE needs to know more than "is it + * visible?". It also allows for user-supplied CommandId rather than + * relying on CurrentCommandId. + * + * The possible return codes are: + * + * TM_Invisible: the tuple didn't exist at all when the scan started, e.g. it + * was created by a later CommandId. + * + * TM_Ok: The tuple is valid and visible, so it may be updated. + * + * TM_SelfModified: The tuple was updated by the current transaction, after + * the current scan started. + * + * TM_Updated: The tuple was updated by a committed transaction (including + * the case where the tuple was moved into a different partition). + * + * TM_Deleted: The tuple was deleted by a committed transaction. + * + * TM_BeingModified: The tuple is being updated by an in-progress transaction + * other than the current transaction. (Note: this includes the case where + * the tuple is share-locked by a MultiXact, even if the MultiXact includes + * the current transaction. Callers that want to distinguish that case must + * test for it themselves.) + */ +TM_Result +HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return TM_Invisible; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return TM_Invisible; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return TM_Invisible; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return TM_Invisible; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return TM_Invisible; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (HeapTupleHeaderGetCmin(tuple) >= curcid) + return TM_Invisible; /* inserted after scan started */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return TM_Ok; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + TransactionId xmax; + + xmax = HeapTupleHeaderGetRawXmax(tuple); + + /* + * Careful here: even though this tuple was created by our own + * transaction, it might be locked by other transactions, if + * the original version was key-share locked when we updated + * it. + */ + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + if (MultiXactIdIsRunning(xmax, true)) + return TM_BeingModified; + else + return TM_Ok; + } + + /* + * If the locker is gone, then there is nothing of interest + * left in this Xmax; otherwise, report the tuple as + * locked/updated. + */ + if (!TransactionIdIsInProgress(xmax)) + return TM_Ok; + return TM_BeingModified; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* deleting subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + { + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), + false)) + return TM_BeingModified; + return TM_Ok; + } + else + { + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + return TM_Invisible; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return TM_Invisible; + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return TM_Ok; + + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return TM_Ok; + if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) + return TM_Updated; /* updated by other */ + else + return TM_Deleted; /* deleted by other */ + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + if (HEAP_LOCKED_UPGRADED(tuple->t_infomask)) + return TM_Ok; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), true)) + return TM_BeingModified; + + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + return TM_Ok; + } + + xmax = HeapTupleGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) + { + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + return TM_BeingModified; + } + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + return TM_BeingModified; + + if (TransactionIdDidCommit(xmax)) + { + if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) + return TM_Updated; + else + return TM_Deleted; + } + + /* + * By here, the update in the Xmax is either aborted or crashed, but + * what about the other members? + */ + + if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + { + /* + * There's no member, even just a locker, alive anymore, so we can + * mark the Xmax as invalid. + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + else + { + /* There are lockers running */ + return TM_BeingModified; + } + } + + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return TM_BeingModified; + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return TM_BeingModified; + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + + /* xmax transaction committed */ + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return TM_Ok; + } + + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) + return TM_Updated; /* updated by other */ + else + return TM_Deleted; /* deleted by other */ +} + +/* + * HeapTupleSatisfiesDirty + * True iff heap tuple is valid including effects of open transactions. + * + * See SNAPSHOT_DIRTY's definition for the intended behaviour. + * + * This is essentially like HeapTupleSatisfiesSelf as far as effects of + * the current transaction and committed/aborted xacts are concerned. + * However, we also include the effects of other xacts still in progress. + * + * A special hack is that the passed-in snapshot struct is used as an + * output argument to return the xids of concurrent xacts that affected the + * tuple. snapshot->xmin is set to the tuple's xmin if that is another + * transaction that's still in progress; or to InvalidTransactionId if the + * tuple's xmin is committed good, committed dead, or my own xact. + * Similarly for snapshot->xmax and the tuple's xmax. If the tuple was + * inserted speculatively, meaning that the inserter might still back down + * on the insertion without aborting the whole transaction, the associated + * token is also returned in snapshot->speculativeToken. + */ +static bool +HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + snapshot->xmin = snapshot->xmax = InvalidTransactionId; + snapshot->speculativeToken = 0; + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!TransactionIdIsInProgress(xvac)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (TransactionIdIsInProgress(xvac)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else + return false; + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + return false; + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + { + /* + * Return the speculative token to caller. Caller can worry about + * xmax, since it requires a conclusively locked row version, and + * a concurrent update to this tuple is a conflict of its + * purposes. + */ + if (HeapTupleHeaderIsSpeculative(tuple)) + { + snapshot->speculativeToken = + HeapTupleHeaderGetSpeculativeToken(tuple); + + Assert(snapshot->speculativeToken != 0); + } + + snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple); + /* XXX shouldn't we fall through to look at xmax? */ + return true; /* in insertion by other */ + } + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; /* updated by other */ + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + { + snapshot->xmax = xmax; + return true; + } + if (TransactionIdDidCommit(xmax)) + return false; + /* it must have aborted or crashed */ + return true; + } + + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + return false; + } + + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + { + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple); + return true; + } + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + /* xmax transaction committed */ + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + return false; /* updated by other */ +} + +/* + * HeapTupleSatisfiesMVCC + * True iff heap tuple is valid for the given MVCC snapshot. + * + * See SNAPSHOT_MVCC's definition for the intended behaviour. + * + * Notice that here, we will not update the tuple status hint bits if the + * inserting/deleting transaction is still running according to our snapshot, + * even if in reality it's committed or aborted by now. This is intentional. + * Checking the true transaction state would require access to high-traffic + * shared data structures, creating contention we'd rather do without, and it + * would not change the result of our visibility check anyway. The hint bits + * will be updated by the first visitor that has a snapshot new enough to see + * the inserting/deleting transaction as done. In the meantime, the cost of + * leaving the hint bits unset is basically that each HeapTupleSatisfiesMVCC + * call will need to run TransactionIdIsCurrentTransactionId in addition to + * XidInMVCCSnapshot (but it would have to do the latter anyway). In the old + * coding where we tried to set the hint bits as soon as possible, we instead + * did TransactionIdIsInProgress in each call --- to no avail, as long as the + * inserting/deleting transaction was still running --- which was more cycles + * and more contention on ProcArrayLock. + */ +static bool +HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return false; + + /* Used by pre-9.0 binary upgrades */ + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; + if (!XidInMVCCSnapshot(xvac, snapshot)) + { + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (!TransactionIdIsCurrentTransactionId(xvac)) + { + if (XidInMVCCSnapshot(xvac, snapshot)) + return false; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) + return false; /* inserted after scan started */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) /* not deleter */ + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* updated after scan started */ + else + return false; /* updated before scan started */ + } + + if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + /* deleting subtransaction must have aborted */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) + return false; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + } + else + { + /* xmin is committed, but maybe not according to our snapshot */ + if (!HeapTupleHeaderXminFrozen(tuple) && + XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) + return false; /* treat as still in progress */ + } + + /* by here, the inserting transaction has committed */ + + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ + return true; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + /* already checked above */ + Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + if (XidInMVCCSnapshot(xmax, snapshot)) + return true; + if (TransactionIdDidCommit(xmax)) + return false; /* updating transaction committed */ + /* it must have aborted or crashed */ + return true; + } + + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + + if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) + return true; + + if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + { + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + } + + /* xmax transaction committed */ + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + } + else + { + /* xmax is committed, but maybe not according to our snapshot */ + if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) + return true; /* treat as still in progress */ + } + + /* xmax transaction committed */ + + return false; +} + + +/* + * HeapTupleSatisfiesVacuum + * + * Determine the status of tuples for VACUUM purposes. Here, what + * we mainly want to know is if a tuple is potentially visible to *any* + * running transaction. If so, it can't be removed yet by VACUUM. + * + * OldestXmin is a cutoff XID (obtained from + * GetOldestNonRemovableTransactionId()). Tuples deleted by XIDs >= + * OldestXmin are deemed "recently dead"; they might still be visible to some + * open transaction, so we can't remove them, even if we see that the deleting + * transaction has committed. + */ +HTSV_Result +HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, + Buffer buffer) +{ + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (TransactionIdPrecedes(dead_after, OldestXmin)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res; +} + +/* + * Work horse for HeapTupleSatisfiesVacuum and similar routines. + * + * In contrast to HeapTupleSatisfiesVacuum this routine, when encountering a + * tuple that could still be visible to some backend, stores the xid that + * needs to be compared with the horizon in *dead_after, and returns + * HEAPTUPLE_RECENTLY_DEAD. The caller then can perform the comparison with + * the horizon. This is e.g. useful when comparing with different horizons. + * + * Note: HEAPTUPLE_DEAD can still be returned here, e.g. if the inserting + * transaction aborted. + */ +HTSV_Result +HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + Assert(dead_after != NULL); + + *dead_after = InvalidTransactionId; + + /* + * Has inserting transaction committed? + * + * If the inserting transaction aborted, then the tuple was never visible + * to any other transaction, so we can delete it immediately. + */ + if (!HeapTupleHeaderXminCommitted(tuple)) + { + if (HeapTupleHeaderXminInvalid(tuple)) + return HEAPTUPLE_DEAD; + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_OFF) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return HEAPTUPLE_DELETE_IN_PROGRESS; + if (TransactionIdIsInProgress(xvac)) + return HEAPTUPLE_DELETE_IN_PROGRESS; + if (TransactionIdDidCommit(xvac)) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HEAPTUPLE_DEAD; + } + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return HEAPTUPLE_INSERT_IN_PROGRESS; + if (TransactionIdIsInProgress(xvac)) + return HEAPTUPLE_INSERT_IN_PROGRESS; + if (TransactionIdDidCommit(xvac)) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HEAPTUPLE_DEAD; + } + } + else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + { + if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ + return HEAPTUPLE_INSERT_IN_PROGRESS; + /* only locked? run infomask-only check first, for performance */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) || + HeapTupleHeaderIsOnlyLocked(tuple)) + return HEAPTUPLE_INSERT_IN_PROGRESS; + /* inserted and then deleted by same xact */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple))) + return HEAPTUPLE_DELETE_IN_PROGRESS; + /* deleting subtransaction must have aborted */ + return HEAPTUPLE_INSERT_IN_PROGRESS; + } + else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + { + /* + * It'd be possible to discern between INSERT/DELETE in progress + * here by looking at xmax - but that doesn't seem beneficial for + * the majority of callers and even detrimental for some. We'd + * rather have callers look at/wait for xmin than xmax. It's + * always correct to return INSERT_IN_PROGRESS because that's + * what's happening from the view of other backends. + */ + return HEAPTUPLE_INSERT_IN_PROGRESS; + } + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed + */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HEAPTUPLE_DEAD; + } + + /* + * At this point the xmin is known committed, but we might not have + * been able to set the hint bit yet; so we can no longer Assert that + * it's set. + */ + } + + /* + * Okay, the inserter committed, so it was good at some point. Now what + * about the deleting transaction? + */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return HEAPTUPLE_LIVE; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + /* + * "Deleting" xact really only locked it, so the tuple is live in any + * case. However, we should make sure that either XMAX_COMMITTED or + * XMAX_INVALID gets set once the xact is gone, to reduce the costs of + * examining the tuple for future xacts. + */ + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + /* + * If it's a pre-pg_upgrade tuple, the multixact cannot + * possibly be running; otherwise have to check. + */ + if (!HEAP_LOCKED_UPGRADED(tuple->t_infomask) && + MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), + true)) + return HEAPTUPLE_LIVE; + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + } + else + { + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return HEAPTUPLE_LIVE; + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + } + } + + /* + * We don't really care whether xmax did commit, abort or crash. We + * know that xmax did lock the tuple, but it did not and will never + * actually update it. + */ + + return HEAPTUPLE_LIVE; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax = HeapTupleGetUpdateXid(tuple); + + /* already checked above */ + Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsInProgress(xmax)) + return HEAPTUPLE_DELETE_IN_PROGRESS; + else if (TransactionIdDidCommit(xmax)) + { + /* + * The multixact might still be running due to lockers. Need to + * allow for pruning if below the xid horizon regardless -- + * otherwise we could end up with a tuple where the updater has to + * be removed due to the horizon, but is not pruned away. It's + * not a problem to prune that tuple, because any remaining + * lockers will also be present in newer tuple versions. + */ + *dead_after = xmax; + return HEAPTUPLE_RECENTLY_DEAD; + } + else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed. + * Mark the Xmax as invalid. + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + } + + return HEAPTUPLE_LIVE; + } + + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + return HEAPTUPLE_DELETE_IN_PROGRESS; + else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + else + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return HEAPTUPLE_LIVE; + } + + /* + * At this point the xmax is known committed, but we might not have + * been able to set the hint bit yet; so we can no longer Assert that + * it's set. + */ + } + + /* + * Deleter committed, allow caller to check if it was recent enough that + * some open transactions could still see the tuple. + */ + *dead_after = HeapTupleHeaderGetRawXmax(tuple); + return HEAPTUPLE_RECENTLY_DEAD; +} + + +/* + * HeapTupleSatisfiesNonVacuumable + * + * True if tuple might be visible to some transaction; false if it's + * surely dead to everyone, ie, vacuumable. + * + * See SNAPSHOT_NON_VACUUMABLE's definition for the intended behaviour. + * + * This is an interface to HeapTupleSatisfiesVacuum that's callable via + * HeapTupleSatisfiesSnapshot, so it can be used through a Snapshot. + * snapshot->vistest must have been set up with the horizon to use. + */ +static bool +HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (GlobalVisTestIsRemovableXid(snapshot->vistest, dead_after)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res != HEAPTUPLE_DEAD; +} + + +/* + * HeapTupleIsSurelyDead + * + * Cheaply determine whether a tuple is surely dead to all onlookers. + * We sometimes use this in lieu of HeapTupleSatisfiesVacuum when the + * tuple has just been tested by another visibility routine (usually + * HeapTupleSatisfiesMVCC) and, therefore, any hint bits that can be set + * should already be set. We assume that if no hint bits are set, the xmin + * or xmax transaction is still running. This is therefore faster than + * HeapTupleSatisfiesVacuum, because we consult neither procarray nor CLOG. + * It's okay to return false when in doubt, but we must return true only + * if the tuple is removable. + */ +bool +HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + /* + * If the inserting transaction is marked invalid, then it aborted, and + * the tuple is definitely dead. If it's marked neither committed nor + * invalid, then we assume it's still alive (since the presumption is that + * all relevant hint bits were just set moments ago). + */ + if (!HeapTupleHeaderXminCommitted(tuple)) + return HeapTupleHeaderXminInvalid(tuple) ? true : false; + + /* + * If the inserting transaction committed, but any deleting transaction + * aborted, the tuple is still alive. + */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return false; + + /* + * If the XMAX is just a lock, the tuple is still alive. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return false; + + /* + * If the Xmax is a MultiXact, it might be dead or alive, but we cannot + * know without checking pg_multixact. + */ + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + return false; + + /* If deleter isn't known to have committed, assume it's still running. */ + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + return false; + + /* Deleter committed, so tuple is dead if the XID is old enough. */ + return GlobalVisTestIsRemovableXid(vistest, + HeapTupleHeaderGetRawXmax(tuple)); +} + +/* + * Is the tuple really only locked? That is, is it not updated? + * + * It's easy to check just infomask bits if the locker is not a multi; but + * otherwise we need to verify that the updating transaction has not aborted. + * + * This function is here because it follows the same visibility rules laid out + * at the top of this file. + */ +bool +HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) +{ + TransactionId xmax; + + /* if there's no valid Xmax, then there's obviously no update either */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return true; + + if (tuple->t_infomask & HEAP_XMAX_LOCK_ONLY) + return true; + + /* invalid xmax means no update */ + if (!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple))) + return true; + + /* + * if HEAP_XMAX_LOCK_ONLY is not set and not a multi, then this must + * necessarily have been updated + */ + if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)) + return false; + + /* ... but if it's a multi, then perhaps the updating Xid aborted. */ + xmax = HeapTupleGetUpdateXid(tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + return false; + if (TransactionIdDidCommit(xmax)) + return false; + + /* + * not current, not in progress, not committed -- must have aborted or + * crashed + */ + return true; +} + +/* + * check whether the transaction id 'xid' is in the pre-sorted array 'xip'. + */ +static bool +TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) +{ + return num > 0 && + bsearch(&xid, xip, num, sizeof(TransactionId), xidComparator) != NULL; +} + +/* + * See the comments for HeapTupleSatisfiesMVCC for the semantics this function + * obeys. + * + * Only usable on tuples from catalog tables! + * + * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support + * reading catalog pages which couldn't have been created in an older version. + * + * We don't set any hint bits in here as it seems unlikely to be beneficial as + * those should already be set by normal access and it seems to be too + * dangerous to do so as the semantics of doing so during timetravel are more + * complicated than when dealing "only" with the present. + */ +static bool +HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + TransactionId xmin = HeapTupleHeaderGetXmin(tuple); + TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple); + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + /* inserting transaction aborted */ + if (HeapTupleHeaderXminInvalid(tuple)) + { + Assert(!TransactionIdDidCommit(xmin)); + return false; + } + /* check if it's one of our txids, toplevel is also in there */ + else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt)) + { + bool resolved; + CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple); + CommandId cmax = InvalidCommandId; + + /* + * another transaction might have (tried to) delete this tuple or + * cmin/cmax was stored in a combo CID. So we need to lookup the + * actual values externally. + */ + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + htup, buffer, + &cmin, &cmax); + + /* + * If we haven't resolved the combo CID to cmin/cmax, that means we + * have not decoded the combo CID yet. That means the cmin is + * definitely in the future, and we're not supposed to see the tuple + * yet. + * + * XXX This only applies to decoding of in-progress transactions. In + * regular logical decoding we only execute this code at commit time, + * at which point we should have seen all relevant combo CIDs. So + * ideally, we should error out in this case but in practice, this + * won't happen. If we are too worried about this then we can add an + * elog inside ResolveCminCmaxDuringDecoding. + * + * XXX For the streaming case, we can track the largest combo CID + * assigned, and error out based on this (when unable to resolve combo + * CID below that observed maximum value). + */ + if (!resolved) + return false; + + Assert(cmin != InvalidCommandId); + + if (cmin >= snapshot->curcid) + return false; /* inserted after scan started */ + /* fall through */ + } + /* committed before our xmin horizon. Do a normal visibility check. */ + else if (TransactionIdPrecedes(xmin, snapshot->xmin)) + { + Assert(!(HeapTupleHeaderXminCommitted(tuple) && + !TransactionIdDidCommit(xmin))); + + /* check for hint bit first, consult clog afterwards */ + if (!HeapTupleHeaderXminCommitted(tuple) && + !TransactionIdDidCommit(xmin)) + return false; + /* fall through */ + } + /* beyond our xmax horizon, i.e. invisible */ + else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax)) + { + return false; + } + /* check if it's a committed transaction in [xmin, xmax) */ + else if (TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt)) + { + /* fall through */ + } + + /* + * none of the above, i.e. between [xmin, xmax) but hasn't committed. I.e. + * invisible. + */ + else + { + return false; + } + + /* at this point we know xmin is visible, go on to check xmax */ + + /* xid invalid or aborted */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return true; + /* locked tuples are always visible */ + else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + + /* + * We can see multis here if we're looking at user tables or if somebody + * SELECT ... FOR SHARE/UPDATE a system table. + */ + else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + xmax = HeapTupleGetUpdateXid(tuple); + } + + /* check if it's one of our txids, toplevel is also in there */ + if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt)) + { + bool resolved; + CommandId cmin; + CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple); + + /* Lookup actual cmin/cmax values */ + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + htup, buffer, + &cmin, &cmax); + + /* + * If we haven't resolved the combo CID to cmin/cmax, that means we + * have not decoded the combo CID yet. That means the cmax is + * definitely in the future, and we're still supposed to see the + * tuple. + * + * XXX This only applies to decoding of in-progress transactions. In + * regular logical decoding we only execute this code at commit time, + * at which point we should have seen all relevant combo CIDs. So + * ideally, we should error out in this case but in practice, this + * won't happen. If we are too worried about this then we can add an + * elog inside ResolveCminCmaxDuringDecoding. + * + * XXX For the streaming case, we can track the largest combo CID + * assigned, and error out based on this (when unable to resolve combo + * CID below that observed maximum value). + */ + if (!resolved || cmax == InvalidCommandId) + return true; + + if (cmax >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + /* below xmin horizon, normal transaction state is valid */ + else if (TransactionIdPrecedes(xmax, snapshot->xmin)) + { + Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED && + !TransactionIdDidCommit(xmax))); + + /* check hint bit first */ + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + return false; + + /* check clog */ + return !TransactionIdDidCommit(xmax); + } + /* above xmax horizon, we cannot possibly see the deleting transaction */ + else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax)) + return true; + /* xmax is between [xmin, xmax), check known committed array */ + else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt)) + return false; + /* xmax is between [xmin, xmax), but known not to have committed yet */ + else + return true; +} + +/* + * HeapTupleSatisfiesVisibility + * True iff heap tuple satisfies a time qual. + * + * Notes: + * Assumes heap tuple is valid, and buffer at least share locked. + * + * Hint bits in the HeapTuple's t_infomask may be updated as a side effect; + * if so, the indicated buffer is marked dirty. + */ +bool +HeapTupleSatisfiesVisibility(HeapTuple tup, Snapshot snapshot, Buffer buffer) +{ + switch (snapshot->snapshot_type) + { + case SNAPSHOT_MVCC: + return HeapTupleSatisfiesMVCC(tup, snapshot, buffer); + break; + case SNAPSHOT_SELF: + return HeapTupleSatisfiesSelf(tup, snapshot, buffer); + break; + case SNAPSHOT_ANY: + return HeapTupleSatisfiesAny(tup, snapshot, buffer); + break; + case SNAPSHOT_TOAST: + return HeapTupleSatisfiesToast(tup, snapshot, buffer); + break; + case SNAPSHOT_DIRTY: + return HeapTupleSatisfiesDirty(tup, snapshot, buffer); + break; + case SNAPSHOT_HISTORIC_MVCC: + return HeapTupleSatisfiesHistoricMVCC(tup, snapshot, buffer); + break; + case SNAPSHOT_NON_VACUUMABLE: + return HeapTupleSatisfiesNonVacuumable(tup, snapshot, buffer); + break; + } + + return false; /* keep compiler quiet */ +} diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c new file mode 100644 index 0000000..55bbe1d --- /dev/null +++ b/src/backend/access/heap/heaptoast.c @@ -0,0 +1,793 @@ +/*------------------------------------------------------------------------- + * + * heaptoast.c + * Heap-specific definitions for external and compressed storage + * of variable size attributes. + * + * Copyright (c) 2000-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/heap/heaptoast.c + * + * + * INTERFACE ROUTINES + * heap_toast_insert_or_update - + * Try to make a given tuple fit into one page by compressing + * or moving off attributes + * + * heap_toast_delete - + * Reclaim toast storage when a tuple is deleted + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/toast_helper.h" +#include "access/toast_internals.h" +#include "utils/fmgroids.h" + + +/* ---------- + * heap_toast_delete - + * + * Cascaded delete toast-entries on DELETE + * ---------- + */ +void +heap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative) +{ + TupleDesc tupleDesc; + Datum toast_values[MaxHeapAttributeNumber]; + bool toast_isnull[MaxHeapAttributeNumber]; + + /* + * We should only ever be called for tuples of plain relations or + * materialized views --- recursing on a toast rel is bad news. + */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW); + + /* + * Get the tuple descriptor and break down the tuple into fields. + * + * NOTE: it's debatable whether to use heap_deform_tuple() here or just + * heap_getattr() only the varlena columns. The latter could win if there + * are few varlena columns and many non-varlena ones. However, + * heap_deform_tuple costs only O(N) while the heap_getattr way would cost + * O(N^2) if there are many varlena columns, so it seems better to err on + * the side of linear cost. (We won't even be here unless there's at + * least one varlena column, by the way.) + */ + tupleDesc = rel->rd_att; + + Assert(tupleDesc->natts <= MaxHeapAttributeNumber); + heap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull); + + /* Do the real work. */ + toast_delete_external(rel, toast_values, toast_isnull, is_speculative); +} + + +/* ---------- + * heap_toast_insert_or_update - + * + * Delete no-longer-used toast-entries and create new ones to + * make the new tuple fit on INSERT or UPDATE + * + * Inputs: + * newtup: the candidate new tuple to be inserted + * oldtup: the old row version for UPDATE, or NULL for INSERT + * options: options to be passed to heap_insert() for toast rows + * Result: + * either newtup if no toasting is needed, or a palloc'd modified tuple + * that is what should actually get stored + * + * NOTE: neither newtup nor oldtup will be modified. This is a change + * from the pre-8.1 API of this routine. + * ---------- + */ +HeapTuple +heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, + int options) +{ + HeapTuple result_tuple; + TupleDesc tupleDesc; + int numAttrs; + + Size maxDataLen; + Size hoff; + + bool toast_isnull[MaxHeapAttributeNumber]; + bool toast_oldisnull[MaxHeapAttributeNumber]; + Datum toast_values[MaxHeapAttributeNumber]; + Datum toast_oldvalues[MaxHeapAttributeNumber]; + ToastAttrInfo toast_attr[MaxHeapAttributeNumber]; + ToastTupleContext ttc; + + /* + * Ignore the INSERT_SPECULATIVE option. Speculative insertions/super + * deletions just normally insert/delete the toast values. It seems + * easiest to deal with that here, instead on, potentially, multiple + * callers. + */ + options &= ~HEAP_INSERT_SPECULATIVE; + + /* + * We should only ever be called for tuples of plain relations or + * materialized views --- recursing on a toast rel is bad news. + */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW); + + /* + * Get the tuple descriptor and break down the tuple(s) into fields. + */ + tupleDesc = rel->rd_att; + numAttrs = tupleDesc->natts; + + Assert(numAttrs <= MaxHeapAttributeNumber); + heap_deform_tuple(newtup, tupleDesc, toast_values, toast_isnull); + if (oldtup != NULL) + heap_deform_tuple(oldtup, tupleDesc, toast_oldvalues, toast_oldisnull); + + /* ---------- + * Prepare for toasting + * ---------- + */ + ttc.ttc_rel = rel; + ttc.ttc_values = toast_values; + ttc.ttc_isnull = toast_isnull; + if (oldtup == NULL) + { + ttc.ttc_oldvalues = NULL; + ttc.ttc_oldisnull = NULL; + } + else + { + ttc.ttc_oldvalues = toast_oldvalues; + ttc.ttc_oldisnull = toast_oldisnull; + } + ttc.ttc_attr = toast_attr; + toast_tuple_init(&ttc); + + /* ---------- + * Compress and/or save external until data fits into target length + * + * 1: Inline compress attributes with attstorage EXTENDED, and store very + * large attributes with attstorage EXTENDED or EXTERNAL external + * immediately + * 2: Store attributes with attstorage EXTENDED or EXTERNAL external + * 3: Inline compress attributes with attstorage MAIN + * 4: Store attributes with attstorage MAIN external + * ---------- + */ + + /* compute header overhead --- this should match heap_form_tuple() */ + hoff = SizeofHeapTupleHeader; + if ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) + hoff += BITMAPLEN(numAttrs); + hoff = MAXALIGN(hoff); + /* now convert to a limit on the tuple data size */ + maxDataLen = RelationGetToastTupleTarget(rel, TOAST_TUPLE_TARGET) - hoff; + + /* + * Look for attributes with attstorage EXTENDED to compress. Also find + * large attributes with attstorage EXTENDED or EXTERNAL, and store them + * external. + */ + while (heap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, true, false); + if (biggest_attno < 0) + break; + + /* + * Attempt to compress it inline, if it has attstorage EXTENDED + */ + if (TupleDescAttr(tupleDesc, biggest_attno)->attstorage == TYPSTORAGE_EXTENDED) + toast_tuple_try_compression(&ttc, biggest_attno); + else + { + /* + * has attstorage EXTERNAL, ignore on subsequent compression + * passes + */ + toast_attr[biggest_attno].tai_colflags |= TOASTCOL_INCOMPRESSIBLE; + } + + /* + * If this value is by itself more than maxDataLen (after compression + * if any), push it out to the toast table immediately, if possible. + * This avoids uselessly compressing other fields in the common case + * where we have one long field and several short ones. + * + * XXX maybe the threshold should be less than maxDataLen? + */ + if (toast_attr[biggest_attno].tai_size > maxDataLen && + rel->rd_rel->reltoastrelid != InvalidOid) + toast_tuple_externalize(&ttc, biggest_attno, options); + } + + /* + * Second we look for attributes of attstorage EXTENDED or EXTERNAL that + * are still inline, and make them external. But skip this if there's no + * toast table to push them to. + */ + while (heap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen && + rel->rd_rel->reltoastrelid != InvalidOid) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, false, false); + if (biggest_attno < 0) + break; + toast_tuple_externalize(&ttc, biggest_attno, options); + } + + /* + * Round 3 - this time we take attributes with storage MAIN into + * compression + */ + while (heap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, true, true); + if (biggest_attno < 0) + break; + + toast_tuple_try_compression(&ttc, biggest_attno); + } + + /* + * Finally we store attributes of type MAIN externally. At this point we + * increase the target tuple size, so that MAIN attributes aren't stored + * externally unless really necessary. + */ + maxDataLen = TOAST_TUPLE_TARGET_MAIN - hoff; + + while (heap_compute_data_size(tupleDesc, + toast_values, toast_isnull) > maxDataLen && + rel->rd_rel->reltoastrelid != InvalidOid) + { + int biggest_attno; + + biggest_attno = toast_tuple_find_biggest_attribute(&ttc, false, true); + if (biggest_attno < 0) + break; + + toast_tuple_externalize(&ttc, biggest_attno, options); + } + + /* + * In the case we toasted any values, we need to build a new heap tuple + * with the changed values. + */ + if ((ttc.ttc_flags & TOAST_NEEDS_CHANGE) != 0) + { + HeapTupleHeader olddata = newtup->t_data; + HeapTupleHeader new_data; + int32 new_header_len; + int32 new_data_len; + int32 new_tuple_len; + + /* + * Calculate the new size of the tuple. + * + * Note: we used to assume here that the old tuple's t_hoff must equal + * the new_header_len value, but that was incorrect. The old tuple + * might have a smaller-than-current natts, if there's been an ALTER + * TABLE ADD COLUMN since it was stored; and that would lead to a + * different conclusion about the size of the null bitmap, or even + * whether there needs to be one at all. + */ + new_header_len = SizeofHeapTupleHeader; + if ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) + new_header_len += BITMAPLEN(numAttrs); + new_header_len = MAXALIGN(new_header_len); + new_data_len = heap_compute_data_size(tupleDesc, + toast_values, toast_isnull); + new_tuple_len = new_header_len + new_data_len; + + /* + * Allocate and zero the space needed, and fill HeapTupleData fields. + */ + result_tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + new_tuple_len); + result_tuple->t_len = new_tuple_len; + result_tuple->t_self = newtup->t_self; + result_tuple->t_tableOid = newtup->t_tableOid; + new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE); + result_tuple->t_data = new_data; + + /* + * Copy the existing tuple header, but adjust natts and t_hoff. + */ + memcpy(new_data, olddata, SizeofHeapTupleHeader); + HeapTupleHeaderSetNatts(new_data, numAttrs); + new_data->t_hoff = new_header_len; + + /* Copy over the data, and fill the null bitmap if needed */ + heap_fill_tuple(tupleDesc, + toast_values, + toast_isnull, + (char *) new_data + new_header_len, + new_data_len, + &(new_data->t_infomask), + ((ttc.ttc_flags & TOAST_HAS_NULLS) != 0) ? + new_data->t_bits : NULL); + } + else + result_tuple = newtup; + + toast_tuple_cleanup(&ttc); + + return result_tuple; +} + + +/* ---------- + * toast_flatten_tuple - + * + * "Flatten" a tuple to contain no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * + * Note: we expect the caller already checked HeapTupleHasExternal(tup), + * so there is no need for a short-circuit path. + * ---------- + */ +HeapTuple +toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc) +{ + HeapTuple new_tuple; + int numAttrs = tupleDesc->natts; + int i; + Datum toast_values[MaxTupleAttributeNumber]; + bool toast_isnull[MaxTupleAttributeNumber]; + bool toast_free[MaxTupleAttributeNumber]; + + /* + * Break down the tuple into fields. + */ + Assert(numAttrs <= MaxTupleAttributeNumber); + heap_deform_tuple(tup, tupleDesc, toast_values, toast_isnull); + + memset(toast_free, 0, numAttrs * sizeof(bool)); + + for (i = 0; i < numAttrs; i++) + { + /* + * Look at non-null varlena attributes + */ + if (!toast_isnull[i] && TupleDescAttr(tupleDesc, i)->attlen == -1) + { + struct varlena *new_value; + + new_value = (struct varlena *) DatumGetPointer(toast_values[i]); + if (VARATT_IS_EXTERNAL(new_value)) + { + new_value = detoast_external_attr(new_value); + toast_values[i] = PointerGetDatum(new_value); + toast_free[i] = true; + } + } + } + + /* + * Form the reconfigured tuple. + */ + new_tuple = heap_form_tuple(tupleDesc, toast_values, toast_isnull); + + /* + * Be sure to copy the tuple's identity fields. We also make a point of + * copying visibility info, just in case anybody looks at those fields in + * a syscache entry. + */ + new_tuple->t_self = tup->t_self; + new_tuple->t_tableOid = tup->t_tableOid; + + new_tuple->t_data->t_choice = tup->t_data->t_choice; + new_tuple->t_data->t_ctid = tup->t_data->t_ctid; + new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; + new_tuple->t_data->t_infomask |= + tup->t_data->t_infomask & HEAP_XACT_MASK; + new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; + new_tuple->t_data->t_infomask2 |= + tup->t_data->t_infomask2 & HEAP2_XACT_MASK; + + /* + * Free allocated temp values + */ + for (i = 0; i < numAttrs; i++) + if (toast_free[i]) + pfree(DatumGetPointer(toast_values[i])); + + return new_tuple; +} + + +/* ---------- + * toast_flatten_tuple_to_datum - + * + * "Flatten" a tuple containing out-of-line toasted fields into a Datum. + * The result is always palloc'd in the current memory context. + * + * We have a general rule that Datums of container types (rows, arrays, + * ranges, etc) must not contain any external TOAST pointers. Without + * this rule, we'd have to look inside each Datum when preparing a tuple + * for storage, which would be expensive and would fail to extend cleanly + * to new sorts of container types. + * + * However, we don't want to say that tuples represented as HeapTuples + * can't contain toasted fields, so instead this routine should be called + * when such a HeapTuple is being converted into a Datum. + * + * While we're at it, we decompress any compressed fields too. This is not + * necessary for correctness, but reflects an expectation that compression + * will be more effective if applied to the whole tuple not individual + * fields. We are not so concerned about that that we want to deconstruct + * and reconstruct tuples just to get rid of compressed fields, however. + * So callers typically won't call this unless they see that the tuple has + * at least one external field. + * + * On the other hand, in-line short-header varlena fields are left alone. + * If we "untoasted" them here, they'd just get changed back to short-header + * format anyway within heap_fill_tuple. + * ---------- + */ +Datum +toast_flatten_tuple_to_datum(HeapTupleHeader tup, + uint32 tup_len, + TupleDesc tupleDesc) +{ + HeapTupleHeader new_data; + int32 new_header_len; + int32 new_data_len; + int32 new_tuple_len; + HeapTupleData tmptup; + int numAttrs = tupleDesc->natts; + int i; + bool has_nulls = false; + Datum toast_values[MaxTupleAttributeNumber]; + bool toast_isnull[MaxTupleAttributeNumber]; + bool toast_free[MaxTupleAttributeNumber]; + + /* Build a temporary HeapTuple control structure */ + tmptup.t_len = tup_len; + ItemPointerSetInvalid(&(tmptup.t_self)); + tmptup.t_tableOid = InvalidOid; + tmptup.t_data = tup; + + /* + * Break down the tuple into fields. + */ + Assert(numAttrs <= MaxTupleAttributeNumber); + heap_deform_tuple(&tmptup, tupleDesc, toast_values, toast_isnull); + + memset(toast_free, 0, numAttrs * sizeof(bool)); + + for (i = 0; i < numAttrs; i++) + { + /* + * Look at non-null varlena attributes + */ + if (toast_isnull[i]) + has_nulls = true; + else if (TupleDescAttr(tupleDesc, i)->attlen == -1) + { + struct varlena *new_value; + + new_value = (struct varlena *) DatumGetPointer(toast_values[i]); + if (VARATT_IS_EXTERNAL(new_value) || + VARATT_IS_COMPRESSED(new_value)) + { + new_value = detoast_attr(new_value); + toast_values[i] = PointerGetDatum(new_value); + toast_free[i] = true; + } + } + } + + /* + * Calculate the new size of the tuple. + * + * This should match the reconstruction code in + * heap_toast_insert_or_update. + */ + new_header_len = SizeofHeapTupleHeader; + if (has_nulls) + new_header_len += BITMAPLEN(numAttrs); + new_header_len = MAXALIGN(new_header_len); + new_data_len = heap_compute_data_size(tupleDesc, + toast_values, toast_isnull); + new_tuple_len = new_header_len + new_data_len; + + new_data = (HeapTupleHeader) palloc0(new_tuple_len); + + /* + * Copy the existing tuple header, but adjust natts and t_hoff. + */ + memcpy(new_data, tup, SizeofHeapTupleHeader); + HeapTupleHeaderSetNatts(new_data, numAttrs); + new_data->t_hoff = new_header_len; + + /* Set the composite-Datum header fields correctly */ + HeapTupleHeaderSetDatumLength(new_data, new_tuple_len); + HeapTupleHeaderSetTypeId(new_data, tupleDesc->tdtypeid); + HeapTupleHeaderSetTypMod(new_data, tupleDesc->tdtypmod); + + /* Copy over the data, and fill the null bitmap if needed */ + heap_fill_tuple(tupleDesc, + toast_values, + toast_isnull, + (char *) new_data + new_header_len, + new_data_len, + &(new_data->t_infomask), + has_nulls ? new_data->t_bits : NULL); + + /* + * Free allocated temp values + */ + for (i = 0; i < numAttrs; i++) + if (toast_free[i]) + pfree(DatumGetPointer(toast_values[i])); + + return PointerGetDatum(new_data); +} + + +/* ---------- + * toast_build_flattened_tuple - + * + * Build a tuple containing no out-of-line toasted fields. + * (This does not eliminate compressed or short-header datums.) + * + * This is essentially just like heap_form_tuple, except that it will + * expand any external-data pointers beforehand. + * + * It's not very clear whether it would be preferable to decompress + * in-line compressed datums while at it. For now, we don't. + * ---------- + */ +HeapTuple +toast_build_flattened_tuple(TupleDesc tupleDesc, + Datum *values, + bool *isnull) +{ + HeapTuple new_tuple; + int numAttrs = tupleDesc->natts; + int num_to_free; + int i; + Datum new_values[MaxTupleAttributeNumber]; + Pointer freeable_values[MaxTupleAttributeNumber]; + + /* + * We can pass the caller's isnull array directly to heap_form_tuple, but + * we potentially need to modify the values array. + */ + Assert(numAttrs <= MaxTupleAttributeNumber); + memcpy(new_values, values, numAttrs * sizeof(Datum)); + + num_to_free = 0; + for (i = 0; i < numAttrs; i++) + { + /* + * Look at non-null varlena attributes + */ + if (!isnull[i] && TupleDescAttr(tupleDesc, i)->attlen == -1) + { + struct varlena *new_value; + + new_value = (struct varlena *) DatumGetPointer(new_values[i]); + if (VARATT_IS_EXTERNAL(new_value)) + { + new_value = detoast_external_attr(new_value); + new_values[i] = PointerGetDatum(new_value); + freeable_values[num_to_free++] = (Pointer) new_value; + } + } + } + + /* + * Form the reconfigured tuple. + */ + new_tuple = heap_form_tuple(tupleDesc, new_values, isnull); + + /* + * Free allocated temp values + */ + for (i = 0; i < num_to_free; i++) + pfree(freeable_values[i]); + + return new_tuple; +} + +/* + * Fetch a TOAST slice from a heap table. + * + * toastrel is the relation from which chunks are to be fetched. + * valueid identifies the TOAST value from which chunks are being fetched. + * attrsize is the total size of the TOAST value. + * sliceoffset is the byte offset within the TOAST value from which to fetch. + * slicelength is the number of bytes to be fetched from the TOAST value. + * result is the varlena into which the results should be written. + */ +void +heap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, + int32 sliceoffset, int32 slicelength, + struct varlena *result) +{ + Relation *toastidxs; + ScanKeyData toastkey[3]; + TupleDesc toasttupDesc = toastrel->rd_att; + int nscankeys; + SysScanDesc toastscan; + HeapTuple ttup; + int32 expectedchunk; + int32 totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; + int startchunk; + int endchunk; + int num_indexes; + int validIndex; + SnapshotData SnapshotToast; + + /* Look for the valid index of toast relation */ + validIndex = toast_open_indexes(toastrel, + AccessShareLock, + &toastidxs, + &num_indexes); + + startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; + endchunk = (sliceoffset + slicelength - 1) / TOAST_MAX_CHUNK_SIZE; + Assert(endchunk <= totalchunks); + + /* Set up a scan key to fetch from the index. */ + ScanKeyInit(&toastkey[0], + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(valueid)); + + /* + * No additional condition if fetching all chunks. Otherwise, use an + * equality condition for one chunk, and a range condition otherwise. + */ + if (startchunk == 0 && endchunk == totalchunks - 1) + nscankeys = 1; + else if (startchunk == endchunk) + { + ScanKeyInit(&toastkey[1], + (AttrNumber) 2, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(startchunk)); + nscankeys = 2; + } + else + { + ScanKeyInit(&toastkey[1], + (AttrNumber) 2, + BTGreaterEqualStrategyNumber, F_INT4GE, + Int32GetDatum(startchunk)); + ScanKeyInit(&toastkey[2], + (AttrNumber) 2, + BTLessEqualStrategyNumber, F_INT4LE, + Int32GetDatum(endchunk)); + nscankeys = 3; + } + + /* Prepare for scan */ + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], + &SnapshotToast, nscankeys, toastkey); + + /* + * Read the chunks by index + * + * The index is on (valueid, chunkidx) so they will come in order + */ + expectedchunk = startchunk; + while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) + { + int32 curchunk; + Pointer chunk; + bool isnull; + char *chunkdata; + int32 chunksize; + int32 expected_size; + int32 chcpystrt; + int32 chcpyend; + + /* + * Have a chunk, extract the sequence number and the data + */ + curchunk = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); + Assert(!isnull); + chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); + Assert(!isnull); + if (!VARATT_IS_EXTENDED(chunk)) + { + chunksize = VARSIZE(chunk) - VARHDRSZ; + chunkdata = VARDATA(chunk); + } + else if (VARATT_IS_SHORT(chunk)) + { + /* could happen due to heap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + chunkdata = VARDATA_SHORT(chunk); + } + else + { + /* should never happen */ + elog(ERROR, "found toasted toast chunk for toast value %u in %s", + valueid, RelationGetRelationName(toastrel)); + chunksize = 0; /* keep compiler quiet */ + chunkdata = NULL; + } + + /* + * Some checks on the data we've found + */ + if (curchunk != expectedchunk) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk number %d (expected %d) for toast value %u in %s", + curchunk, expectedchunk, valueid, + RelationGetRelationName(toastrel)))); + if (curchunk > endchunk) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", + curchunk, + startchunk, endchunk, valueid, + RelationGetRelationName(toastrel)))); + expected_size = curchunk < totalchunks - 1 ? TOAST_MAX_CHUNK_SIZE + : attrsize - ((totalchunks - 1) * TOAST_MAX_CHUNK_SIZE); + if (chunksize != expected_size) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s", + chunksize, expected_size, + curchunk, totalchunks, valueid, + RelationGetRelationName(toastrel)))); + + /* + * Copy the data into proper place in our result + */ + chcpystrt = 0; + chcpyend = chunksize - 1; + if (curchunk == startchunk) + chcpystrt = sliceoffset % TOAST_MAX_CHUNK_SIZE; + if (curchunk == endchunk) + chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE; + + memcpy(VARDATA(result) + + (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, + chunkdata + chcpystrt, + (chcpyend - chcpystrt) + 1); + + expectedchunk++; + } + + /* + * Final checks that we successfully fetched the datum + */ + if (expectedchunk != (endchunk + 1)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("missing chunk number %d for toast value %u in %s", + expectedchunk, valueid, + RelationGetRelationName(toastrel)))); + + /* End scan and close indexes. */ + systable_endscan_ordered(toastscan); + toast_close_indexes(toastidxs, num_indexes, AccessShareLock); +} diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c new file mode 100644 index 0000000..d34edb4 --- /dev/null +++ b/src/backend/access/heap/hio.c @@ -0,0 +1,721 @@ +/*------------------------------------------------------------------------- + * + * hio.c + * POSTGRES heap access method input/output code. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/hio.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/hio.h" +#include "access/htup_details.h" +#include "access/visibilitymap.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" + + +/* + * RelationPutHeapTuple - place tuple at specified page + * + * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!! + * + * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer. + */ +void +RelationPutHeapTuple(Relation relation, + Buffer buffer, + HeapTuple tuple, + bool token) +{ + Page pageHeader; + OffsetNumber offnum; + + /* + * A tuple that's being inserted speculatively should already have its + * token set. + */ + Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data)); + + /* + * Do not allow tuples with invalid combinations of hint bits to be placed + * on a page. This combination is detected as corruption by the + * contrib/amcheck logic, so if you disable this assertion, make + * corresponding changes there. + */ + Assert(!((tuple->t_data->t_infomask & HEAP_XMAX_COMMITTED) && + (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI))); + + /* Add the tuple to the page */ + pageHeader = BufferGetPage(buffer); + + offnum = PageAddItem(pageHeader, (Item) tuple->t_data, + tuple->t_len, InvalidOffsetNumber, false, true); + + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple to page"); + + /* Update tuple->t_self to the actual position where it was stored */ + ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum); + + /* + * Insert the correct position into CTID of the stored tuple, too (unless + * this is a speculative insertion, in which case the token is held in + * CTID field instead) + */ + if (!token) + { + ItemId itemId = PageGetItemId(pageHeader, offnum); + HeapTupleHeader item = (HeapTupleHeader) PageGetItem(pageHeader, itemId); + + item->t_ctid = tuple->t_self; + } +} + +/* + * Read in a buffer in mode, using bulk-insert strategy if bistate isn't NULL. + */ +static Buffer +ReadBufferBI(Relation relation, BlockNumber targetBlock, + ReadBufferMode mode, BulkInsertState bistate) +{ + Buffer buffer; + + /* If not bulk-insert, exactly like ReadBuffer */ + if (!bistate) + return ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, + mode, NULL); + + /* If we have the desired block already pinned, re-pin and return it */ + if (bistate->current_buf != InvalidBuffer) + { + if (BufferGetBlockNumber(bistate->current_buf) == targetBlock) + { + /* + * Currently the LOCK variants are only used for extending + * relation, which should never reach this branch. + */ + Assert(mode != RBM_ZERO_AND_LOCK && + mode != RBM_ZERO_AND_CLEANUP_LOCK); + + IncrBufferRefCount(bistate->current_buf); + return bistate->current_buf; + } + /* ... else drop the old buffer */ + ReleaseBuffer(bistate->current_buf); + bistate->current_buf = InvalidBuffer; + } + + /* Perform a read using the buffer strategy */ + buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, + mode, bistate->strategy); + + /* Save the selected block as target for future inserts */ + IncrBufferRefCount(buffer); + bistate->current_buf = buffer; + + return buffer; +} + +/* + * For each heap page which is all-visible, acquire a pin on the appropriate + * visibility map page, if we haven't already got one. + * + * buffer2 may be InvalidBuffer, if only one buffer is involved. buffer1 + * must not be InvalidBuffer. If both buffers are specified, block1 must + * be less than block2. + */ +static void +GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, + BlockNumber block1, BlockNumber block2, + Buffer *vmbuffer1, Buffer *vmbuffer2) +{ + bool need_to_pin_buffer1; + bool need_to_pin_buffer2; + + Assert(BufferIsValid(buffer1)); + Assert(buffer2 == InvalidBuffer || block1 <= block2); + + while (1) + { + /* Figure out which pins we need but don't have. */ + need_to_pin_buffer1 = PageIsAllVisible(BufferGetPage(buffer1)) + && !visibilitymap_pin_ok(block1, *vmbuffer1); + need_to_pin_buffer2 = buffer2 != InvalidBuffer + && PageIsAllVisible(BufferGetPage(buffer2)) + && !visibilitymap_pin_ok(block2, *vmbuffer2); + if (!need_to_pin_buffer1 && !need_to_pin_buffer2) + return; + + /* We must unlock both buffers before doing any I/O. */ + LockBuffer(buffer1, BUFFER_LOCK_UNLOCK); + if (buffer2 != InvalidBuffer && buffer2 != buffer1) + LockBuffer(buffer2, BUFFER_LOCK_UNLOCK); + + /* Get pins. */ + if (need_to_pin_buffer1) + visibilitymap_pin(relation, block1, vmbuffer1); + if (need_to_pin_buffer2) + visibilitymap_pin(relation, block2, vmbuffer2); + + /* Relock buffers. */ + LockBuffer(buffer1, BUFFER_LOCK_EXCLUSIVE); + if (buffer2 != InvalidBuffer && buffer2 != buffer1) + LockBuffer(buffer2, BUFFER_LOCK_EXCLUSIVE); + + /* + * If there are two buffers involved and we pinned just one of them, + * it's possible that the second one became all-visible while we were + * busy pinning the first one. If it looks like that's a possible + * scenario, we'll need to make a second pass through this loop. + */ + if (buffer2 == InvalidBuffer || buffer1 == buffer2 + || (need_to_pin_buffer1 && need_to_pin_buffer2)) + break; + } +} + +/* + * Extend a relation by multiple blocks to avoid future contention on the + * relation extension lock. Our goal is to pre-extend the relation by an + * amount which ramps up as the degree of contention ramps up, but limiting + * the result to some sane overall value. + */ +static void +RelationAddExtraBlocks(Relation relation, BulkInsertState bistate) +{ + BlockNumber blockNum, + firstBlock = InvalidBlockNumber; + int extraBlocks; + int lockWaiters; + + /* Use the length of the lock wait queue to judge how much to extend. */ + lockWaiters = RelationExtensionLockWaiterCount(relation); + if (lockWaiters <= 0) + return; + + /* + * It might seem like multiplying the number of lock waiters by as much as + * 20 is too aggressive, but benchmarking revealed that smaller numbers + * were insufficient. 512 is just an arbitrary cap to prevent + * pathological results. + */ + extraBlocks = Min(512, lockWaiters * 20); + + do + { + Buffer buffer; + Page page; + Size freespace; + + /* + * Extend by one page. This should generally match the main-line + * extension code in RelationGetBufferForTuple, except that we hold + * the relation extension lock throughout, and we don't immediately + * initialize the page (see below). + */ + buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); + page = BufferGetPage(buffer); + + if (!PageIsNew(page)) + elog(ERROR, "page %u of relation \"%s\" should be empty but is not", + BufferGetBlockNumber(buffer), + RelationGetRelationName(relation)); + + /* + * Add the page to the FSM without initializing. If we were to + * initialize here, the page would potentially get flushed out to disk + * before we add any useful content. There's no guarantee that that'd + * happen before a potential crash, so we need to deal with + * uninitialized pages anyway, thus avoid the potential for + * unnecessary writes. + */ + + /* we'll need this info below */ + blockNum = BufferGetBlockNumber(buffer); + freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData; + + UnlockReleaseBuffer(buffer); + + /* Remember first block number thus added. */ + if (firstBlock == InvalidBlockNumber) + firstBlock = blockNum; + + /* + * Immediately update the bottom level of the FSM. This has a good + * chance of making this page visible to other concurrently inserting + * backends, and we want that to happen without delay. + */ + RecordPageWithFreeSpace(relation, blockNum, freespace); + } + while (--extraBlocks > 0); + + /* + * Updating the upper levels of the free space map is too expensive to do + * for every block, but it's worth doing once at the end to make sure that + * subsequent insertion activity sees all of those nifty free pages we + * just inserted. + */ + FreeSpaceMapVacuumRange(relation, firstBlock, blockNum + 1); +} + +/* + * RelationGetBufferForTuple + * + * Returns pinned and exclusive-locked buffer of a page in given relation + * with free space >= given len. + * + * If otherBuffer is not InvalidBuffer, then it references a previously + * pinned buffer of another page in the same relation; on return, this + * buffer will also be exclusive-locked. (This case is used by heap_update; + * the otherBuffer contains the tuple being updated.) + * + * The reason for passing otherBuffer is that if two backends are doing + * concurrent heap_update operations, a deadlock could occur if they try + * to lock the same two buffers in opposite orders. To ensure that this + * can't happen, we impose the rule that buffers of a relation must be + * locked in increasing page number order. This is most conveniently done + * by having RelationGetBufferForTuple lock them both, with suitable care + * for ordering. + * + * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the + * same buffer we select for insertion of the new tuple (this could only + * happen if space is freed in that page after heap_update finds there's not + * enough there). In that case, the page will be pinned and locked only once. + * + * We also handle the possibility that the all-visible flag will need to be + * cleared on one or both pages. If so, pin on the associated visibility map + * page must be acquired before acquiring buffer lock(s), to avoid possibly + * doing I/O while holding buffer locks. The pins are passed back to the + * caller using the input-output arguments vmbuffer and vmbuffer_other. + * Note that in some cases the caller might have already acquired such pins, + * which is indicated by these arguments not being InvalidBuffer on entry. + * + * We normally use FSM to help us find free space. However, + * if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to + * the end of the relation if the tuple won't fit on the current target page. + * This can save some cycles when we know the relation is new and doesn't + * contain useful amounts of free space. + * + * HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a + * relation, if the caller holds exclusive lock and is careful to invalidate + * relation's smgr_targblock before the first insertion --- that ensures that + * all insertions will occur into newly added pages and not be intermixed + * with tuples from other transactions. That way, a crash can't risk losing + * any committed data of other transactions. (See heap_insert's comments + * for additional constraints needed for safe usage of this behavior.) + * + * The caller can also provide a BulkInsertState object to optimize many + * insertions into the same relation. This keeps a pin on the current + * insertion target page (to save pin/unpin cycles) and also passes a + * BULKWRITE buffer selection strategy object to the buffer manager. + * Passing NULL for bistate selects the default behavior. + * + * We don't fill existing pages further than the fillfactor, except for large + * tuples in nearly-empty pages. This is OK since this routine is not + * consulted when updating a tuple and keeping it on the same page, which is + * the scenario fillfactor is meant to reserve space for. + * + * ereport(ERROR) is allowed here, so this routine *must* be called + * before any (unlogged) changes are made in buffer pool. + */ +Buffer +RelationGetBufferForTuple(Relation relation, Size len, + Buffer otherBuffer, int options, + BulkInsertState bistate, + Buffer *vmbuffer, Buffer *vmbuffer_other) +{ + bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); + Buffer buffer = InvalidBuffer; + Page page; + Size nearlyEmptyFreeSpace, + pageFreeSpace = 0, + saveFreeSpace = 0, + targetFreeSpace = 0; + BlockNumber targetBlock, + otherBlock; + bool needLock; + + len = MAXALIGN(len); /* be conservative */ + + /* Bulk insert is not supported for updates, only inserts. */ + Assert(otherBuffer == InvalidBuffer || !bistate); + + /* + * If we're gonna fail for oversize tuple, do it right away + */ + if (len > MaxHeapTupleSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("row is too big: size %zu, maximum size %zu", + len, MaxHeapTupleSize))); + + /* Compute desired extra freespace due to fillfactor option */ + saveFreeSpace = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + + /* + * Since pages without tuples can still have line pointers, we consider + * pages "empty" when the unavailable space is slight. This threshold is + * somewhat arbitrary, but it should prevent most unnecessary relation + * extensions while inserting large tuples into low-fillfactor tables. + */ + nearlyEmptyFreeSpace = MaxHeapTupleSize - + (MaxHeapTuplesPerPage / 8 * sizeof(ItemIdData)); + if (len + saveFreeSpace > nearlyEmptyFreeSpace) + targetFreeSpace = Max(len, nearlyEmptyFreeSpace); + else + targetFreeSpace = len + saveFreeSpace; + + if (otherBuffer != InvalidBuffer) + otherBlock = BufferGetBlockNumber(otherBuffer); + else + otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */ + + /* + * We first try to put the tuple on the same page we last inserted a tuple + * on, as cached in the BulkInsertState or relcache entry. If that + * doesn't work, we ask the Free Space Map to locate a suitable page. + * Since the FSM's info might be out of date, we have to be prepared to + * loop around and retry multiple times. (To insure this isn't an infinite + * loop, we must update the FSM with the correct amount of free space on + * each page that proves not to be suitable.) If the FSM has no record of + * a page with enough free space, we give up and extend the relation. + * + * When use_fsm is false, we either put the tuple onto the existing target + * page or extend the relation. + */ + if (bistate && bistate->current_buf != InvalidBuffer) + targetBlock = BufferGetBlockNumber(bistate->current_buf); + else + targetBlock = RelationGetTargetBlock(relation); + + if (targetBlock == InvalidBlockNumber && use_fsm) + { + /* + * We have no cached target page, so ask the FSM for an initial + * target. + */ + targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace); + } + + /* + * If the FSM knows nothing of the rel, try the last page before we give + * up and extend. This avoids one-tuple-per-page syndrome during + * bootstrapping or in a recently-started system. + */ + if (targetBlock == InvalidBlockNumber) + { + BlockNumber nblocks = RelationGetNumberOfBlocks(relation); + + if (nblocks > 0) + targetBlock = nblocks - 1; + } + +loop: + while (targetBlock != InvalidBlockNumber) + { + /* + * Read and exclusive-lock the target block, as well as the other + * block if one was given, taking suitable care with lock ordering and + * the possibility they are the same block. + * + * If the page-level all-visible flag is set, caller will need to + * clear both that and the corresponding visibility map bit. However, + * by the time we return, we'll have x-locked the buffer, and we don't + * want to do any I/O while in that state. So we check the bit here + * before taking the lock, and pin the page if it appears necessary. + * Checking without the lock creates a risk of getting the wrong + * answer, so we'll have to recheck after acquiring the lock. + */ + if (otherBuffer == InvalidBuffer) + { + /* easy case */ + buffer = ReadBufferBI(relation, targetBlock, RBM_NORMAL, bistate); + if (PageIsAllVisible(BufferGetPage(buffer))) + visibilitymap_pin(relation, targetBlock, vmbuffer); + + /* + * If the page is empty, pin vmbuffer to set all_frozen bit later. + */ + if ((options & HEAP_INSERT_FROZEN) && + (PageGetMaxOffsetNumber(BufferGetPage(buffer)) == 0)) + visibilitymap_pin(relation, targetBlock, vmbuffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + else if (otherBlock == targetBlock) + { + /* also easy case */ + buffer = otherBuffer; + if (PageIsAllVisible(BufferGetPage(buffer))) + visibilitymap_pin(relation, targetBlock, vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + else if (otherBlock < targetBlock) + { + /* lock other buffer first */ + buffer = ReadBuffer(relation, targetBlock); + if (PageIsAllVisible(BufferGetPage(buffer))) + visibilitymap_pin(relation, targetBlock, vmbuffer); + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + else + { + /* lock target buffer first */ + buffer = ReadBuffer(relation, targetBlock); + if (PageIsAllVisible(BufferGetPage(buffer))) + visibilitymap_pin(relation, targetBlock, vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * We now have the target page (and the other buffer, if any) pinned + * and locked. However, since our initial PageIsAllVisible checks + * were performed before acquiring the lock, the results might now be + * out of date, either for the selected victim buffer, or for the + * other buffer passed by the caller. In that case, we'll need to + * give up our locks, go get the pin(s) we failed to get earlier, and + * re-lock. That's pretty painful, but hopefully shouldn't happen + * often. + * + * Note that there's a small possibility that we didn't pin the page + * above but still have the correct page pinned anyway, either because + * we've already made a previous pass through this loop, or because + * caller passed us the right page anyway. + * + * Note also that it's possible that by the time we get the pin and + * retake the buffer locks, the visibility map bit will have been + * cleared by some other backend anyway. In that case, we'll have + * done a bit of extra work for no gain, but there's no real harm + * done. + */ + if (otherBuffer == InvalidBuffer || targetBlock <= otherBlock) + GetVisibilityMapPins(relation, buffer, otherBuffer, + targetBlock, otherBlock, vmbuffer, + vmbuffer_other); + else + GetVisibilityMapPins(relation, otherBuffer, buffer, + otherBlock, targetBlock, vmbuffer_other, + vmbuffer); + + /* + * Now we can check to see if there's enough free space here. If so, + * we're done. + */ + page = BufferGetPage(buffer); + + /* + * If necessary initialize page, it'll be used soon. We could avoid + * dirtying the buffer here, and rely on the caller to do so whenever + * it puts a tuple onto the page, but there seems not much benefit in + * doing so. + */ + if (PageIsNew(page)) + { + PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + } + + pageFreeSpace = PageGetHeapFreeSpace(page); + if (targetFreeSpace <= pageFreeSpace) + { + /* use this page as future insert target, too */ + RelationSetTargetBlock(relation, targetBlock); + return buffer; + } + + /* + * Not enough space, so we must give up our page locks and pin (if + * any) and prepare to look elsewhere. We don't care which order we + * unlock the two buffers in, so this can be slightly simpler than the + * code above. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + if (otherBuffer == InvalidBuffer) + ReleaseBuffer(buffer); + else if (otherBlock != targetBlock) + { + LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* Without FSM, always fall out of the loop and extend */ + if (!use_fsm) + break; + + /* + * Update FSM as to condition of this page, and ask for another page + * to try. + */ + targetBlock = RecordAndGetPageWithFreeSpace(relation, + targetBlock, + pageFreeSpace, + targetFreeSpace); + } + + /* + * Have to extend the relation. + * + * We have to use a lock to ensure no one else is extending the rel at the + * same time, else we will both try to initialize the same new page. We + * can skip locking for new or temp relations, however, since no one else + * could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(relation); + + /* + * If we need the lock but are not able to acquire it immediately, we'll + * consider extending the relation by multiple blocks at a time to manage + * contention on the relation extension lock. However, this only makes + * sense if we're using the FSM; otherwise, there's no point. + */ + if (needLock) + { + if (!use_fsm) + LockRelationForExtension(relation, ExclusiveLock); + else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock)) + { + /* Couldn't get the lock immediately; wait for it. */ + LockRelationForExtension(relation, ExclusiveLock); + + /* + * Check if some other backend has extended a block for us while + * we were waiting on the lock. + */ + targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace); + + /* + * If some other waiter has already extended the relation, we + * don't need to do so; just use the existing freespace. + */ + if (targetBlock != InvalidBlockNumber) + { + UnlockRelationForExtension(relation, ExclusiveLock); + goto loop; + } + + /* Time to bulk-extend. */ + RelationAddExtraBlocks(relation, bistate); + } + } + + /* + * In addition to whatever extension we performed above, we always add at + * least one block to satisfy our own request. + * + * XXX This does an lseek - rather expensive - but at the moment it is the + * only way to accurately determine how many blocks are in a relation. Is + * it worth keeping an accurate file length in shared memory someplace, + * rather than relying on the kernel to do it for us? + */ + buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); + + /* + * We need to initialize the empty new page. Double-check that it really + * is empty (this should never happen, but if it does we don't want to + * risk wiping out valid data). + */ + page = BufferGetPage(buffer); + + if (!PageIsNew(page)) + elog(ERROR, "page %u of relation \"%s\" should be empty but is not", + BufferGetBlockNumber(buffer), + RelationGetRelationName(relation)); + + PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + + /* + * The page is empty, pin vmbuffer to set all_frozen bit. + */ + if (options & HEAP_INSERT_FROZEN) + { + Assert(PageGetMaxOffsetNumber(BufferGetPage(buffer)) == 0); + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer); + } + + /* + * Release the file-extension lock; it's now OK for someone else to extend + * the relation some more. + */ + if (needLock) + UnlockRelationForExtension(relation, ExclusiveLock); + + /* + * Lock the other buffer. It's guaranteed to be of a lower page number + * than the new page. To conform with the deadlock prevent rules, we ought + * to lock otherBuffer first, but that would give other backends a chance + * to put tuples on our page. To reduce the likelihood of that, attempt to + * lock the other buffer conditionally, that's very likely to work. + * Otherwise we need to lock buffers in the correct order, and retry if + * the space has been used in the mean time. + * + * Alternatively, we could acquire the lock on otherBuffer before + * extending the relation, but that'd require holding the lock while + * performing IO, which seems worse than an unlikely retry. + */ + if (otherBuffer != InvalidBuffer) + { + Assert(otherBuffer != buffer); + targetBlock = BufferGetBlockNumber(buffer); + Assert(targetBlock > otherBlock); + + if (unlikely(!ConditionalLockBuffer(otherBuffer))) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Because the buffers were unlocked for a while, it's possible, + * although unlikely, that an all-visible flag became set or that + * somebody used up the available space in the new page. We can + * use GetVisibilityMapPins to deal with the first case. In the + * second case, just retry from start. + */ + GetVisibilityMapPins(relation, otherBuffer, buffer, + otherBlock, targetBlock, vmbuffer_other, + vmbuffer); + + if (len > PageGetHeapFreeSpace(page)) + { + LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); + UnlockReleaseBuffer(buffer); + + goto loop; + } + } + } + + if (len > PageGetHeapFreeSpace(page)) + { + /* We should not get here given the test at the top */ + elog(PANIC, "tuple is too big: size %zu", len); + } + + /* + * Remember the new page as our target for future insertions. + * + * XXX should we enter the new page into the free space map immediately, + * or just keep it for this backend's exclusive use in the short run + * (until VACUUM sees it)? Seems to depend on whether you expect the + * current backend to make more insertions or not, which is probably a + * good bet most of the time. So for now, don't add it to FSM yet. + */ + RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); + + return buffer; +} diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c new file mode 100644 index 0000000..f7f8056 --- /dev/null +++ b/src/backend/access/heap/pruneheap.c @@ -0,0 +1,1052 @@ +/*------------------------------------------------------------------------- + * + * pruneheap.c + * heap page pruning and HOT-chain management code + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/pruneheap.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/htup_details.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "catalog/catalog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "utils/snapmgr.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +/* Working data for heap_page_prune and subroutines */ +typedef struct +{ + Relation rel; + + /* tuple visibility test, initialized for the relation */ + GlobalVisState *vistest; + + /* + * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have + * been computed (done on demand, and only if + * OldSnapshotThresholdActive()). The first time a tuple is about to be + * removed based on the limited horizon, old_snap_used is set to true, and + * SetOldSnapshotThresholdTimestamp() is called. See + * heap_prune_satisfies_vacuum(). + */ + TimestampTz old_snap_ts; + TransactionId old_snap_xmin; + bool old_snap_used; + + TransactionId new_prune_xid; /* new prune hint value for page */ + TransactionId latestRemovedXid; /* latest xid to be removed by this prune */ + int nredirected; /* numbers of entries in arrays below */ + int ndead; + int nunused; + /* arrays that accumulate indexes of items to be changed */ + OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; + OffsetNumber nowdead[MaxHeapTuplesPerPage]; + OffsetNumber nowunused[MaxHeapTuplesPerPage]; + + /* + * marked[i] is true if item i is entered in one of the above arrays. + * + * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * 1. Otherwise every access would need to subtract 1. + */ + bool marked[MaxHeapTuplesPerPage + 1]; + + /* + * Tuple visibility is only computed once for each tuple, for correctness + * and efficiency reasons; see comment in heap_page_prune() for + * details. This is of type int8[,] intead of HTSV_Result[], so we can use + * -1 to indicate no visibility has been computed, e.g. for LP_DEAD items. + * + * Same indexing as ->marked. + */ + int8 htsv[MaxHeapTuplesPerPage + 1]; +} PruneState; + +/* Local functions */ +static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, + HeapTuple tup, + Buffer buffer); +static int heap_prune_chain(Buffer buffer, + OffsetNumber rootoffnum, + PruneState *prstate); +static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); +static void heap_prune_record_redirect(PruneState *prstate, + OffsetNumber offnum, OffsetNumber rdoffnum); +static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); +static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); + + +/* + * Optionally prune and repair fragmentation in the specified page. + * + * This is an opportunistic function. It will perform housekeeping + * only if the page heuristically looks like a candidate for pruning and we + * can acquire buffer cleanup lock without blocking. + * + * Note: this is called quite often. It's important that it fall out quickly + * if there's not any use in pruning. + * + * Caller must have pin on the buffer, and must *not* have a lock on it. + */ +void +heap_page_prune_opt(Relation relation, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + TransactionId prune_xid; + GlobalVisState *vistest; + TransactionId limited_xmin = InvalidTransactionId; + TimestampTz limited_ts = 0; + Size minfree; + + /* + * We can't write WAL in recovery mode, so there's no point trying to + * clean the page. The primary will likely issue a cleaning WAL record + * soon anyway, so this is no particular loss. + */ + if (RecoveryInProgress()) + return; + + /* + * XXX: Magic to keep old_snapshot_threshold tests appear "working". They + * currently are broken, and discussion of what to do about them is + * ongoing. See + * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de + */ + if (old_snapshot_threshold == 0) + SnapshotTooOldMagicForTest(); + + /* + * First check whether there's any chance there's something to prune, + * determining the appropriate horizon is a waste if there's no prune_xid + * (i.e. no updates/deletes left potentially dead tuples around). + */ + prune_xid = ((PageHeader) page)->pd_prune_xid; + if (!TransactionIdIsValid(prune_xid)) + return; + + /* + * Check whether prune_xid indicates that there may be dead rows that can + * be cleaned up. + * + * It is OK to check the old snapshot limit before acquiring the cleanup + * lock because the worst that can happen is that we are not quite as + * aggressive about the cleanup (by however many transaction IDs are + * consumed between this point and acquiring the lock). This allows us to + * save significant overhead in the case where the page is found not to be + * prunable. + * + * Even if old_snapshot_threshold is set, we first check whether the page + * can be pruned without. Both because + * TransactionIdLimitedForOldSnapshots() is not cheap, and because not + * unnecessarily relying on old_snapshot_threshold avoids causing + * conflicts. + */ + vistest = GlobalVisTestFor(relation); + + if (!GlobalVisTestIsRemovableXid(vistest, prune_xid)) + { + if (!OldSnapshotThresholdActive()) + return; + + if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest), + relation, + &limited_xmin, &limited_ts)) + return; + + if (!TransactionIdPrecedes(prune_xid, limited_xmin)) + return; + } + + /* + * We prune when a previous UPDATE failed to find enough space on the page + * for a new tuple version, or when free space falls below the relation's + * fill-factor target (but not less than 10%). + * + * Checking free space here is questionable since we aren't holding any + * lock on the buffer; in the worst case we could get a bogus answer. It's + * unlikely to be *seriously* wrong, though, since reading either pd_lower + * or pd_upper is probably atomic. Avoiding taking a lock seems more + * important than sometimes getting a wrong answer in what is after all + * just a heuristic estimate. + */ + minfree = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + minfree = Max(minfree, BLCKSZ / 10); + + if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) + { + /* OK, try to get exclusive buffer lock */ + if (!ConditionalLockBufferForCleanup(buffer)) + return; + + /* + * Now that we have buffer lock, get accurate information about the + * page's free space, and recheck the heuristic about whether to + * prune. (We needn't recheck PageIsPrunable, since no one else could + * have pruned while we hold pin.) + */ + if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) + { + /* OK to prune */ + (void) heap_page_prune(relation, buffer, vistest, + limited_xmin, limited_ts, + true, NULL); + } + + /* And release buffer lock */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } +} + + +/* + * Prune and repair fragmentation in the specified page. + * + * Caller must have pin and buffer cleanup lock on the page. + * + * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD + * (see heap_prune_satisfies_vacuum and + * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to + * either have been set by TransactionIdLimitedForOldSnapshots, or + * InvalidTransactionId/0 respectively. + * + * If report_stats is true then we send the number of reclaimed heap-only + * tuples to pgstats. (This must be false during vacuum, since vacuum will + * send its own new total to pgstats, and we don't want this delta applied + * on top of that.) + * + * off_loc is the offset location required by the caller to use in error + * callback. + * + * Returns the number of tuples deleted from the page during this call. + */ +int +heap_page_prune(Relation relation, Buffer buffer, + GlobalVisState *vistest, + TransactionId old_snap_xmin, + TimestampTz old_snap_ts, + bool report_stats, + OffsetNumber *off_loc) +{ + int ndeleted = 0; + Page page = BufferGetPage(buffer); + OffsetNumber offnum, + maxoff; + PruneState prstate; + HeapTupleData tup; + + /* + * Our strategy is to scan the page and make lists of items to change, + * then apply the changes within a critical section. This keeps as much + * logic as possible out of the critical section, and also ensures that + * WAL replay will work the same as the normal case. + * + * First, initialize the new pd_prune_xid value to zero (indicating no + * prunable tuples). If we find any tuples which may soon become + * prunable, we will save the lowest relevant XID in new_prune_xid. Also + * initialize the rest of our working state. + */ + prstate.new_prune_xid = InvalidTransactionId; + prstate.rel = relation; + prstate.vistest = vistest; + prstate.old_snap_xmin = old_snap_xmin; + prstate.old_snap_ts = old_snap_ts; + prstate.old_snap_used = false; + prstate.latestRemovedXid = InvalidTransactionId; + prstate.nredirected = prstate.ndead = prstate.nunused = 0; + memset(prstate.marked, 0, sizeof(prstate.marked)); + + maxoff = PageGetMaxOffsetNumber(page); + tup.t_tableOid = RelationGetRelid(prstate.rel); + + /* + * Determine HTSV for all tuples. + * + * This is required for correctness to deal with cases where running HTSV + * twice could result in different results (e.g. RECENTLY_DEAD can turn to + * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid() + * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the + * inserting transaction aborts, ...). That in turn could cause + * heap_prune_chain() to behave incorrectly if a tuple is reached twice, + * once directly via a heap_prune_chain() and once following a HOT chain. + * + * It's also good for performance. Most commonly tuples within a page are + * stored at decreasing offsets (while the items are stored at increasing + * offsets). When processing all tuples on a page this leads to reading + * memory at decreasing offsets within a page, with a variable stride. + * That's hard for CPU prefetchers to deal with. Processing the items in + * reverse order (and thus the tuples in increasing order) increases + * prefetching efficiency significantly / decreases the number of cache + * misses. + */ + for (offnum = maxoff; + offnum >= FirstOffsetNumber; + offnum = OffsetNumberPrev(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + HeapTupleHeader htup; + + /* Nothing to do if slot doesn't contain a tuple */ + if (!ItemIdIsNormal(itemid)) + { + prstate.htsv[offnum] = -1; + continue; + } + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + tup.t_data = htup; + tup.t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum); + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + if (off_loc) + *off_loc = offnum; + + prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, + buffer); + } + + /* Scan the page */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + /* Ignore items already processed as part of an earlier chain */ + if (prstate.marked[offnum]) + continue; + + /* see preceding loop */ + if (off_loc) + *off_loc = offnum; + + /* Nothing to do if slot is empty or already dead */ + itemid = PageGetItemId(page, offnum); + if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) + continue; + + /* Process this item or chain of items */ + ndeleted += heap_prune_chain(buffer, offnum, &prstate); + } + + /* Clear the offset information once we have processed the given page. */ + if (off_loc) + *off_loc = InvalidOffsetNumber; + + /* Any error while applying the changes is critical */ + START_CRIT_SECTION(); + + /* Have we found any prunable items? */ + if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) + { + /* + * Apply the planned item changes, then repair page fragmentation, and + * update the page's hint bit about whether it has free line pointers. + */ + heap_page_prune_execute(buffer, + prstate.redirected, prstate.nredirected, + prstate.nowdead, prstate.ndead, + prstate.nowunused, prstate.nunused); + + /* + * Update the page's pd_prune_xid field to either zero, or the lowest + * XID of any soon-prunable tuple. + */ + ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; + + /* + * Also clear the "page is full" flag, since there's no point in + * repeating the prune/defrag process until something else happens to + * the page. + */ + PageClearFull(page); + + MarkBufferDirty(buffer); + + /* + * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did + */ + if (RelationNeedsWAL(relation)) + { + xl_heap_prune xlrec; + XLogRecPtr recptr; + + xlrec.latestRemovedXid = prstate.latestRemovedXid; + xlrec.nredirected = prstate.nredirected; + xlrec.ndead = prstate.ndead; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapPrune); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + /* + * The OffsetNumber arrays are not actually in the buffer, but we + * pretend that they are. When XLogInsert stores the whole + * buffer, the offset arrays need not be stored too. + */ + if (prstate.nredirected > 0) + XLogRegisterBufData(0, (char *) prstate.redirected, + prstate.nredirected * + sizeof(OffsetNumber) * 2); + + if (prstate.ndead > 0) + XLogRegisterBufData(0, (char *) prstate.nowdead, + prstate.ndead * sizeof(OffsetNumber)); + + if (prstate.nunused > 0) + XLogRegisterBufData(0, (char *) prstate.nowunused, + prstate.nunused * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE); + + PageSetLSN(BufferGetPage(buffer), recptr); + } + } + else + { + /* + * If we didn't prune anything, but have found a new value for the + * pd_prune_xid field, update it and mark the buffer dirty. This is + * treated as a non-WAL-logged hint. + * + * Also clear the "page is full" flag if it is set, since there's no + * point in repeating the prune/defrag process until something else + * happens to the page. + */ + if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || + PageIsFull(page)) + { + ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; + PageClearFull(page); + MarkBufferDirtyHint(buffer, true); + } + } + + END_CRIT_SECTION(); + + /* + * If requested, report the number of tuples reclaimed to pgstats. This is + * ndeleted minus ndead, because we don't want to count a now-DEAD root + * item as a deletion for this purpose. + */ + if (report_stats && ndeleted > prstate.ndead) + pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); + + /* + * XXX Should we update the FSM information of this page ? + * + * There are two schools of thought here. We may not want to update FSM + * information so that the page is not used for unrelated UPDATEs/INSERTs + * and any free space in this page will remain available for further + * UPDATEs in *this* page, thus improving chances for doing HOT updates. + * + * But for a large table and where a page does not receive further UPDATEs + * for a long time, we might waste this space by not updating the FSM + * information. The relation may get extended and fragmented further. + * + * One possibility is to leave "fillfactor" worth of space in this page + * and update FSM with the remaining space. + */ + + return ndeleted; +} + + +/* + * Perform visibility checks for heap pruning. + * + * This is more complicated than just using GlobalVisTestIsRemovableXid() + * because of old_snapshot_threshold. We only want to increase the threshold + * that triggers errors for old snapshots when we actually decide to remove a + * row based on the limited horizon. + * + * Due to its cost we also only want to call + * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have + * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we + * still want to be able to remove rows that are too new to be removed + * according to prstate->vistest, but that can be removed based on + * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on + * demand in here, if appropriate. + */ +static HTSV_Result +heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) +{ + HTSV_Result res; + TransactionId dead_after; + + res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after); + + if (res != HEAPTUPLE_RECENTLY_DEAD) + return res; + + /* + * If we are already relying on the limited xmin, there is no need to + * delay doing so anymore. + */ + if (prstate->old_snap_used) + { + Assert(TransactionIdIsValid(prstate->old_snap_xmin)); + + if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) + res = HEAPTUPLE_DEAD; + return res; + } + + /* + * First check if GlobalVisTestIsRemovableXid() is sufficient to find the + * row dead. If not, and old_snapshot_threshold is enabled, try to use the + * lowered horizon. + */ + if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after)) + res = HEAPTUPLE_DEAD; + else if (OldSnapshotThresholdActive()) + { + /* haven't determined limited horizon yet, requests */ + if (!TransactionIdIsValid(prstate->old_snap_xmin)) + { + TransactionId horizon = + GlobalVisTestNonRemovableHorizon(prstate->vistest); + + TransactionIdLimitedForOldSnapshots(horizon, prstate->rel, + &prstate->old_snap_xmin, + &prstate->old_snap_ts); + } + + if (TransactionIdIsValid(prstate->old_snap_xmin) && + TransactionIdPrecedes(dead_after, prstate->old_snap_xmin)) + { + /* + * About to remove row based on snapshot_too_old. Need to raise + * the threshold so problematic accesses would error. + */ + Assert(!prstate->old_snap_used); + SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts, + prstate->old_snap_xmin); + prstate->old_snap_used = true; + res = HEAPTUPLE_DEAD; + } + } + + return res; +} + + +/* + * Prune specified line pointer or a HOT chain originating at line pointer. + * + * If the item is an index-referenced tuple (i.e. not a heap-only tuple), + * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT + * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. + * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really + * DEAD, our visibility test is just too coarse to detect it. + * + * The root line pointer is redirected to the tuple immediately after the + * latest DEAD tuple. If all tuples in the chain are DEAD, the root line + * pointer is marked LP_DEAD. (This includes the case of a DEAD simple + * tuple, which we treat as a chain of length 1.) + * + * We don't actually change the page here. We just add entries to the arrays in + * prstate showing the changes to be made. Items to be redirected are added + * to the redirected[] array (two entries per redirection); items to be set to + * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED + * state are added to nowunused[]. + * + * Returns the number of tuples (to be) deleted from the page. + */ +static int +heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) +{ + int ndeleted = 0; + Page dp = (Page) BufferGetPage(buffer); + TransactionId priorXmax = InvalidTransactionId; + ItemId rootlp; + HeapTupleHeader htup; + OffsetNumber latestdead = InvalidOffsetNumber, + maxoff = PageGetMaxOffsetNumber(dp), + offnum; + OffsetNumber chainitems[MaxHeapTuplesPerPage]; + int nchain = 0, + i; + + rootlp = PageGetItemId(dp, rootoffnum); + + /* + * If it's a heap-only tuple, then it is not the start of a HOT chain. + */ + if (ItemIdIsNormal(rootlp)) + { + Assert(prstate->htsv[rootoffnum] != -1); + htup = (HeapTupleHeader) PageGetItem(dp, rootlp); + + if (HeapTupleHeaderIsHeapOnly(htup)) + { + /* + * If the tuple is DEAD and doesn't chain to anything else, mark + * it unused immediately. (If it does chain, we can only remove + * it as part of pruning its chain.) + * + * We need this primarily to handle aborted HOT updates, that is, + * XMIN_INVALID heap-only tuples. Those might not be linked to by + * any chain, since the parent tuple might be re-updated before + * any pruning occurs. So we have to be able to reap them + * separately from chain-pruning. (Note that + * HeapTupleHeaderIsHotUpdated will never return true for an + * XMIN_INVALID tuple, so this code will work even when there were + * sequential updates within the aborted transaction.) + * + * Note that we might first arrive at a dead heap-only tuple + * either here or while following a chain below. Whichever path + * gets there first will mark the tuple unused. + */ + if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD && + !HeapTupleHeaderIsHotUpdated(htup)) + { + heap_prune_record_unused(prstate, rootoffnum); + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + ndeleted++; + } + + /* Nothing more to do */ + return ndeleted; + } + } + + /* Start from the root tuple */ + offnum = rootoffnum; + + /* while not end of the chain */ + for (;;) + { + ItemId lp; + bool tupdead, + recent_dead; + + /* Some sanity checks */ + if (offnum < FirstOffsetNumber || offnum > maxoff) + break; + + /* If item is already processed, stop --- it must not be same chain */ + if (prstate->marked[offnum]) + break; + + lp = PageGetItemId(dp, offnum); + + /* Unused item obviously isn't part of the chain */ + if (!ItemIdIsUsed(lp)) + break; + + /* + * If we are looking at the redirected root line pointer, jump to the + * first normal tuple in the chain. If we find a redirect somewhere + * else, stop --- it must not be same chain. + */ + if (ItemIdIsRedirected(lp)) + { + if (nchain > 0) + break; /* not at start of chain */ + chainitems[nchain++] = offnum; + offnum = ItemIdGetRedirect(rootlp); + continue; + } + + /* + * Likewise, a dead line pointer can't be part of the chain. (We + * already eliminated the case of dead root tuple outside this + * function.) + */ + if (ItemIdIsDead(lp)) + break; + + Assert(ItemIdIsNormal(lp)); + Assert(prstate->htsv[offnum] != -1); + htup = (HeapTupleHeader) PageGetItem(dp, lp); + + /* + * Check the tuple XMIN against prior XMAX, if any + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + break; + + /* + * OK, this tuple is indeed a member of the chain. + */ + chainitems[nchain++] = offnum; + + /* + * Check tuple's visibility status. + */ + tupdead = recent_dead = false; + + switch ((HTSV_Result) prstate->htsv[offnum]) + { + case HEAPTUPLE_DEAD: + tupdead = true; + break; + + case HEAPTUPLE_RECENTLY_DEAD: + recent_dead = true; + + /* + * This tuple may soon become DEAD. Update the hint field so + * that the page is reconsidered for pruning in future. + */ + heap_prune_record_prunable(prstate, + HeapTupleHeaderGetUpdateXid(htup)); + break; + + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * This tuple may soon become DEAD. Update the hint field so + * that the page is reconsidered for pruning in future. + */ + heap_prune_record_prunable(prstate, + HeapTupleHeaderGetUpdateXid(htup)); + break; + + case HEAPTUPLE_LIVE: + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * If we wanted to optimize for aborts, we might consider + * marking the page prunable when we see INSERT_IN_PROGRESS. + * But we don't. See related decisions about when to mark the + * page prunable in heapam.c. + */ + break; + + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + /* + * Remember the last DEAD tuple seen. We will advance past + * RECENTLY_DEAD tuples just in case there's a DEAD one after them; + * but we can't advance past anything else. (XXX is it really worth + * continuing to scan beyond RECENTLY_DEAD? The case where we will + * find another DEAD tuple is a fairly unusual corner case.) + */ + if (tupdead) + { + latestdead = offnum; + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + } + else if (!recent_dead) + break; + + /* + * If the tuple is not HOT-updated, then we are at the end of this + * HOT-update chain. + */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + /* HOT implies it can't have moved to different partition */ + Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); + + /* + * Advance to next chain member. + */ + Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == + BufferGetBlockNumber(buffer)); + offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + + /* + * If we found a DEAD tuple in the chain, adjust the HOT chain so that all + * the DEAD tuples at the start of the chain are removed and the root line + * pointer is appropriately redirected. + */ + if (OffsetNumberIsValid(latestdead)) + { + /* + * Mark as unused each intermediate item that we are able to remove + * from the chain. + * + * When the previous item is the last dead tuple seen, we are at the + * right candidate for redirection. + */ + for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) + { + heap_prune_record_unused(prstate, chainitems[i]); + ndeleted++; + } + + /* + * If the root entry had been a normal tuple, we are deleting it, so + * count it in the result. But changing a redirect (even to DEAD + * state) doesn't count. + */ + if (ItemIdIsNormal(rootlp)) + ndeleted++; + + /* + * If the DEAD tuple is at the end of the chain, the entire chain is + * dead and the root line pointer can be marked dead. Otherwise just + * redirect the root to the correct chain member. + */ + if (i >= nchain) + heap_prune_record_dead(prstate, rootoffnum); + else + heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); + } + else if (nchain < 2 && ItemIdIsRedirected(rootlp)) + { + /* + * We found a redirect item that doesn't point to a valid follow-on + * item. This can happen if the loop in heap_page_prune caused us to + * visit the dead successor of a redirect item before visiting the + * redirect item. We can clean up by setting the redirect item to + * DEAD state. + */ + heap_prune_record_dead(prstate, rootoffnum); + } + + return ndeleted; +} + +/* Record lowest soon-prunable XID */ +static void +heap_prune_record_prunable(PruneState *prstate, TransactionId xid) +{ + /* + * This should exactly match the PageSetPrunable macro. We can't store + * directly into the page header yet, so we update working state. + */ + Assert(TransactionIdIsNormal(xid)); + if (!TransactionIdIsValid(prstate->new_prune_xid) || + TransactionIdPrecedes(xid, prstate->new_prune_xid)) + prstate->new_prune_xid = xid; +} + +/* Record line pointer to be redirected */ +static void +heap_prune_record_redirect(PruneState *prstate, + OffsetNumber offnum, OffsetNumber rdoffnum) +{ + Assert(prstate->nredirected < MaxHeapTuplesPerPage); + prstate->redirected[prstate->nredirected * 2] = offnum; + prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; + prstate->nredirected++; + Assert(!prstate->marked[offnum]); + prstate->marked[offnum] = true; + Assert(!prstate->marked[rdoffnum]); + prstate->marked[rdoffnum] = true; +} + +/* Record line pointer to be marked dead */ +static void +heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) +{ + Assert(prstate->ndead < MaxHeapTuplesPerPage); + prstate->nowdead[prstate->ndead] = offnum; + prstate->ndead++; + Assert(!prstate->marked[offnum]); + prstate->marked[offnum] = true; +} + +/* Record line pointer to be marked unused */ +static void +heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) +{ + Assert(prstate->nunused < MaxHeapTuplesPerPage); + prstate->nowunused[prstate->nunused] = offnum; + prstate->nunused++; + Assert(!prstate->marked[offnum]); + prstate->marked[offnum] = true; +} + + +/* + * Perform the actual page changes needed by heap_page_prune. + * It is expected that the caller has a super-exclusive lock on the + * buffer. + */ +void +heap_page_prune_execute(Buffer buffer, + OffsetNumber *redirected, int nredirected, + OffsetNumber *nowdead, int ndead, + OffsetNumber *nowunused, int nunused) +{ + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *offnum; + int i; + + /* Shouldn't be called unless there's something to do */ + Assert(nredirected > 0 || ndead > 0 || nunused > 0); + + /* Update all redirected line pointers */ + offnum = redirected; + for (i = 0; i < nredirected; i++) + { + OffsetNumber fromoff = *offnum++; + OffsetNumber tooff = *offnum++; + ItemId fromlp = PageGetItemId(page, fromoff); + + ItemIdSetRedirect(fromlp, tooff); + } + + /* Update all now-dead line pointers */ + offnum = nowdead; + for (i = 0; i < ndead; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + + ItemIdSetDead(lp); + } + + /* Update all now-unused line pointers */ + offnum = nowunused; + for (i = 0; i < nunused; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + + ItemIdSetUnused(lp); + } + + /* + * Finally, repair any fragmentation, and update the page's hint bit about + * whether it has free pointers. + */ + PageRepairFragmentation(page); +} + + +/* + * For all items in this page, find their respective root line pointers. + * If item k is part of a HOT-chain with root at item j, then we set + * root_offsets[k - 1] = j. + * + * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. + * Unused entries are filled with InvalidOffsetNumber (zero). + * + * The function must be called with at least share lock on the buffer, to + * prevent concurrent prune operations. + * + * Note: The information collected here is valid only as long as the caller + * holds a pin on the buffer. Once pin is released, a tuple might be pruned + * and reused by a completely unrelated tuple. + */ +void +heap_get_root_tuples(Page page, OffsetNumber *root_offsets) +{ + OffsetNumber offnum, + maxoff; + + MemSet(root_offsets, InvalidOffsetNumber, + MaxHeapTuplesPerPage * sizeof(OffsetNumber)); + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId lp = PageGetItemId(page, offnum); + HeapTupleHeader htup; + OffsetNumber nextoffnum; + TransactionId priorXmax; + + /* skip unused and dead items */ + if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) + continue; + + if (ItemIdIsNormal(lp)) + { + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Check if this tuple is part of a HOT-chain rooted at some other + * tuple. If so, skip it for now; we'll process it when we find + * its root. + */ + if (HeapTupleHeaderIsHeapOnly(htup)) + continue; + + /* + * This is either a plain tuple or the root of a HOT-chain. + * Remember it in the mapping. + */ + root_offsets[offnum - 1] = offnum; + + /* If it's not the start of a HOT-chain, we're done with it */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + continue; + + /* Set up to scan the HOT-chain */ + nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + else + { + /* Must be a redirect item. We do not set its root_offsets entry */ + Assert(ItemIdIsRedirected(lp)); + /* Set up to scan the HOT-chain */ + nextoffnum = ItemIdGetRedirect(lp); + priorXmax = InvalidTransactionId; + } + + /* + * Now follow the HOT-chain and collect other tuples in the chain. + * + * Note: Even though this is a nested loop, the complexity of the + * function is O(N) because a tuple in the page should be visited not + * more than twice, once in the outer loop and once in HOT-chain + * chases. + */ + for (;;) + { + /* Sanity check */ + if (nextoffnum < FirstOffsetNumber || nextoffnum > maxoff) + break; + + lp = PageGetItemId(page, nextoffnum); + + /* Check for broken chains */ + if (!ItemIdIsNormal(lp)) + break; + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) + break; + + /* Remember the root line pointer for this item */ + root_offsets[nextoffnum - 1] = offnum; + + /* Advance to next chain member, if any */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + /* HOT implies it can't have moved to different partition */ + Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); + + nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetUpdateXid(htup); + } + } +} diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c new file mode 100644 index 0000000..15bef9f --- /dev/null +++ b/src/backend/access/heap/rewriteheap.c @@ -0,0 +1,1295 @@ +/*------------------------------------------------------------------------- + * + * rewriteheap.c + * Support functions to rewrite tables. + * + * These functions provide a facility to completely rewrite a heap, while + * preserving visibility information and update chains. + * + * INTERFACE + * + * The caller is responsible for creating the new heap, all catalog + * changes, supplying the tuples to be written to the new heap, and + * rebuilding indexes. The caller must hold AccessExclusiveLock on the + * target table, because we assume no one else is writing into it. + * + * To use the facility: + * + * begin_heap_rewrite + * while (fetch next tuple) + * { + * if (tuple is dead) + * rewrite_heap_dead_tuple + * else + * { + * // do any transformations here if required + * rewrite_heap_tuple + * } + * } + * end_heap_rewrite + * + * The contents of the new relation shouldn't be relied on until after + * end_heap_rewrite is called. + * + * + * IMPLEMENTATION + * + * This would be a fairly trivial affair, except that we need to maintain + * the ctid chains that link versions of an updated tuple together. + * Since the newly stored tuples will have tids different from the original + * ones, if we just copied t_ctid fields to the new table the links would + * be wrong. When we are required to copy a (presumably recently-dead or + * delete-in-progress) tuple whose ctid doesn't point to itself, we have + * to substitute the correct ctid instead. + * + * For each ctid reference from A -> B, we might encounter either A first + * or B first. (Note that a tuple in the middle of a chain is both A and B + * of different pairs.) + * + * If we encounter A first, we'll store the tuple in the unresolved_tups + * hash table. When we later encounter B, we remove A from the hash table, + * fix the ctid to point to the new location of B, and insert both A and B + * to the new heap. + * + * If we encounter B first, we can insert B to the new heap right away. + * We then add an entry to the old_new_tid_map hash table showing B's + * original tid (in the old heap) and new tid (in the new heap). + * When we later encounter A, we get the new location of B from the table, + * and can write A immediately with the correct ctid. + * + * Entries in the hash tables can be removed as soon as the later tuple + * is encountered. That helps to keep the memory usage down. At the end, + * both tables are usually empty; we should have encountered both A and B + * of each pair. However, it's possible for A to be RECENTLY_DEAD and B + * entirely DEAD according to HeapTupleSatisfiesVacuum, because the test + * for deadness using OldestXmin is not exact. In such a case we might + * encounter B first, and skip it, and find A later. Then A would be added + * to unresolved_tups, and stay there until end of the rewrite. Since + * this case is very unusual, we don't worry about the memory usage. + * + * Using in-memory hash tables means that we use some memory for each live + * update chain in the table, from the time we find one end of the + * reference until we find the other end. That shouldn't be a problem in + * practice, but if you do something like an UPDATE without a where-clause + * on a large table, and then run CLUSTER in the same transaction, you + * could run out of memory. It doesn't seem worthwhile to add support for + * spill-to-disk, as there shouldn't be that many RECENTLY_DEAD tuples in a + * table under normal circumstances. Furthermore, in the typical scenario + * of CLUSTERing on an unchanging key column, we'll see all the versions + * of a given tuple together anyway, and so the peak memory usage is only + * proportional to the number of RECENTLY_DEAD versions of a single row, not + * in the whole table. Note that if we do fail halfway through a CLUSTER, + * the old table is still valid, so failure is not catastrophic. + * + * We can't use the normal heap_insert function to insert into the new + * heap, because heap_insert overwrites the visibility information. + * We use a special-purpose raw_heap_insert function instead, which + * is optimized for bulk inserting a lot of tuples, knowing that we have + * exclusive access to the heap. raw_heap_insert builds new pages in + * local storage. When a page is full, or at the end of the process, + * we insert it to WAL as a single record and then write it to disk + * directly through smgr. Note, however, that any data sent to the new + * heap's TOAST table will go through the normal bufmgr. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994-5, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/heap/rewriteheap.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/heaptoast.h" +#include "access/rewriteheap.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/catalog.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/slot.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * State associated with a rewrite operation. This is opaque to the user + * of the rewrite facility. + */ +typedef struct RewriteStateData +{ + Relation rs_old_rel; /* source heap */ + Relation rs_new_rel; /* destination heap */ + Page rs_buffer; /* page currently being built */ + BlockNumber rs_blockno; /* block where page will go */ + bool rs_buffer_valid; /* T if any tuples in buffer */ + bool rs_logical_rewrite; /* do we need to do logical rewriting */ + TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine + * tuple visibility */ + TransactionId rs_freeze_xid; /* Xid that will be used as freeze cutoff + * point */ + TransactionId rs_logical_xmin; /* Xid that will be used as cutoff point + * for logical rewrites */ + MultiXactId rs_cutoff_multi; /* MultiXactId that will be used as cutoff + * point for multixacts */ + MemoryContext rs_cxt; /* for hash tables and entries and tuples in + * them */ + XLogRecPtr rs_begin_lsn; /* XLogInsertLsn when starting the rewrite */ + HTAB *rs_unresolved_tups; /* unmatched A tuples */ + HTAB *rs_old_new_tid_map; /* unmatched B tuples */ + HTAB *rs_logical_mappings; /* logical remapping files */ + uint32 rs_num_rewrite_mappings; /* # in memory mappings */ +} RewriteStateData; + +/* + * The lookup keys for the hash tables are tuple TID and xmin (we must check + * both to avoid false matches from dead tuples). Beware that there is + * probably some padding space in this struct; it must be zeroed out for + * correct hashtable operation. + */ +typedef struct +{ + TransactionId xmin; /* tuple xmin */ + ItemPointerData tid; /* tuple location in old heap */ +} TidHashKey; + +/* + * Entry structures for the hash tables + */ +typedef struct +{ + TidHashKey key; /* expected xmin/old location of B tuple */ + ItemPointerData old_tid; /* A's location in the old heap */ + HeapTuple tuple; /* A's tuple contents */ +} UnresolvedTupData; + +typedef UnresolvedTupData *UnresolvedTup; + +typedef struct +{ + TidHashKey key; /* actual xmin/old location of B tuple */ + ItemPointerData new_tid; /* where we put it in the new heap */ +} OldToNewMappingData; + +typedef OldToNewMappingData *OldToNewMapping; + +/* + * In-Memory data for an xid that might need logical remapping entries + * to be logged. + */ +typedef struct RewriteMappingFile +{ + TransactionId xid; /* xid that might need to see the row */ + int vfd; /* fd of mappings file */ + off_t off; /* how far have we written yet */ + uint32 num_mappings; /* number of in-memory mappings */ + dlist_head mappings; /* list of in-memory mappings */ + char path[MAXPGPATH]; /* path, for error messages */ +} RewriteMappingFile; + +/* + * A single In-Memory logical rewrite mapping, hanging off + * RewriteMappingFile->mappings. + */ +typedef struct RewriteMappingDataEntry +{ + LogicalRewriteMappingData map; /* map between old and new location of the + * tuple */ + dlist_node node; +} RewriteMappingDataEntry; + + +/* prototypes for internal functions */ +static void raw_heap_insert(RewriteState state, HeapTuple tup); + +/* internal logical remapping prototypes */ +static void logical_begin_heap_rewrite(RewriteState state); +static void logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple); +static void logical_end_heap_rewrite(RewriteState state); + + +/* + * Begin a rewrite of a table + * + * old_heap old, locked heap relation tuples will be read from + * new_heap new, locked heap relation to insert tuples to + * oldest_xmin xid used by the caller to determine which tuples are dead + * freeze_xid xid before which tuples will be frozen + * cutoff_multi multixact before which multis will be removed + * + * Returns an opaque RewriteState, allocated in current memory context, + * to be used in subsequent calls to the other functions. + */ +RewriteState +begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, + TransactionId freeze_xid, MultiXactId cutoff_multi) +{ + RewriteState state; + MemoryContext rw_cxt; + MemoryContext old_cxt; + HASHCTL hash_ctl; + + /* + * To ease cleanup, make a separate context that will contain the + * RewriteState struct itself plus all subsidiary data. + */ + rw_cxt = AllocSetContextCreate(CurrentMemoryContext, + "Table rewrite", + ALLOCSET_DEFAULT_SIZES); + old_cxt = MemoryContextSwitchTo(rw_cxt); + + /* Create and fill in the state struct */ + state = palloc0(sizeof(RewriteStateData)); + + state->rs_old_rel = old_heap; + state->rs_new_rel = new_heap; + state->rs_buffer = (Page) palloc(BLCKSZ); + /* new_heap needn't be empty, just locked */ + state->rs_blockno = RelationGetNumberOfBlocks(new_heap); + state->rs_buffer_valid = false; + state->rs_oldest_xmin = oldest_xmin; + state->rs_freeze_xid = freeze_xid; + state->rs_cutoff_multi = cutoff_multi; + state->rs_cxt = rw_cxt; + + /* Initialize hash tables used to track update chains */ + hash_ctl.keysize = sizeof(TidHashKey); + hash_ctl.entrysize = sizeof(UnresolvedTupData); + hash_ctl.hcxt = state->rs_cxt; + + state->rs_unresolved_tups = + hash_create("Rewrite / Unresolved ctids", + 128, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + hash_ctl.entrysize = sizeof(OldToNewMappingData); + + state->rs_old_new_tid_map = + hash_create("Rewrite / Old to new tid map", + 128, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + MemoryContextSwitchTo(old_cxt); + + logical_begin_heap_rewrite(state); + + return state; +} + +/* + * End a rewrite. + * + * state and any other resources are freed. + */ +void +end_heap_rewrite(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + UnresolvedTup unresolved; + + /* + * Write any remaining tuples in the UnresolvedTups table. If we have any + * left, they should in fact be dead, but let's err on the safe side. + */ + hash_seq_init(&seq_status, state->rs_unresolved_tups); + + while ((unresolved = hash_seq_search(&seq_status)) != NULL) + { + ItemPointerSetInvalid(&unresolved->tuple->t_data->t_ctid); + raw_heap_insert(state, unresolved->tuple); + } + + /* Write the last page, if any */ + if (state->rs_buffer_valid) + { + if (RelationNeedsWAL(state->rs_new_rel)) + log_newpage(&state->rs_new_rel->rd_node, + MAIN_FORKNUM, + state->rs_blockno, + state->rs_buffer, + true); + + PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); + + RelationOpenSmgr(state->rs_new_rel); + smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno, + (char *) state->rs_buffer, true); + } + + /* + * When we WAL-logged rel pages, we must nonetheless fsync them. The + * reason is the same as in storage.c's RelationCopyStorage(): we're + * writing data that's not in shared buffers, and so a CHECKPOINT + * occurring during the rewriteheap operation won't have fsync'd data we + * wrote before the checkpoint. + */ + if (RelationNeedsWAL(state->rs_new_rel)) + { + /* for an empty table, this could be first smgr access */ + RelationOpenSmgr(state->rs_new_rel); + smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM); + } + + logical_end_heap_rewrite(state); + + /* Deleting the context frees everything */ + MemoryContextDelete(state->rs_cxt); +} + +/* + * Add a tuple to the new heap. + * + * Visibility information is copied from the original tuple, except that + * we "freeze" very-old tuples. Note that since we scribble on new_tuple, + * it had better be temp storage not a pointer to the original tuple. + * + * state opaque state as returned by begin_heap_rewrite + * old_tuple original tuple in the old heap + * new_tuple new, rewritten tuple to be inserted to new heap + */ +void +rewrite_heap_tuple(RewriteState state, + HeapTuple old_tuple, HeapTuple new_tuple) +{ + MemoryContext old_cxt; + ItemPointerData old_tid; + TidHashKey hashkey; + bool found; + bool free_new; + + old_cxt = MemoryContextSwitchTo(state->rs_cxt); + + /* + * Copy the original tuple's visibility information into new_tuple. + * + * XXX we might later need to copy some t_infomask2 bits, too? Right now, + * we intentionally clear the HOT status bits. + */ + memcpy(&new_tuple->t_data->t_choice.t_heap, + &old_tuple->t_data->t_choice.t_heap, + sizeof(HeapTupleFields)); + + new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; + new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; + new_tuple->t_data->t_infomask |= + old_tuple->t_data->t_infomask & HEAP_XACT_MASK; + + /* + * While we have our hands on the tuple, we may as well freeze any + * eligible xmin or xmax, so that future VACUUM effort can be saved. + */ + heap_freeze_tuple(new_tuple->t_data, + state->rs_old_rel->rd_rel->relfrozenxid, + state->rs_old_rel->rd_rel->relminmxid, + state->rs_freeze_xid, + state->rs_cutoff_multi); + + /* + * Invalid ctid means that ctid should point to the tuple itself. We'll + * override it later if the tuple is part of an update chain. + */ + ItemPointerSetInvalid(&new_tuple->t_data->t_ctid); + + /* + * If the tuple has been updated, check the old-to-new mapping hash table. + */ + if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && + !HeapTupleHeaderIndicatesMovedPartitions(old_tuple->t_data) && + !(ItemPointerEquals(&(old_tuple->t_self), + &(old_tuple->t_data->t_ctid)))) + { + OldToNewMapping mapping; + + memset(&hashkey, 0, sizeof(hashkey)); + hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); + hashkey.tid = old_tuple->t_data->t_ctid; + + mapping = (OldToNewMapping) + hash_search(state->rs_old_new_tid_map, &hashkey, + HASH_FIND, NULL); + + if (mapping != NULL) + { + /* + * We've already copied the tuple that t_ctid points to, so we can + * set the ctid of this tuple to point to the new location, and + * insert it right away. + */ + new_tuple->t_data->t_ctid = mapping->new_tid; + + /* We don't need the mapping entry anymore */ + hash_search(state->rs_old_new_tid_map, &hashkey, + HASH_REMOVE, &found); + Assert(found); + } + else + { + /* + * We haven't seen the tuple t_ctid points to yet. Stash this + * tuple into unresolved_tups to be written later. + */ + UnresolvedTup unresolved; + + unresolved = hash_search(state->rs_unresolved_tups, &hashkey, + HASH_ENTER, &found); + Assert(!found); + + unresolved->old_tid = old_tuple->t_self; + unresolved->tuple = heap_copytuple(new_tuple); + + /* + * We can't do anything more now, since we don't know where the + * tuple will be written. + */ + MemoryContextSwitchTo(old_cxt); + return; + } + } + + /* + * Now we will write the tuple, and then check to see if it is the B tuple + * in any new or known pair. When we resolve a known pair, we will be + * able to write that pair's A tuple, and then we have to check if it + * resolves some other pair. Hence, we need a loop here. + */ + old_tid = old_tuple->t_self; + free_new = false; + + for (;;) + { + ItemPointerData new_tid; + + /* Insert the tuple and find out where it's put in new_heap */ + raw_heap_insert(state, new_tuple); + new_tid = new_tuple->t_self; + + logical_rewrite_heap_tuple(state, old_tid, new_tuple); + + /* + * If the tuple is the updated version of a row, and the prior version + * wouldn't be DEAD yet, then we need to either resolve the prior + * version (if it's waiting in rs_unresolved_tups), or make an entry + * in rs_old_new_tid_map (so we can resolve it when we do see it). The + * previous tuple's xmax would equal this one's xmin, so it's + * RECENTLY_DEAD if and only if the xmin is not before OldestXmin. + */ + if ((new_tuple->t_data->t_infomask & HEAP_UPDATED) && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(new_tuple->t_data), + state->rs_oldest_xmin)) + { + /* + * Okay, this is B in an update pair. See if we've seen A. + */ + UnresolvedTup unresolved; + + memset(&hashkey, 0, sizeof(hashkey)); + hashkey.xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); + hashkey.tid = old_tid; + + unresolved = hash_search(state->rs_unresolved_tups, &hashkey, + HASH_FIND, NULL); + + if (unresolved != NULL) + { + /* + * We have seen and memorized the previous tuple already. Now + * that we know where we inserted the tuple its t_ctid points + * to, fix its t_ctid and insert it to the new heap. + */ + if (free_new) + heap_freetuple(new_tuple); + new_tuple = unresolved->tuple; + free_new = true; + old_tid = unresolved->old_tid; + new_tuple->t_data->t_ctid = new_tid; + + /* + * We don't need the hash entry anymore, but don't free its + * tuple just yet. + */ + hash_search(state->rs_unresolved_tups, &hashkey, + HASH_REMOVE, &found); + Assert(found); + + /* loop back to insert the previous tuple in the chain */ + continue; + } + else + { + /* + * Remember the new tid of this tuple. We'll use it to set the + * ctid when we find the previous tuple in the chain. + */ + OldToNewMapping mapping; + + mapping = hash_search(state->rs_old_new_tid_map, &hashkey, + HASH_ENTER, &found); + Assert(!found); + + mapping->new_tid = new_tid; + } + } + + /* Done with this (chain of) tuples, for now */ + if (free_new) + heap_freetuple(new_tuple); + break; + } + + MemoryContextSwitchTo(old_cxt); +} + +/* + * Register a dead tuple with an ongoing rewrite. Dead tuples are not + * copied to the new table, but we still make note of them so that we + * can release some resources earlier. + * + * Returns true if a tuple was removed from the unresolved_tups table. + * This indicates that that tuple, previously thought to be "recently dead", + * is now known really dead and won't be written to the output. + */ +bool +rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple) +{ + /* + * If we have already seen an earlier tuple in the update chain that + * points to this tuple, let's forget about that earlier tuple. It's in + * fact dead as well, our simple xmax < OldestXmin test in + * HeapTupleSatisfiesVacuum just wasn't enough to detect it. It happens + * when xmin of a tuple is greater than xmax, which sounds + * counter-intuitive but is perfectly valid. + * + * We don't bother to try to detect the situation the other way round, + * when we encounter the dead tuple first and then the recently dead one + * that points to it. If that happens, we'll have some unmatched entries + * in the UnresolvedTups hash table at the end. That can happen anyway, + * because a vacuum might have removed the dead tuple in the chain before + * us. + */ + UnresolvedTup unresolved; + TidHashKey hashkey; + bool found; + + memset(&hashkey, 0, sizeof(hashkey)); + hashkey.xmin = HeapTupleHeaderGetXmin(old_tuple->t_data); + hashkey.tid = old_tuple->t_self; + + unresolved = hash_search(state->rs_unresolved_tups, &hashkey, + HASH_FIND, NULL); + + if (unresolved != NULL) + { + /* Need to free the contained tuple as well as the hashtable entry */ + heap_freetuple(unresolved->tuple); + hash_search(state->rs_unresolved_tups, &hashkey, + HASH_REMOVE, &found); + Assert(found); + return true; + } + + return false; +} + +/* + * Insert a tuple to the new relation. This has to track heap_insert + * and its subsidiary functions! + * + * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the + * tuple is invalid on entry, it's replaced with the new TID as well (in + * the inserted data only, not in the caller's copy). + */ +static void +raw_heap_insert(RewriteState state, HeapTuple tup) +{ + Page page = state->rs_buffer; + Size pageFreeSpace, + saveFreeSpace; + Size len; + OffsetNumber newoff; + HeapTuple heaptup; + + /* + * If the new tuple is too big for storage or contains already toasted + * out-of-line attributes from some other relation, invoke the toaster. + * + * Note: below this point, heaptup is the data we actually intend to store + * into the relation; tup is the caller's original untoasted data. + */ + if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE) + { + /* toast table entries should never be recursively toasted */ + Assert(!HeapTupleHasExternal(tup)); + heaptup = tup; + } + else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) + { + int options = HEAP_INSERT_SKIP_FSM; + + /* + * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data + * for the TOAST table are not logically decoded. The main heap is + * WAL-logged as XLOG FPI records, which are not logically decoded. + */ + options |= HEAP_INSERT_NO_LOGICAL; + + heaptup = heap_toast_insert_or_update(state->rs_new_rel, tup, NULL, + options); + } + else + heaptup = tup; + + len = MAXALIGN(heaptup->t_len); /* be conservative */ + + /* + * If we're gonna fail for oversize tuple, do it right away + */ + if (len > MaxHeapTupleSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("row is too big: size %zu, maximum size %zu", + len, MaxHeapTupleSize))); + + /* Compute desired extra freespace due to fillfactor option */ + saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, + HEAP_DEFAULT_FILLFACTOR); + + /* Now we can check to see if there's enough free space already. */ + if (state->rs_buffer_valid) + { + pageFreeSpace = PageGetHeapFreeSpace(page); + + if (len + saveFreeSpace > pageFreeSpace) + { + /* + * Doesn't fit, so write out the existing page. It always + * contains a tuple. Hence, unlike RelationGetBufferForTuple(), + * enforce saveFreeSpace unconditionally. + */ + + /* XLOG stuff */ + if (RelationNeedsWAL(state->rs_new_rel)) + log_newpage(&state->rs_new_rel->rd_node, + MAIN_FORKNUM, + state->rs_blockno, + page, + true); + + /* + * Now write the page. We say skipFsync = true because there's no + * need for smgr to schedule an fsync for this write; we'll do it + * ourselves in end_heap_rewrite. + */ + RelationOpenSmgr(state->rs_new_rel); + + PageSetChecksumInplace(page, state->rs_blockno); + + smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, + state->rs_blockno, (char *) page, true); + + state->rs_blockno++; + state->rs_buffer_valid = false; + } + } + + if (!state->rs_buffer_valid) + { + /* Initialize a new empty page */ + PageInit(page, BLCKSZ, 0); + state->rs_buffer_valid = true; + } + + /* And now we can insert the tuple into the page */ + newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, + InvalidOffsetNumber, false, true); + if (newoff == InvalidOffsetNumber) + elog(ERROR, "failed to add tuple"); + + /* Update caller's t_self to the actual position where it was stored */ + ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff); + + /* + * Insert the correct position into CTID of the stored tuple, too, if the + * caller didn't supply a valid CTID. + */ + if (!ItemPointerIsValid(&tup->t_data->t_ctid)) + { + ItemId newitemid; + HeapTupleHeader onpage_tup; + + newitemid = PageGetItemId(page, newoff); + onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); + + onpage_tup->t_ctid = tup->t_self; + } + + /* If heaptup is a private copy, release it. */ + if (heaptup != tup) + heap_freetuple(heaptup); +} + +/* ------------------------------------------------------------------------ + * Logical rewrite support + * + * When doing logical decoding - which relies on using cmin/cmax of catalog + * tuples, via xl_heap_new_cid records - heap rewrites have to log enough + * information to allow the decoding backend to updates its internal mapping + * of (relfilenode,ctid) => (cmin, cmax) to be correct for the rewritten heap. + * + * For that, every time we find a tuple that's been modified in a catalog + * relation within the xmin horizon of any decoding slot, we log a mapping + * from the old to the new location. + * + * To deal with rewrites that abort the filename of a mapping file contains + * the xid of the transaction performing the rewrite, which then can be + * checked before being read in. + * + * For efficiency we don't immediately spill every single map mapping for a + * row to disk but only do so in batches when we've collected several of them + * in memory or when end_heap_rewrite() has been called. + * + * Crash-Safety: This module diverts from the usual patterns of doing WAL + * since it cannot rely on checkpoint flushing out all buffers and thus + * waiting for exclusive locks on buffers. Usually the XLogInsert() covering + * buffer modifications is performed while the buffer(s) that are being + * modified are exclusively locked guaranteeing that both the WAL record and + * the modified heap are on either side of the checkpoint. But since the + * mapping files we log aren't in shared_buffers that interlock doesn't work. + * + * Instead we simply write the mapping files out to disk, *before* the + * XLogInsert() is performed. That guarantees that either the XLogInsert() is + * inserted after the checkpoint's redo pointer or that the checkpoint (via + * CheckPointLogicalRewriteHeap()) has flushed the (partial) mapping file to + * disk. That leaves the tail end that has not yet been flushed open to + * corruption, which is solved by including the current offset in the + * xl_heap_rewrite_mapping records and truncating the mapping file to it + * during replay. Every time a rewrite is finished all generated mapping files + * are synced to disk. + * + * Note that if we were only concerned about crash safety we wouldn't have to + * deal with WAL logging at all - an fsync() at the end of a rewrite would be + * sufficient for crash safety. Any mapping that hasn't been safely flushed to + * disk has to be by an aborted (explicitly or via a crash) transaction and is + * ignored by virtue of the xid in its name being subject to a + * TransactionDidCommit() check. But we want to support having standbys via + * physical replication, both for availability and to do logical decoding + * there. + * ------------------------------------------------------------------------ + */ + +/* + * Do preparations for logging logical mappings during a rewrite if + * necessary. If we detect that we don't need to log anything we'll prevent + * any further action by the various logical rewrite functions. + */ +static void +logical_begin_heap_rewrite(RewriteState state) +{ + HASHCTL hash_ctl; + TransactionId logical_xmin; + + /* + * We only need to persist these mappings if the rewritten table can be + * accessed during logical decoding, if not, we can skip doing any + * additional work. + */ + state->rs_logical_rewrite = + RelationIsAccessibleInLogicalDecoding(state->rs_old_rel); + + if (!state->rs_logical_rewrite) + return; + + ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin); + + /* + * If there are no logical slots in progress we don't need to do anything, + * there cannot be any remappings for relevant rows yet. The relation's + * lock protects us against races. + */ + if (logical_xmin == InvalidTransactionId) + { + state->rs_logical_rewrite = false; + return; + } + + state->rs_logical_xmin = logical_xmin; + state->rs_begin_lsn = GetXLogInsertRecPtr(); + state->rs_num_rewrite_mappings = 0; + + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(RewriteMappingFile); + hash_ctl.hcxt = state->rs_cxt; + + state->rs_logical_mappings = + hash_create("Logical rewrite mapping", + 128, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); +} + +/* + * Flush all logical in-memory mappings to disk, but don't fsync them yet. + */ +static void +logical_heap_rewrite_flush_mappings(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + RewriteMappingFile *src; + dlist_mutable_iter iter; + + Assert(state->rs_logical_rewrite); + + /* no logical rewrite in progress, no need to iterate over mappings */ + if (state->rs_num_rewrite_mappings == 0) + return; + + elog(DEBUG1, "flushing %u logical rewrite mapping entries", + state->rs_num_rewrite_mappings); + + hash_seq_init(&seq_status, state->rs_logical_mappings); + while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) + { + char *waldata; + char *waldata_start; + xl_heap_rewrite_mapping xlrec; + Oid dboid; + uint32 len; + int written; + + /* this file hasn't got any new mappings */ + if (src->num_mappings == 0) + continue; + + if (state->rs_old_rel->rd_rel->relisshared) + dboid = InvalidOid; + else + dboid = MyDatabaseId; + + xlrec.num_mappings = src->num_mappings; + xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel); + xlrec.mapped_xid = src->xid; + xlrec.mapped_db = dboid; + xlrec.offset = src->off; + xlrec.start_lsn = state->rs_begin_lsn; + + /* write all mappings consecutively */ + len = src->num_mappings * sizeof(LogicalRewriteMappingData); + waldata_start = waldata = palloc(len); + + /* + * collect data we need to write out, but don't modify ondisk data yet + */ + dlist_foreach_modify(iter, &src->mappings) + { + RewriteMappingDataEntry *pmap; + + pmap = dlist_container(RewriteMappingDataEntry, node, iter.cur); + + memcpy(waldata, &pmap->map, sizeof(pmap->map)); + waldata += sizeof(pmap->map); + + /* remove from the list and free */ + dlist_delete(&pmap->node); + pfree(pmap); + + /* update bookkeeping */ + state->rs_num_rewrite_mappings--; + src->num_mappings--; + } + + Assert(src->num_mappings == 0); + Assert(waldata == waldata_start + len); + + /* + * Note that we deviate from the usual WAL coding practices here, + * check the above "Logical rewrite support" comment for reasoning. + */ + written = FileWrite(src->vfd, waldata_start, len, src->off, + WAIT_EVENT_LOGICAL_REWRITE_WRITE); + if (written != len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, + written, len))); + src->off += len; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + XLogRegisterData(waldata_start, len); + + /* write xlog record */ + XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE); + + pfree(waldata_start); + } + Assert(state->rs_num_rewrite_mappings == 0); +} + +/* + * Logical remapping part of end_heap_rewrite(). + */ +static void +logical_end_heap_rewrite(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + RewriteMappingFile *src; + + /* done, no logical rewrite in progress */ + if (!state->rs_logical_rewrite) + return; + + /* writeout remaining in-memory entries */ + if (state->rs_num_rewrite_mappings > 0) + logical_heap_rewrite_flush_mappings(state); + + /* Iterate over all mappings we have written and fsync the files. */ + hash_seq_init(&seq_status, state->rs_logical_mappings); + while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) + { + if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", src->path))); + FileClose(src->vfd); + } + /* memory context cleanup will deal with the rest */ +} + +/* + * Log a single (old->new) mapping for 'xid'. + */ +static void +logical_rewrite_log_mapping(RewriteState state, TransactionId xid, + LogicalRewriteMappingData *map) +{ + RewriteMappingFile *src; + RewriteMappingDataEntry *pmap; + Oid relid; + bool found; + + relid = RelationGetRelid(state->rs_old_rel); + + /* look for existing mappings for this 'mapped' xid */ + src = hash_search(state->rs_logical_mappings, &xid, + HASH_ENTER, &found); + + /* + * We haven't yet had the need to map anything for this xid, create + * per-xid data structures. + */ + if (!found) + { + char path[MAXPGPATH]; + Oid dboid; + + if (state->rs_old_rel->rd_rel->relisshared) + dboid = InvalidOid; + else + dboid = MyDatabaseId; + + snprintf(path, MAXPGPATH, + "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, + dboid, relid, + LSN_FORMAT_ARGS(state->rs_begin_lsn), + xid, GetCurrentTransactionId()); + + dlist_init(&src->mappings); + src->num_mappings = 0; + src->off = 0; + memcpy(src->path, path, sizeof(path)); + src->vfd = PathNameOpenFile(path, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (src->vfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + } + + pmap = MemoryContextAlloc(state->rs_cxt, + sizeof(RewriteMappingDataEntry)); + memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData)); + dlist_push_tail(&src->mappings, &pmap->node); + src->num_mappings++; + state->rs_num_rewrite_mappings++; + + /* + * Write out buffer every time we've too many in-memory entries across all + * mapping files. + */ + if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */ ) + logical_heap_rewrite_flush_mappings(state); +} + +/* + * Perform logical remapping for a tuple that's mapped from old_tid to + * new_tuple->t_self by rewrite_heap_tuple() if necessary for the tuple. + */ +static void +logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, + HeapTuple new_tuple) +{ + ItemPointerData new_tid = new_tuple->t_self; + TransactionId cutoff = state->rs_logical_xmin; + TransactionId xmin; + TransactionId xmax; + bool do_log_xmin = false; + bool do_log_xmax = false; + LogicalRewriteMappingData map; + + /* no logical rewrite in progress, we don't need to log anything */ + if (!state->rs_logical_rewrite) + return; + + xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); + /* use *GetUpdateXid to correctly deal with multixacts */ + xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data); + + /* + * Log the mapping iff the tuple has been created recently. + */ + if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff)) + do_log_xmin = true; + + if (!TransactionIdIsNormal(xmax)) + { + /* + * no xmax is set, can't have any permanent ones, so this check is + * sufficient + */ + } + else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask)) + { + /* only locked, we don't care */ + } + else if (!TransactionIdPrecedes(xmax, cutoff)) + { + /* tuple has been deleted recently, log */ + do_log_xmax = true; + } + + /* if neither needs to be logged, we're done */ + if (!do_log_xmin && !do_log_xmax) + return; + + /* fill out mapping information */ + map.old_node = state->rs_old_rel->rd_node; + map.old_tid = old_tid; + map.new_node = state->rs_new_rel->rd_node; + map.new_tid = new_tid; + + /* --- + * Now persist the mapping for the individual xids that are affected. We + * need to log for both xmin and xmax if they aren't the same transaction + * since the mapping files are per "affected" xid. + * We don't muster all that much effort detecting whether xmin and xmax + * are actually the same transaction, we just check whether the xid is the + * same disregarding subtransactions. Logging too much is relatively + * harmless and we could never do the check fully since subtransaction + * data is thrown away during restarts. + * --- + */ + if (do_log_xmin) + logical_rewrite_log_mapping(state, xmin, &map); + /* separately log mapping for xmax unless it'd be redundant */ + if (do_log_xmax && !TransactionIdEquals(xmin, xmax)) + logical_rewrite_log_mapping(state, xmax, &map); +} + +/* + * Replay XLOG_HEAP2_REWRITE records + */ +void +heap_xlog_logical_rewrite(XLogReaderState *r) +{ + char path[MAXPGPATH]; + int fd; + xl_heap_rewrite_mapping *xlrec; + uint32 len; + char *data; + + xlrec = (xl_heap_rewrite_mapping *) XLogRecGetData(r); + + snprintf(path, MAXPGPATH, + "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, + xlrec->mapped_db, xlrec->mapped_rel, + LSN_FORMAT_ARGS(xlrec->start_lsn), + xlrec->mapped_xid, XLogRecGetXid(r)); + + fd = OpenTransientFile(path, + O_CREAT | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + + /* + * Truncate all data that's not guaranteed to have been safely fsynced (by + * previous record or by the last checkpoint). + */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE); + if (ftruncate(fd, xlrec->offset) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\" to %u: %m", + path, (uint32) xlrec->offset))); + pgstat_report_wait_end(); + + data = XLogRecGetData(r) + sizeof(*xlrec); + + len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData); + + /* write out tail end of mapping file (again) */ + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE); + if (pg_pwrite(fd, data, len, xlrec->offset) != len) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + } + pgstat_report_wait_end(); + + /* + * Now fsync all previously written data. We could improve things and only + * do this for the last write to a file, but the required bookkeeping + * doesn't seem worth the trouble. + */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + +/* --- + * Perform a checkpoint for logical rewrite mappings + * + * This serves two tasks: + * 1) Remove all mappings not needed anymore based on the logical restart LSN + * 2) Flush all remaining mappings to disk, so that replay after a checkpoint + * only has to deal with the parts of a mapping that have been written out + * after the checkpoint started. + * --- + */ +void +CheckPointLogicalRewriteHeap(void) +{ + XLogRecPtr cutoff; + XLogRecPtr redo; + DIR *mappings_dir; + struct dirent *mapping_de; + char path[MAXPGPATH + 20]; + + /* + * We start of with a minimum of the last redo pointer. No new decoding + * slot will start before that, so that's a safe upper bound for removal. + */ + redo = GetRedoRecPtr(); + + /* now check for the restart ptrs from existing slots */ + cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + + /* don't start earlier than the restart lsn */ + if (cutoff != InvalidXLogRecPtr && redo < cutoff) + cutoff = redo; + + mappings_dir = AllocateDir("pg_logical/mappings"); + while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL) + { + struct stat statbuf; + Oid dboid; + Oid relid; + XLogRecPtr lsn; + TransactionId rewrite_xid; + TransactionId create_xid; + uint32 hi, + lo; + + if (strcmp(mapping_de->d_name, ".") == 0 || + strcmp(mapping_de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name); + if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + continue; + + /* Skip over files that cannot be ours. */ + if (strncmp(mapping_de->d_name, "map-", 4) != 0) + continue; + + if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, + &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6) + elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); + + lsn = ((uint64) hi) << 32 | lo; + + if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + { + elog(DEBUG1, "removing logical rewrite file \"%s\"", path); + if (unlink(path) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } + else + { + /* on some operating systems fsyncing a file requires O_RDWR */ + int fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + + /* + * The file cannot vanish due to concurrency since this function + * is the only one removing logical mappings and only one + * checkpoint can be in progress at a time. + */ + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* + * We could try to avoid fsyncing files that either haven't + * changed or have only been created since the checkpoint's start, + * but it's currently not deemed worth the effort. + */ + pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + } + } + FreeDir(mappings_dir); + + /* persist directory entries to disk */ + fsync_fname("pg_logical/mappings", true); +} diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c new file mode 100644 index 0000000..8aab6e3 --- /dev/null +++ b/src/backend/access/heap/vacuumlazy.c @@ -0,0 +1,4353 @@ +/*------------------------------------------------------------------------- + * + * vacuumlazy.c + * Concurrent ("lazy") vacuuming. + * + * + * The major space usage for LAZY VACUUM is storage for the array of dead tuple + * TIDs. We want to ensure we can vacuum even the very largest relations with + * finite memory space usage. To do that, we set upper bounds on the number of + * tuples we will keep track of at once. + * + * We are willing to use at most maintenance_work_mem (or perhaps + * autovacuum_work_mem) memory space to keep track of dead tuples. We + * initially allocate an array of TIDs of that size, with an upper limit that + * depends on table size (this limit ensures we don't allocate a huge area + * uselessly for vacuuming small tables). If the array threatens to overflow, + * we suspend the heap scan phase and perform a pass of index cleanup and page + * compaction, then resume the heap scan with an empty TID array. + * + * If we're processing a table with no indexes, we can just vacuum each page + * as we go; there's no need to save up multiple tuples to minimize the number + * of index scans performed. So we don't use maintenance_work_mem memory for + * the TID array, just enough to hold as many heap tuples as fit on one page. + * + * Lazy vacuum supports parallel execution with parallel worker processes. In + * a parallel vacuum, we perform both index vacuum and index cleanup with + * parallel worker processes. Individual indexes are processed by one vacuum + * process. At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare + * the parallel context and initialize the DSM segment that contains shared + * information as well as the memory space for storing dead tuples. When + * starting either index vacuum or index cleanup, we launch parallel worker + * processes. Once all indexes are processed the parallel worker processes + * exit. After that, the leader process re-initializes the parallel context + * so that it can use the same DSM for multiple passes of index vacuum and + * for performing index cleanup. For updating the index statistics, we need + * to update the system table and since updates are not allowed during + * parallel mode we update the index statistics after exiting from the + * parallel mode. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/vacuumlazy.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/amapi.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/parallel.h" +#include "access/transam.h" +#include "access/visibilitymap.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/index.h" +#include "catalog/storage.h" +#include "commands/dbcommands.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "optimizer/paths.h" +#include "pgstat.h" +#include "portability/instr_time.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "tcop/tcopprot.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/timestamp.h" + + +/* + * Space/time tradeoff parameters: do these need to be user-tunable? + * + * To consider truncating the relation, we want there to be at least + * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever + * is less) potentially-freeable pages. + */ +#define REL_TRUNCATE_MINIMUM 1000 +#define REL_TRUNCATE_FRACTION 16 + +/* + * Timing parameters for truncate locking heuristics. + * + * These were not exposed as user tunable GUC values because it didn't seem + * that the potential for improvement was great enough to merit the cost of + * supporting them. + */ +#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */ +#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ +#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ + +/* + * Threshold that controls whether we bypass index vacuuming and heap + * vacuuming as an optimization + */ +#define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */ + +/* + * Perform a failsafe check every 4GB during the heap scan, approximately + */ +#define FAILSAFE_EVERY_PAGES \ + ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ)) + +/* + * When a table has no indexes, vacuum the FSM after every 8GB, approximately + * (it won't be exact because we only vacuum FSM after processing a heap page + * that has some removable tuples). When there are indexes, this is ignored, + * and we vacuum FSM after each index/heap cleaning pass. + */ +#define VACUUM_FSM_EVERY_PAGES \ + ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) + +/* + * Guesstimation of number of dead tuples per page. This is used to + * provide an upper limit to memory allocated when vacuuming small + * tables. + */ +#define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage + +/* + * Before we consider skipping a page that's marked as clean in + * visibility map, we must've seen at least this many clean pages. + */ +#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) + +/* + * Size of the prefetch window for lazy vacuum backwards truncation scan. + * Needs to be a power of 2. + */ +#define PREFETCH_SIZE ((BlockNumber) 32) + +/* + * DSM keys for parallel vacuum. Unlike other parallel execution code, since + * we don't need to worry about DSM keys conflicting with plan_node_id we can + * use small integers. + */ +#define PARALLEL_VACUUM_KEY_SHARED 1 +#define PARALLEL_VACUUM_KEY_DEAD_TUPLES 2 +#define PARALLEL_VACUUM_KEY_QUERY_TEXT 3 +#define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4 +#define PARALLEL_VACUUM_KEY_WAL_USAGE 5 + +/* + * Macro to check if we are in a parallel vacuum. If true, we are in the + * parallel mode and the DSM segment is initialized. + */ +#define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL) + +/* Phases of vacuum during which we report error context. */ +typedef enum +{ + VACUUM_ERRCB_PHASE_UNKNOWN, + VACUUM_ERRCB_PHASE_SCAN_HEAP, + VACUUM_ERRCB_PHASE_VACUUM_INDEX, + VACUUM_ERRCB_PHASE_VACUUM_HEAP, + VACUUM_ERRCB_PHASE_INDEX_CLEANUP, + VACUUM_ERRCB_PHASE_TRUNCATE +} VacErrPhase; + +/* + * LVDeadTuples stores the dead tuple TIDs collected during the heap scan. + * This is allocated in the DSM segment in parallel mode and in local memory + * in non-parallel mode. + */ +typedef struct LVDeadTuples +{ + int max_tuples; /* # slots allocated in array */ + int num_tuples; /* current # of entries */ + /* List of TIDs of tuples we intend to delete */ + /* NB: this list is ordered by TID address */ + ItemPointerData itemptrs[FLEXIBLE_ARRAY_MEMBER]; /* array of + * ItemPointerData */ +} LVDeadTuples; + +/* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */ +#define SizeOfDeadTuples(cnt) \ + add_size(offsetof(LVDeadTuples, itemptrs), \ + mul_size(sizeof(ItemPointerData), cnt)) +#define MAXDEADTUPLES(max_size) \ + (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData)) + +/* + * Shared information among parallel workers. So this is allocated in the DSM + * segment. + */ +typedef struct LVShared +{ + /* + * Target table relid and log level. These fields are not modified during + * the lazy vacuum. + */ + Oid relid; + int elevel; + + /* + * An indication for vacuum workers to perform either index vacuum or + * index cleanup. first_time is true only if for_cleanup is true and + * bulk-deletion is not performed yet. + */ + bool for_cleanup; + bool first_time; + + /* + * Fields for both index vacuum and cleanup. + * + * reltuples is the total number of input heap tuples. We set either old + * live tuples in the index vacuum case or the new live tuples in the + * index cleanup case. + * + * estimated_count is true if reltuples is an estimated value. (Note that + * reltuples could be -1 in this case, indicating we have no idea.) + */ + double reltuples; + bool estimated_count; + + /* + * In single process lazy vacuum we could consume more memory during index + * vacuuming or cleanup apart from the memory for heap scanning. In + * parallel vacuum, since individual vacuum workers can consume memory + * equal to maintenance_work_mem, the new maintenance_work_mem for each + * worker is set such that the parallel operation doesn't consume more + * memory than single process lazy vacuum. + */ + int maintenance_work_mem_worker; + + /* + * Shared vacuum cost balance. During parallel vacuum, + * VacuumSharedCostBalance points to this value and it accumulates the + * balance of each parallel vacuum worker. + */ + pg_atomic_uint32 cost_balance; + + /* + * Number of active parallel workers. This is used for computing the + * minimum threshold of the vacuum cost balance before a worker sleeps for + * cost-based delay. + */ + pg_atomic_uint32 active_nworkers; + + /* + * Variables to control parallel vacuum. We have a bitmap to indicate + * which index has stats in shared memory. The set bit in the map + * indicates that the particular index supports a parallel vacuum. + */ + pg_atomic_uint32 idx; /* counter for vacuuming and clean up */ + uint32 offset; /* sizeof header incl. bitmap */ + bits8 bitmap[FLEXIBLE_ARRAY_MEMBER]; /* bit map of NULLs */ + + /* Shared index statistics data follows at end of struct */ +} LVShared; + +#define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8)) +#define GetSharedIndStats(s) \ + ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset)) +#define IndStatsIsNull(s, i) \ + (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07)))) + +/* + * Struct for an index bulk-deletion statistic used for parallel vacuum. This + * is allocated in the DSM segment. + */ +typedef struct LVSharedIndStats +{ + bool updated; /* are the stats updated? */ + IndexBulkDeleteResult istat; +} LVSharedIndStats; + +/* Struct for maintaining a parallel vacuum state. */ +typedef struct LVParallelState +{ + ParallelContext *pcxt; + + /* Shared information among parallel vacuum workers */ + LVShared *lvshared; + + /* Points to buffer usage area in DSM */ + BufferUsage *buffer_usage; + + /* Points to WAL usage area in DSM */ + WalUsage *wal_usage; + + /* + * The number of indexes that support parallel index bulk-deletion and + * parallel index cleanup respectively. + */ + int nindexes_parallel_bulkdel; + int nindexes_parallel_cleanup; + int nindexes_parallel_condcleanup; +} LVParallelState; + +typedef struct LVRelState +{ + /* Target heap relation and its indexes */ + Relation rel; + Relation *indrels; + int nindexes; + + /* Wraparound failsafe has been triggered? */ + bool failsafe_active; + /* Consider index vacuuming bypass optimization? */ + bool consider_bypass_optimization; + + /* Doing index vacuuming, index cleanup, rel truncation? */ + bool do_index_vacuuming; + bool do_index_cleanup; + bool do_rel_truncate; + + /* Buffer access strategy and parallel state */ + BufferAccessStrategy bstrategy; + LVParallelState *lps; + + /* Statistics from pg_class when we start out */ + BlockNumber old_rel_pages; /* previous value of pg_class.relpages */ + double old_live_tuples; /* previous value of pg_class.reltuples */ + /* rel's initial relfrozenxid and relminmxid */ + TransactionId relfrozenxid; + MultiXactId relminmxid; + + /* VACUUM operation's cutoff for pruning */ + TransactionId OldestXmin; + /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */ + TransactionId FreezeLimit; + MultiXactId MultiXactCutoff; + + /* Error reporting state */ + char *relnamespace; + char *relname; + char *indname; + BlockNumber blkno; /* used only for heap operations */ + OffsetNumber offnum; /* used only for heap operations */ + VacErrPhase phase; + + /* + * State managed by lazy_scan_heap() follows + */ + LVDeadTuples *dead_tuples; /* items to vacuum from indexes */ + BlockNumber rel_pages; /* total number of pages */ + BlockNumber scanned_pages; /* number of pages we examined */ + BlockNumber pinskipped_pages; /* # of pages skipped due to a pin */ + BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */ + BlockNumber tupcount_pages; /* pages whose tuples we counted */ + BlockNumber pages_removed; /* pages remove by truncation */ + BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */ + BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + + /* Statistics output by us, for table */ + double new_rel_tuples; /* new estimated total # of tuples */ + double new_live_tuples; /* new estimated total # of live tuples */ + /* Statistics output by index AMs */ + IndexBulkDeleteResult **indstats; + + /* Instrumentation counters */ + int num_index_scans; + int64 tuples_deleted; /* # deleted from table */ + int64 lpdead_items; /* # deleted from indexes */ + int64 new_dead_tuples; /* new estimated total # of dead items in + * table */ + int64 num_tuples; /* total number of nonremovable tuples */ + int64 live_tuples; /* live tuples (reltuples estimate) */ +} LVRelState; + +/* + * State returned by lazy_scan_prune() + */ +typedef struct LVPagePruneState +{ + bool hastup; /* Page is truncatable? */ + bool has_lpdead_items; /* includes existing LP_DEAD items */ + + /* + * State describes the proper VM bit states to set for the page following + * pruning and freezing. all_visible implies !has_lpdead_items, but don't + * trust all_frozen result unless all_visible is also set to true. + */ + bool all_visible; /* Every item visible to all? */ + bool all_frozen; /* provided all_visible is also true */ + TransactionId visibility_cutoff_xid; /* For recovery conflicts */ +} LVPagePruneState; + +/* Struct for saving and restoring vacuum error information. */ +typedef struct LVSavedErrInfo +{ + BlockNumber blkno; + OffsetNumber offnum; + VacErrPhase phase; +} LVSavedErrInfo; + +/* elevel controls whole VACUUM's verbosity */ +static int elevel = -1; + + +/* non-export function prototypes */ +static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, + bool aggressive); +static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, + BlockNumber blkno, Page page, + GlobalVisState *vistest, + LVPagePruneState *prunestate); +static void lazy_vacuum(LVRelState *vacrel); +static bool lazy_vacuum_all_indexes(LVRelState *vacrel); +static void lazy_vacuum_heap_rel(LVRelState *vacrel); +static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, + Buffer buffer, int tupindex, Buffer *vmbuffer); +static bool lazy_check_needs_freeze(Buffer buf, bool *hastup, + LVRelState *vacrel); +static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); +static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel); +static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel); +static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers); +static void do_parallel_processing(LVRelState *vacrel, + LVShared *lvshared); +static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, + LVShared *lvshared); +static IndexBulkDeleteResult *parallel_process_one_index(Relation indrel, + IndexBulkDeleteResult *istat, + LVShared *lvshared, + LVSharedIndStats *shared_indstats, + LVRelState *vacrel); +static void lazy_cleanup_all_indexes(LVRelState *vacrel); +static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, + IndexBulkDeleteResult *istat, + double reltuples, + LVRelState *vacrel); +static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel, + IndexBulkDeleteResult *istat, + double reltuples, + bool estimated_count, + LVRelState *vacrel); +static bool should_attempt_truncation(LVRelState *vacrel); +static void lazy_truncate_heap(LVRelState *vacrel); +static BlockNumber count_nondeletable_pages(LVRelState *vacrel, + bool *lock_waiter_detected); +static long compute_max_dead_tuples(BlockNumber relblocks, bool hasindex); +static void lazy_space_alloc(LVRelState *vacrel, int nworkers, + BlockNumber relblocks); +static void lazy_space_free(LVRelState *vacrel); +static bool lazy_tid_reaped(ItemPointer itemptr, void *state); +static int vac_cmp_itemptr(const void *left, const void *right); +static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, + TransactionId *visibility_cutoff_xid, bool *all_frozen); +static int compute_parallel_vacuum_workers(LVRelState *vacrel, + int nrequested, + bool *will_parallel_vacuum); +static void update_index_statistics(LVRelState *vacrel); +static LVParallelState *begin_parallel_vacuum(LVRelState *vacrel, + BlockNumber nblocks, + int nrequested); +static void end_parallel_vacuum(LVRelState *vacrel); +static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx); +static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared); +static void vacuum_error_callback(void *arg); +static void update_vacuum_error_info(LVRelState *vacrel, + LVSavedErrInfo *saved_vacrel, + int phase, BlockNumber blkno, + OffsetNumber offnum); +static void restore_vacuum_error_info(LVRelState *vacrel, + const LVSavedErrInfo *saved_vacrel); + + +/* + * heap_vacuum_rel() -- perform VACUUM for one heap relation + * + * This routine vacuums a single heap, cleans out its indexes, and + * updates its relpages and reltuples statistics. + * + * At entry, we have already established a transaction and opened + * and locked the relation. + */ +void +heap_vacuum_rel(Relation rel, VacuumParams *params, + BufferAccessStrategy bstrategy) +{ + LVRelState *vacrel; + PGRUsage ru0; + TimestampTz starttime = 0; + WalUsage walusage_start = pgWalUsage; + WalUsage walusage = {0, 0, 0}; + long secs; + int usecs; + double read_rate, + write_rate; + bool aggressive; /* should we scan all unfrozen pages? */ + bool scanned_all_unfrozen; /* actually scanned all such pages? */ + char **indnames = NULL; + TransactionId xidFullScanLimit; + MultiXactId mxactFullScanLimit; + BlockNumber new_rel_pages; + BlockNumber new_rel_allvisible; + double new_live_tuples; + TransactionId new_frozen_xid; + MultiXactId new_min_multi; + ErrorContextCallback errcallback; + PgStat_Counter startreadtime = 0; + PgStat_Counter startwritetime = 0; + TransactionId OldestXmin; + TransactionId FreezeLimit; + MultiXactId MultiXactCutoff; + + /* measure elapsed time iff autovacuum logging requires it */ + if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) + { + pg_rusage_init(&ru0); + starttime = GetCurrentTimestamp(); + if (track_io_timing) + { + startreadtime = pgStatBlockReadTime; + startwritetime = pgStatBlockWriteTime; + } + } + + if (params->options & VACOPT_VERBOSE) + elevel = INFO; + else + elevel = DEBUG2; + + pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, + RelationGetRelid(rel)); + + vacuum_set_xid_limits(rel, + params->freeze_min_age, + params->freeze_table_age, + params->multixact_freeze_min_age, + params->multixact_freeze_table_age, + &OldestXmin, &FreezeLimit, &xidFullScanLimit, + &MultiXactCutoff, &mxactFullScanLimit); + + /* + * We request an aggressive scan if the table's frozen Xid is now older + * than or equal to the requested Xid full-table scan limit; or if the + * table's minimum MultiXactId is older than or equal to the requested + * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified. + */ + aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid, + xidFullScanLimit); + aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid, + mxactFullScanLimit); + if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) + aggressive = true; + + vacrel = (LVRelState *) palloc0(sizeof(LVRelState)); + + /* Set up high level stuff about rel */ + vacrel->rel = rel; + vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes, + &vacrel->indrels); + vacrel->failsafe_active = false; + vacrel->consider_bypass_optimization = true; + + /* + * The index_cleanup param either disables index vacuuming and cleanup or + * forces it to go ahead when we would otherwise apply the index bypass + * optimization. The default is 'auto', which leaves the final decision + * up to lazy_vacuum(). + * + * The truncate param allows user to avoid attempting relation truncation, + * though it can't force truncation to happen. + */ + Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED); + Assert(params->truncate != VACOPTVALUE_UNSPECIFIED && + params->truncate != VACOPTVALUE_AUTO); + vacrel->do_index_vacuuming = true; + vacrel->do_index_cleanup = true; + vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED); + if (params->index_cleanup == VACOPTVALUE_DISABLED) + { + /* Force disable index vacuuming up-front */ + vacrel->do_index_vacuuming = false; + vacrel->do_index_cleanup = false; + } + else if (params->index_cleanup == VACOPTVALUE_ENABLED) + { + /* Force index vacuuming. Note that failsafe can still bypass. */ + vacrel->consider_bypass_optimization = false; + } + else + { + /* Default/auto, make all decisions dynamically */ + Assert(params->index_cleanup == VACOPTVALUE_AUTO); + } + + vacrel->bstrategy = bstrategy; + vacrel->old_rel_pages = rel->rd_rel->relpages; + vacrel->old_live_tuples = rel->rd_rel->reltuples; + vacrel->relfrozenxid = rel->rd_rel->relfrozenxid; + vacrel->relminmxid = rel->rd_rel->relminmxid; + + /* Set cutoffs for entire VACUUM */ + vacrel->OldestXmin = OldestXmin; + vacrel->FreezeLimit = FreezeLimit; + vacrel->MultiXactCutoff = MultiXactCutoff; + + vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel)); + vacrel->relname = pstrdup(RelationGetRelationName(rel)); + vacrel->indname = NULL; + vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN; + + /* Save index names iff autovacuum logging requires it */ + if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 && + vacrel->nindexes > 0) + { + indnames = palloc(sizeof(char *) * vacrel->nindexes); + for (int i = 0; i < vacrel->nindexes; i++) + indnames[i] = + pstrdup(RelationGetRelationName(vacrel->indrels[i])); + } + + /* + * Setup error traceback support for ereport(). The idea is to set up an + * error context callback to display additional information on any error + * during a vacuum. During different phases of vacuum (heap scan, heap + * vacuum, index vacuum, index clean up, heap truncate), we update the + * error context callback to display appropriate information. + * + * Note that the index vacuum and heap vacuum phases may be called + * multiple times in the middle of the heap scan phase. So the old phase + * information is restored at the end of those phases. + */ + errcallback.callback = vacuum_error_callback; + errcallback.arg = vacrel; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* Do the vacuuming */ + lazy_scan_heap(vacrel, params, aggressive); + + /* Done with indexes */ + vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock); + + /* + * Compute whether we actually scanned the all unfrozen pages. If we did, + * we can adjust relfrozenxid and relminmxid. + * + * NB: We need to check this before truncating the relation, because that + * will change ->rel_pages. + */ + if ((vacrel->scanned_pages + vacrel->frozenskipped_pages) + < vacrel->rel_pages) + { + Assert(!aggressive); + scanned_all_unfrozen = false; + } + else + scanned_all_unfrozen = true; + + /* + * Optionally truncate the relation. + */ + if (should_attempt_truncation(vacrel)) + { + /* + * Update error traceback information. This is the last phase during + * which we add context information to errors, so we don't need to + * revert to the previous phase. + */ + update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE, + vacrel->nonempty_pages, + InvalidOffsetNumber); + lazy_truncate_heap(vacrel); + } + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + /* Report that we are now doing final cleanup */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); + + /* + * Update statistics in pg_class. + * + * In principle new_live_tuples could be -1 indicating that we (still) + * don't know the tuple count. In practice that probably can't happen, + * since we'd surely have scanned some pages if the table is new and + * nonempty. + * + * For safety, clamp relallvisible to be not more than what we're setting + * relpages to. + * + * Also, don't change relfrozenxid/relminmxid if we skipped any pages, + * since then we don't know for certain that all tuples have a newer xmin. + */ + new_rel_pages = vacrel->rel_pages; + new_live_tuples = vacrel->new_live_tuples; + + visibilitymap_count(rel, &new_rel_allvisible, NULL); + if (new_rel_allvisible > new_rel_pages) + new_rel_allvisible = new_rel_pages; + + new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId; + new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId; + + vac_update_relstats(rel, + new_rel_pages, + new_live_tuples, + new_rel_allvisible, + vacrel->nindexes > 0, + new_frozen_xid, + new_min_multi, + false); + + /* + * Report results to the stats collector, too. + * + * Deliberately avoid telling the stats collector about LP_DEAD items that + * remain in the table due to VACUUM bypassing index and heap vacuuming. + * ANALYZE will consider the remaining LP_DEAD items to be dead tuples. It + * seems like a good idea to err on the side of not vacuuming again too + * soon in cases where the failsafe prevented significant amounts of heap + * vacuuming. + */ + pgstat_report_vacuum(RelationGetRelid(rel), + rel->rd_rel->relisshared, + Max(new_live_tuples, 0), + vacrel->new_dead_tuples); + pgstat_progress_end_command(); + + /* and log the action if appropriate */ + if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) + { + TimestampTz endtime = GetCurrentTimestamp(); + + if (params->log_min_duration == 0 || + TimestampDifferenceExceeds(starttime, endtime, + params->log_min_duration)) + { + StringInfoData buf; + char *msgfmt; + BlockNumber orig_rel_pages; + + TimestampDifference(starttime, endtime, &secs, &usecs); + + memset(&walusage, 0, sizeof(WalUsage)); + WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); + + read_rate = 0; + write_rate = 0; + if ((secs > 0) || (usecs > 0)) + { + read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) / + (secs + usecs / 1000000.0); + write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) / + (secs + usecs / 1000000.0); + } + + /* + * This is pretty messy, but we split it up so that we can skip + * emitting individual parts of the message when not applicable. + */ + initStringInfo(&buf); + if (params->is_wraparound) + { + /* + * While it's possible for a VACUUM to be both is_wraparound + * and !aggressive, that's just a corner-case -- is_wraparound + * implies aggressive. Produce distinct output for the corner + * case all the same, just in case. + */ + if (aggressive) + msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); + else + msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); + } + else + { + if (aggressive) + msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n"); + else + msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"); + } + appendStringInfo(&buf, msgfmt, + get_database_name(MyDatabaseId), + vacrel->relnamespace, + vacrel->relname, + vacrel->num_index_scans); + appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"), + vacrel->pages_removed, + vacrel->rel_pages, + vacrel->pinskipped_pages, + vacrel->frozenskipped_pages); + appendStringInfo(&buf, + _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"), + (long long) vacrel->tuples_deleted, + (long long) vacrel->new_rel_tuples, + (long long) vacrel->new_dead_tuples, + OldestXmin); + orig_rel_pages = vacrel->rel_pages + vacrel->pages_removed; + if (orig_rel_pages > 0) + { + if (vacrel->do_index_vacuuming) + { + if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0) + appendStringInfoString(&buf, _("index scan not needed: ")); + else + appendStringInfoString(&buf, _("index scan needed: ")); + + msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n"); + } + else + { + if (!vacrel->failsafe_active) + appendStringInfoString(&buf, _("index scan bypassed: ")); + else + appendStringInfoString(&buf, _("index scan bypassed by failsafe: ")); + + msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n"); + } + appendStringInfo(&buf, msgfmt, + vacrel->lpdead_item_pages, + 100.0 * vacrel->lpdead_item_pages / orig_rel_pages, + (long long) vacrel->lpdead_items); + } + for (int i = 0; i < vacrel->nindexes; i++) + { + IndexBulkDeleteResult *istat = vacrel->indstats[i]; + + if (!istat) + continue; + + appendStringInfo(&buf, + _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"), + indnames[i], + istat->num_pages, + istat->pages_newly_deleted, + istat->pages_deleted, + istat->pages_free); + } + if (track_io_timing) + { + double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000; + double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000; + + appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"), + read_ms, write_ms); + } + appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), + read_rate, write_rate); + appendStringInfo(&buf, + _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"), + (long long) VacuumPageHit, + (long long) VacuumPageMiss, + (long long) VacuumPageDirty); + appendStringInfo(&buf, + _("WAL usage: %lld records, %lld full page images, %llu bytes\n"), + (long long) walusage.wal_records, + (long long) walusage.wal_fpi, + (unsigned long long) walusage.wal_bytes); + appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); + + ereport(LOG, + (errmsg_internal("%s", buf.data))); + pfree(buf.data); + } + } + + /* Cleanup index statistics and index names */ + for (int i = 0; i < vacrel->nindexes; i++) + { + if (vacrel->indstats[i]) + pfree(vacrel->indstats[i]); + + if (indnames && indnames[i]) + pfree(indnames[i]); + } +} + +/* + * lazy_scan_heap() -- scan an open heap relation + * + * This routine prunes each page in the heap, which will among other + * things truncate dead tuples to dead line pointers, defragment the + * page, and set commit status bits (see heap_page_prune). It also builds + * lists of dead tuples and pages with free space, calculates statistics + * on the number of live tuples in the heap, and marks pages as + * all-visible if appropriate. When done, or when we run low on space + * for dead-tuple TIDs, invoke lazy_vacuum to vacuum indexes and vacuum + * heap relation during its own second pass over the heap. + * + * If the table has at least two indexes, we execute both index vacuum + * and index cleanup with parallel workers unless parallel vacuum is + * disabled. In a parallel vacuum, we enter parallel mode and then + * create both the parallel context and the DSM segment before starting + * heap scan so that we can record dead tuples to the DSM segment. All + * parallel workers are launched at beginning of index vacuuming and + * index cleanup and they exit once done with all indexes. At the end of + * this function we exit from parallel mode. Index bulk-deletion results + * are stored in the DSM segment and we update index statistics for all + * the indexes after exiting from parallel mode since writes are not + * allowed during parallel mode. + * + * If there are no indexes then we can reclaim line pointers on the fly; + * dead line pointers need only be retained until all index pointers that + * reference them have been killed. + */ +static void +lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) +{ + LVDeadTuples *dead_tuples; + BlockNumber nblocks, + blkno, + next_unskippable_block, + next_failsafe_block, + next_fsm_block_to_vacuum; + PGRUsage ru0; + Buffer vmbuffer = InvalidBuffer; + bool skipping_blocks; + StringInfoData buf; + const int initprog_index[] = { + PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_TOTAL_HEAP_BLKS, + PROGRESS_VACUUM_MAX_DEAD_TUPLES + }; + int64 initprog_val[3]; + GlobalVisState *vistest; + + pg_rusage_init(&ru0); + + if (aggressive) + ereport(elevel, + (errmsg("aggressively vacuuming \"%s.%s\"", + vacrel->relnamespace, + vacrel->relname))); + else + ereport(elevel, + (errmsg("vacuuming \"%s.%s\"", + vacrel->relnamespace, + vacrel->relname))); + + nblocks = RelationGetNumberOfBlocks(vacrel->rel); + next_unskippable_block = 0; + next_failsafe_block = 0; + next_fsm_block_to_vacuum = 0; + vacrel->rel_pages = nblocks; + vacrel->scanned_pages = 0; + vacrel->pinskipped_pages = 0; + vacrel->frozenskipped_pages = 0; + vacrel->tupcount_pages = 0; + vacrel->pages_removed = 0; + vacrel->lpdead_item_pages = 0; + vacrel->nonempty_pages = 0; + + /* Initialize instrumentation counters */ + vacrel->num_index_scans = 0; + vacrel->tuples_deleted = 0; + vacrel->lpdead_items = 0; + vacrel->new_dead_tuples = 0; + vacrel->num_tuples = 0; + vacrel->live_tuples = 0; + + vistest = GlobalVisTestFor(vacrel->rel); + + vacrel->indstats = (IndexBulkDeleteResult **) + palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *)); + + /* + * Before beginning scan, check if it's already necessary to apply + * failsafe + */ + lazy_check_wraparound_failsafe(vacrel); + + /* + * Allocate the space for dead tuples. Note that this handles parallel + * VACUUM initialization as part of allocating shared memory space used + * for dead_tuples. + */ + lazy_space_alloc(vacrel, params->nworkers, nblocks); + dead_tuples = vacrel->dead_tuples; + + /* Report that we're scanning the heap, advertising total # of blocks */ + initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; + initprog_val[1] = nblocks; + initprog_val[2] = dead_tuples->max_tuples; + pgstat_progress_update_multi_param(3, initprog_index, initprog_val); + + /* + * Except when aggressive is set, we want to skip pages that are + * all-visible according to the visibility map, but only when we can skip + * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading + * sequentially, the OS should be doing readahead for us, so there's no + * gain in skipping a page now and then; that's likely to disable + * readahead and so be counterproductive. Also, skipping even a single + * page means that we can't update relfrozenxid, so we only want to do it + * if we can skip a goodly number of pages. + * + * When aggressive is set, we can't skip pages just because they are + * all-visible, but we can still skip pages that are all-frozen, since + * such pages do not need freezing and do not affect the value that we can + * safely set for relfrozenxid or relminmxid. + * + * Before entering the main loop, establish the invariant that + * next_unskippable_block is the next block number >= blkno that we can't + * skip based on the visibility map, either all-visible for a regular scan + * or all-frozen for an aggressive scan. We set it to nblocks if there's + * no such block. We also set up the skipping_blocks flag correctly at + * this stage. + * + * Note: The value returned by visibilitymap_get_status could be slightly + * out-of-date, since we make this test before reading the corresponding + * heap page or locking the buffer. This is OK. If we mistakenly think + * that the page is all-visible or all-frozen when in fact the flag's just + * been cleared, we might fail to vacuum the page. It's easy to see that + * skipping a page when aggressive is not set is not a very big deal; we + * might leave some dead tuples lying around, but the next vacuum will + * find them. But even when aggressive *is* set, it's still OK if we miss + * a page whose all-frozen marking has just been cleared. Any new XIDs + * just added to that page are necessarily newer than the GlobalXmin we + * computed, so they'll have no effect on the value to which we can safely + * set relfrozenxid. A similar argument applies for MXIDs and relminmxid. + * + * We will scan the table's last page, at least to the extent of + * determining whether it has tuples or not, even if it should be skipped + * according to the above rules; except when we've already determined that + * it's not worth trying to truncate the table. This avoids having + * lazy_truncate_heap() take access-exclusive lock on the table to attempt + * a truncation that just fails immediately because there are tuples in + * the last page. This is worth avoiding mainly because such a lock must + * be replayed on any hot standby, where it can be disruptive. + */ + if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) + { + while (next_unskippable_block < nblocks) + { + uint8 vmstatus; + + vmstatus = visibilitymap_get_status(vacrel->rel, + next_unskippable_block, + &vmbuffer); + if (aggressive) + { + if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0) + break; + } + else + { + if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0) + break; + } + vacuum_delay_point(); + next_unskippable_block++; + } + } + + if (next_unskippable_block >= SKIP_PAGES_THRESHOLD) + skipping_blocks = true; + else + skipping_blocks = false; + + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buf; + Page page; + bool all_visible_according_to_vm = false; + LVPagePruneState prunestate; + + /* + * Consider need to skip blocks. See note above about forcing + * scanning of last page. + */ +#define FORCE_CHECK_PAGE() \ + (blkno == nblocks - 1 && should_attempt_truncation(vacrel)) + + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + + update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP, + blkno, InvalidOffsetNumber); + + if (blkno == next_unskippable_block) + { + /* Time to advance next_unskippable_block */ + next_unskippable_block++; + if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) + { + while (next_unskippable_block < nblocks) + { + uint8 vmskipflags; + + vmskipflags = visibilitymap_get_status(vacrel->rel, + next_unskippable_block, + &vmbuffer); + if (aggressive) + { + if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0) + break; + } + else + { + if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0) + break; + } + vacuum_delay_point(); + next_unskippable_block++; + } + } + + /* + * We know we can't skip the current block. But set up + * skipping_blocks to do the right thing at the following blocks. + */ + if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD) + skipping_blocks = true; + else + skipping_blocks = false; + + /* + * Normally, the fact that we can't skip this block must mean that + * it's not all-visible. But in an aggressive vacuum we know only + * that it's not all-frozen, so it might still be all-visible. + */ + if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) + all_visible_according_to_vm = true; + } + else + { + /* + * The current block is potentially skippable; if we've seen a + * long enough run of skippable blocks to justify skipping it, and + * we're not forced to check it, then go ahead and skip. + * Otherwise, the page must be at least all-visible if not + * all-frozen, so we can set all_visible_according_to_vm = true. + */ + if (skipping_blocks && !FORCE_CHECK_PAGE()) + { + /* + * Tricky, tricky. If this is in aggressive vacuum, the page + * must have been all-frozen at the time we checked whether it + * was skippable, but it might not be any more. We must be + * careful to count it as a skipped all-frozen page in that + * case, or else we'll think we can't update relfrozenxid and + * relminmxid. If it's not an aggressive vacuum, we don't + * know whether it was all-frozen, so we have to recheck; but + * in this case an approximate answer is OK. + */ + if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) + vacrel->frozenskipped_pages++; + continue; + } + all_visible_according_to_vm = true; + } + + vacuum_delay_point(); + + /* + * Regularly check if wraparound failsafe should trigger. + * + * There is a similar check inside lazy_vacuum_all_indexes(), but + * relfrozenxid might start to look dangerously old before we reach + * that point. This check also provides failsafe coverage for the + * one-pass strategy, and the two-pass strategy with the index_cleanup + * param set to 'off'. + */ + if (blkno - next_failsafe_block >= FAILSAFE_EVERY_PAGES) + { + lazy_check_wraparound_failsafe(vacrel); + next_failsafe_block = blkno; + } + + /* + * Consider if we definitely have enough space to process TIDs on page + * already. If we are close to overrunning the available space for + * dead-tuple TIDs, pause and do a cycle of vacuuming before we tackle + * this page. + */ + if ((dead_tuples->max_tuples - dead_tuples->num_tuples) < MaxHeapTuplesPerPage && + dead_tuples->num_tuples > 0) + { + /* + * Before beginning index vacuuming, we release any pin we may + * hold on the visibility map page. This isn't necessary for + * correctness, but we do it anyway to avoid holding the pin + * across a lengthy, unrelated operation. + */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + + /* Remove the collected garbage tuples from table and indexes */ + vacrel->consider_bypass_optimization = false; + lazy_vacuum(vacrel); + + /* + * Vacuum the Free Space Map to make newly-freed space visible on + * upper-level FSM pages. Note we have not yet processed blkno. + */ + FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, + blkno); + next_fsm_block_to_vacuum = blkno; + + /* Report that we are once again scanning the heap */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_SCAN_HEAP); + } + + /* + * Set up visibility map page as needed. + * + * Pin the visibility map page in case we need to mark the page + * all-visible. In most cases this will be very cheap, because we'll + * already have the correct page pinned anyway. However, it's + * possible that (a) next_unskippable_block is covered by a different + * VM page than the current block or (b) we released our pin and did a + * cycle of index vacuuming. + */ + visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); + + buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, + RBM_NORMAL, vacrel->bstrategy); + + /* + * We need buffer cleanup lock so that we can prune HOT chains and + * defragment the page. + */ + if (!ConditionalLockBufferForCleanup(buf)) + { + bool hastup; + + /* + * If we're not performing an aggressive scan to guard against XID + * wraparound, and we don't want to forcibly check the page, then + * it's OK to skip vacuuming pages we get a lock conflict on. They + * will be dealt with in some future vacuum. + */ + if (!aggressive && !FORCE_CHECK_PAGE()) + { + ReleaseBuffer(buf); + vacrel->pinskipped_pages++; + continue; + } + + /* + * Read the page with share lock to see if any xids on it need to + * be frozen. If not we just skip the page, after updating our + * scan statistics. If there are some, we wait for cleanup lock. + * + * We could defer the lock request further by remembering the page + * and coming back to it later, or we could even register + * ourselves for multiple buffers and then service whichever one + * is received first. For now, this seems good enough. + * + * If we get here with aggressive false, then we're just forcibly + * checking the page, and so we don't want to insist on getting + * the lock; we only need to know if the page contains tuples, so + * that we can update nonempty_pages correctly. It's convenient + * to use lazy_check_needs_freeze() for both situations, though. + */ + LockBuffer(buf, BUFFER_LOCK_SHARE); + if (!lazy_check_needs_freeze(buf, &hastup, vacrel)) + { + UnlockReleaseBuffer(buf); + vacrel->scanned_pages++; + vacrel->pinskipped_pages++; + if (hastup) + vacrel->nonempty_pages = blkno + 1; + continue; + } + if (!aggressive) + { + /* + * Here, we must not advance scanned_pages; that would amount + * to claiming that the page contains no freezable tuples. + */ + UnlockReleaseBuffer(buf); + vacrel->pinskipped_pages++; + if (hastup) + vacrel->nonempty_pages = blkno + 1; + continue; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); + /* drop through to normal processing */ + } + + /* + * By here we definitely have enough dead_tuples space for whatever + * LP_DEAD tids are on this page, we have the visibility map page set + * up in case we need to set this page's all_visible/all_frozen bit, + * and we have a super-exclusive lock. Any tuples on this page are + * now sure to be "counted" by this VACUUM. + * + * One last piece of preamble needs to take place before we can prune: + * we need to consider new and empty pages. + */ + vacrel->scanned_pages++; + vacrel->tupcount_pages++; + + page = BufferGetPage(buf); + + if (PageIsNew(page)) + { + /* + * All-zeroes pages can be left over if either a backend extends + * the relation by a single page, but crashes before the newly + * initialized page has been written out, or when bulk-extending + * the relation (which creates a number of empty pages at the tail + * end of the relation, but enters them into the FSM). + * + * Note we do not enter the page into the visibilitymap. That has + * the downside that we repeatedly visit this page in subsequent + * vacuums, but otherwise we'll never not discover the space on a + * promoted standby. The harm of repeated checking ought to + * normally not be too bad - the space usually should be used at + * some point, otherwise there wouldn't be any regular vacuums. + * + * Make sure these pages are in the FSM, to ensure they can be + * reused. Do that by testing if there's any space recorded for + * the page. If not, enter it. We do so after releasing the lock + * on the heap page, the FSM is approximate, after all. + */ + UnlockReleaseBuffer(buf); + + if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0) + { + Size freespace = BLCKSZ - SizeOfPageHeaderData; + + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + } + continue; + } + + if (PageIsEmpty(page)) + { + Size freespace = PageGetHeapFreeSpace(page); + + /* + * Empty pages are always all-visible and all-frozen (note that + * the same is currently not true for new pages, see above). + */ + if (!PageIsAllVisible(page)) + { + START_CRIT_SECTION(); + + /* mark buffer dirty before writing a WAL record */ + MarkBufferDirty(buf); + + /* + * It's possible that another backend has extended the heap, + * initialized the page, and then failed to WAL-log the page + * due to an ERROR. Since heap extension is not WAL-logged, + * recovery might try to replay our record setting the page + * all-visible and find that the page isn't initialized, which + * will cause a PANIC. To prevent that, check whether the + * page has been previously WAL-logged, and if not, do that + * now. + */ + if (RelationNeedsWAL(vacrel->rel) && + PageGetLSN(page) == InvalidXLogRecPtr) + log_newpage_buffer(buf, true); + + PageSetAllVisible(page); + visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); + END_CRIT_SECTION(); + } + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + continue; + } + + /* + * Prune and freeze tuples. + * + * Accumulates details of remaining LP_DEAD line pointers on page in + * dead tuple list. This includes LP_DEAD line pointers that we + * pruned ourselves, as well as existing LP_DEAD line pointers that + * were pruned some time earlier. Also considers freezing XIDs in the + * tuple headers of remaining items with storage. + */ + lazy_scan_prune(vacrel, buf, blkno, page, vistest, &prunestate); + + Assert(!prunestate.all_visible || !prunestate.has_lpdead_items); + + /* Remember the location of the last page with nonremovable tuples */ + if (prunestate.hastup) + vacrel->nonempty_pages = blkno + 1; + + if (vacrel->nindexes == 0) + { + /* + * Consider the need to do page-at-a-time heap vacuuming when + * using the one-pass strategy now. + * + * The one-pass strategy will never call lazy_vacuum(). The steps + * performed here can be thought of as the one-pass equivalent of + * a call to lazy_vacuum(). + */ + if (prunestate.has_lpdead_items) + { + Size freespace; + + lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer); + + /* Forget the now-vacuumed tuples */ + dead_tuples->num_tuples = 0; + + /* + * Periodically perform FSM vacuuming to make newly-freed + * space visible on upper FSM pages. Note we have not yet + * performed FSM processing for blkno. + */ + if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) + { + FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, + blkno); + next_fsm_block_to_vacuum = blkno; + } + + /* + * Now perform FSM processing for blkno, and move on to next + * page. + * + * Our call to lazy_vacuum_heap_page() will have considered if + * it's possible to set all_visible/all_frozen independently + * of lazy_scan_prune(). Note that prunestate was invalidated + * by lazy_vacuum_heap_page() call. + */ + freespace = PageGetHeapFreeSpace(page); + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + continue; + } + + /* + * There was no call to lazy_vacuum_heap_page() because pruning + * didn't encounter/create any LP_DEAD items that needed to be + * vacuumed. Prune state has not been invalidated, so proceed + * with prunestate-driven visibility map and FSM steps (just like + * the two-pass strategy). + */ + Assert(dead_tuples->num_tuples == 0); + } + + /* + * Handle setting visibility map bit based on what the VM said about + * the page before pruning started, and using prunestate + */ + if (!all_visible_according_to_vm && prunestate.all_visible) + { + uint8 flags = VISIBILITYMAP_ALL_VISIBLE; + + if (prunestate.all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + + /* + * It should never be the case that the visibility map page is set + * while the page-level bit is clear, but the reverse is allowed + * (if checksums are not enabled). Regardless, set both bits so + * that we get back in sync. + * + * NB: If the heap page is all-visible but the VM bit is not set, + * we don't need to dirty the heap page. However, if checksums + * are enabled, we do need to make sure that the heap page is + * dirtied before passing it to visibilitymap_set(), because it + * may be logged. Given that this situation should only happen in + * rare cases after a crash, it is not worth optimizing. + */ + PageSetAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, prunestate.visibility_cutoff_xid, + flags); + } + + /* + * As of PostgreSQL 9.2, the visibility map bit should never be set if + * the page-level bit is clear. However, it's possible that the bit + * got cleared after we checked it and before we took the buffer + * content lock, so we must recheck before jumping to the conclusion + * that something bad has happened. + */ + else if (all_visible_according_to_vm && !PageIsAllVisible(page) + && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) + { + elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + vacrel->relname, blkno); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + /* + * It's possible for the value returned by + * GetOldestNonRemovableTransactionId() to move backwards, so it's not + * wrong for us to see tuples that appear to not be visible to + * everyone yet, while PD_ALL_VISIBLE is already set. The real safe + * xmin value never moves backwards, but + * GetOldestNonRemovableTransactionId() is conservative and sometimes + * returns a value that's unnecessarily small, so if we see that + * contradiction it just means that the tuples that we think are not + * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag + * is correct. + * + * There should never be dead tuples on a page with PD_ALL_VISIBLE + * set, however. + */ + else if (prunestate.has_lpdead_items && PageIsAllVisible(page)) + { + elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", + vacrel->relname, blkno); + PageClearAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + /* + * If the all-visible page is all-frozen but not marked as such yet, + * mark it as all-frozen. Note that all_frozen is only valid if + * all_visible is true, so we must check both. + */ + else if (all_visible_according_to_vm && prunestate.all_visible && + prunestate.all_frozen && + !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) + { + /* + * We can pass InvalidTransactionId as the cutoff XID here, + * because setting the all-frozen bit doesn't cause recovery + * conflicts. + */ + visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_FROZEN); + } + + /* + * Final steps for block: drop super-exclusive lock, record free space + * in the FSM + */ + if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming) + { + /* + * Wait until lazy_vacuum_heap_rel() to save free space. This + * doesn't just save us some cycles; it also allows us to record + * any additional free space that lazy_vacuum_heap_page() will + * make available in cases where it's possible to truncate the + * page's line pointer array. + * + * Note: It's not in fact 100% certain that we really will call + * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip + * index vacuuming (and so must skip heap vacuuming). This is + * deemed okay because it only happens in emergencies, or when + * there is very little free space anyway. (Besides, we start + * recording free space in the FSM once index vacuuming has been + * abandoned.) + * + * Note: The one-pass (no indexes) case is only supposed to make + * it this far when there were no LP_DEAD items during pruning. + */ + Assert(vacrel->nindexes > 0); + UnlockReleaseBuffer(buf); + } + else + { + Size freespace = PageGetHeapFreeSpace(page); + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + } + } + + /* report that everything is now scanned */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + + /* Clear the block number information */ + vacrel->blkno = InvalidBlockNumber; + + /* now we can compute the new value for pg_class.reltuples */ + vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks, + vacrel->tupcount_pages, + vacrel->live_tuples); + + /* + * Also compute the total number of surviving heap entries. In the + * (unlikely) scenario that new_live_tuples is -1, take it as zero. + */ + vacrel->new_rel_tuples = + Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples; + + /* + * Release any remaining pin on visibility map page. + */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + + /* If any tuples need to be deleted, perform final vacuum cycle */ + if (dead_tuples->num_tuples > 0) + lazy_vacuum(vacrel); + + /* + * Vacuum the remainder of the Free Space Map. We must do this whether or + * not there were indexes, and whether or not we bypassed index vacuuming. + */ + if (blkno > next_fsm_block_to_vacuum) + FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno); + + /* report all blocks vacuumed */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + + /* Do post-vacuum cleanup */ + if (vacrel->nindexes > 0 && vacrel->do_index_cleanup) + lazy_cleanup_all_indexes(vacrel); + + /* + * Free resources managed by lazy_space_alloc(). (We must end parallel + * mode/free shared memory before updating index statistics. We cannot + * write while in parallel mode.) + */ + lazy_space_free(vacrel); + + /* Update index statistics */ + if (vacrel->nindexes > 0 && vacrel->do_index_cleanup) + update_index_statistics(vacrel); + + /* + * When the table has no indexes (i.e. in the one-pass strategy case), + * make log report that lazy_vacuum_heap_rel would've made had there been + * indexes. (As in the two-pass strategy case, only make this report when + * there were LP_DEAD line pointers vacuumed in lazy_vacuum_heap_page.) + */ + if (vacrel->nindexes == 0 && vacrel->lpdead_item_pages > 0) + ereport(elevel, + (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages", + vacrel->relname, (long long) vacrel->lpdead_items, + vacrel->lpdead_item_pages))); + + /* + * Make a log report summarizing pruning and freezing. + * + * The autovacuum specific logging in heap_vacuum_rel summarizes an entire + * VACUUM operation, whereas each VACUUM VERBOSE log report generally + * summarizes a single round of index/heap vacuuming (or rel truncation). + * It wouldn't make sense to report on pruning or freezing while following + * that convention, though. You can think of this log report as a summary + * of our first pass over the heap. + */ + initStringInfo(&buf); + appendStringInfo(&buf, + _("%lld dead row versions cannot be removed yet, oldest xmin: %u\n"), + (long long) vacrel->new_dead_tuples, vacrel->OldestXmin); + appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ", + "Skipped %u pages due to buffer pins, ", + vacrel->pinskipped_pages), + vacrel->pinskipped_pages); + appendStringInfo(&buf, ngettext("%u frozen page.\n", + "%u frozen pages.\n", + vacrel->frozenskipped_pages), + vacrel->frozenskipped_pages); + appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0)); + + ereport(elevel, + (errmsg("table \"%s\": found %lld removable, %lld nonremovable row versions in %u out of %u pages", + vacrel->relname, + (long long) vacrel->tuples_deleted, + (long long) vacrel->num_tuples, vacrel->scanned_pages, + nblocks), + errdetail_internal("%s", buf.data))); + pfree(buf.data); +} + +/* + * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing. + * + * Caller must hold pin and buffer cleanup lock on the buffer. + * + * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune() + * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about + * whether or not a tuple should be considered DEAD. This happened when an + * inserting transaction concurrently aborted (after our heap_page_prune() + * call, before our HeapTupleSatisfiesVacuum() call). There was rather a lot + * of complexity just so we could deal with tuples that were DEAD to VACUUM, + * but nevertheless were left with storage after pruning. + * + * The approach we take now is to restart pruning when the race condition is + * detected. This allows heap_page_prune() to prune the tuples inserted by + * the now-aborted transaction. This is a little crude, but it guarantees + * that any items that make it into the dead_tuples array are simple LP_DEAD + * line pointers, and that every remaining item with tuple storage is + * considered as a candidate for freezing. + */ +static void +lazy_scan_prune(LVRelState *vacrel, + Buffer buf, + BlockNumber blkno, + Page page, + GlobalVisState *vistest, + LVPagePruneState *prunestate) +{ + Relation rel = vacrel->rel; + OffsetNumber offnum, + maxoff; + ItemId itemid; + HeapTupleData tuple; + HTSV_Result res; + int tuples_deleted, + lpdead_items, + new_dead_tuples, + num_tuples, + live_tuples; + int nfrozen; + OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; + xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage]; + + maxoff = PageGetMaxOffsetNumber(page); + +retry: + + /* Initialize (or reset) page-level counters */ + tuples_deleted = 0; + lpdead_items = 0; + new_dead_tuples = 0; + num_tuples = 0; + live_tuples = 0; + + /* + * Prune all HOT-update chains in this page. + * + * We count tuples removed by the pruning step as tuples_deleted. Its + * final value can be thought of as the number of tuples that have been + * deleted from the table. It should not be confused with lpdead_items; + * lpdead_items's final value can be thought of as the number of tuples + * that were deleted from indexes. + */ + tuples_deleted = heap_page_prune(rel, buf, vistest, + InvalidTransactionId, 0, false, + &vacrel->offnum); + + /* + * Now scan the page to collect LP_DEAD items and check for tuples + * requiring freezing among remaining tuples with storage + */ + prunestate->hastup = false; + prunestate->has_lpdead_items = false; + prunestate->all_visible = true; + prunestate->all_frozen = true; + prunestate->visibility_cutoff_xid = InvalidTransactionId; + nfrozen = 0; + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + bool tuple_totally_frozen; + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + vacrel->offnum = offnum; + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(itemid)) + continue; + + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + { + prunestate->hastup = true; /* page won't be truncatable */ + continue; + } + + /* + * LP_DEAD items are processed outside of the loop. + * + * Note that we deliberately don't set hastup=true in the case of an + * LP_DEAD item here, which is not how lazy_check_needs_freeze() or + * count_nondeletable_pages() do it -- they only consider pages empty + * when they only have LP_UNUSED items, which is important for + * correctness. + * + * Our assumption is that any LP_DEAD items we encounter here will + * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually + * call count_nondeletable_pages(). In any case our opinion of + * whether or not a page 'hastup' (which is how our caller sets its + * vacrel->nonempty_pages value) is inherently race-prone. It must be + * treated as advisory/unreliable, so we might as well be slightly + * optimistic. + */ + if (ItemIdIsDead(itemid)) + { + deadoffsets[lpdead_items++] = offnum; + prunestate->all_visible = false; + prunestate->has_lpdead_items = true; + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + + /* + * DEAD tuples are almost always pruned into LP_DEAD line pointers by + * heap_page_prune(), but it's possible that the tuple state changed + * since heap_page_prune() looked. Handle that here by restarting. + * (See comments at the top of function for a full explanation.) + */ + res = HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf); + + if (unlikely(res == HEAPTUPLE_DEAD)) + goto retry; + + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM + * and ANALYZE may produce wildly different reltuples values, e.g. + * when there are many recently-dead tuples. + * + * The logic here is a bit simpler than acquire_sample_rows(), as + * VACUUM can't run inside a transaction block, which makes some cases + * impossible (e.g. in-progress insert from the same transaction). + * + * We treat LP_DEAD items a little differently, too -- we don't count + * them as dead_tuples at all (we only consider new_dead_tuples). The + * outcome is no different because we assume that any LP_DEAD items we + * encounter here will become LP_UNUSED inside lazy_vacuum_heap_page() + * before we report anything to the stats collector. (Cases where we + * bypass index vacuuming will violate our assumption, but the overall + * impact of that should be negligible.) + */ + switch (res) + { + case HEAPTUPLE_LIVE: + + /* + * Count it as live. Not only is this natural, but it's also + * what acquire_sample_rows() does. + */ + live_tuples++; + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed + * asynchronously. See SetHintBits for more info. Check that + * the tuple is hinted xmin-committed because of that. + */ + if (prunestate->all_visible) + { + TransactionId xmin; + + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + prunestate->all_visible = false; + break; + } + + /* + * The inserter definitely committed. But is it old enough + * that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin)) + { + prunestate->all_visible = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid)) + prunestate->visibility_cutoff_xid = xmin; + } + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently deleted then we must not remove it + * from relation. (We only remove items that are LP_DEAD from + * pruning.) + */ + new_dead_tuples++; + prunestate->all_visible = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * We do not count these rows as live, because we expect the + * inserting transaction to update the counters at commit, and + * we assume that will happen only after we report our + * results. This assumption is a bit shaky, but it is what + * acquire_sample_rows() does, so be consistent. + */ + prunestate->all_visible = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + /* This is an expected case during concurrent vacuum */ + prunestate->all_visible = false; + + /* + * Count such rows as live. As above, we assume the deleting + * transaction will commit and update the counters after we + * report. + */ + live_tuples++; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + /* + * Non-removable tuple (i.e. tuple with storage). + * + * Check tuple left behind after pruning to see if needs to be frozen + * now. + */ + num_tuples++; + prunestate->hastup = true; + if (heap_prepare_freeze_tuple(tuple.t_data, + vacrel->relfrozenxid, + vacrel->relminmxid, + vacrel->FreezeLimit, + vacrel->MultiXactCutoff, + &frozen[nfrozen], + &tuple_totally_frozen)) + { + /* Will execute freeze below */ + frozen[nfrozen++].offset = offnum; + } + + /* + * If tuple is not frozen (and not about to become frozen) then caller + * had better not go on to set this page's VM bit + */ + if (!tuple_totally_frozen) + prunestate->all_frozen = false; + } + + /* + * We have now divided every item on the page into either an LP_DEAD item + * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple + * that remains and needs to be considered for freezing now (LP_UNUSED and + * LP_REDIRECT items also remain, but are of no further interest to us). + */ + vacrel->offnum = InvalidOffsetNumber; + + /* + * Consider the need to freeze any items with tuple storage from the page + * first (arbitrary) + */ + if (nfrozen > 0) + { + Assert(prunestate->hastup); + + /* + * At least one tuple with storage needs to be frozen -- execute that + * now. + * + * If we need to freeze any tuples we'll mark the buffer dirty, and + * write a WAL record recording the changes. We must log the changes + * to be crash-safe against future truncation of CLOG. + */ + START_CRIT_SECTION(); + + MarkBufferDirty(buf); + + /* execute collected freezes */ + for (int i = 0; i < nfrozen; i++) + { + HeapTupleHeader htup; + + itemid = PageGetItemId(page, frozen[i].offset); + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + heap_execute_freeze_tuple(htup, &frozen[i]); + } + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(vacrel->rel)) + { + XLogRecPtr recptr; + + recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit, + frozen, nfrozen); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + /* + * The second pass over the heap can also set visibility map bits, using + * the same approach. This is important when the table frequently has a + * few old LP_DEAD items on each page by the time we get to it (typically + * because past opportunistic pruning operations freed some non-HOT + * tuples). + * + * VACUUM will call heap_page_is_all_visible() during the second pass over + * the heap to determine all_visible and all_frozen for the page -- this + * is a specialized version of the logic from this function. Now that + * we've finished pruning and freezing, make sure that we're in total + * agreement with heap_page_is_all_visible() using an assertion. + */ +#ifdef USE_ASSERT_CHECKING + /* Note that all_frozen value does not matter when !all_visible */ + if (prunestate->all_visible) + { + TransactionId cutoff; + bool all_frozen; + + if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen)) + Assert(false); + + Assert(lpdead_items == 0); + Assert(prunestate->all_frozen == all_frozen); + + /* + * It's possible that we froze tuples and made the page's XID cutoff + * (for recovery conflict purposes) FrozenTransactionId. This is okay + * because visibility_cutoff_xid will be logged by our caller in a + * moment. + */ + Assert(cutoff == FrozenTransactionId || + cutoff == prunestate->visibility_cutoff_xid); + } +#endif + + /* + * Now save details of the LP_DEAD items from the page in the dead_tuples + * array. Also record that page has dead items in per-page prunestate. + */ + if (lpdead_items > 0) + { + LVDeadTuples *dead_tuples = vacrel->dead_tuples; + ItemPointerData tmp; + + Assert(!prunestate->all_visible); + Assert(prunestate->has_lpdead_items); + + vacrel->lpdead_item_pages++; + + ItemPointerSetBlockNumber(&tmp, blkno); + + for (int i = 0; i < lpdead_items; i++) + { + ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); + dead_tuples->itemptrs[dead_tuples->num_tuples++] = tmp; + } + + Assert(dead_tuples->num_tuples <= dead_tuples->max_tuples); + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, + dead_tuples->num_tuples); + } + + /* Finally, add page-local counts to whole-VACUUM counts */ + vacrel->tuples_deleted += tuples_deleted; + vacrel->lpdead_items += lpdead_items; + vacrel->new_dead_tuples += new_dead_tuples; + vacrel->num_tuples += num_tuples; + vacrel->live_tuples += live_tuples; +} + +/* + * Remove the collected garbage tuples from the table and its indexes. + * + * We may choose to bypass index vacuuming at this point, though only when the + * ongoing VACUUM operation will definitely only have one index scan/round of + * index vacuuming. Caller indicates whether or not this is such a VACUUM + * operation using 'onecall' argument. + * + * In rare emergencies, the ongoing VACUUM operation can be made to skip both + * index vacuuming and index cleanup at the point we're called. This avoids + * having the whole system refuse to allocate further XIDs/MultiXactIds due to + * wraparound. + */ +static void +lazy_vacuum(LVRelState *vacrel) +{ + bool bypass; + + /* Should not end up here with no indexes */ + Assert(vacrel->nindexes > 0); + Assert(!IsParallelWorker()); + Assert(vacrel->lpdead_item_pages > 0); + + if (!vacrel->do_index_vacuuming) + { + Assert(!vacrel->do_index_cleanup); + vacrel->dead_tuples->num_tuples = 0; + return; + } + + /* + * Consider bypassing index vacuuming (and heap vacuuming) entirely. + * + * We currently only do this in cases where the number of LP_DEAD items + * for the entire VACUUM operation is close to zero. This avoids sharp + * discontinuities in the duration and overhead of successive VACUUM + * operations that run against the same table with a fixed workload. + * Ideally, successive VACUUM operations will behave as if there are + * exactly zero LP_DEAD items in cases where there are close to zero. + * + * This is likely to be helpful with a table that is continually affected + * by UPDATEs that can mostly apply the HOT optimization, but occasionally + * have small aberrations that lead to just a few heap pages retaining + * only one or two LP_DEAD items. This is pretty common; even when the + * DBA goes out of their way to make UPDATEs use HOT, it is practically + * impossible to predict whether HOT will be applied in 100% of cases. + * It's far easier to ensure that 99%+ of all UPDATEs against a table use + * HOT through careful tuning. + */ + bypass = false; + if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0) + { + BlockNumber threshold; + + Assert(vacrel->num_index_scans == 0); + Assert(vacrel->lpdead_items == vacrel->dead_tuples->num_tuples); + Assert(vacrel->do_index_vacuuming); + Assert(vacrel->do_index_cleanup); + + /* + * This crossover point at which we'll start to do index vacuuming is + * expressed as a percentage of the total number of heap pages in the + * table that are known to have at least one LP_DEAD item. This is + * much more important than the total number of LP_DEAD items, since + * it's a proxy for the number of heap pages whose visibility map bits + * cannot be set on account of bypassing index and heap vacuuming. + * + * We apply one further precautionary test: the space currently used + * to store the TIDs (TIDs that now all point to LP_DEAD items) must + * not exceed 32MB. This limits the risk that we will bypass index + * vacuuming again and again until eventually there is a VACUUM whose + * dead_tuples space is not CPU cache resident. + * + * We don't take any special steps to remember the LP_DEAD items (such + * as counting them in new_dead_tuples report to the stats collector) + * when the optimization is applied. Though the accounting used in + * analyze.c's acquire_sample_rows() will recognize the same LP_DEAD + * items as dead rows in its own stats collector report, that's okay. + * The discrepancy should be negligible. If this optimization is ever + * expanded to cover more cases then this may need to be reconsidered. + */ + threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES; + bypass = (vacrel->lpdead_item_pages < threshold && + vacrel->lpdead_items < MAXDEADTUPLES(32L * 1024L * 1024L)); + } + + if (bypass) + { + /* + * There are almost zero TIDs. Behave as if there were precisely + * zero: bypass index vacuuming, but do index cleanup. + * + * We expect that the ongoing VACUUM operation will finish very + * quickly, so there is no point in considering speeding up as a + * failsafe against wraparound failure. (Index cleanup is expected to + * finish very quickly in cases where there were no ambulkdelete() + * calls.) + */ + vacrel->do_index_vacuuming = false; + ereport(elevel, + (errmsg("table \"%s\": index scan bypassed: %u pages from table (%.2f%% of total) have %lld dead item identifiers", + vacrel->relname, vacrel->lpdead_item_pages, + 100.0 * vacrel->lpdead_item_pages / vacrel->rel_pages, + (long long) vacrel->lpdead_items))); + } + else if (lazy_vacuum_all_indexes(vacrel)) + { + /* + * We successfully completed a round of index vacuuming. Do related + * heap vacuuming now. + */ + lazy_vacuum_heap_rel(vacrel); + } + else + { + /* + * Failsafe case. + * + * we attempted index vacuuming, but didn't finish a full round/full + * index scan. This happens when relfrozenxid or relminmxid is too + * far in the past. + * + * From this point on the VACUUM operation will do no further index + * vacuuming or heap vacuuming. This VACUUM operation won't end up + * back here again. + */ + Assert(vacrel->failsafe_active); + } + + /* + * Forget the LP_DEAD items that we just vacuumed (or just decided to not + * vacuum) + */ + vacrel->dead_tuples->num_tuples = 0; +} + +/* + * lazy_vacuum_all_indexes() -- Main entry for index vacuuming + * + * Returns true in the common case when all indexes were successfully + * vacuumed. Returns false in rare cases where we determined that the ongoing + * VACUUM operation is at risk of taking too long to finish, leading to + * wraparound failure. + */ +static bool +lazy_vacuum_all_indexes(LVRelState *vacrel) +{ + bool allindexes = true; + + Assert(!IsParallelWorker()); + Assert(vacrel->nindexes > 0); + Assert(vacrel->do_index_vacuuming); + Assert(vacrel->do_index_cleanup); + Assert(TransactionIdIsNormal(vacrel->relfrozenxid)); + Assert(MultiXactIdIsValid(vacrel->relminmxid)); + + /* Precheck for XID wraparound emergencies */ + if (lazy_check_wraparound_failsafe(vacrel)) + { + /* Wraparound emergency -- don't even start an index scan */ + return false; + } + + /* Report that we are now vacuuming indexes */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_VACUUM_INDEX); + + if (!ParallelVacuumIsActive(vacrel)) + { + for (int idx = 0; idx < vacrel->nindexes; idx++) + { + Relation indrel = vacrel->indrels[idx]; + IndexBulkDeleteResult *istat = vacrel->indstats[idx]; + + vacrel->indstats[idx] = + lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples, + vacrel); + + if (lazy_check_wraparound_failsafe(vacrel)) + { + /* Wraparound emergency -- end current index scan */ + allindexes = false; + break; + } + } + } + else + { + /* Outsource everything to parallel variant */ + do_parallel_lazy_vacuum_all_indexes(vacrel); + + /* + * Do a postcheck to consider applying wraparound failsafe now. Note + * that parallel VACUUM only gets the precheck and this postcheck. + */ + if (lazy_check_wraparound_failsafe(vacrel)) + allindexes = false; + } + + /* + * We delete all LP_DEAD items from the first heap pass in all indexes on + * each call here (except calls where we choose to do the failsafe). This + * makes the next call to lazy_vacuum_heap_rel() safe (except in the event + * of the failsafe triggering, which prevents the next call from taking + * place). + */ + Assert(vacrel->num_index_scans > 0 || + vacrel->dead_tuples->num_tuples == vacrel->lpdead_items); + Assert(allindexes || vacrel->failsafe_active); + + /* + * Increase and report the number of index scans. + * + * We deliberately include the case where we started a round of bulk + * deletes that we weren't able to finish due to the failsafe triggering. + */ + vacrel->num_index_scans++; + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS, + vacrel->num_index_scans); + + return allindexes; +} + +/* + * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy + * + * This routine marks LP_DEAD items in vacrel->dead_tuples array as LP_UNUSED. + * Pages that never had lazy_scan_prune record LP_DEAD items are not visited + * at all. + * + * We may also be able to truncate the line pointer array of the heap pages we + * visit. If there is a contiguous group of LP_UNUSED items at the end of the + * array, it can be reclaimed as free space. These LP_UNUSED items usually + * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from + * each page to LP_UNUSED, and then consider if it's possible to truncate the + * page's line pointer array). + * + * Note: the reason for doing this as a second pass is we cannot remove the + * tuples until we've removed their index entries, and we want to process + * index entry removal in batches as large as possible. + */ +static void +lazy_vacuum_heap_rel(LVRelState *vacrel) +{ + int tupindex; + BlockNumber vacuumed_pages; + PGRUsage ru0; + Buffer vmbuffer = InvalidBuffer; + LVSavedErrInfo saved_err_info; + + Assert(vacrel->do_index_vacuuming); + Assert(vacrel->do_index_cleanup); + Assert(vacrel->num_index_scans > 0); + + /* Report that we are now vacuuming the heap */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_VACUUM_HEAP); + + /* Update error traceback information */ + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_VACUUM_HEAP, + InvalidBlockNumber, InvalidOffsetNumber); + + pg_rusage_init(&ru0); + vacuumed_pages = 0; + + tupindex = 0; + while (tupindex < vacrel->dead_tuples->num_tuples) + { + BlockNumber tblk; + Buffer buf; + Page page; + Size freespace; + + vacuum_delay_point(); + + tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]); + vacrel->blkno = tblk; + buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL, + vacrel->bstrategy); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + tupindex = lazy_vacuum_heap_page(vacrel, tblk, buf, tupindex, + &vmbuffer); + + /* Now that we've vacuumed the page, record its available space */ + page = BufferGetPage(buf); + freespace = PageGetHeapFreeSpace(page); + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, tblk, freespace); + vacuumed_pages++; + } + + /* Clear the block number information */ + vacrel->blkno = InvalidBlockNumber; + + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + + /* + * We set all LP_DEAD items from the first heap pass to LP_UNUSED during + * the second heap pass. No more, no less. + */ + Assert(tupindex > 0); + Assert(vacrel->num_index_scans > 1 || + (tupindex == vacrel->lpdead_items && + vacuumed_pages == vacrel->lpdead_item_pages)); + + ereport(elevel, + (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages", + vacrel->relname, (long long ) tupindex, vacuumed_pages), + errdetail_internal("%s", pg_rusage_show(&ru0)))); + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); +} + +/* + * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the + * vacrel->dead_tuples array. + * + * Caller must have an exclusive buffer lock on the buffer (though a + * super-exclusive lock is also acceptable). + * + * tupindex is the index in vacrel->dead_tuples of the first dead tuple for + * this page. We assume the rest follow sequentially. The return value is + * the first tupindex after the tuples of this page. + * + * Prior to PostgreSQL 14 there were rare cases where this routine had to set + * tuples with storage to unused. These days it is strictly responsible for + * marking LP_DEAD stub line pointers as unused. This only happens for those + * LP_DEAD items on the page that were determined to be LP_DEAD items back + * when the same page was visited by lazy_scan_prune() (i.e. those whose TID + * was recorded in the dead_tuples array). + */ +static int +lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, + int tupindex, Buffer *vmbuffer) +{ + LVDeadTuples *dead_tuples = vacrel->dead_tuples; + Page page = BufferGetPage(buffer); + OffsetNumber unused[MaxHeapTuplesPerPage]; + int uncnt = 0; + TransactionId visibility_cutoff_xid; + bool all_frozen; + LVSavedErrInfo saved_err_info; + + Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming); + + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + + /* Update error traceback information */ + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno, + InvalidOffsetNumber); + + START_CRIT_SECTION(); + + for (; tupindex < dead_tuples->num_tuples; tupindex++) + { + BlockNumber tblk; + OffsetNumber toff; + ItemId itemid; + + tblk = ItemPointerGetBlockNumber(&dead_tuples->itemptrs[tupindex]); + if (tblk != blkno) + break; /* past end of tuples for this block */ + toff = ItemPointerGetOffsetNumber(&dead_tuples->itemptrs[tupindex]); + itemid = PageGetItemId(page, toff); + + Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid)); + ItemIdSetUnused(itemid); + unused[uncnt++] = toff; + } + + Assert(uncnt > 0); + + /* Attempt to truncate line pointer array now */ + PageTruncateLinePointerArray(page); + + /* + * Mark buffer dirty before we write WAL. + */ + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(vacrel->rel)) + { + xl_heap_vacuum xlrec; + XLogRecPtr recptr; + + xlrec.nunused = uncnt; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) unused, uncnt * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM); + + PageSetLSN(page, recptr); + } + + /* + * End critical section, so we safely can do visibility tests (which + * possibly need to perform IO and allocate memory!). If we crash now the + * page (including the corresponding vm bit) might not be marked all + * visible, but that's fine. A later vacuum will fix that. + */ + END_CRIT_SECTION(); + + /* + * Now that we have removed the LD_DEAD items from the page, once again + * check if the page has become all-visible. The page is already marked + * dirty, exclusively locked, and, if needed, a full page image has been + * emitted. + */ + if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid, + &all_frozen)) + PageSetAllVisible(page); + + /* + * All the changes to the heap page have been done. If the all-visible + * flag is now set, also set the VM all-visible bit (and, if possible, the + * all-frozen bit) unless this has already been done previously. + */ + if (PageIsAllVisible(page)) + { + uint8 flags = 0; + uint8 vm_status = visibilitymap_get_status(vacrel->rel, + blkno, vmbuffer); + + /* Set the VM all-frozen bit to flag, if needed */ + if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0) + flags |= VISIBILITYMAP_ALL_VISIBLE; + if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + + Assert(BufferIsValid(*vmbuffer)); + if (flags != 0) + visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr, + *vmbuffer, visibility_cutoff_xid, flags); + } + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); + return tupindex; +} + +/* + * lazy_check_needs_freeze() -- scan page to see if any tuples + * need to be cleaned to avoid wraparound + * + * Returns true if the page needs to be vacuumed using cleanup lock. + * Also returns a flag indicating whether page contains any tuples at all. + */ +static bool +lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel) +{ + Page page = BufferGetPage(buf); + OffsetNumber offnum, + maxoff; + HeapTupleHeader tupleheader; + + *hastup = false; + + /* + * New and empty pages, obviously, don't contain tuples. We could make + * sure that the page is registered in the FSM, but it doesn't seem worth + * waiting for a cleanup lock just for that, especially because it's + * likely that the pin holder will do so. + */ + if (PageIsNew(page) || PageIsEmpty(page)) + return false; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + vacrel->offnum = offnum; + itemid = PageGetItemId(page, offnum); + + /* this should match hastup test in count_nondeletable_pages() */ + if (ItemIdIsUsed(itemid)) + *hastup = true; + + /* dead and redirect items never need freezing */ + if (!ItemIdIsNormal(itemid)) + continue; + + tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); + + if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit, + vacrel->MultiXactCutoff, buf)) + break; + } /* scan along page */ + + /* Clear the offset information once we have processed the given page. */ + vacrel->offnum = InvalidOffsetNumber; + + return (offnum <= maxoff); +} + +/* + * Trigger the failsafe to avoid wraparound failure when vacrel table has a + * relfrozenxid and/or relminmxid that is dangerously far in the past. + * Triggering the failsafe makes the ongoing VACUUM bypass any further index + * vacuuming and heap vacuuming. Truncating the heap is also bypassed. + * + * Any remaining work (work that VACUUM cannot just bypass) is typically sped + * up when the failsafe triggers. VACUUM stops applying any cost-based delay + * that it started out with. + * + * Returns true when failsafe has been triggered. + */ +static bool +lazy_check_wraparound_failsafe(LVRelState *vacrel) +{ + /* Don't warn more than once per VACUUM */ + if (vacrel->failsafe_active) + return true; + + if (unlikely(vacuum_xid_failsafe_check(vacrel->relfrozenxid, + vacrel->relminmxid))) + { + vacrel->failsafe_active = true; + + /* Disable index vacuuming, index cleanup, and heap rel truncation */ + vacrel->do_index_vacuuming = false; + vacrel->do_index_cleanup = false; + vacrel->do_rel_truncate = false; + + ereport(WARNING, + (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans", + get_database_name(MyDatabaseId), + vacrel->relnamespace, + vacrel->relname, + vacrel->num_index_scans), + errdetail("The table's relfrozenxid or relminmxid is too far in the past."), + errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n" + "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs."))); + + /* Stop applying cost limits from this point on */ + VacuumCostActive = false; + VacuumCostBalance = 0; + + return true; + } + + return false; +} + +/* + * Perform lazy_vacuum_all_indexes() steps in parallel + */ +static void +do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel) +{ + /* Tell parallel workers to do index vacuuming */ + vacrel->lps->lvshared->for_cleanup = false; + vacrel->lps->lvshared->first_time = false; + + /* + * We can only provide an approximate value of num_heap_tuples, at least + * for now. Matches serial VACUUM case. + */ + vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples; + vacrel->lps->lvshared->estimated_count = true; + + do_parallel_vacuum_or_cleanup(vacrel, + vacrel->lps->nindexes_parallel_bulkdel); +} + +/* + * Perform lazy_cleanup_all_indexes() steps in parallel + */ +static void +do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel) +{ + int nworkers; + + /* + * If parallel vacuum is active we perform index cleanup with parallel + * workers. + * + * Tell parallel workers to do index cleanup. + */ + vacrel->lps->lvshared->for_cleanup = true; + vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0); + + /* + * Now we can provide a better estimate of total number of surviving + * tuples (we assume indexes are more interested in that than in the + * number of nominally live tuples). + */ + vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples; + vacrel->lps->lvshared->estimated_count = + (vacrel->tupcount_pages < vacrel->rel_pages); + + /* Determine the number of parallel workers to launch */ + if (vacrel->lps->lvshared->first_time) + nworkers = vacrel->lps->nindexes_parallel_cleanup + + vacrel->lps->nindexes_parallel_condcleanup; + else + nworkers = vacrel->lps->nindexes_parallel_cleanup; + + do_parallel_vacuum_or_cleanup(vacrel, nworkers); +} + +/* + * Perform index vacuum or index cleanup with parallel workers. This function + * must be used by the parallel vacuum leader process. The caller must set + * lps->lvshared->for_cleanup to indicate whether to perform vacuum or + * cleanup. + */ +static void +do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers) +{ + LVParallelState *lps = vacrel->lps; + + Assert(!IsParallelWorker()); + Assert(ParallelVacuumIsActive(vacrel)); + Assert(vacrel->nindexes > 0); + + /* The leader process will participate */ + nworkers--; + + /* + * It is possible that parallel context is initialized with fewer workers + * than the number of indexes that need a separate worker in the current + * phase, so we need to consider it. See compute_parallel_vacuum_workers. + */ + nworkers = Min(nworkers, lps->pcxt->nworkers); + + /* Setup the shared cost-based vacuum delay and launch workers */ + if (nworkers > 0) + { + if (vacrel->num_index_scans > 0) + { + /* Reset the parallel index processing counter */ + pg_atomic_write_u32(&(lps->lvshared->idx), 0); + + /* Reinitialize the parallel context to relaunch parallel workers */ + ReinitializeParallelDSM(lps->pcxt); + } + + /* + * Set up shared cost balance and the number of active workers for + * vacuum delay. We need to do this before launching workers as + * otherwise, they might not see the updated values for these + * parameters. + */ + pg_atomic_write_u32(&(lps->lvshared->cost_balance), VacuumCostBalance); + pg_atomic_write_u32(&(lps->lvshared->active_nworkers), 0); + + /* + * The number of workers can vary between bulkdelete and cleanup + * phase. + */ + ReinitializeParallelWorkers(lps->pcxt, nworkers); + + LaunchParallelWorkers(lps->pcxt); + + if (lps->pcxt->nworkers_launched > 0) + { + /* + * Reset the local cost values for leader backend as we have + * already accumulated the remaining balance of heap. + */ + VacuumCostBalance = 0; + VacuumCostBalanceLocal = 0; + + /* Enable shared cost balance for leader backend */ + VacuumSharedCostBalance = &(lps->lvshared->cost_balance); + VacuumActiveNWorkers = &(lps->lvshared->active_nworkers); + } + + if (lps->lvshared->for_cleanup) + ereport(elevel, + (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)", + "launched %d parallel vacuum workers for index cleanup (planned: %d)", + lps->pcxt->nworkers_launched), + lps->pcxt->nworkers_launched, nworkers))); + else + ereport(elevel, + (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)", + "launched %d parallel vacuum workers for index vacuuming (planned: %d)", + lps->pcxt->nworkers_launched), + lps->pcxt->nworkers_launched, nworkers))); + } + + /* Process the indexes that can be processed by only leader process */ + do_serial_processing_for_unsafe_indexes(vacrel, lps->lvshared); + + /* + * Join as a parallel worker. The leader process alone processes all the + * indexes in the case where no workers are launched. + */ + do_parallel_processing(vacrel, lps->lvshared); + + /* + * Next, accumulate buffer and WAL usage. (This must wait for the workers + * to finish, or we might get incomplete data.) + */ + if (nworkers > 0) + { + /* Wait for all vacuum workers to finish */ + WaitForParallelWorkersToFinish(lps->pcxt); + + for (int i = 0; i < lps->pcxt->nworkers_launched; i++) + InstrAccumParallelQuery(&lps->buffer_usage[i], &lps->wal_usage[i]); + } + + /* + * Carry the shared balance value to heap scan and disable shared costing + */ + if (VacuumSharedCostBalance) + { + VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance); + VacuumSharedCostBalance = NULL; + VacuumActiveNWorkers = NULL; + } +} + +/* + * Index vacuum/cleanup routine used by the leader process and parallel + * vacuum worker processes to process the indexes in parallel. + */ +static void +do_parallel_processing(LVRelState *vacrel, LVShared *lvshared) +{ + /* + * Increment the active worker count if we are able to launch any worker. + */ + if (VacuumActiveNWorkers) + pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1); + + /* Loop until all indexes are vacuumed */ + for (;;) + { + int idx; + LVSharedIndStats *shared_istat; + Relation indrel; + IndexBulkDeleteResult *istat; + + /* Get an index number to process */ + idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1); + + /* Done for all indexes? */ + if (idx >= vacrel->nindexes) + break; + + /* Get the index statistics space from DSM, if any */ + shared_istat = parallel_stats_for_idx(lvshared, idx); + + /* Skip indexes not participating in parallelism */ + if (shared_istat == NULL) + continue; + + indrel = vacrel->indrels[idx]; + + /* + * Skip processing indexes that are unsafe for workers (these are + * processed in do_serial_processing_for_unsafe_indexes() by leader) + */ + if (!parallel_processing_is_safe(indrel, lvshared)) + continue; + + /* Do vacuum or cleanup of the index */ + istat = (vacrel->indstats[idx]); + vacrel->indstats[idx] = parallel_process_one_index(indrel, istat, + lvshared, + shared_istat, + vacrel); + } + + /* + * We have completed the index vacuum so decrement the active worker + * count. + */ + if (VacuumActiveNWorkers) + pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1); +} + +/* + * Perform parallel processing of indexes in leader process. + * + * Handles index vacuuming (or index cleanup) for indexes that are not + * parallel safe. It's possible that this will vary for a given index, based + * on details like whether we're performing for_cleanup processing right now. + * + * Also performs processing of smaller indexes that fell under the size cutoff + * enforced by compute_parallel_vacuum_workers(). These indexes never get a + * slot for statistics in DSM. + */ +static void +do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared) +{ + Assert(!IsParallelWorker()); + + /* + * Increment the active worker count if we are able to launch any worker. + */ + if (VacuumActiveNWorkers) + pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1); + + for (int idx = 0; idx < vacrel->nindexes; idx++) + { + LVSharedIndStats *shared_istat; + Relation indrel; + IndexBulkDeleteResult *istat; + + shared_istat = parallel_stats_for_idx(lvshared, idx); + indrel = vacrel->indrels[idx]; + + /* + * We're only here for the indexes that parallel workers won't + * process. Note that the shared_istat test ensures that we process + * indexes that fell under initial size cutoff. + */ + if (shared_istat != NULL && + parallel_processing_is_safe(indrel, lvshared)) + continue; + + /* Do vacuum or cleanup of the index */ + istat = (vacrel->indstats[idx]); + vacrel->indstats[idx] = parallel_process_one_index(indrel, istat, + lvshared, + shared_istat, + vacrel); + } + + /* + * We have completed the index vacuum so decrement the active worker + * count. + */ + if (VacuumActiveNWorkers) + pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1); +} + +/* + * Vacuum or cleanup index either by leader process or by one of the worker + * process. After processing the index this function copies the index + * statistics returned from ambulkdelete and amvacuumcleanup to the DSM + * segment. + */ +static IndexBulkDeleteResult * +parallel_process_one_index(Relation indrel, + IndexBulkDeleteResult *istat, + LVShared *lvshared, + LVSharedIndStats *shared_istat, + LVRelState *vacrel) +{ + IndexBulkDeleteResult *istat_res; + + /* + * Update the pointer to the corresponding bulk-deletion result if someone + * has already updated it + */ + if (shared_istat && shared_istat->updated && istat == NULL) + istat = &shared_istat->istat; + + /* Do vacuum or cleanup of the index */ + if (lvshared->for_cleanup) + istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples, + lvshared->estimated_count, vacrel); + else + istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples, + vacrel); + + /* + * Copy the index bulk-deletion result returned from ambulkdelete and + * amvacuumcleanup to the DSM segment if it's the first cycle because they + * allocate locally and it's possible that an index will be vacuumed by a + * different vacuum process the next cycle. Copying the result normally + * happens only the first time an index is vacuumed. For any additional + * vacuum pass, we directly point to the result on the DSM segment and + * pass it to vacuum index APIs so that workers can update it directly. + * + * Since all vacuum workers write the bulk-deletion result at different + * slots we can write them without locking. + */ + if (shared_istat && !shared_istat->updated && istat_res != NULL) + { + memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult)); + shared_istat->updated = true; + + /* Free the locally-allocated bulk-deletion result */ + pfree(istat_res); + + /* return the pointer to the result from shared memory */ + return &shared_istat->istat; + } + + return istat_res; +} + +/* + * lazy_cleanup_all_indexes() -- cleanup all indexes of relation. + */ +static void +lazy_cleanup_all_indexes(LVRelState *vacrel) +{ + Assert(!IsParallelWorker()); + Assert(vacrel->nindexes > 0); + + /* Report that we are now cleaning up indexes */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_INDEX_CLEANUP); + + if (!ParallelVacuumIsActive(vacrel)) + { + double reltuples = vacrel->new_rel_tuples; + bool estimated_count = + vacrel->tupcount_pages < vacrel->rel_pages; + + for (int idx = 0; idx < vacrel->nindexes; idx++) + { + Relation indrel = vacrel->indrels[idx]; + IndexBulkDeleteResult *istat = vacrel->indstats[idx]; + + vacrel->indstats[idx] = + lazy_cleanup_one_index(indrel, istat, reltuples, + estimated_count, vacrel); + } + } + else + { + /* Outsource everything to parallel variant */ + do_parallel_lazy_cleanup_all_indexes(vacrel); + } +} + +/* + * lazy_vacuum_one_index() -- vacuum index relation. + * + * Delete all the index entries pointing to tuples listed in + * dead_tuples, and update running statistics. + * + * reltuples is the number of heap tuples to be passed to the + * bulkdelete callback. It's always assumed to be estimated. + * + * Returns bulk delete stats derived from input stats + */ +static IndexBulkDeleteResult * +lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat, + double reltuples, LVRelState *vacrel) +{ + IndexVacuumInfo ivinfo; + PGRUsage ru0; + LVSavedErrInfo saved_err_info; + + pg_rusage_init(&ru0); + + ivinfo.index = indrel; + ivinfo.analyze_only = false; + ivinfo.report_progress = false; + ivinfo.estimated_count = true; + ivinfo.message_level = elevel; + ivinfo.num_heap_tuples = reltuples; + ivinfo.strategy = vacrel->bstrategy; + + /* + * Update error traceback information. + * + * The index name is saved during this phase and restored immediately + * after this phase. See vacuum_error_callback. + */ + Assert(vacrel->indname == NULL); + vacrel->indname = pstrdup(RelationGetRelationName(indrel)); + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_VACUUM_INDEX, + InvalidBlockNumber, InvalidOffsetNumber); + + /* Do bulk deletion */ + istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped, + (void *) vacrel->dead_tuples); + + ereport(elevel, + (errmsg("scanned index \"%s\" to remove %d row versions", + vacrel->indname, vacrel->dead_tuples->num_tuples), + errdetail_internal("%s", pg_rusage_show(&ru0)))); + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); + pfree(vacrel->indname); + vacrel->indname = NULL; + + return istat; +} + +/* + * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation. + * + * reltuples is the number of heap tuples and estimated_count is true + * if reltuples is an estimated value. + * + * Returns bulk delete stats derived from input stats + */ +static IndexBulkDeleteResult * +lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, + double reltuples, bool estimated_count, + LVRelState *vacrel) +{ + IndexVacuumInfo ivinfo; + PGRUsage ru0; + LVSavedErrInfo saved_err_info; + + pg_rusage_init(&ru0); + + ivinfo.index = indrel; + ivinfo.analyze_only = false; + ivinfo.report_progress = false; + ivinfo.estimated_count = estimated_count; + ivinfo.message_level = elevel; + + ivinfo.num_heap_tuples = reltuples; + ivinfo.strategy = vacrel->bstrategy; + + /* + * Update error traceback information. + * + * The index name is saved during this phase and restored immediately + * after this phase. See vacuum_error_callback. + */ + Assert(vacrel->indname == NULL); + vacrel->indname = pstrdup(RelationGetRelationName(indrel)); + update_vacuum_error_info(vacrel, &saved_err_info, + VACUUM_ERRCB_PHASE_INDEX_CLEANUP, + InvalidBlockNumber, InvalidOffsetNumber); + + istat = index_vacuum_cleanup(&ivinfo, istat); + + if (istat) + { + ereport(elevel, + (errmsg("index \"%s\" now contains %.0f row versions in %u pages", + RelationGetRelationName(indrel), + (istat)->num_index_tuples, + (istat)->num_pages), + errdetail("%.0f index row versions were removed.\n" + "%u index pages were newly deleted.\n" + "%u index pages are currently deleted, of which %u are currently reusable.\n" + "%s.", + (istat)->tuples_removed, + (istat)->pages_newly_deleted, + (istat)->pages_deleted, (istat)->pages_free, + pg_rusage_show(&ru0)))); + } + + /* Revert to the previous phase information for error traceback */ + restore_vacuum_error_info(vacrel, &saved_err_info); + pfree(vacrel->indname); + vacrel->indname = NULL; + + return istat; +} + +/* + * should_attempt_truncation - should we attempt to truncate the heap? + * + * Don't even think about it unless we have a shot at releasing a goodly + * number of pages. Otherwise, the time taken isn't worth it. + * + * Also don't attempt it if wraparound failsafe is in effect. It's hard to + * predict how long lazy_truncate_heap will take. Don't take any chances. + * There is very little chance of truncation working out when the failsafe is + * in effect in any case. lazy_scan_prune makes the optimistic assumption + * that any LP_DEAD items it encounters will always be LP_UNUSED by the time + * we're called. + * + * Also don't attempt it if we are doing early pruning/vacuuming, because a + * scan which cannot find a truncated heap page cannot determine that the + * snapshot is too old to read that page. + * + * This is split out so that we can test whether truncation is going to be + * called for before we actually do it. If you change the logic here, be + * careful to depend only on fields that lazy_scan_heap updates on-the-fly. + */ +static bool +should_attempt_truncation(LVRelState *vacrel) +{ + BlockNumber possibly_freeable; + + if (!vacrel->do_rel_truncate || vacrel->failsafe_active) + return false; + + possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages; + if (possibly_freeable > 0 && + (possibly_freeable >= REL_TRUNCATE_MINIMUM || + possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) && + old_snapshot_threshold < 0) + return true; + else + return false; +} + +/* + * lazy_truncate_heap - try to truncate off any empty pages at the end + */ +static void +lazy_truncate_heap(LVRelState *vacrel) +{ + BlockNumber old_rel_pages = vacrel->rel_pages; + BlockNumber new_rel_pages; + bool lock_waiter_detected; + int lock_retry; + + /* Report that we are now truncating */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_TRUNCATE); + + /* + * Loop until no more truncating can be done. + */ + do + { + PGRUsage ru0; + + pg_rusage_init(&ru0); + + /* + * We need full exclusive lock on the relation in order to do + * truncation. If we can't get it, give up rather than waiting --- we + * don't want to block other backends, and we don't want to deadlock + * (which is quite possible considering we already hold a lower-grade + * lock). + */ + lock_waiter_detected = false; + lock_retry = 0; + while (true) + { + if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock)) + break; + + /* + * Check for interrupts while trying to (re-)acquire the exclusive + * lock. + */ + CHECK_FOR_INTERRUPTS(); + + if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / + VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) + { + /* + * We failed to establish the lock in the specified number of + * retries. This means we give up truncating. + */ + ereport(elevel, + (errmsg("\"%s\": stopping truncate due to conflicting lock request", + vacrel->relname))); + return; + } + + pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L); + } + + /* + * Now that we have exclusive lock, look to see if the rel has grown + * whilst we were vacuuming with non-exclusive lock. If so, give up; + * the newly added pages presumably contain non-deletable tuples. + */ + new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel); + if (new_rel_pages != old_rel_pages) + { + /* + * Note: we intentionally don't update vacrel->rel_pages with the + * new rel size here. If we did, it would amount to assuming that + * the new pages are empty, which is unlikely. Leaving the numbers + * alone amounts to assuming that the new pages have the same + * tuple density as existing ones, which is less unlikely. + */ + UnlockRelation(vacrel->rel, AccessExclusiveLock); + return; + } + + /* + * Scan backwards from the end to verify that the end pages actually + * contain no tuples. This is *necessary*, not optional, because + * other backends could have added tuples to these pages whilst we + * were vacuuming. + */ + new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected); + vacrel->blkno = new_rel_pages; + + if (new_rel_pages >= old_rel_pages) + { + /* can't do anything after all */ + UnlockRelation(vacrel->rel, AccessExclusiveLock); + return; + } + + /* + * Okay to truncate. + */ + RelationTruncate(vacrel->rel, new_rel_pages); + + /* + * We can release the exclusive lock as soon as we have truncated. + * Other backends can't safely access the relation until they have + * processed the smgr invalidation that smgrtruncate sent out ... but + * that should happen as part of standard invalidation processing once + * they acquire lock on the relation. + */ + UnlockRelation(vacrel->rel, AccessExclusiveLock); + + /* + * Update statistics. Here, it *is* correct to adjust rel_pages + * without also touching reltuples, since the tuple count wasn't + * changed by the truncation. + */ + vacrel->pages_removed += old_rel_pages - new_rel_pages; + vacrel->rel_pages = new_rel_pages; + + ereport(elevel, + (errmsg("table \"%s\": truncated %u to %u pages", + vacrel->relname, + old_rel_pages, new_rel_pages), + errdetail_internal("%s", + pg_rusage_show(&ru0)))); + old_rel_pages = new_rel_pages; + } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected); +} + +/* + * Rescan end pages to verify that they are (still) empty of tuples. + * + * Returns number of nondeletable pages (last nonempty page + 1). + */ +static BlockNumber +count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) +{ + BlockNumber blkno; + BlockNumber prefetchedUntil; + instr_time starttime; + + /* Initialize the starttime if we check for conflicting lock requests */ + INSTR_TIME_SET_CURRENT(starttime); + + /* + * Start checking blocks at what we believe relation end to be and move + * backwards. (Strange coding of loop control is needed because blkno is + * unsigned.) To make the scan faster, we prefetch a few blocks at a time + * in forward direction, so that OS-level readahead can kick in. + */ + blkno = vacrel->rel_pages; + StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, + "prefetch size must be power of 2"); + prefetchedUntil = InvalidBlockNumber; + while (blkno > vacrel->nonempty_pages) + { + Buffer buf; + Page page; + OffsetNumber offnum, + maxoff; + bool hastup; + + /* + * Check if another process requests a lock on our relation. We are + * holding an AccessExclusiveLock here, so they will be waiting. We + * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we + * only check if that interval has elapsed once every 32 blocks to + * keep the number of system calls and actual shared lock table + * lookups to a minimum. + */ + if ((blkno % 32) == 0) + { + instr_time currenttime; + instr_time elapsed; + + INSTR_TIME_SET_CURRENT(currenttime); + elapsed = currenttime; + INSTR_TIME_SUBTRACT(elapsed, starttime); + if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) + >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) + { + if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock)) + { + ereport(elevel, + (errmsg("table \"%s\": suspending truncate due to conflicting lock request", + vacrel->relname))); + + *lock_waiter_detected = true; + return blkno; + } + starttime = currenttime; + } + } + + /* + * We don't insert a vacuum delay point here, because we have an + * exclusive lock on the table which we want to hold for as short a + * time as possible. We still need to check for interrupts however. + */ + CHECK_FOR_INTERRUPTS(); + + blkno--; + + /* If we haven't prefetched this lot yet, do so now. */ + if (prefetchedUntil > blkno) + { + BlockNumber prefetchStart; + BlockNumber pblkno; + + prefetchStart = blkno & ~(PREFETCH_SIZE - 1); + for (pblkno = prefetchStart; pblkno <= blkno; pblkno++) + { + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno); + CHECK_FOR_INTERRUPTS(); + } + prefetchedUntil = prefetchStart; + } + + buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + vacrel->bstrategy); + + /* In this phase we only need shared access to the buffer */ + LockBuffer(buf, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buf); + + if (PageIsNew(page) || PageIsEmpty(page)) + { + UnlockReleaseBuffer(buf); + continue; + } + + hastup = false; + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + /* + * Note: any non-unused item should be taken as a reason to keep + * this page. Even an LP_DEAD item makes truncation unsafe, since + * we must not have cleaned out its index entries. + */ + if (ItemIdIsUsed(itemid)) + { + hastup = true; + break; /* can stop scanning */ + } + } /* scan along page */ + + UnlockReleaseBuffer(buf); + + /* Done scanning if we found a tuple here */ + if (hastup) + return blkno + 1; + } + + /* + * If we fall out of the loop, all the previously-thought-to-be-empty + * pages still are; we need not bother to look at the last known-nonempty + * page. + */ + return vacrel->nonempty_pages; +} + +/* + * Return the maximum number of dead tuples we can record. + */ +static long +compute_max_dead_tuples(BlockNumber relblocks, bool hasindex) +{ + long maxtuples; + int vac_work_mem = IsAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; + + if (hasindex) + { + maxtuples = MAXDEADTUPLES(vac_work_mem * 1024L); + maxtuples = Min(maxtuples, INT_MAX); + maxtuples = Min(maxtuples, MAXDEADTUPLES(MaxAllocSize)); + + /* curious coding here to ensure the multiplication can't overflow */ + if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks) + maxtuples = relblocks * LAZY_ALLOC_TUPLES; + + /* stay sane if small maintenance_work_mem */ + maxtuples = Max(maxtuples, MaxHeapTuplesPerPage); + } + else + maxtuples = MaxHeapTuplesPerPage; + + return maxtuples; +} + +/* + * lazy_space_alloc - space allocation decisions for lazy vacuum + * + * See the comments at the head of this file for rationale. + */ +static void +lazy_space_alloc(LVRelState *vacrel, int nworkers, BlockNumber nblocks) +{ + LVDeadTuples *dead_tuples; + long maxtuples; + + /* + * Initialize state for a parallel vacuum. As of now, only one worker can + * be used for an index, so we invoke parallelism only if there are at + * least two indexes on a table. + */ + if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming) + { + /* + * Since parallel workers cannot access data in temporary tables, we + * can't perform parallel vacuum on them. + */ + if (RelationUsesLocalBuffers(vacrel->rel)) + { + /* + * Give warning only if the user explicitly tries to perform a + * parallel vacuum on the temporary table. + */ + if (nworkers > 0) + ereport(WARNING, + (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel", + vacrel->relname))); + } + else + vacrel->lps = begin_parallel_vacuum(vacrel, nblocks, nworkers); + + /* If parallel mode started, we're done */ + if (ParallelVacuumIsActive(vacrel)) + return; + } + + maxtuples = compute_max_dead_tuples(nblocks, vacrel->nindexes > 0); + + dead_tuples = (LVDeadTuples *) palloc(SizeOfDeadTuples(maxtuples)); + dead_tuples->num_tuples = 0; + dead_tuples->max_tuples = (int) maxtuples; + + vacrel->dead_tuples = dead_tuples; +} + +/* + * lazy_space_free - free space allocated in lazy_space_alloc + */ +static void +lazy_space_free(LVRelState *vacrel) +{ + if (!ParallelVacuumIsActive(vacrel)) + return; + + /* + * End parallel mode before updating index statistics as we cannot write + * during parallel mode. + */ + end_parallel_vacuum(vacrel); +} + +/* + * lazy_tid_reaped() -- is a particular tid deletable? + * + * This has the right signature to be an IndexBulkDeleteCallback. + * + * Assumes dead_tuples array is in sorted order. + */ +static bool +lazy_tid_reaped(ItemPointer itemptr, void *state) +{ + LVDeadTuples *dead_tuples = (LVDeadTuples *) state; + int64 litem, + ritem, + item; + ItemPointer res; + + litem = itemptr_encode(&dead_tuples->itemptrs[0]); + ritem = itemptr_encode(&dead_tuples->itemptrs[dead_tuples->num_tuples - 1]); + item = itemptr_encode(itemptr); + + /* + * Doing a simple bound check before bsearch() is useful to avoid the + * extra cost of bsearch(), especially if dead tuples on the heap are + * concentrated in a certain range. Since this function is called for + * every index tuple, it pays to be really fast. + */ + if (item < litem || item > ritem) + return false; + + res = (ItemPointer) bsearch((void *) itemptr, + (void *) dead_tuples->itemptrs, + dead_tuples->num_tuples, + sizeof(ItemPointerData), + vac_cmp_itemptr); + + return (res != NULL); +} + +/* + * Comparator routines for use with qsort() and bsearch(). + */ +static int +vac_cmp_itemptr(const void *left, const void *right) +{ + BlockNumber lblk, + rblk; + OffsetNumber loff, + roff; + + lblk = ItemPointerGetBlockNumber((ItemPointer) left); + rblk = ItemPointerGetBlockNumber((ItemPointer) right); + + if (lblk < rblk) + return -1; + if (lblk > rblk) + return 1; + + loff = ItemPointerGetOffsetNumber((ItemPointer) left); + roff = ItemPointerGetOffsetNumber((ItemPointer) right); + + if (loff < roff) + return -1; + if (loff > roff) + return 1; + + return 0; +} + +/* + * Check if every tuple in the given page is visible to all current and future + * transactions. Also return the visibility_cutoff_xid which is the highest + * xmin amongst the visible tuples. Set *all_frozen to true if every tuple + * on this page is frozen. + */ +static bool +heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, + TransactionId *visibility_cutoff_xid, + bool *all_frozen) +{ + Page page = BufferGetPage(buf); + BlockNumber blockno = BufferGetBlockNumber(buf); + OffsetNumber offnum, + maxoff; + bool all_visible = true; + + *visibility_cutoff_xid = InvalidTransactionId; + *all_frozen = true; + + /* + * This is a stripped down version of the line pointer scan in + * lazy_scan_heap(). So if you change anything here, also check that code. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff && all_visible; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData tuple; + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + vacrel->offnum = offnum; + itemid = PageGetItemId(page, offnum); + + /* Unused or redirect line pointers are of no interest */ + if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) + continue; + + ItemPointerSet(&(tuple.t_self), blockno, offnum); + + /* + * Dead line pointers can have index pointers pointing to them. So + * they can't be treated as visible + */ + if (ItemIdIsDead(itemid)) + { + all_visible = false; + *all_frozen = false; + break; + } + + Assert(ItemIdIsNormal(itemid)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(vacrel->rel); + + switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf)) + { + case HEAPTUPLE_LIVE: + { + TransactionId xmin; + + /* Check comments in lazy_scan_heap. */ + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + all_visible = false; + *all_frozen = false; + break; + } + + /* + * The inserter definitely committed. But is it old enough + * that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin)) + { + all_visible = false; + *all_frozen = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) + *visibility_cutoff_xid = xmin; + + /* Check whether this tuple is already frozen or not */ + if (all_visible && *all_frozen && + heap_tuple_needs_eventual_freeze(tuple.t_data)) + *all_frozen = false; + } + break; + + case HEAPTUPLE_DEAD: + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_INSERT_IN_PROGRESS: + case HEAPTUPLE_DELETE_IN_PROGRESS: + { + all_visible = false; + *all_frozen = false; + break; + } + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + } /* scan along page */ + + /* Clear the offset information once we have processed the given page. */ + vacrel->offnum = InvalidOffsetNumber; + + return all_visible; +} + +/* + * Compute the number of parallel worker processes to request. Both index + * vacuum and index cleanup can be executed with parallel workers. The index + * is eligible for parallel vacuum iff its size is greater than + * min_parallel_index_scan_size as invoking workers for very small indexes + * can hurt performance. + * + * nrequested is the number of parallel workers that user requested. If + * nrequested is 0, we compute the parallel degree based on nindexes, that is + * the number of indexes that support parallel vacuum. This function also + * sets will_parallel_vacuum to remember indexes that participate in parallel + * vacuum. + */ +static int +compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested, + bool *will_parallel_vacuum) +{ + int nindexes_parallel = 0; + int nindexes_parallel_bulkdel = 0; + int nindexes_parallel_cleanup = 0; + int parallel_workers; + + /* + * We don't allow performing parallel operation in standalone backend or + * when parallelism is disabled. + */ + if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0) + return 0; + + /* + * Compute the number of indexes that can participate in parallel vacuum. + */ + for (int idx = 0; idx < vacrel->nindexes; idx++) + { + Relation indrel = vacrel->indrels[idx]; + uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions; + + if (vacoptions == VACUUM_OPTION_NO_PARALLEL || + RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size) + continue; + + will_parallel_vacuum[idx] = true; + + if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0) + nindexes_parallel_bulkdel++; + if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) || + ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)) + nindexes_parallel_cleanup++; + } + + nindexes_parallel = Max(nindexes_parallel_bulkdel, + nindexes_parallel_cleanup); + + /* The leader process takes one index */ + nindexes_parallel--; + + /* No index supports parallel vacuum */ + if (nindexes_parallel <= 0) + return 0; + + /* Compute the parallel degree */ + parallel_workers = (nrequested > 0) ? + Min(nrequested, nindexes_parallel) : nindexes_parallel; + + /* Cap by max_parallel_maintenance_workers */ + parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers); + + return parallel_workers; +} + +/* + * Update index statistics in pg_class if the statistics are accurate. + */ +static void +update_index_statistics(LVRelState *vacrel) +{ + Relation *indrels = vacrel->indrels; + int nindexes = vacrel->nindexes; + IndexBulkDeleteResult **indstats = vacrel->indstats; + + Assert(!IsInParallelMode()); + + for (int idx = 0; idx < nindexes; idx++) + { + Relation indrel = indrels[idx]; + IndexBulkDeleteResult *istat = indstats[idx]; + + if (istat == NULL || istat->estimated_count) + continue; + + /* Update index statistics */ + vac_update_relstats(indrel, + istat->num_pages, + istat->num_index_tuples, + 0, + false, + InvalidTransactionId, + InvalidMultiXactId, + false); + } +} + +/* + * This function prepares and returns parallel vacuum state if we can launch + * even one worker. This function is responsible for entering parallel mode, + * create a parallel context, and then initialize the DSM segment. + */ +static LVParallelState * +begin_parallel_vacuum(LVRelState *vacrel, BlockNumber nblocks, + int nrequested) +{ + LVParallelState *lps = NULL; + Relation *indrels = vacrel->indrels; + int nindexes = vacrel->nindexes; + ParallelContext *pcxt; + LVShared *shared; + LVDeadTuples *dead_tuples; + BufferUsage *buffer_usage; + WalUsage *wal_usage; + bool *will_parallel_vacuum; + long maxtuples; + Size est_shared; + Size est_deadtuples; + int nindexes_mwm = 0; + int parallel_workers = 0; + int querylen; + + /* + * A parallel vacuum must be requested and there must be indexes on the + * relation + */ + Assert(nrequested >= 0); + Assert(nindexes > 0); + + /* + * Compute the number of parallel vacuum workers to launch + */ + will_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes); + parallel_workers = compute_parallel_vacuum_workers(vacrel, + nrequested, + will_parallel_vacuum); + + /* Can't perform vacuum in parallel */ + if (parallel_workers <= 0) + { + pfree(will_parallel_vacuum); + return lps; + } + + lps = (LVParallelState *) palloc0(sizeof(LVParallelState)); + + EnterParallelMode(); + pcxt = CreateParallelContext("postgres", "parallel_vacuum_main", + parallel_workers); + Assert(pcxt->nworkers > 0); + lps->pcxt = pcxt; + + /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */ + est_shared = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes))); + for (int idx = 0; idx < nindexes; idx++) + { + Relation indrel = indrels[idx]; + uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions; + + /* + * Cleanup option should be either disabled, always performing in + * parallel or conditionally performing in parallel. + */ + Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) || + ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0)); + Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE); + + /* Skip indexes that don't participate in parallel vacuum */ + if (!will_parallel_vacuum[idx]) + continue; + + if (indrel->rd_indam->amusemaintenanceworkmem) + nindexes_mwm++; + + est_shared = add_size(est_shared, sizeof(LVSharedIndStats)); + + /* + * Remember the number of indexes that support parallel operation for + * each phase. + */ + if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0) + lps->nindexes_parallel_bulkdel++; + if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) + lps->nindexes_parallel_cleanup++; + if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0) + lps->nindexes_parallel_condcleanup++; + } + shm_toc_estimate_chunk(&pcxt->estimator, est_shared); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */ + maxtuples = compute_max_dead_tuples(nblocks, true); + est_deadtuples = MAXALIGN(SizeOfDeadTuples(maxtuples)); + shm_toc_estimate_chunk(&pcxt->estimator, est_deadtuples); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* + * Estimate space for BufferUsage and WalUsage -- + * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE. + * + * If there are no extensions loaded that care, we could skip this. We + * have no way of knowing whether anyone's looking at pgBufferUsage or + * pgWalUsage, so do it unconditionally. + */ + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */ + if (debug_query_string) + { + querylen = strlen(debug_query_string); + shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + else + querylen = 0; /* keep compiler quiet */ + + InitializeParallelDSM(pcxt); + + /* Prepare shared information */ + shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared); + MemSet(shared, 0, est_shared); + shared->relid = RelationGetRelid(vacrel->rel); + shared->elevel = elevel; + shared->maintenance_work_mem_worker = + (nindexes_mwm > 0) ? + maintenance_work_mem / Min(parallel_workers, nindexes_mwm) : + maintenance_work_mem; + + pg_atomic_init_u32(&(shared->cost_balance), 0); + pg_atomic_init_u32(&(shared->active_nworkers), 0); + pg_atomic_init_u32(&(shared->idx), 0); + shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes))); + + /* + * Initialize variables for shared index statistics, set NULL bitmap and + * the size of stats for each index. + */ + memset(shared->bitmap, 0x00, BITMAPLEN(nindexes)); + for (int idx = 0; idx < nindexes; idx++) + { + if (!will_parallel_vacuum[idx]) + continue; + + /* Set NOT NULL as this index does support parallelism */ + shared->bitmap[idx >> 3] |= 1 << (idx & 0x07); + } + + shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared); + lps->lvshared = shared; + + /* Prepare the dead tuple space */ + dead_tuples = (LVDeadTuples *) shm_toc_allocate(pcxt->toc, est_deadtuples); + dead_tuples->max_tuples = maxtuples; + dead_tuples->num_tuples = 0; + MemSet(dead_tuples->itemptrs, 0, sizeof(ItemPointerData) * maxtuples); + shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_TUPLES, dead_tuples); + vacrel->dead_tuples = dead_tuples; + + /* + * Allocate space for each worker's BufferUsage and WalUsage; no need to + * initialize + */ + buffer_usage = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage); + lps->buffer_usage = buffer_usage; + wal_usage = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage); + lps->wal_usage = wal_usage; + + /* Store query string for workers */ + if (debug_query_string) + { + char *sharedquery; + + sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); + memcpy(sharedquery, debug_query_string, querylen + 1); + sharedquery[querylen] = '\0'; + shm_toc_insert(pcxt->toc, + PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery); + } + + pfree(will_parallel_vacuum); + return lps; +} + +/* + * Destroy the parallel context, and end parallel mode. + * + * Since writes are not allowed during parallel mode, copy the + * updated index statistics from DSM into local memory and then later use that + * to update the index statistics. One might think that we can exit from + * parallel mode, update the index statistics and then destroy parallel + * context, but that won't be safe (see ExitParallelMode). + */ +static void +end_parallel_vacuum(LVRelState *vacrel) +{ + IndexBulkDeleteResult **indstats = vacrel->indstats; + LVParallelState *lps = vacrel->lps; + int nindexes = vacrel->nindexes; + + Assert(!IsParallelWorker()); + + /* Copy the updated statistics */ + for (int idx = 0; idx < nindexes; idx++) + { + LVSharedIndStats *shared_istat; + + shared_istat = parallel_stats_for_idx(lps->lvshared, idx); + + /* + * Skip index -- it must have been processed by the leader, from + * inside do_serial_processing_for_unsafe_indexes() + */ + if (shared_istat == NULL) + continue; + + if (shared_istat->updated) + { + indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + memcpy(indstats[idx], &(shared_istat->istat), sizeof(IndexBulkDeleteResult)); + } + else + indstats[idx] = NULL; + } + + DestroyParallelContext(lps->pcxt); + ExitParallelMode(); + + /* Deactivate parallel vacuum */ + pfree(lps); + vacrel->lps = NULL; +} + +/* + * Return shared memory statistics for index at offset 'getidx', if any + * + * Returning NULL indicates that compute_parallel_vacuum_workers() determined + * that the index is a totally unsuitable target for all parallel processing + * up front. For example, the index could be < min_parallel_index_scan_size + * cutoff. + */ +static LVSharedIndStats * +parallel_stats_for_idx(LVShared *lvshared, int getidx) +{ + char *p; + + if (IndStatsIsNull(lvshared, getidx)) + return NULL; + + p = (char *) GetSharedIndStats(lvshared); + for (int idx = 0; idx < getidx; idx++) + { + if (IndStatsIsNull(lvshared, idx)) + continue; + + p += sizeof(LVSharedIndStats); + } + + return (LVSharedIndStats *) p; +} + +/* + * Returns false, if the given index can't participate in parallel index + * vacuum or parallel index cleanup + */ +static bool +parallel_processing_is_safe(Relation indrel, LVShared *lvshared) +{ + uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions; + + /* first_time must be true only if for_cleanup is true */ + Assert(lvshared->for_cleanup || !lvshared->first_time); + + if (lvshared->for_cleanup) + { + /* Skip, if the index does not support parallel cleanup */ + if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) && + ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0)) + return false; + + /* + * Skip, if the index supports parallel cleanup conditionally, but we + * have already processed the index (for bulkdelete). See the + * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know + * when indexes support parallel cleanup conditionally. + */ + if (!lvshared->first_time && + ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)) + return false; + } + else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0) + { + /* Skip if the index does not support parallel bulk deletion */ + return false; + } + + return true; +} + +/* + * Perform work within a launched parallel process. + * + * Since parallel vacuum workers perform only index vacuum or index cleanup, + * we don't need to report progress information. + */ +void +parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) +{ + Relation rel; + Relation *indrels; + LVShared *lvshared; + LVDeadTuples *dead_tuples; + BufferUsage *buffer_usage; + WalUsage *wal_usage; + int nindexes; + char *sharedquery; + LVRelState vacrel; + ErrorContextCallback errcallback; + + /* + * A parallel vacuum worker must have only PROC_IN_VACUUM flag since we + * don't support parallel vacuum for autovacuum as of now. + */ + Assert(MyProc->statusFlags == PROC_IN_VACUUM); + + lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED, + false); + elevel = lvshared->elevel; + + if (lvshared->for_cleanup) + elog(DEBUG1, "starting parallel vacuum worker for cleanup"); + else + elog(DEBUG1, "starting parallel vacuum worker for bulk delete"); + + /* Set debug_query_string for individual workers */ + sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true); + debug_query_string = sharedquery; + pgstat_report_activity(STATE_RUNNING, debug_query_string); + + /* + * Open table. The lock mode is the same as the leader process. It's + * okay because the lock mode does not conflict among the parallel + * workers. + */ + rel = table_open(lvshared->relid, ShareUpdateExclusiveLock); + + /* + * Open all indexes. indrels are sorted in order by OID, which should be + * matched to the leader's one. + */ + vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels); + Assert(nindexes > 0); + + /* Set dead tuple space */ + dead_tuples = (LVDeadTuples *) shm_toc_lookup(toc, + PARALLEL_VACUUM_KEY_DEAD_TUPLES, + false); + + /* Set cost-based vacuum delay */ + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumPageHit = 0; + VacuumPageMiss = 0; + VacuumPageDirty = 0; + VacuumCostBalanceLocal = 0; + VacuumSharedCostBalance = &(lvshared->cost_balance); + VacuumActiveNWorkers = &(lvshared->active_nworkers); + + vacrel.rel = rel; + vacrel.indrels = indrels; + vacrel.nindexes = nindexes; + /* Each parallel VACUUM worker gets its own access strategy */ + vacrel.bstrategy = GetAccessStrategy(BAS_VACUUM); + vacrel.indstats = (IndexBulkDeleteResult **) + palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); + + if (lvshared->maintenance_work_mem_worker > 0) + maintenance_work_mem = lvshared->maintenance_work_mem_worker; + + /* + * Initialize vacrel for use as error callback arg by parallel worker. + */ + vacrel.relnamespace = get_namespace_name(RelationGetNamespace(rel)); + vacrel.relname = pstrdup(RelationGetRelationName(rel)); + vacrel.indname = NULL; + vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN; /* Not yet processing */ + vacrel.dead_tuples = dead_tuples; + + /* Setup error traceback support for ereport() */ + errcallback.callback = vacuum_error_callback; + errcallback.arg = &vacrel; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* Prepare to track buffer usage during parallel execution */ + InstrStartParallelQuery(); + + /* Process indexes to perform vacuum/cleanup */ + do_parallel_processing(&vacrel, lvshared); + + /* Report buffer/WAL usage during parallel execution */ + buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); + wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); + InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + &wal_usage[ParallelWorkerNumber]); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + vac_close_indexes(nindexes, indrels, RowExclusiveLock); + table_close(rel, ShareUpdateExclusiveLock); + FreeAccessStrategy(vacrel.bstrategy); + pfree(vacrel.indstats); +} + +/* + * Error context callback for errors occurring during vacuum. + */ +static void +vacuum_error_callback(void *arg) +{ + LVRelState *errinfo = arg; + + switch (errinfo->phase) + { + case VACUUM_ERRCB_PHASE_SCAN_HEAP: + if (BlockNumberIsValid(errinfo->blkno)) + { + if (OffsetNumberIsValid(errinfo->offnum)) + errcontext("while scanning block %u offset %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname); + else + errcontext("while scanning block %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->relnamespace, errinfo->relname); + } + else + errcontext("while scanning relation \"%s.%s\"", + errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_VACUUM_HEAP: + if (BlockNumberIsValid(errinfo->blkno)) + { + if (OffsetNumberIsValid(errinfo->offnum)) + errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname); + else + errcontext("while vacuuming block %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->relnamespace, errinfo->relname); + } + else + errcontext("while vacuuming relation \"%s.%s\"", + errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_VACUUM_INDEX: + errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"", + errinfo->indname, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_INDEX_CLEANUP: + errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"", + errinfo->indname, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_TRUNCATE: + if (BlockNumberIsValid(errinfo->blkno)) + errcontext("while truncating relation \"%s.%s\" to %u blocks", + errinfo->relnamespace, errinfo->relname, errinfo->blkno); + break; + + case VACUUM_ERRCB_PHASE_UNKNOWN: + default: + return; /* do nothing; the errinfo may not be + * initialized */ + } +} + +/* + * Updates the information required for vacuum error callback. This also saves + * the current information which can be later restored via restore_vacuum_error_info. + */ +static void +update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel, + int phase, BlockNumber blkno, OffsetNumber offnum) +{ + if (saved_vacrel) + { + saved_vacrel->offnum = vacrel->offnum; + saved_vacrel->blkno = vacrel->blkno; + saved_vacrel->phase = vacrel->phase; + } + + vacrel->blkno = blkno; + vacrel->offnum = offnum; + vacrel->phase = phase; +} + +/* + * Restores the vacuum information saved via a prior call to update_vacuum_error_info. + */ +static void +restore_vacuum_error_info(LVRelState *vacrel, + const LVSavedErrInfo *saved_vacrel) +{ + vacrel->blkno = saved_vacrel->blkno; + vacrel->offnum = saved_vacrel->offnum; + vacrel->phase = saved_vacrel->phase; +} diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c new file mode 100644 index 0000000..e198df6 --- /dev/null +++ b/src/backend/access/heap/visibilitymap.c @@ -0,0 +1,672 @@ +/*------------------------------------------------------------------------- + * + * visibilitymap.c + * bitmap for tracking visibility of heap tuples + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/visibilitymap.c + * + * INTERFACE ROUTINES + * visibilitymap_clear - clear bits for one page in the visibility map + * visibilitymap_pin - pin a map page for setting a bit + * visibilitymap_pin_ok - check whether correct map page is already pinned + * visibilitymap_set - set a bit in a previously pinned page + * visibilitymap_get_status - get status of bits + * visibilitymap_count - count number of bits set in visibility map + * visibilitymap_prepare_truncate - + * prepare for truncation of the visibility map + * + * NOTES + * + * The visibility map is a bitmap with two bits (all-visible and all-frozen) + * per heap page. A set all-visible bit means that all tuples on the page are + * known visible to all transactions, and therefore the page doesn't need to + * be vacuumed. A set all-frozen bit means that all tuples on the page are + * completely frozen, and therefore the page doesn't need to be vacuumed even + * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum). + * The all-frozen bit must be set only when the page is already all-visible. + * + * The map is conservative in the sense that we make sure that whenever a bit + * is set, we know the condition is true, but if a bit is not set, it might or + * might not be true. + * + * Clearing visibility map bits is not separately WAL-logged. The callers + * must make sure that whenever a bit is cleared, the bit is cleared on WAL + * replay of the updating operation as well. + * + * When we *set* a visibility map during VACUUM, we must write WAL. This may + * seem counterintuitive, since the bit is basically a hint: if it is clear, + * it may still be the case that every tuple on the page is visible to all + * transactions; we just don't know that for certain. The difficulty is that + * there are two bits which are typically set together: the PD_ALL_VISIBLE bit + * on the page itself, and the visibility map bit. If a crash occurs after the + * visibility map page makes it to disk and before the updated heap page makes + * it to disk, redo must set the bit on the heap page. Otherwise, the next + * insert, update, or delete on the heap page will fail to realize that the + * visibility map bit must be cleared, possibly causing index-only scans to + * return wrong answers. + * + * VACUUM will normally skip pages for which the visibility map bit is set; + * such pages can't contain any dead tuples and therefore don't need vacuuming. + * + * LOCKING + * + * In heapam.c, whenever a page is modified so that not all tuples on the + * page are visible to everyone anymore, the corresponding bit in the + * visibility map is cleared. In order to be crash-safe, we need to do this + * while still holding a lock on the heap page and in the same critical + * section that logs the page modification. However, we don't want to hold + * the buffer lock over any I/O that may be required to read in the visibility + * map page. To avoid this, we examine the heap page before locking it; + * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map + * bit. Then, we lock the buffer. But this creates a race condition: there + * is a possibility that in the time it takes to lock the buffer, the + * PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the + * buffer, pin the visibility map page, and relock the buffer. This shouldn't + * happen often, because only VACUUM currently sets visibility map bits, + * and the race will only occur if VACUUM processes a given page at almost + * exactly the same time that someone tries to further modify it. + * + * To set a bit, you need to hold a lock on the heap page. That prevents + * the race condition where VACUUM sees that all tuples on the page are + * visible to everyone, but another backend modifies the page before VACUUM + * sets the bit in the visibility map. + * + * When a bit is set, the LSN of the visibility map page is updated to make + * sure that the visibility map update doesn't get written to disk before the + * WAL record of the changes that made it possible to set the bit is flushed. + * But when a bit is cleared, we don't have to do that because it's always + * safe to clear a bit in the map from correctness point of view. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam_xlog.h" +#include "access/visibilitymap.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/inval.h" + + +/*#define TRACE_VISIBILITYMAP */ + +/* + * Size of the bitmap on each visibility map page, in bytes. There's no + * extra headers, so the whole page minus the standard page header is + * used for the bitmap. + */ +#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) + +/* Number of heap blocks we can represent in one byte */ +#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + +/* Number of heap blocks we can represent in one visibility map page. */ +#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) + +/* Mapping from heap block number to the right bit in the visibility map */ +#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) +#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) +#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) + +/* Masks for counting subsets of bits in the visibility map. */ +#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each + * bit pair */ +#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each + * bit pair */ + +/* prototypes for internal routines */ +static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); +static void vm_extend(Relation rel, BlockNumber vm_nblocks); + + +/* + * visibilitymap_clear - clear specified bits for one page in visibility map + * + * You must pass a buffer containing the correct map page to this function. + * Call visibilitymap_pin first to pin the right one. This function doesn't do + * any I/O. Returns true if any bits have been cleared and false otherwise. + */ +bool +visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf, uint8 flags) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + int mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + uint8 mask = flags << mapOffset; + char *map; + bool cleared = false; + + Assert(flags & VISIBILITYMAP_VALID_BITS); + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock) + elog(ERROR, "wrong buffer passed to visibilitymap_clear"); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + map = PageGetContents(BufferGetPage(buf)); + + if (map[mapByte] & mask) + { + map[mapByte] &= ~mask; + + MarkBufferDirty(buf); + cleared = true; + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + return cleared; +} + +/* + * visibilitymap_pin - pin a map page for setting a bit + * + * Setting a bit in the visibility map is a two-phase operation. First, call + * visibilitymap_pin, to pin the visibility map page containing the bit for + * the heap page. Because that can require I/O to read the map page, you + * shouldn't hold a lock on the heap page while doing that. Then, call + * visibilitymap_set to actually set the bit. + * + * On entry, *buf should be InvalidBuffer or a valid buffer returned by + * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same + * relation. On return, *buf is a valid buffer with the map page containing + * the bit for heapBlk. + * + * If the page doesn't exist in the map file yet, it is extended. + */ +void +visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*buf)) + { + if (BufferGetBlockNumber(*buf) == mapBlock) + return; + + ReleaseBuffer(*buf); + } + *buf = vm_readbuf(rel, mapBlock, true); +} + +/* + * visibilitymap_pin_ok - do we already have the correct page pinned? + * + * On entry, buf should be InvalidBuffer or a valid buffer returned by + * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same + * relation. The return value indicates whether the buffer covers the + * given heapBlk. + */ +bool +visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + + return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock; +} + +/* + * visibilitymap_set - set bit(s) on a previously pinned page + * + * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, + * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the + * one provided; in normal running, we generate a new XLOG record and set the + * page LSN to that value. cutoff_xid is the largest xmin on the page being + * marked all-visible; it is needed for Hot Standby, and can be + * InvalidTransactionId if the page contains no tuples. It can also be set + * to InvalidTransactionId when a page that is already all-visible is being + * marked all-frozen. + * + * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling + * this function. Except in recovery, caller should also pass the heap + * buffer. When checksums are enabled and we're not in recovery, we must add + * the heap buffer to the WAL chain to protect it from being torn. + * + * You must pass a buffer containing the correct map page to this function. + * Call visibilitymap_pin first to pin the right one. This function doesn't do + * any I/O. + */ +void +visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + Page page; + uint8 *map; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); + Assert(InRecovery || BufferIsValid(heapBuf)); + Assert(flags & VISIBILITYMAP_VALID_BITS); + + /* Check that we have the right heap page pinned, if present */ + if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) + elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); + + /* Check that we have the right VM page pinned */ + if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) + elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); + + page = BufferGetPage(vmBuf); + map = (uint8 *) PageGetContents(page); + LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); + + if (flags != (map[mapByte] >> mapOffset & VISIBILITYMAP_VALID_BITS)) + { + START_CRIT_SECTION(); + + map[mapByte] |= (flags << mapOffset); + MarkBufferDirty(vmBuf); + + if (RelationNeedsWAL(rel)) + { + if (XLogRecPtrIsInvalid(recptr)) + { + Assert(!InRecovery); + recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, + cutoff_xid, flags); + + /* + * If data checksums are enabled (or wal_log_hints=on), we + * need to protect the heap page from being torn. + */ + if (XLogHintBitIsNeeded()) + { + Page heapPage = BufferGetPage(heapBuf); + + /* caller is expected to set PD_ALL_VISIBLE first */ + Assert(PageIsAllVisible(heapPage)); + PageSetLSN(heapPage, recptr); + } + } + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); +} + +/* + * visibilitymap_get_status - get status of bits + * + * Are all tuples on heapBlk visible to all or are marked frozen, according + * to the visibility map? + * + * On entry, *buf should be InvalidBuffer or a valid buffer returned by an + * earlier call to visibilitymap_pin or visibilitymap_get_status on the same + * relation. On return, *buf is a valid buffer with the map page containing + * the bit for heapBlk, or InvalidBuffer. The caller is responsible for + * releasing *buf after it's done testing and setting bits. + * + * NOTE: This function is typically called without a lock on the heap page, + * so somebody else could change the bit just after we look at it. In fact, + * since we don't lock the visibility map page either, it's even possible that + * someone else could have changed the bit just before we look at it, but yet + * we might see the old value. It is the caller's responsibility to deal with + * all concurrency issues! + */ +uint8 +visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + char *map; + uint8 result; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*buf)) + { + if (BufferGetBlockNumber(*buf) != mapBlock) + { + ReleaseBuffer(*buf); + *buf = InvalidBuffer; + } + } + + if (!BufferIsValid(*buf)) + { + *buf = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(*buf)) + return false; + } + + map = PageGetContents(BufferGetPage(*buf)); + + /* + * A single byte read is atomic. There could be memory-ordering effects + * here, but for performance reasons we make it the caller's job to worry + * about that. + */ + result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS); + return result; +} + +/* + * visibilitymap_count - count number of bits set in visibility map + * + * Note: we ignore the possibility of race conditions when the table is being + * extended concurrently with the call. New pages added to the table aren't + * going to be marked all-visible or all-frozen, so they won't affect the result. + */ +void +visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen) +{ + BlockNumber mapBlock; + BlockNumber nvisible = 0; + BlockNumber nfrozen = 0; + + /* all_visible must be specified */ + Assert(all_visible); + + for (mapBlock = 0;; mapBlock++) + { + Buffer mapBuffer; + uint64 *map; + int i; + + /* + * Read till we fall off the end of the map. We assume that any extra + * bytes in the last page are zeroed, so we don't bother excluding + * them from the count. + */ + mapBuffer = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(mapBuffer)) + break; + + /* + * We choose not to lock the page, since the result is going to be + * immediately stale anyway if anyone is concurrently setting or + * clearing bits, and we only really need an approximate value. + */ + map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer)); + + StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0, + "unsupported MAPSIZE"); + if (all_frozen == NULL) + { + for (i = 0; i < MAPSIZE / sizeof(uint64); i++) + nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); + } + else + { + for (i = 0; i < MAPSIZE / sizeof(uint64); i++) + { + nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); + nfrozen += pg_popcount64(map[i] & FROZEN_MASK64); + } + } + + ReleaseBuffer(mapBuffer); + } + + *all_visible = nvisible; + if (all_frozen) + *all_frozen = nfrozen; +} + +/* + * visibilitymap_prepare_truncate - + * prepare for truncation of the visibility map + * + * nheapblocks is the new size of the heap. + * + * Return the number of blocks of new visibility map. + * If it's InvalidBlockNumber, there is nothing to truncate; + * otherwise the caller is responsible for calling smgrtruncate() + * to truncate the visibility map pages. + */ +BlockNumber +visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) +{ + BlockNumber newnblocks; + + /* last remaining block, byte, and bit */ + BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); + uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); + uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks); + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); +#endif + + RelationOpenSmgr(rel); + + /* + * If no visibility map has been created yet for this relation, there's + * nothing to truncate. + */ + if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + return InvalidBlockNumber; + + /* + * Unless the new size is exactly at a visibility map page boundary, the + * tail bits in the last remaining map page, representing truncated heap + * blocks, need to be cleared. This is not only tidy, but also necessary + * because we don't get a chance to clear the bits if the heap is extended + * again. + */ + if (truncByte != 0 || truncOffset != 0) + { + Buffer mapBuffer; + Page page; + char *map; + + newnblocks = truncBlock + 1; + + mapBuffer = vm_readbuf(rel, truncBlock, false); + if (!BufferIsValid(mapBuffer)) + { + /* nothing to do, the file was already smaller */ + return InvalidBlockNumber; + } + + page = BufferGetPage(mapBuffer); + map = PageGetContents(page); + + LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + /* Clear out the unwanted bytes. */ + MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); + + /*---- + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + *---- + */ + map[truncByte] &= (1 << truncOffset) - 1; + + /* + * Truncation of a relation is WAL-logged at a higher-level, and we + * will be called at WAL replay. But if checksums are enabled, we need + * to still write a WAL record to protect against a torn page, if the + * page is flushed to disk before the truncation WAL record. We cannot + * use MarkBufferDirtyHint here, because that will not dirty the page + * during recovery. + */ + MarkBufferDirty(mapBuffer); + if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) + log_newpage_buffer(mapBuffer, false); + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(mapBuffer); + } + else + newnblocks = truncBlock; + + if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks) + { + /* nothing to do, the file was already smaller than requested size */ + return InvalidBlockNumber; + } + + return newnblocks; +} + +/* + * Read a visibility map page. + * + * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is + * true, the visibility map file is extended. + */ +static Buffer +vm_readbuf(Relation rel, BlockNumber blkno, bool extend) +{ + Buffer buf; + + /* + * We might not have opened the relation at the smgr level yet, or we + * might have been forced to close it by a sinval message. The code below + * won't necessarily notice relation extension immediately when extend = + * false, so we rely on sinval messages to ensure that our ideas about the + * size of the map aren't too far out of date. + */ + RelationOpenSmgr(rel); + + /* + * If we haven't cached the size of the visibility map fork yet, check it + * first. + */ + if (rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) + { + if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); + else + rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0; + } + + /* Handle requests beyond EOF */ + if (blkno >= rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM]) + { + if (extend) + vm_extend(rel, blkno + 1); + else + return InvalidBuffer; + } + + /* + * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's + * always safe to clear bits, so it's better to clear corrupt pages than + * error out. + * + * The initialize-the-page part is trickier than it looks, because of the + * possibility of multiple backends doing this concurrently, and our + * desire to not uselessly take the buffer lock in the normal path where + * the page is OK. We must take the lock to initialize the page, so + * recheck page newness after we have the lock, in case someone else + * already did it. Also, because we initially check PageIsNew with no + * lock, it's possible to fall through and return the buffer while someone + * else is still initializing the page (i.e., we might see pd_upper as set + * but other page header fields are still zeroes). This is harmless for + * callers that will take a buffer lock themselves, but some callers + * inspect the page without any lock at all. The latter is OK only so + * long as it doesn't depend on the page header having correct contents. + * Current usage is safe because PageGetContents() does not require that. + */ + buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, + RBM_ZERO_ON_ERROR, NULL); + if (PageIsNew(BufferGetPage(buf))) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(BufferGetPage(buf))) + PageInit(BufferGetPage(buf), BLCKSZ, 0); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + return buf; +} + +/* + * Ensure that the visibility map fork is at least vm_nblocks long, extending + * it if necessary with zeroed pages. + */ +static void +vm_extend(Relation rel, BlockNumber vm_nblocks) +{ + BlockNumber vm_nblocks_now; + PGAlignedBlock pg; + + PageInit((Page) pg.data, BLCKSZ, 0); + + /* + * We use the relation extension lock to lock out other backends trying to + * extend the visibility map at the same time. It also locks out extension + * of the main fork, unnecessarily, but extending the visibility map + * happens seldom enough that it doesn't seem worthwhile to have a + * separate lock tag type for it. + * + * Note that another backend might have extended or created the relation + * by the time we get the lock. + */ + LockRelationForExtension(rel, ExclusiveLock); + + /* Might have to re-open if a cache flush happened */ + RelationOpenSmgr(rel); + + /* + * Create the file first if it doesn't exist. If smgr_vm_nblocks is + * positive then it must exist, no need for an smgrexists call. + */ + if ((rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == 0 || + rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) && + !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false); + + /* Invalidate cache so that smgrnblocks() asks the kernel. */ + rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber; + vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); + + /* Now extend the file */ + while (vm_nblocks_now < vm_nblocks) + { + PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); + + smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, + pg.data, false); + vm_nblocks_now++; + } + + /* + * Send a shared-inval message to force other backends to close any smgr + * references they may have for this rel, which we are about to change. + * This is a useful optimization because it means that backends don't have + * to keep checking for creation or extension of the file, which happens + * infrequently. + */ + CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode); + + UnlockRelationForExtension(rel, ExclusiveLock); +} diff --git a/src/backend/access/index/Makefile b/src/backend/access/index/Makefile new file mode 100644 index 0000000..6f2e306 --- /dev/null +++ b/src/backend/access/index/Makefile @@ -0,0 +1,21 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/index +# +# IDENTIFICATION +# src/backend/access/index/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/index +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + amapi.o \ + amvalidate.o \ + genam.o \ + indexam.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c new file mode 100644 index 0000000..d30bc43 --- /dev/null +++ b/src/backend/access/index/amapi.c @@ -0,0 +1,143 @@ +/*------------------------------------------------------------------------- + * + * amapi.c + * Support routines for API for Postgres index access methods. + * + * Copyright (c) 2015-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/index/amapi.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amapi.h" +#include "access/htup_details.h" +#include "catalog/pg_am.h" +#include "catalog/pg_opclass.h" +#include "utils/builtins.h" +#include "utils/syscache.h" + + +/* + * GetIndexAmRoutine - call the specified access method handler routine to get + * its IndexAmRoutine struct, which will be palloc'd in the caller's context. + * + * Note that if the amhandler function is built-in, this will not involve + * any catalog access. It's therefore safe to use this while bootstrapping + * indexes for the system catalogs. relcache.c relies on that. + */ +IndexAmRoutine * +GetIndexAmRoutine(Oid amhandler) +{ + Datum datum; + IndexAmRoutine *routine; + + datum = OidFunctionCall0(amhandler); + routine = (IndexAmRoutine *) DatumGetPointer(datum); + + if (routine == NULL || !IsA(routine, IndexAmRoutine)) + elog(ERROR, "index access method handler function %u did not return an IndexAmRoutine struct", + amhandler); + + return routine; +} + +/* + * GetIndexAmRoutineByAmId - look up the handler of the index access method + * with the given OID, and get its IndexAmRoutine struct. + * + * If the given OID isn't a valid index access method, returns NULL if + * noerror is true, else throws error. + */ +IndexAmRoutine * +GetIndexAmRoutineByAmId(Oid amoid, bool noerror) +{ + HeapTuple tuple; + Form_pg_am amform; + regproc amhandler; + + /* Get handler function OID for the access method */ + tuple = SearchSysCache1(AMOID, ObjectIdGetDatum(amoid)); + if (!HeapTupleIsValid(tuple)) + { + if (noerror) + return NULL; + elog(ERROR, "cache lookup failed for access method %u", + amoid); + } + amform = (Form_pg_am) GETSTRUCT(tuple); + + /* Check if it's an index access method as opposed to some other AM */ + if (amform->amtype != AMTYPE_INDEX) + { + if (noerror) + { + ReleaseSysCache(tuple); + return NULL; + } + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("access method \"%s\" is not of type %s", + NameStr(amform->amname), "INDEX"))); + } + + amhandler = amform->amhandler; + + /* Complain if handler OID is invalid */ + if (!RegProcedureIsValid(amhandler)) + { + if (noerror) + { + ReleaseSysCache(tuple); + return NULL; + } + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("index access method \"%s\" does not have a handler", + NameStr(amform->amname)))); + } + + ReleaseSysCache(tuple); + + /* And finally, call the handler function to get the API struct. */ + return GetIndexAmRoutine(amhandler); +} + + +/* + * Ask appropriate access method to validate the specified opclass. + */ +Datum +amvalidate(PG_FUNCTION_ARGS) +{ + Oid opclassoid = PG_GETARG_OID(0); + bool result; + HeapTuple classtup; + Form_pg_opclass classform; + Oid amoid; + IndexAmRoutine *amroutine; + + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + amoid = classform->opcmethod; + + ReleaseSysCache(classtup); + + amroutine = GetIndexAmRoutineByAmId(amoid, false); + + if (amroutine->amvalidate == NULL) + elog(ERROR, "function amvalidate is not defined for index access method %u", + amoid); + + result = amroutine->amvalidate(opclassoid); + + pfree(amroutine); + + PG_RETURN_BOOL(result); +} diff --git a/src/backend/access/index/amvalidate.c b/src/backend/access/index/amvalidate.c new file mode 100644 index 0000000..9dd0ae6 --- /dev/null +++ b/src/backend/access/index/amvalidate.c @@ -0,0 +1,276 @@ +/*------------------------------------------------------------------------- + * + * amvalidate.c + * Support routines for index access methods' amvalidate and + * amadjustmembers functions. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/index/amvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/htup_details.h" +#include "catalog/pg_am.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "parser/parse_coerce.h" +#include "utils/syscache.h" + + +/* + * identify_opfamily_groups() returns a List of OpFamilyOpFuncGroup structs, + * one for each combination of lefttype/righttype present in the family's + * operator and support function lists. If amopstrategy K is present for + * this datatype combination, we set bit 1 << K in operatorset, and similarly + * for the support functions. With uint64 fields we can handle operator and + * function numbers up to 63, which is plenty for the foreseeable future. + * + * The given CatCLists are expected to represent a single opfamily fetched + * from the AMOPSTRATEGY and AMPROCNUM caches, so that they will be in + * order by those caches' second and third cache keys, namely the datatypes. + */ +List * +identify_opfamily_groups(CatCList *oprlist, CatCList *proclist) +{ + List *result = NIL; + OpFamilyOpFuncGroup *thisgroup; + Form_pg_amop oprform; + Form_pg_amproc procform; + int io, + ip; + + /* We need the lists to be ordered; should be true in normal operation */ + if (!oprlist->ordered || !proclist->ordered) + elog(ERROR, "cannot validate operator family without ordered data"); + + /* + * Advance through the lists concurrently. Thanks to the ordering, we + * should see all operators and functions of a given datatype pair + * consecutively. + */ + thisgroup = NULL; + io = ip = 0; + if (io < oprlist->n_members) + { + oprform = (Form_pg_amop) GETSTRUCT(&oprlist->members[io]->tuple); + io++; + } + else + oprform = NULL; + if (ip < proclist->n_members) + { + procform = (Form_pg_amproc) GETSTRUCT(&proclist->members[ip]->tuple); + ip++; + } + else + procform = NULL; + + while (oprform || procform) + { + if (oprform && thisgroup && + oprform->amoplefttype == thisgroup->lefttype && + oprform->amoprighttype == thisgroup->righttype) + { + /* Operator belongs to current group; include it and advance */ + + /* Ignore strategy numbers outside supported range */ + if (oprform->amopstrategy > 0 && oprform->amopstrategy < 64) + thisgroup->operatorset |= ((uint64) 1) << oprform->amopstrategy; + + if (io < oprlist->n_members) + { + oprform = (Form_pg_amop) GETSTRUCT(&oprlist->members[io]->tuple); + io++; + } + else + oprform = NULL; + continue; + } + + if (procform && thisgroup && + procform->amproclefttype == thisgroup->lefttype && + procform->amprocrighttype == thisgroup->righttype) + { + /* Procedure belongs to current group; include it and advance */ + + /* Ignore function numbers outside supported range */ + if (procform->amprocnum > 0 && procform->amprocnum < 64) + thisgroup->functionset |= ((uint64) 1) << procform->amprocnum; + + if (ip < proclist->n_members) + { + procform = (Form_pg_amproc) GETSTRUCT(&proclist->members[ip]->tuple); + ip++; + } + else + procform = NULL; + continue; + } + + /* Time for a new group */ + thisgroup = (OpFamilyOpFuncGroup *) palloc(sizeof(OpFamilyOpFuncGroup)); + if (oprform && + (!procform || + (oprform->amoplefttype < procform->amproclefttype || + (oprform->amoplefttype == procform->amproclefttype && + oprform->amoprighttype < procform->amprocrighttype)))) + { + thisgroup->lefttype = oprform->amoplefttype; + thisgroup->righttype = oprform->amoprighttype; + } + else + { + thisgroup->lefttype = procform->amproclefttype; + thisgroup->righttype = procform->amprocrighttype; + } + thisgroup->operatorset = thisgroup->functionset = 0; + result = lappend(result, thisgroup); + } + + return result; +} + +/* + * Validate the signature (argument and result types) of an opclass support + * function. Return true if OK, false if not. + * + * The "..." represents maxargs argument-type OIDs. If "exact" is true, they + * must match the function arg types exactly, else only binary-coercibly. + * In any case the function result type must match restype exactly. + */ +bool +check_amproc_signature(Oid funcid, Oid restype, bool exact, + int minargs, int maxargs,...) +{ + bool result = true; + HeapTuple tp; + Form_pg_proc procform; + va_list ap; + int i; + + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + procform = (Form_pg_proc) GETSTRUCT(tp); + + if (procform->prorettype != restype || procform->proretset || + procform->pronargs < minargs || procform->pronargs > maxargs) + result = false; + + va_start(ap, maxargs); + for (i = 0; i < maxargs; i++) + { + Oid argtype = va_arg(ap, Oid); + + if (i >= procform->pronargs) + continue; + if (exact ? (argtype != procform->proargtypes.values[i]) : + !IsBinaryCoercible(argtype, procform->proargtypes.values[i])) + result = false; + } + va_end(ap); + + ReleaseSysCache(tp); + return result; +} + +/* + * Validate the signature of an opclass options support function, that should + * be 'void(internal)'. + */ +bool +check_amoptsproc_signature(Oid funcid) +{ + return check_amproc_signature(funcid, VOIDOID, true, 1, 1, INTERNALOID); +} + +/* + * Validate the signature (argument and result types) of an opclass operator. + * Return true if OK, false if not. + * + * Currently, we can hard-wire this as accepting only binary operators. Also, + * we can insist on exact type matches, since the given lefttype/righttype + * come from pg_amop and should always match the operator exactly. + */ +bool +check_amop_signature(Oid opno, Oid restype, Oid lefttype, Oid righttype) +{ + bool result = true; + HeapTuple tp; + Form_pg_operator opform; + + tp = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno)); + if (!HeapTupleIsValid(tp)) /* shouldn't happen */ + elog(ERROR, "cache lookup failed for operator %u", opno); + opform = (Form_pg_operator) GETSTRUCT(tp); + + if (opform->oprresult != restype || opform->oprkind != 'b' || + opform->oprleft != lefttype || opform->oprright != righttype) + result = false; + + ReleaseSysCache(tp); + return result; +} + +/* + * Get the OID of the opclass belonging to an opfamily and accepting + * the specified type as input type. Returns InvalidOid if no such opclass. + * + * If there is more than one such opclass, you get a random one of them. + * Since that shouldn't happen, we don't waste cycles checking. + * + * We could look up the AM's OID from the opfamily, but all existing callers + * know that or can get it without an extra lookup, so we make them pass it. + */ +Oid +opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, Oid datatypeoid) +{ + Oid result = InvalidOid; + CatCList *opclist; + int i; + + /* + * We search through all the AM's opclasses to see if one matches. This + * is a bit inefficient but there is no better index available. It also + * saves making an explicit check that the opfamily belongs to the AM. + */ + opclist = SearchSysCacheList1(CLAAMNAMENSP, ObjectIdGetDatum(amoid)); + + for (i = 0; i < opclist->n_members; i++) + { + HeapTuple classtup = &opclist->members[i]->tuple; + Form_pg_opclass classform = (Form_pg_opclass) GETSTRUCT(classtup); + + if (classform->opcfamily == opfamilyoid && + classform->opcintype == datatypeoid) + { + result = classform->oid; + break; + } + } + + ReleaseCatCacheList(opclist); + + return result; +} + +/* + * Is the datatype a legitimate input type for the btree opfamily? + */ +bool +opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid) +{ + return OidIsValid(opclass_for_family_datatype(BTREE_AM_OID, + opfamilyoid, + datatypeoid)); +} diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c new file mode 100644 index 0000000..b93288a --- /dev/null +++ b/src/backend/access/index/genam.c @@ -0,0 +1,745 @@ +/*------------------------------------------------------------------------- + * + * genam.c + * general index access method routines + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/index/genam.c + * + * NOTES + * many of the old access method routines have been turned into + * macros and moved to genam.h -cim 4/30/91 + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "catalog/index.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/procarray.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/rls.h" +#include "utils/ruleutils.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + + +/* ---------------------------------------------------------------- + * general access method routines + * + * All indexed access methods use an identical scan structure. + * We don't know how the various AMs do locking, however, so we don't + * do anything about that here. + * + * The intent is that an AM implementor will define a beginscan routine + * that calls RelationGetIndexScan, to fill in the scan, and then does + * whatever kind of locking he wants. + * + * At the end of a scan, the AM's endscan routine undoes the locking, + * but does *not* call IndexScanEnd --- the higher-level index_endscan + * routine does that. (We can't do it in the AM because index_endscan + * still needs to touch the IndexScanDesc after calling the AM.) + * + * Because of this, the AM does not have a choice whether to call + * RelationGetIndexScan or not; its beginscan routine must return an + * object made by RelationGetIndexScan. This is kinda ugly but not + * worth cleaning up now. + * ---------------------------------------------------------------- + */ + +/* ---------------- + * RelationGetIndexScan -- Create and fill an IndexScanDesc. + * + * This routine creates an index scan structure and sets up initial + * contents for it. + * + * Parameters: + * indexRelation -- index relation for scan. + * nkeys -- count of scan keys (index qual conditions). + * norderbys -- count of index order-by operators. + * + * Returns: + * An initialized IndexScanDesc. + * ---------------- + */ +IndexScanDesc +RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) +{ + IndexScanDesc scan; + + scan = (IndexScanDesc) palloc(sizeof(IndexScanDescData)); + + scan->heapRelation = NULL; /* may be set later */ + scan->xs_heapfetch = NULL; + scan->indexRelation = indexRelation; + scan->xs_snapshot = InvalidSnapshot; /* caller must initialize this */ + scan->numberOfKeys = nkeys; + scan->numberOfOrderBys = norderbys; + + /* + * We allocate key workspace here, but it won't get filled until amrescan. + */ + if (nkeys > 0) + scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); + else + scan->keyData = NULL; + if (norderbys > 0) + scan->orderByData = (ScanKey) palloc(sizeof(ScanKeyData) * norderbys); + else + scan->orderByData = NULL; + + scan->xs_want_itup = false; /* may be set later */ + + /* + * During recovery we ignore killed tuples and don't bother to kill them + * either. We do this because the xmin on the primary node could easily be + * later than the xmin on the standby node, so that what the primary + * thinks is killed is supposed to be visible on standby. So for correct + * MVCC for queries during recovery we must ignore these hints and check + * all tuples. Do *not* set ignore_killed_tuples to true when running in a + * transaction that was started during recovery. xactStartedInRecovery + * should not be altered by index AMs. + */ + scan->kill_prior_tuple = false; + scan->xactStartedInRecovery = TransactionStartedDuringRecovery(); + scan->ignore_killed_tuples = !scan->xactStartedInRecovery; + + scan->opaque = NULL; + + scan->xs_itup = NULL; + scan->xs_itupdesc = NULL; + scan->xs_hitup = NULL; + scan->xs_hitupdesc = NULL; + + return scan; +} + +/* ---------------- + * IndexScanEnd -- End an index scan. + * + * This routine just releases the storage acquired by + * RelationGetIndexScan(). Any AM-level resources are + * assumed to already have been released by the AM's + * endscan routine. + * + * Returns: + * None. + * ---------------- + */ +void +IndexScanEnd(IndexScanDesc scan) +{ + if (scan->keyData != NULL) + pfree(scan->keyData); + if (scan->orderByData != NULL) + pfree(scan->orderByData); + + pfree(scan); +} + +/* + * BuildIndexValueDescription + * + * Construct a string describing the contents of an index entry, in the + * form "(key_name, ...)=(key_value, ...)". This is currently used + * for building unique-constraint and exclusion-constraint error messages, + * so only key columns of the index are checked and printed. + * + * Note that if the user does not have permissions to view all of the + * columns involved then a NULL is returned. Returning a partial key seems + * unlikely to be useful and we have no way to know which of the columns the + * user provided (unlike in ExecBuildSlotValueDescription). + * + * The passed-in values/nulls arrays are the "raw" input to the index AM, + * e.g. results of FormIndexDatum --- this is not necessarily what is stored + * in the index, but it's what the user perceives to be stored. + * + * Note: if you change anything here, check whether + * ExecBuildSlotPartitionKeyDescription() in execMain.c needs a similar + * change. + */ +char * +BuildIndexValueDescription(Relation indexRelation, + Datum *values, bool *isnull) +{ + StringInfoData buf; + Form_pg_index idxrec; + int indnkeyatts; + int i; + int keyno; + Oid indexrelid = RelationGetRelid(indexRelation); + Oid indrelid; + AclResult aclresult; + + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRelation); + + /* + * Check permissions- if the user does not have access to view all of the + * key columns then return NULL to avoid leaking data. + * + * First check if RLS is enabled for the relation. If so, return NULL to + * avoid leaking data. + * + * Next we need to check table-level SELECT access and then, if there is + * no access there, check column-level permissions. + */ + idxrec = indexRelation->rd_index; + indrelid = idxrec->indrelid; + Assert(indexrelid == idxrec->indexrelid); + + /* RLS check- if RLS is enabled then we don't return anything. */ + if (check_enable_rls(indrelid, InvalidOid, true) == RLS_ENABLED) + return NULL; + + /* Table-level SELECT is enough, if the user has it */ + aclresult = pg_class_aclcheck(indrelid, GetUserId(), ACL_SELECT); + if (aclresult != ACLCHECK_OK) + { + /* + * No table-level access, so step through the columns in the index and + * make sure the user has SELECT rights on all of them. + */ + for (keyno = 0; keyno < indnkeyatts; keyno++) + { + AttrNumber attnum = idxrec->indkey.values[keyno]; + + /* + * Note that if attnum == InvalidAttrNumber, then this is an index + * based on an expression and we return no detail rather than try + * to figure out what column(s) the expression includes and if the + * user has SELECT rights on them. + */ + if (attnum == InvalidAttrNumber || + pg_attribute_aclcheck(indrelid, attnum, GetUserId(), + ACL_SELECT) != ACLCHECK_OK) + { + /* No access, so clean up and return */ + return NULL; + } + } + } + + initStringInfo(&buf); + appendStringInfo(&buf, "(%s)=(", + pg_get_indexdef_columns(indexrelid, true)); + + for (i = 0; i < indnkeyatts; i++) + { + char *val; + + if (isnull[i]) + val = "null"; + else + { + Oid foutoid; + bool typisvarlena; + + /* + * The provided data is not necessarily of the type stored in the + * index; rather it is of the index opclass's input type. So look + * at rd_opcintype not the index tupdesc. + * + * Note: this is a bit shaky for opclasses that have pseudotype + * input types such as ANYARRAY or RECORD. Currently, the + * typoutput functions associated with the pseudotypes will work + * okay, but we might have to try harder in future. + */ + getTypeOutputInfo(indexRelation->rd_opcintype[i], + &foutoid, &typisvarlena); + val = OidOutputFunctionCall(foutoid, values[i]); + } + + if (i > 0) + appendStringInfoString(&buf, ", "); + appendStringInfoString(&buf, val); + } + + appendStringInfoChar(&buf, ')'); + + return buf.data; +} + +/* + * Get the latestRemovedXid from the table entries pointed at by the index + * tuples being deleted using an AM-generic approach. + * + * This is a table_index_delete_tuples() shim used by index AMs that have + * simple requirements. These callers only need to consult the tableam to get + * a latestRemovedXid value, and only expect to delete tuples that are already + * known deletable. When a latestRemovedXid value isn't needed in index AM's + * deletion WAL record, it is safe for it to skip calling here entirely. + * + * We assume that caller index AM uses the standard IndexTuple representation, + * with table TIDs stored in the t_tid field. We also expect (and assert) + * that the line pointers on page for 'itemnos' offsets are already marked + * LP_DEAD. + */ +TransactionId +index_compute_xid_horizon_for_tuples(Relation irel, + Relation hrel, + Buffer ibuf, + OffsetNumber *itemnos, + int nitems) +{ + TM_IndexDeleteOp delstate; + TransactionId latestRemovedXid = InvalidTransactionId; + Page ipage = BufferGetPage(ibuf); + IndexTuple itup; + + Assert(nitems > 0); + + delstate.bottomup = false; + delstate.bottomupfreespace = 0; + delstate.ndeltids = 0; + delstate.deltids = palloc(nitems * sizeof(TM_IndexDelete)); + delstate.status = palloc(nitems * sizeof(TM_IndexStatus)); + + /* identify what the index tuples about to be deleted point to */ + for (int i = 0; i < nitems; i++) + { + ItemId iitemid; + + iitemid = PageGetItemId(ipage, itemnos[i]); + itup = (IndexTuple) PageGetItem(ipage, iitemid); + + Assert(ItemIdIsDead(iitemid)); + + ItemPointerCopy(&itup->t_tid, &delstate.deltids[i].tid); + delstate.deltids[i].id = delstate.ndeltids; + delstate.status[i].idxoffnum = InvalidOffsetNumber; /* unused */ + delstate.status[i].knowndeletable = true; /* LP_DEAD-marked */ + delstate.status[i].promising = false; /* unused */ + delstate.status[i].freespace = 0; /* unused */ + + delstate.ndeltids++; + } + + /* determine the actual xid horizon */ + latestRemovedXid = table_index_delete_tuples(hrel, &delstate); + + /* assert tableam agrees that all items are deletable */ + Assert(delstate.ndeltids == nitems); + + pfree(delstate.deltids); + pfree(delstate.status); + + return latestRemovedXid; +} + + +/* ---------------------------------------------------------------- + * heap-or-index-scan access to system catalogs + * + * These functions support system catalog accesses that normally use + * an index but need to be capable of being switched to heap scans + * if the system indexes are unavailable. + * + * The specified scan keys must be compatible with the named index. + * Generally this means that they must constrain either all columns + * of the index, or the first K columns of an N-column index. + * + * These routines could work with non-system tables, actually, + * but they're only useful when there is a known index to use with + * the given scan keys; so in practice they're only good for + * predetermined types of scans of system catalogs. + * ---------------------------------------------------------------- + */ + +/* + * systable_beginscan --- set up for heap-or-index scan + * + * rel: catalog to scan, already opened and suitably locked + * indexId: OID of index to conditionally use + * indexOK: if false, forces a heap scan (see notes below) + * snapshot: time qual to use (NULL for a recent catalog snapshot) + * nkeys, key: scan keys + * + * The attribute numbers in the scan key should be set for the heap case. + * If we choose to index, we reset them to 1..n to reference the index + * columns. Note this means there must be one scankey qualification per + * index column! This is checked by the Asserts in the normal, index-using + * case, but won't be checked if the heapscan path is taken. + * + * The routine checks the normal cases for whether an indexscan is safe, + * but caller can make additional checks and pass indexOK=false if needed. + * In standard case indexOK can simply be constant TRUE. + */ +SysScanDesc +systable_beginscan(Relation heapRelation, + Oid indexId, + bool indexOK, + Snapshot snapshot, + int nkeys, ScanKey key) +{ + SysScanDesc sysscan; + Relation irel; + + if (indexOK && + !IgnoreSystemIndexes && + !ReindexIsProcessingIndex(indexId)) + irel = index_open(indexId, AccessShareLock); + else + irel = NULL; + + sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData)); + + sysscan->heap_rel = heapRelation; + sysscan->irel = irel; + sysscan->slot = table_slot_create(heapRelation, NULL); + + if (snapshot == NULL) + { + Oid relid = RelationGetRelid(heapRelation); + + snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); + sysscan->snapshot = snapshot; + } + else + { + /* Caller is responsible for any snapshot. */ + sysscan->snapshot = NULL; + } + + if (irel) + { + int i; + + /* Change attribute numbers to be index column numbers. */ + for (i = 0; i < nkeys; i++) + { + int j; + + for (j = 0; j < IndexRelationGetNumberOfAttributes(irel); j++) + { + if (key[i].sk_attno == irel->rd_index->indkey.values[j]) + { + key[i].sk_attno = j + 1; + break; + } + } + if (j == IndexRelationGetNumberOfAttributes(irel)) + elog(ERROR, "column is not in index"); + } + + sysscan->iscan = index_beginscan(heapRelation, irel, + snapshot, nkeys, 0); + index_rescan(sysscan->iscan, key, nkeys, NULL, 0); + sysscan->scan = NULL; + } + else + { + /* + * We disallow synchronized scans when forced to use a heapscan on a + * catalog. In most cases the desired rows are near the front, so + * that the unpredictable start point of a syncscan is a serious + * disadvantage; and there are no compensating advantages, because + * it's unlikely that such scans will occur in parallel. + */ + sysscan->scan = table_beginscan_strat(heapRelation, snapshot, + nkeys, key, + true, false); + sysscan->iscan = NULL; + } + + /* + * If CheckXidAlive is set then set a flag to indicate that system table + * scan is in-progress. See detailed comments in xact.c where these + * variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = true; + + return sysscan; +} + +/* + * HandleConcurrentAbort - Handle concurrent abort of the CheckXidAlive. + * + * Error out, if CheckXidAlive is aborted. We can't directly use + * TransactionIdDidAbort as after crash such transaction might not have been + * marked as aborted. See detailed comments in xact.c where the variable + * is declared. + */ +static inline void +HandleConcurrentAbort() +{ + if (TransactionIdIsValid(CheckXidAlive) && + !TransactionIdIsInProgress(CheckXidAlive) && + !TransactionIdDidCommit(CheckXidAlive)) + ereport(ERROR, + (errcode(ERRCODE_TRANSACTION_ROLLBACK), + errmsg("transaction aborted during system catalog scan"))); +} + +/* + * systable_getnext --- get next tuple in a heap-or-index scan + * + * Returns NULL if no more tuples available. + * + * Note that returned tuple is a reference to data in a disk buffer; + * it must not be modified, and should be presumed inaccessible after + * next getnext() or endscan() call. + * + * XXX: It'd probably make sense to offer a slot based interface, at least + * optionally. + */ +HeapTuple +systable_getnext(SysScanDesc sysscan) +{ + HeapTuple htup = NULL; + + if (sysscan->irel) + { + if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot)) + { + bool shouldFree; + + htup = ExecFetchSlotHeapTuple(sysscan->slot, false, &shouldFree); + Assert(!shouldFree); + + /* + * We currently don't need to support lossy index operators for + * any system catalog scan. It could be done here, using the scan + * keys to drive the operator calls, if we arranged to save the + * heap attnums during systable_beginscan(); this is practical + * because we still wouldn't need to support indexes on + * expressions. + */ + if (sysscan->iscan->xs_recheck) + elog(ERROR, "system catalog scans with lossy index conditions are not implemented"); + } + } + else + { + if (table_scan_getnextslot(sysscan->scan, ForwardScanDirection, sysscan->slot)) + { + bool shouldFree; + + htup = ExecFetchSlotHeapTuple(sysscan->slot, false, &shouldFree); + Assert(!shouldFree); + } + } + + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + + return htup; +} + +/* + * systable_recheck_tuple --- recheck visibility of most-recently-fetched tuple + * + * In particular, determine if this tuple would be visible to a catalog scan + * that started now. We don't handle the case of a non-MVCC scan snapshot, + * because no caller needs that yet. + * + * This is useful to test whether an object was deleted while we waited to + * acquire lock on it. + * + * Note: we don't actually *need* the tuple to be passed in, but it's a + * good crosscheck that the caller is interested in the right tuple. + */ +bool +systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup) +{ + Snapshot freshsnap; + bool result; + + Assert(tup == ExecFetchSlotHeapTuple(sysscan->slot, false, NULL)); + + /* + * Trust that table_tuple_satisfies_snapshot() and its subsidiaries + * (commonly LockBuffer() and HeapTupleSatisfiesMVCC()) do not themselves + * acquire snapshots, so we need not register the snapshot. Those + * facilities are too low-level to have any business scanning tables. + */ + freshsnap = GetCatalogSnapshot(RelationGetRelid(sysscan->heap_rel)); + + result = table_tuple_satisfies_snapshot(sysscan->heap_rel, + sysscan->slot, + freshsnap); + + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + + return result; +} + +/* + * systable_endscan --- close scan, release resources + * + * Note that it's still up to the caller to close the heap relation. + */ +void +systable_endscan(SysScanDesc sysscan) +{ + if (sysscan->slot) + { + ExecDropSingleTupleTableSlot(sysscan->slot); + sysscan->slot = NULL; + } + + if (sysscan->irel) + { + index_endscan(sysscan->iscan); + index_close(sysscan->irel, AccessShareLock); + } + else + table_endscan(sysscan->scan); + + if (sysscan->snapshot) + UnregisterSnapshot(sysscan->snapshot); + + /* + * Reset the bsysscan flag at the end of the systable scan. See detailed + * comments in xact.c where these variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = false; + + pfree(sysscan); +} + + +/* + * systable_beginscan_ordered --- set up for ordered catalog scan + * + * These routines have essentially the same API as systable_beginscan etc, + * except that they guarantee to return multiple matching tuples in + * index order. Also, for largely historical reasons, the index to use + * is opened and locked by the caller, not here. + * + * Currently we do not support non-index-based scans here. (In principle + * we could do a heapscan and sort, but the uses are in places that + * probably don't need to still work with corrupted catalog indexes.) + * For the moment, therefore, these functions are merely the thinest of + * wrappers around index_beginscan/index_getnext_slot. The main reason for + * their existence is to centralize possible future support of lossy operators + * in catalog scans. + */ +SysScanDesc +systable_beginscan_ordered(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + int nkeys, ScanKey key) +{ + SysScanDesc sysscan; + int i; + + /* REINDEX can probably be a hard error here ... */ + if (ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) + elog(ERROR, "cannot do ordered scan on index \"%s\", because it is being reindexed", + RelationGetRelationName(indexRelation)); + /* ... but we only throw a warning about violating IgnoreSystemIndexes */ + if (IgnoreSystemIndexes) + elog(WARNING, "using index \"%s\" despite IgnoreSystemIndexes", + RelationGetRelationName(indexRelation)); + + sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData)); + + sysscan->heap_rel = heapRelation; + sysscan->irel = indexRelation; + sysscan->slot = table_slot_create(heapRelation, NULL); + + if (snapshot == NULL) + { + Oid relid = RelationGetRelid(heapRelation); + + snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); + sysscan->snapshot = snapshot; + } + else + { + /* Caller is responsible for any snapshot. */ + sysscan->snapshot = NULL; + } + + /* Change attribute numbers to be index column numbers. */ + for (i = 0; i < nkeys; i++) + { + int j; + + for (j = 0; j < IndexRelationGetNumberOfAttributes(indexRelation); j++) + { + if (key[i].sk_attno == indexRelation->rd_index->indkey.values[j]) + { + key[i].sk_attno = j + 1; + break; + } + } + if (j == IndexRelationGetNumberOfAttributes(indexRelation)) + elog(ERROR, "column is not in index"); + } + + sysscan->iscan = index_beginscan(heapRelation, indexRelation, + snapshot, nkeys, 0); + index_rescan(sysscan->iscan, key, nkeys, NULL, 0); + sysscan->scan = NULL; + + return sysscan; +} + +/* + * systable_getnext_ordered --- get next tuple in an ordered catalog scan + */ +HeapTuple +systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction) +{ + HeapTuple htup = NULL; + + Assert(sysscan->irel); + if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot)) + htup = ExecFetchSlotHeapTuple(sysscan->slot, false, NULL); + + /* See notes in systable_getnext */ + if (htup && sysscan->iscan->xs_recheck) + elog(ERROR, "system catalog scans with lossy index conditions are not implemented"); + + /* + * Handle the concurrent abort while fetching the catalog tuple during + * logical streaming of a transaction. + */ + HandleConcurrentAbort(); + + return htup; +} + +/* + * systable_endscan_ordered --- close scan, release resources + */ +void +systable_endscan_ordered(SysScanDesc sysscan) +{ + if (sysscan->slot) + { + ExecDropSingleTupleTableSlot(sysscan->slot); + sysscan->slot = NULL; + } + + Assert(sysscan->irel); + index_endscan(sysscan->iscan); + if (sysscan->snapshot) + UnregisterSnapshot(sysscan->snapshot); + pfree(sysscan); +} diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c new file mode 100644 index 0000000..5e22479 --- /dev/null +++ b/src/backend/access/index/indexam.c @@ -0,0 +1,984 @@ +/*------------------------------------------------------------------------- + * + * indexam.c + * general index access method routines + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/index/indexam.c + * + * INTERFACE ROUTINES + * index_open - open an index relation by relation OID + * index_close - close an index relation + * index_beginscan - start a scan of an index with amgettuple + * index_beginscan_bitmap - start a scan of an index with amgetbitmap + * index_rescan - restart a scan of an index + * index_endscan - end a scan + * index_insert - insert an index tuple into a relation + * index_markpos - mark a scan position + * index_restrpos - restore a scan position + * index_parallelscan_estimate - estimate shared memory for parallel scan + * index_parallelscan_initialize - initialize parallel scan + * index_parallelrescan - (re)start a parallel scan of an index + * index_beginscan_parallel - join parallel index scan + * index_getnext_tid - get the next TID from a scan + * index_fetch_heap - get the scan's next heap tuple + * index_getnext_slot - get the next tuple from a scan + * index_getbitmap - get all tuples from a scan + * index_bulk_delete - bulk deletion of index tuples + * index_vacuum_cleanup - post-deletion cleanup of an index + * index_can_return - does index support index-only scans? + * index_getprocid - get a support procedure OID + * index_getprocinfo - get a support procedure's lookup info + * + * NOTES + * This file contains the index_ routines which used + * to be a scattered collection of stuff in access/genam. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/amapi.h" +#include "access/heapam.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "catalog/index.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_type.h" +#include "commands/defrem.h" +#include "nodes/makefuncs.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/ruleutils.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + + +/* ---------------------------------------------------------------- + * macros used in index_ routines + * + * Note: the ReindexIsProcessingIndex() check in RELATION_CHECKS is there + * to check that we don't try to scan or do retail insertions into an index + * that is currently being rebuilt or pending rebuild. This helps to catch + * things that don't work when reindexing system catalogs. The assertion + * doesn't prevent the actual rebuild because we don't use RELATION_CHECKS + * when calling the index AM's ambuild routine, and there is no reason for + * ambuild to call its subsidiary routines through this file. + * ---------------------------------------------------------------- + */ +#define RELATION_CHECKS \ +( \ + AssertMacro(RelationIsValid(indexRelation)), \ + AssertMacro(PointerIsValid(indexRelation->rd_indam)), \ + AssertMacro(!ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) \ +) + +#define SCAN_CHECKS \ +( \ + AssertMacro(IndexScanIsValid(scan)), \ + AssertMacro(RelationIsValid(scan->indexRelation)), \ + AssertMacro(PointerIsValid(scan->indexRelation->rd_indam)) \ +) + +#define CHECK_REL_PROCEDURE(pname) \ +do { \ + if (indexRelation->rd_indam->pname == NULL) \ + elog(ERROR, "function \"%s\" is not defined for index \"%s\"", \ + CppAsString(pname), RelationGetRelationName(indexRelation)); \ +} while(0) + +#define CHECK_SCAN_PROCEDURE(pname) \ +do { \ + if (scan->indexRelation->rd_indam->pname == NULL) \ + elog(ERROR, "function \"%s\" is not defined for index \"%s\"", \ + CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \ +} while(0) + +static IndexScanDesc index_beginscan_internal(Relation indexRelation, + int nkeys, int norderbys, Snapshot snapshot, + ParallelIndexScanDesc pscan, bool temp_snap); + + +/* ---------------------------------------------------------------- + * index_ interface functions + * ---------------------------------------------------------------- + */ + +/* ---------------- + * index_open - open an index relation by relation OID + * + * If lockmode is not "NoLock", the specified kind of lock is + * obtained on the index. (Generally, NoLock should only be + * used if the caller knows it has some appropriate lock on the + * index already.) + * + * An error is raised if the index does not exist. + * + * This is a convenience routine adapted for indexscan use. + * Some callers may prefer to use relation_open directly. + * ---------------- + */ +Relation +index_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + r = relation_open(relationId, lockmode); + + if (r->rd_rel->relkind != RELKIND_INDEX && + r->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not an index", + RelationGetRelationName(r)))); + + return r; +} + +/* ---------------- + * index_close - close an index relation + * + * If lockmode is not "NoLock", we then release the specified lock. + * + * Note that it is often sensible to hold a lock beyond index_close; + * in that case, the lock is released automatically at xact end. + * ---------------- + */ +void +index_close(Relation relation, LOCKMODE lockmode) +{ + LockRelId relid = relation->rd_lockInfo.lockRelId; + + Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES); + + /* The relcache does the real work... */ + RelationClose(relation); + + if (lockmode != NoLock) + UnlockRelationId(&relid, lockmode); +} + +/* ---------------- + * index_insert - insert an index tuple into a relation + * ---------------- + */ +bool +index_insert(Relation indexRelation, + Datum *values, + bool *isnull, + ItemPointer heap_t_ctid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(aminsert); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, + heap_t_ctid, heapRelation, + checkUnique, indexUnchanged, + indexInfo); +} + +/* + * index_beginscan - start a scan of an index with amgettuple + * + * Caller must be holding suitable locks on the heap and the index. + */ +IndexScanDesc +index_beginscan(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + int nkeys, int norderbys) +{ + IndexScanDesc scan; + + scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false); + + /* + * Save additional parameters into the scandesc. Everything else was set + * up by RelationGetIndexScan. + */ + scan->heapRelation = heapRelation; + scan->xs_snapshot = snapshot; + + /* prepare to fetch index matches from table */ + scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + + return scan; +} + +/* + * index_beginscan_bitmap - start a scan of an index with amgetbitmap + * + * As above, caller had better be holding some lock on the parent heap + * relation, even though it's not explicitly mentioned here. + */ +IndexScanDesc +index_beginscan_bitmap(Relation indexRelation, + Snapshot snapshot, + int nkeys) +{ + IndexScanDesc scan; + + scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false); + + /* + * Save additional parameters into the scandesc. Everything else was set + * up by RelationGetIndexScan. + */ + scan->xs_snapshot = snapshot; + + return scan; +} + +/* + * index_beginscan_internal --- common code for index_beginscan variants + */ +static IndexScanDesc +index_beginscan_internal(Relation indexRelation, + int nkeys, int norderbys, Snapshot snapshot, + ParallelIndexScanDesc pscan, bool temp_snap) +{ + IndexScanDesc scan; + + RELATION_CHECKS; + CHECK_REL_PROCEDURE(ambeginscan); + + if (!(indexRelation->rd_indam->ampredlocks)) + PredicateLockRelation(indexRelation, snapshot); + + /* + * We hold a reference count to the relcache entry throughout the scan. + */ + RelationIncrementReferenceCount(indexRelation); + + /* + * Tell the AM to open a scan. + */ + scan = indexRelation->rd_indam->ambeginscan(indexRelation, nkeys, + norderbys); + /* Initialize information for parallel scan. */ + scan->parallel_scan = pscan; + scan->xs_temp_snap = temp_snap; + + return scan; +} + +/* ---------------- + * index_rescan - (re)start a scan of an index + * + * During a restart, the caller may specify a new set of scankeys and/or + * orderbykeys; but the number of keys cannot differ from what index_beginscan + * was told. (Later we might relax that to "must not exceed", but currently + * the index AMs tend to assume that scan->numberOfKeys is what to believe.) + * To restart the scan without changing keys, pass NULL for the key arrays. + * (Of course, keys *must* be passed on the first call, unless + * scan->numberOfKeys is zero.) + * ---------------- + */ +void +index_rescan(IndexScanDesc scan, + ScanKey keys, int nkeys, + ScanKey orderbys, int norderbys) +{ + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amrescan); + + Assert(nkeys == scan->numberOfKeys); + Assert(norderbys == scan->numberOfOrderBys); + + /* Release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + scan->kill_prior_tuple = false; /* for safety */ + scan->xs_heap_continue = false; + + scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys, + orderbys, norderbys); +} + +/* ---------------- + * index_endscan - end a scan + * ---------------- + */ +void +index_endscan(IndexScanDesc scan) +{ + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amendscan); + + /* Release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + { + table_index_fetch_end(scan->xs_heapfetch); + scan->xs_heapfetch = NULL; + } + + /* End the AM's scan */ + scan->indexRelation->rd_indam->amendscan(scan); + + /* Release index refcount acquired by index_beginscan */ + RelationDecrementReferenceCount(scan->indexRelation); + + if (scan->xs_temp_snap) + UnregisterSnapshot(scan->xs_snapshot); + + /* Release the scan data structure itself */ + IndexScanEnd(scan); +} + +/* ---------------- + * index_markpos - mark a scan position + * ---------------- + */ +void +index_markpos(IndexScanDesc scan) +{ + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(ammarkpos); + + scan->indexRelation->rd_indam->ammarkpos(scan); +} + +/* ---------------- + * index_restrpos - restore a scan position + * + * NOTE: this only restores the internal scan state of the index AM. See + * comments for ExecRestrPos(). + * + * NOTE: For heap, in the presence of HOT chains, mark/restore only works + * correctly if the scan's snapshot is MVCC-safe; that ensures that there's at + * most one returnable tuple in each HOT chain, and so restoring the prior + * state at the granularity of the index AM is sufficient. Since the only + * current user of mark/restore functionality is nodeMergejoin.c, this + * effectively means that merge-join plans only work for MVCC snapshots. This + * could be fixed if necessary, but for now it seems unimportant. + * ---------------- + */ +void +index_restrpos(IndexScanDesc scan) +{ + Assert(IsMVCCSnapshot(scan->xs_snapshot)); + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amrestrpos); + + /* release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + scan->kill_prior_tuple = false; /* for safety */ + scan->xs_heap_continue = false; + + scan->indexRelation->rd_indam->amrestrpos(scan); +} + +/* + * index_parallelscan_estimate - estimate shared memory for parallel scan + * + * Currently, we don't pass any information to the AM-specific estimator, + * so it can probably only return a constant. In the future, we might need + * to pass more information. + */ +Size +index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot) +{ + Size nbytes; + + RELATION_CHECKS; + + nbytes = offsetof(ParallelIndexScanDescData, ps_snapshot_data); + nbytes = add_size(nbytes, EstimateSnapshotSpace(snapshot)); + nbytes = MAXALIGN(nbytes); + + /* + * If amestimateparallelscan is not provided, assume there is no + * AM-specific data needed. (It's hard to believe that could work, but + * it's easy enough to cater to it here.) + */ + if (indexRelation->rd_indam->amestimateparallelscan != NULL) + nbytes = add_size(nbytes, + indexRelation->rd_indam->amestimateparallelscan()); + + return nbytes; +} + +/* + * index_parallelscan_initialize - initialize parallel scan + * + * We initialize both the ParallelIndexScanDesc proper and the AM-specific + * information which follows it. + * + * This function calls access method specific initialization routine to + * initialize am specific information. Call this just once in the leader + * process; then, individual workers attach via index_beginscan_parallel. + */ +void +index_parallelscan_initialize(Relation heapRelation, Relation indexRelation, + Snapshot snapshot, ParallelIndexScanDesc target) +{ + Size offset; + + RELATION_CHECKS; + + offset = add_size(offsetof(ParallelIndexScanDescData, ps_snapshot_data), + EstimateSnapshotSpace(snapshot)); + offset = MAXALIGN(offset); + + target->ps_relid = RelationGetRelid(heapRelation); + target->ps_indexid = RelationGetRelid(indexRelation); + target->ps_offset = offset; + SerializeSnapshot(snapshot, target->ps_snapshot_data); + + /* aminitparallelscan is optional; assume no-op if not provided by AM */ + if (indexRelation->rd_indam->aminitparallelscan != NULL) + { + void *amtarget; + + amtarget = OffsetToPointer(target, offset); + indexRelation->rd_indam->aminitparallelscan(amtarget); + } +} + +/* ---------------- + * index_parallelrescan - (re)start a parallel scan of an index + * ---------------- + */ +void +index_parallelrescan(IndexScanDesc scan) +{ + SCAN_CHECKS; + + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + /* amparallelrescan is optional; assume no-op if not provided by AM */ + if (scan->indexRelation->rd_indam->amparallelrescan != NULL) + scan->indexRelation->rd_indam->amparallelrescan(scan); +} + +/* + * index_beginscan_parallel - join parallel index scan + * + * Caller must be holding suitable locks on the heap and the index. + */ +IndexScanDesc +index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys, + int norderbys, ParallelIndexScanDesc pscan) +{ + Snapshot snapshot; + IndexScanDesc scan; + + Assert(RelationGetRelid(heaprel) == pscan->ps_relid); + snapshot = RestoreSnapshot(pscan->ps_snapshot_data); + RegisterSnapshot(snapshot); + scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot, + pscan, true); + + /* + * Save additional parameters into the scandesc. Everything else was set + * up by index_beginscan_internal. + */ + scan->heapRelation = heaprel; + scan->xs_snapshot = snapshot; + + /* prepare to fetch index matches from table */ + scan->xs_heapfetch = table_index_fetch_begin(heaprel); + + return scan; +} + +/* ---------------- + * index_getnext_tid - get the next TID from a scan + * + * The result is the next TID satisfying the scan keys, + * or NULL if no more matching tuples exist. + * ---------------- + */ +ItemPointer +index_getnext_tid(IndexScanDesc scan, ScanDirection direction) +{ + bool found; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgettuple); + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * The AM's amgettuple proc finds the next index entry matching the scan + * keys, and puts the TID into scan->xs_heaptid. It should also set + * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * pay no attention to those fields here. + */ + found = scan->indexRelation->rd_indam->amgettuple(scan, direction); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + scan->xs_heap_continue = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + return NULL; + } + Assert(ItemPointerIsValid(&scan->xs_heaptid)); + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the TID of the tuple we found. */ + return &scan->xs_heaptid; +} + +/* ---------------- + * index_fetch_heap - get the scan's next heap tuple + * + * The result is a visible heap tuple associated with the index TID most + * recently fetched by index_getnext_tid, or NULL if no more matching tuples + * exist. (There can be more than one matching tuple because of HOT chains, + * although when using an MVCC snapshot it should be impossible for more than + * one such tuple to exist.) + * + * On success, the buffer containing the heap tup is pinned (the pin will be + * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan + * call). + * + * Note: caller must check scan->xs_recheck, and perform rechecking of the + * scan keys if required. We do not do that here because we don't have + * enough information to do it efficiently in the general case. + * ---------------- + */ +bool +index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) +{ + bool all_dead = false; + bool found; + + found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, + scan->xs_snapshot, slot, + &scan->xs_heap_continue, &all_dead); + + if (found) + pgstat_count_heap_fetch(scan->indexRelation); + + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + + return found; +} + +/* ---------------- + * index_getnext_slot - get the next tuple from a scan + * + * The result is true if a tuple satisfying the scan keys and the snapshot was + * found, false otherwise. The tuple is stored in the specified slot. + * + * On success, resources (like buffer pins) are likely to be held, and will be + * dropped by a future index_getnext_tid, index_fetch_heap or index_endscan + * call). + * + * Note: caller must check scan->xs_recheck, and perform rechecking of the + * scan keys if required. We do not do that here because we don't have + * enough information to do it efficiently in the general case. + * ---------------- + */ +bool +index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *slot) +{ + for (;;) + { + if (!scan->xs_heap_continue) + { + ItemPointer tid; + + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scan, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + } + + /* + * Fetch the next (or only) visible heap tuple for this index entry. + * If we don't find anything, loop around and grab the next TID from + * the index. + */ + Assert(ItemPointerIsValid(&scan->xs_heaptid)); + if (index_fetch_heap(scan, slot)) + return true; + } + + return false; +} + +/* ---------------- + * index_getbitmap - get all tuples at once from an index scan + * + * Adds the TIDs of all heap tuples satisfying the scan keys to a bitmap. + * Since there's no interlock between the index scan and the eventual heap + * access, this is only safe to use with MVCC-based snapshots: the heap + * item slot could have been replaced by a newer tuple by the time we get + * to it. + * + * Returns the number of matching tuples found. (Note: this might be only + * approximate, so it should only be used for statistical purposes.) + * ---------------- + */ +int64 +index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap) +{ + int64 ntids; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgetbitmap); + + /* just make sure this is false... */ + scan->kill_prior_tuple = false; + + /* + * have the am's getbitmap proc do all the work. + */ + ntids = scan->indexRelation->rd_indam->amgetbitmap(scan, bitmap); + + pgstat_count_index_tuples(scan->indexRelation, ntids); + + return ntids; +} + +/* ---------------- + * index_bulk_delete - do mass deletion of index entries + * + * callback routine tells whether a given main-heap tuple is + * to be deleted + * + * return value is an optional palloc'd struct of statistics + * ---------------- + */ +IndexBulkDeleteResult * +index_bulk_delete(IndexVacuumInfo *info, + IndexBulkDeleteResult *istat, + IndexBulkDeleteCallback callback, + void *callback_state) +{ + Relation indexRelation = info->index; + + RELATION_CHECKS; + CHECK_REL_PROCEDURE(ambulkdelete); + + return indexRelation->rd_indam->ambulkdelete(info, istat, + callback, callback_state); +} + +/* ---------------- + * index_vacuum_cleanup - do post-deletion cleanup of an index + * + * return value is an optional palloc'd struct of statistics + * ---------------- + */ +IndexBulkDeleteResult * +index_vacuum_cleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *istat) +{ + Relation indexRelation = info->index; + + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amvacuumcleanup); + + return indexRelation->rd_indam->amvacuumcleanup(info, istat); +} + +/* ---------------- + * index_can_return + * + * Does the index access method support index-only scans for the given + * column? + * ---------------- + */ +bool +index_can_return(Relation indexRelation, int attno) +{ + RELATION_CHECKS; + + /* amcanreturn is optional; assume false if not provided by AM */ + if (indexRelation->rd_indam->amcanreturn == NULL) + return false; + + return indexRelation->rd_indam->amcanreturn(indexRelation, attno); +} + +/* ---------------- + * index_getprocid + * + * Index access methods typically require support routines that are + * not directly the implementation of any WHERE-clause query operator + * and so cannot be kept in pg_amop. Instead, such routines are kept + * in pg_amproc. These registered procedure OIDs are assigned numbers + * according to a convention established by the access method. + * The general index code doesn't know anything about the routines + * involved; it just builds an ordered list of them for + * each attribute on which an index is defined. + * + * As of Postgres 8.3, support routines within an operator family + * are further subdivided by the "left type" and "right type" of the + * query operator(s) that they support. The "default" functions for a + * particular indexed attribute are those with both types equal to + * the index opclass' opcintype (note that this is subtly different + * from the indexed attribute's own type: it may be a binary-compatible + * type instead). Only the default functions are stored in relcache + * entries --- access methods can use the syscache to look up non-default + * functions. + * + * This routine returns the requested default procedure OID for a + * particular indexed attribute. + * ---------------- + */ +RegProcedure +index_getprocid(Relation irel, + AttrNumber attnum, + uint16 procnum) +{ + RegProcedure *loc; + int nproc; + int procindex; + + nproc = irel->rd_indam->amsupport; + + Assert(procnum > 0 && procnum <= (uint16) nproc); + + procindex = (nproc * (attnum - 1)) + (procnum - 1); + + loc = irel->rd_support; + + Assert(loc != NULL); + + return loc[procindex]; +} + +/* ---------------- + * index_getprocinfo + * + * This routine allows index AMs to keep fmgr lookup info for + * support procs in the relcache. As above, only the "default" + * functions for any particular indexed attribute are cached. + * + * Note: the return value points into cached data that will be lost during + * any relcache rebuild! Therefore, either use the callinfo right away, + * or save it only after having acquired some type of lock on the index rel. + * ---------------- + */ +FmgrInfo * +index_getprocinfo(Relation irel, + AttrNumber attnum, + uint16 procnum) +{ + FmgrInfo *locinfo; + int nproc; + int optsproc; + int procindex; + + nproc = irel->rd_indam->amsupport; + optsproc = irel->rd_indam->amoptsprocnum; + + Assert(procnum > 0 && procnum <= (uint16) nproc); + + procindex = (nproc * (attnum - 1)) + (procnum - 1); + + locinfo = irel->rd_supportinfo; + + Assert(locinfo != NULL); + + locinfo += procindex; + + /* Initialize the lookup info if first time through */ + if (locinfo->fn_oid == InvalidOid) + { + RegProcedure *loc = irel->rd_support; + RegProcedure procId; + + Assert(loc != NULL); + + procId = loc[procindex]; + + /* + * Complain if function was not found during IndexSupportInitialize. + * This should not happen unless the system tables contain bogus + * entries for the index opclass. (If an AM wants to allow a support + * function to be optional, it can use index_getprocid.) + */ + if (!RegProcedureIsValid(procId)) + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + procnum, attnum, RelationGetRelationName(irel)); + + fmgr_info_cxt(procId, locinfo, irel->rd_indexcxt); + + if (procnum != optsproc) + { + /* Initialize locinfo->fn_expr with opclass options Const */ + bytea **attoptions = RelationGetIndexAttOptions(irel, false); + MemoryContext oldcxt = MemoryContextSwitchTo(irel->rd_indexcxt); + + set_fn_opclass_options(locinfo, attoptions[attnum - 1]); + + MemoryContextSwitchTo(oldcxt); + } + } + + return locinfo; +} + +/* ---------------- + * index_store_float8_orderby_distances + * + * Convert AM distance function's results (that can be inexact) + * to ORDER BY types and save them into xs_orderbyvals/xs_orderbynulls + * for a possible recheck. + * ---------------- + */ +void +index_store_float8_orderby_distances(IndexScanDesc scan, Oid *orderByTypes, + IndexOrderByDistance *distances, + bool recheckOrderBy) +{ + int i; + + Assert(distances || !recheckOrderBy); + + scan->xs_recheckorderby = recheckOrderBy; + + for (i = 0; i < scan->numberOfOrderBys; i++) + { + if (orderByTypes[i] == FLOAT8OID) + { +#ifndef USE_FLOAT8_BYVAL + /* must free any old value to avoid memory leakage */ + if (!scan->xs_orderbynulls[i]) + pfree(DatumGetPointer(scan->xs_orderbyvals[i])); +#endif + if (distances && !distances[i].isnull) + { + scan->xs_orderbyvals[i] = Float8GetDatum(distances[i].value); + scan->xs_orderbynulls[i] = false; + } + else + { + scan->xs_orderbyvals[i] = (Datum) 0; + scan->xs_orderbynulls[i] = true; + } + } + else if (orderByTypes[i] == FLOAT4OID) + { + /* convert distance function's result to ORDER BY type */ + if (distances && !distances[i].isnull) + { + scan->xs_orderbyvals[i] = Float4GetDatum((float4) distances[i].value); + scan->xs_orderbynulls[i] = false; + } + else + { + scan->xs_orderbyvals[i] = (Datum) 0; + scan->xs_orderbynulls[i] = true; + } + } + else + { + /* + * If the ordering operator's return value is anything else, we + * don't know how to convert the float8 bound calculated by the + * distance function to that. The executor won't actually need + * the order by values we return here, if there are no lossy + * results, so only insist on converting if the *recheck flag is + * set. + */ + if (scan->xs_recheckorderby) + elog(ERROR, "ORDER BY operator must return float8 or float4 if the distance function is lossy"); + scan->xs_orderbynulls[i] = true; + } + } +} + +/* ---------------- + * index_opclass_options + * + * Parse opclass-specific options for index column. + * ---------------- + */ +bytea * +index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions, + bool validate) +{ + int amoptsprocnum = indrel->rd_indam->amoptsprocnum; + Oid procid = InvalidOid; + FmgrInfo *procinfo; + local_relopts relopts; + + /* fetch options support procedure if specified */ + if (amoptsprocnum != 0) + procid = index_getprocid(indrel, attnum, amoptsprocnum); + + if (!OidIsValid(procid)) + { + Oid opclass; + Datum indclassDatum; + oidvector *indclass; + bool isnull; + + if (!DatumGetPointer(attoptions)) + return NULL; /* ok, no options, no procedure */ + + /* + * Report an error if the opclass's options-parsing procedure does not + * exist but the opclass options are specified. + */ + indclassDatum = SysCacheGetAttr(INDEXRELID, indrel->rd_indextuple, + Anum_pg_index_indclass, &isnull); + Assert(!isnull); + indclass = (oidvector *) DatumGetPointer(indclassDatum); + opclass = indclass->values[attnum - 1]; + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("operator class %s has no options", + generate_opclass_name(opclass)))); + } + + init_local_reloptions(&relopts, 0); + + procinfo = index_getprocinfo(indrel, attnum, amoptsprocnum); + + (void) FunctionCall1(procinfo, PointerGetDatum(&relopts)); + + return build_local_reloptions(&relopts, attoptions, validate); +} diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile new file mode 100644 index 0000000..d69808e --- /dev/null +++ b/src/backend/access/nbtree/Makefile @@ -0,0 +1,28 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/nbtree +# +# IDENTIFICATION +# src/backend/access/nbtree/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/nbtree +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + nbtcompare.o \ + nbtdedup.o \ + nbtinsert.o \ + nbtpage.o \ + nbtree.o \ + nbtsearch.o \ + nbtsort.o \ + nbtsplitloc.o \ + nbtutils.o \ + nbtvalidate.o \ + nbtxlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README new file mode 100644 index 0000000..bfe33b6 --- /dev/null +++ b/src/backend/access/nbtree/README @@ -0,0 +1,1056 @@ +src/backend/access/nbtree/README + +Btree Indexing +============== + +This directory contains a correct implementation of Lehman and Yao's +high-concurrency B-tree management algorithm (P. Lehman and S. Yao, +Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions +on Database Systems, Vol 6, No. 4, December 1981, pp 650-670). We also +use a simplified version of the deletion logic described in Lanin and +Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm, +Proceedings of 1986 Fall Joint Computer Conference, pp 380-389). + +The basic Lehman & Yao Algorithm +-------------------------------- + +Compared to a classic B-tree, L&Y adds a right-link pointer to each page, +to the page's right sibling. It also adds a "high key" to each page, which +is an upper bound on the keys that are allowed on that page. These two +additions make it possible detect a concurrent page split, which allows the +tree to be searched without holding any read locks (except to keep a single +page from being modified while reading it). + +When a search follows a downlink to a child page, it compares the page's +high key with the search key. If the search key is greater than the high +key, the page must've been split concurrently, and you must follow the +right-link to find the new page containing the key range you're looking +for. This might need to be repeated, if the page has been split more than +once. + +Lehman and Yao talk about alternating "separator" keys and downlinks in +internal pages rather than tuples or records. We use the term "pivot" +tuple to refer to tuples which don't point to heap tuples, that are used +only for tree navigation. All tuples on non-leaf pages and high keys on +leaf pages are pivot tuples. Since pivot tuples are only used to represent +which part of the key space belongs on each page, they can have attribute +values copied from non-pivot tuples that were deleted and killed by VACUUM +some time ago. A pivot tuple may contain a "separator" key and downlink, +just a separator key (i.e. the downlink value is implicitly undefined), or +just a downlink (i.e. all attributes are truncated away). + +The requirement that all btree keys be unique is satisfied by treating heap +TID as a tiebreaker attribute. Logical duplicates are sorted in heap TID +order. This is necessary because Lehman and Yao also require that the key +range for a subtree S is described by Ki < v <= Ki+1 where Ki and Ki+1 are +the adjacent keys in the parent page (Ki must be _strictly_ less than v, +which is assured by having reliably unique keys). Keys are always unique +on their level, with the exception of a leaf page's high key, which can be +fully equal to the last item on the page. + +The Postgres implementation of suffix truncation must make sure that the +Lehman and Yao invariants hold, and represents that absent/truncated +attributes in pivot tuples have the sentinel value "minus infinity". The +later section on suffix truncation will be helpful if it's unclear how the +Lehman & Yao invariants work with a real world example. + +Differences to the Lehman & Yao algorithm +----------------------------------------- + +We have made the following changes in order to incorporate the L&Y algorithm +into Postgres: + +Lehman and Yao don't require read locks, but assume that in-memory +copies of tree pages are unshared. Postgres shares in-memory buffers +among backends. As a result, we do page-level read locking on btree +pages in order to guarantee that no record is modified while we are +examining it. This reduces concurrency but guarantees correct +behavior. + +We support the notion of an ordered "scan" of an index as well as +insertions, deletions, and simple lookups. A scan in the forward +direction is no problem, we just use the right-sibling pointers that +L&Y require anyway. (Thus, once we have descended the tree to the +correct start point for the scan, the scan looks only at leaf pages +and never at higher tree levels.) To support scans in the backward +direction, we also store a "left sibling" link much like the "right +sibling". (This adds an extra step to the L&Y split algorithm: while +holding the write lock on the page being split, we also lock its former +right sibling to update that page's left-link. This is safe since no +writer of that page can be interested in acquiring a write lock on our +page.) A backwards scan has one additional bit of complexity: after +following the left-link we must account for the possibility that the +left sibling page got split before we could read it. So, we have to +move right until we find a page whose right-link matches the page we +came from. (Actually, it's even harder than that; see page deletion +discussion below.) + +Page read locks are held only for as long as a scan is examining a page. +To minimize lock/unlock traffic, an index scan always searches a leaf page +to identify all the matching items at once, copying their heap tuple IDs +into backend-local storage. The heap tuple IDs are then processed while +not holding any page lock within the index. We do continue to hold a pin +on the leaf page in some circumstances, to protect against concurrent +deletions (see below). In this state the scan is effectively stopped +"between" pages, either before or after the page it has pinned. This is +safe in the presence of concurrent insertions and even page splits, because +items are never moved across pre-existing page boundaries --- so the scan +cannot miss any items it should have seen, nor accidentally return the same +item twice. The scan must remember the page's right-link at the time it +was scanned, since that is the page to move right to; if we move right to +the current right-link then we'd re-scan any items moved by a page split. +We don't similarly remember the left-link, since it's best to use the most +up-to-date left-link when trying to move left (see detailed move-left +algorithm below). + +In most cases we release our lock and pin on a page before attempting +to acquire pin and lock on the page we are moving to. In a few places +it is necessary to lock the next page before releasing the current one. +This is safe when moving right or up, but not when moving left or down +(else we'd create the possibility of deadlocks). + +Lehman and Yao fail to discuss what must happen when the root page +becomes full and must be split. Our implementation is to split the +root in the same way that any other page would be split, then construct +a new root page holding pointers to both of the resulting pages (which +now become siblings on the next level of the tree). The new root page +is then installed by altering the root pointer in the meta-data page (see +below). This works because the root is not treated specially in any +other way --- in particular, searches will move right using its link +pointer if the link is set. Therefore, searches will find the data +that's been moved into the right sibling even if they read the meta-data +page before it got updated. This is the same reasoning that makes a +split of a non-root page safe. The locking considerations are similar too. + +When an inserter recurses up the tree, splitting internal pages to insert +links to pages inserted on the level below, it is possible that it will +need to access a page above the level that was the root when it began its +descent (or more accurately, the level that was the root when it read the +meta-data page). In this case the stack it made while descending does not +help for finding the correct page. When this happens, we find the correct +place by re-descending the tree until we reach the level one above the +level we need to insert a link to, and then moving right as necessary. +(Typically this will take only two fetches, the meta-data page and the new +root, but in principle there could have been more than one root split +since we saw the root. We can identify the correct tree level by means of +the level numbers stored in each page. The situation is rare enough that +we do not need a more efficient solution.) + +Lehman and Yao must couple/chain locks as part of moving right when +relocating a child page's downlink during an ascent of the tree. This is +the only point where Lehman and Yao have to simultaneously hold three +locks (a lock on the child, the original parent, and the original parent's +right sibling). We don't need to couple internal page locks for pages on +the same level, though. We match a child's block number to a downlink +from a pivot tuple one level up, whereas Lehman and Yao match on the +separator key associated with the downlink that was followed during the +initial descent. We can release the lock on the original parent page +before acquiring a lock on its right sibling, since there is never any +need to deal with the case where the separator key that we must relocate +becomes the original parent's high key. Lanin and Shasha don't couple +locks here either, though they also don't couple locks between levels +during ascents. They are willing to "wait and try again" to avoid races. +Their algorithm is optimistic, which means that "an insertion holds no +more than one write lock at a time during its ascent". We more or less +stick with Lehman and Yao's approach of conservatively coupling parent and +child locks when ascending the tree, since it's far simpler. + +Lehman and Yao assume fixed-size keys, but we must deal with +variable-size keys. Therefore there is not a fixed maximum number of +keys per page; we just stuff in as many as will fit. When we split a +page, we try to equalize the number of bytes, not items, assigned to +pages (though suffix truncation is also considered). Note we must include +the incoming item in this calculation, otherwise it is possible to find +that the incoming item doesn't fit on the split page where it needs to go! + +Deleting index tuples during VACUUM +----------------------------------- + +Before deleting a leaf item, we get a super-exclusive lock on the target +page, so that no other backend has a pin on the page when the deletion +starts. This is not necessary for correctness in terms of the btree index +operations themselves; as explained above, index scans logically stop +"between" pages and so can't lose their place. The reason we do it is to +provide an interlock between VACUUM and indexscans. Since VACUUM deletes +index entries before reclaiming heap tuple line pointers, the +super-exclusive lock guarantees that VACUUM can't reclaim for re-use a +line pointer that an indexscanning process might be about to visit. This +guarantee works only for simple indexscans that visit the heap in sync +with the index scan, not for bitmap scans. We only need the guarantee +when using non-MVCC snapshot rules; when using an MVCC snapshot, it +doesn't matter if the heap tuple is replaced with an unrelated tuple at +the same TID, because the new tuple won't be visible to our scan anyway. +Therefore, a scan using an MVCC snapshot which has no other confounding +factors will not hold the pin after the page contents are read. The +current reasons for exceptions, where a pin is still needed, are if the +index is not WAL-logged or if the scan is an index-only scan. If later +work allows the pin to be dropped for all cases we will be able to +simplify the vacuum code, since the concept of a super-exclusive lock +for btree indexes will no longer be needed. + +Because a pin is not always held, and a page can be split even while +someone does hold a pin on it, it is possible that an indexscan will +return items that are no longer stored on the page it has a pin on, but +rather somewhere to the right of that page. To ensure that VACUUM can't +prematurely remove such heap tuples, we require btbulkdelete to obtain a +super-exclusive lock on every leaf page in the index, even pages that +don't contain any deletable tuples. Any scan which could yield incorrect +results if the tuple at a TID matching the scan's range and filter +conditions were replaced by a different tuple while the scan is in +progress must hold the pin on each index page until all index entries read +from the page have been processed. This guarantees that the btbulkdelete +call cannot return while any indexscan is still holding a copy of a +deleted index tuple if the scan could be confused by that. Note that this +requirement does not say that btbulkdelete must visit the pages in any +particular order. (See also simple deletion and bottom-up deletion, +below.) + +There is no such interlocking for deletion of items in internal pages, +since backends keep no lock nor pin on a page they have descended past. +Hence, when a backend is ascending the tree using its stack, it must +be prepared for the possibility that the item it wants is to the left of +the recorded position (but it can't have moved left out of the recorded +page). Since we hold a lock on the lower page (per L&Y) until we have +re-found the parent item that links to it, we can be assured that the +parent item does still exist and can't have been deleted. + +VACUUM's linear scan, concurrent page splits +-------------------------------------------- + +VACUUM accesses the index by doing a linear scan to search for deletable +TIDs, while considering the possibility of deleting empty pages in +passing. This is in physical/block order, not logical/keyspace order. +The tricky part of this is avoiding missing any deletable tuples in the +presence of concurrent page splits: a page split could easily move some +tuples from a page not yet passed over by the sequential scan to a +lower-numbered page already passed over. + +To implement this, we provide a "vacuum cycle ID" mechanism that makes it +possible to determine whether a page has been split since the current +btbulkdelete cycle started. If btbulkdelete finds a page that has been +split since it started, and has a right-link pointing to a lower page +number, then it temporarily suspends its sequential scan and visits that +page instead. It must continue to follow right-links and vacuum dead +tuples until reaching a page that either hasn't been split since +btbulkdelete started, or is above the location of the outer sequential +scan. Then it can resume the sequential scan. This ensures that all +tuples are visited. It may be that some tuples are visited twice, but +that has no worse effect than an inaccurate index tuple count (and we +can't guarantee an accurate count anyway in the face of concurrent +activity). Note that this still works if the has-been-recently-split test +has a small probability of false positives, so long as it never gives a +false negative. This makes it possible to implement the test with a small +counter value stored on each index page. + +Deleting entire pages during VACUUM +----------------------------------- + +We consider deleting an entire page from the btree only when it's become +completely empty of items. (Merging partly-full pages would allow better +space reuse, but it seems impractical to move existing data items left or +right to make this happen --- a scan moving in the opposite direction +might miss the items if so.) Also, we *never* delete the rightmost page +on a tree level (this restriction simplifies the traversal algorithms, as +explained below). Page deletion always begins from an empty leaf page. An +internal page can only be deleted as part of deleting an entire subtree. +This is always a "skinny" subtree consisting of a "chain" of internal pages +plus a single leaf page. There is one page on each level of the subtree, +and each level/page covers the same key space. + +Deleting a leaf page is a two-stage process. In the first stage, the page +is unlinked from its parent, and marked as half-dead. The parent page must +be found using the same type of search as used to find the parent during an +insertion split. We lock the target and the parent pages, change the +target's downlink to point to the right sibling, and remove its old +downlink. This causes the target page's key space to effectively belong to +its right sibling. (Neither the left nor right sibling pages need to +change their "high key" if any; so there is no problem with possibly not +having enough space to replace a high key.) At the same time, we mark the +target page as half-dead, which causes any subsequent searches to ignore it +and move right (or left, in a backwards scan). This leaves the tree in a +similar state as during a page split: the page has no downlink pointing to +it, but it's still linked to its siblings. + +(Note: Lanin and Shasha prefer to make the key space move left, but their +argument for doing so hinges on not having left-links, which we have +anyway. So we simplify the algorithm by moving the key space right. This +is only possible because we don't match on a separator key when ascending +the tree during a page split, unlike Lehman and Yao/Lanin and Shasha -- it +doesn't matter if the downlink is re-found in a pivot tuple whose separator +key does not match the one encountered when inserter initially descended +the tree.) + +To preserve consistency on the parent level, we cannot merge the key space +of a page into its right sibling unless the right sibling is a child of +the same parent --- otherwise, the parent's key space assignment changes +too, meaning we'd have to make bounding-key updates in its parent, and +perhaps all the way up the tree. Since we can't possibly do that +atomically, we forbid this case. That means that the rightmost child of a +parent node can't be deleted unless it's the only remaining child, in which +case we will delete the parent too (see below). + +In the second-stage, the half-dead leaf page is unlinked from its siblings. +We first lock the left sibling (if any) of the target, the target page +itself, and its right sibling (there must be one) in that order. Then we +update the side-links in the siblings, and mark the target page deleted. + +When we're about to delete the last remaining child of a parent page, things +are slightly more complicated. In the first stage, we leave the immediate +parent of the leaf page alone, and remove the downlink to the parent page +instead, from the grandparent. If it's the last child of the grandparent +too, we recurse up until we find a parent with more than one child, and +remove the downlink of that page. The leaf page is marked as half-dead, and +the block number of the page whose downlink was removed is stashed in the +half-dead leaf page. This leaves us with a chain of internal pages, with +one downlink each, leading to the half-dead leaf page, and no downlink +pointing to the topmost page in the chain. + +While we recurse up to find the topmost parent in the chain, we keep the +leaf page locked, but don't need to hold locks on the intermediate pages +between the leaf and the topmost parent -- insertions into upper tree levels +happen only as a result of splits of child pages, and that can't happen as +long as we're keeping the leaf locked. The internal pages in the chain +cannot acquire new children afterwards either, because the leaf page is +marked as half-dead and won't be split. + +Removing the downlink to the top of the to-be-deleted subtree/chain +effectively transfers the key space to the right sibling for all the +intermediate levels too, in one atomic operation. A concurrent search might +still visit the intermediate pages, but it will move right when it reaches +the half-dead page at the leaf level. In particular, the search will move to +the subtree to the right of the half-dead leaf page/to-be-deleted subtree, +since the half-dead leaf page's right sibling must be a "cousin" page, not a +"true" sibling page (or a second cousin page when the to-be-deleted chain +starts at leaf page's grandparent page, and so on). + +In the second stage, the topmost page in the chain is unlinked from its +siblings, and the half-dead leaf page is updated to point to the next page +down in the chain. This is repeated until there are no internal pages left +in the chain. Finally, the half-dead leaf page itself is unlinked from its +siblings. + +A deleted page cannot be recycled immediately, since there may be other +processes waiting to reference it (ie, search processes that just left the +parent, or scans moving right or left from one of the siblings). These +processes must be able to observe a deleted page for some time after the +deletion operation, in order to be able to at least recover from it (they +recover by moving right, as with concurrent page splits). Searchers never +have to worry about concurrent page recycling. + +See "Placing deleted pages in the FSM" section below for a description of +when and how deleted pages become safe for VACUUM to make recyclable. + +Page deletion and backwards scans +--------------------------------- + +Moving left in a backward scan is complicated because we must consider +the possibility that the left sibling was just split (meaning we must find +the rightmost page derived from the left sibling), plus the possibility +that the page we were just on has now been deleted and hence isn't in the +sibling chain at all anymore. So the move-left algorithm becomes: + +0. Remember the page we are on as the "original page". +1. Follow the original page's left-link (we're done if this is zero). +2. If the current page is live and its right-link matches the "original + page", we are done. +3. Otherwise, move right one or more times looking for a live page whose + right-link matches the "original page". If found, we are done. (In + principle we could scan all the way to the right end of the index, but + in practice it seems better to give up after a small number of tries. + It's unlikely the original page's sibling split more than a few times + while we were in flight to it; if we do not find a matching link in a + few tries, then most likely the original page is deleted.) +4. Return to the "original page". If it is still live, return to step 1 + (we guessed wrong about it being deleted, and should restart with its + current left-link). If it is dead, move right until a non-dead page + is found (there must be one, since rightmost pages are never deleted), + mark that as the new "original page", and return to step 1. + +This algorithm is correct because the live page found by step 4 will have +the same left keyspace boundary as the page we started from. Therefore, +when we ultimately exit, it must be on a page whose right keyspace +boundary matches the left boundary of where we started --- which is what +we need to be sure we don't miss or re-scan any items. + +Page deletion and tree height +----------------------------- + +Because we never delete the rightmost page of any level (and in particular +never delete the root), it's impossible for the height of the tree to +decrease. After massive deletions we might have a scenario in which the +tree is "skinny", with several single-page levels below the root. +Operations will still be correct in this case, but we'd waste cycles +descending through the single-page levels. To handle this we use an idea +from Lanin and Shasha: we keep track of the "fast root" level, which is +the lowest single-page level. The meta-data page keeps a pointer to this +level as well as the true root. All ordinary operations initiate their +searches at the fast root not the true root. When we split a page that is +alone on its level or delete the next-to-last page on a level (both cases +are easily detected), we have to make sure that the fast root pointer is +adjusted appropriately. In the split case, we do this work as part of the +atomic update for the insertion into the parent level; in the delete case +as part of the atomic update for the delete (either way, the metapage has +to be the last page locked in the update to avoid deadlock risks). This +avoids race conditions if two such operations are executing concurrently. + +Placing deleted pages in the FSM +-------------------------------- + +Recycling a page is decoupled from page deletion. A deleted page can only +be put in the FSM to be recycled once there is no possible scan or search +that has a reference to it; until then, it must stay in place with its +sibling links undisturbed, as a tombstone that allows concurrent searches +to detect and then recover from concurrent deletions (which are rather +like concurrent page splits to searchers). This design is an +implementation of what Lanin and Shasha call "the drain technique". + +We implement the technique by waiting until all active snapshots and +registered snapshots as of the page deletion are gone; which is overly +strong, but is simple to implement within Postgres. When marked fully +dead, a deleted page is labeled with the next-transaction counter value. +VACUUM can reclaim the page for re-use when the stored XID is guaranteed +to be "visible to everyone". As collateral damage, we wait for snapshots +taken until the next transaction to allocate an XID commits. We also wait +for running XIDs with no snapshots. + +Prior to PostgreSQL 14, VACUUM would only place _old_ deleted pages that +it encounters during its linear scan (pages deleted by a previous VACUUM +operation) in the FSM. Newly deleted pages were never placed in the FSM, +because that was assumed to _always_ be unsafe. That assumption was +unnecessarily pessimistic in practice, though -- it often doesn't take +very long for newly deleted pages to become safe to place in the FSM. +There is no truly principled way to predict when deleted pages will become +safe to place in the FSM for recycling -- it might become safe almost +immediately (long before the current VACUUM completes), or it might not +even be safe by the time the next VACUUM takes place. Recycle safety is +purely a question of maintaining the consistency (or at least the apparent +consistency) of a physical data structure. The state within the backend +running VACUUM is simply not relevant. + +PostgreSQL 14 added the ability for VACUUM to consider if it's possible to +recycle newly deleted pages at the end of the full index scan where the +page deletion took place. It is convenient to check if it's safe at that +point. This does require that VACUUM keep around a little bookkeeping +information about newly deleted pages, but that's very cheap. Using +in-memory state for this avoids the need to revisit newly deleted pages a +second time later on -- we can just use safexid values from the local +bookkeeping state to determine recycle safety in a deferred fashion. + +The need for additional FSM indirection after a page deletion operation +takes place is a natural consequence of the highly permissive rules for +index scans with Lehman and Yao's design. In general an index scan +doesn't have to hold a lock or even a pin on any page when it descends the +tree (nothing that you'd usually think of as an interlock is held "between +levels"). At the same time, index scans cannot be allowed to land on a +truly unrelated page due to concurrent recycling (not to be confused with +concurrent deletion), because that results in wrong answers to queries. +Simpler approaches to page deletion that don't need to defer recycling are +possible, but none seem compatible with Lehman and Yao's design. + +Placing an already-deleted page in the FSM to be recycled when needed +doesn't actually change the state of the page. The page will be changed +whenever it is subsequently taken from the FSM for reuse. The deleted +page's contents will be overwritten by the split operation (it will become +the new right sibling page). + +Fastpath For Index Insertion +---------------------------- + +We optimize for a common case of insertion of increasing index key +values by caching the last page to which this backend inserted the last +value, if this page was the rightmost leaf page. For the next insert, we +can then quickly check if the cached page is still the rightmost leaf +page and also the correct place to hold the current value. We can avoid +the cost of walking down the tree in such common cases. + +The optimization works on the assumption that there can only be one +non-ignorable leaf rightmost page, and so not even a visible-to-everyone +style interlock is required. We cannot fail to detect that our hint was +invalidated, because there can only be one such page in the B-Tree at +any time. It's possible that the page will be deleted and recycled +without a backend's cached page also being detected as invalidated, but +only when we happen to recycle a block that once again gets recycled as the +rightmost leaf page. + +Simple deletion +--------------- + +If a process visits a heap tuple and finds that it's dead and removable +(ie, dead to all open transactions, not only that process), then we can +return to the index and mark the corresponding index entry "known dead", +allowing subsequent index scans to skip visiting the heap tuple. The +"known dead" marking works by setting the index item's lp_flags state +to LP_DEAD. This is currently only done in plain indexscans, not bitmap +scans, because only plain scans visit the heap and index "in sync" and so +there's not a convenient way to do it for bitmap scans. Note also that +LP_DEAD bits are often set when checking a unique index for conflicts on +insert (this is simpler because it takes place when we hold an exclusive +lock on the leaf page). + +Once an index tuple has been marked LP_DEAD it can actually be deleted +from the index immediately; since index scans only stop "between" pages, +no scan can lose its place from such a deletion. We separate the steps +because we allow LP_DEAD to be set with only a share lock (it's exactly +like a hint bit for a heap tuple), but physically removing tuples requires +exclusive lock. Also, delaying the deletion often allows us to pick up +extra index tuples that weren't initially safe for index scans to mark +LP_DEAD. We do this with index tuples whose TIDs point to the same table +blocks as an LP_DEAD-marked tuple. They're practically free to check in +passing, and have a pretty good chance of being safe to delete due to +various locality effects. + +We only try to delete LP_DEAD tuples (and nearby tuples) when we are +otherwise faced with having to split a page to do an insertion (and hence +have exclusive lock on it already). Deduplication and bottom-up index +deletion can also prevent a page split, but simple deletion is always our +preferred approach. (Note that posting list tuples can only have their +LP_DEAD bit set when every table TID within the posting list is known +dead. This isn't much of a problem in practice because LP_DEAD bits are +just a starting point for simple deletion -- we still manage to perform +granular deletes of posting list TIDs quite often.) + +It's sufficient to have an exclusive lock on the index page, not a +super-exclusive lock, to do deletion of LP_DEAD items. It might seem +that this breaks the interlock between VACUUM and indexscans, but that is +not so: as long as an indexscanning process has a pin on the page where +the index item used to be, VACUUM cannot complete its btbulkdelete scan +and so cannot remove the heap tuple. This is another reason why +btbulkdelete has to get a super-exclusive lock on every leaf page, not only +the ones where it actually sees items to delete. + +LP_DEAD setting by index scans cannot be sure that a TID whose index tuple +it had planned on LP_DEAD-setting has not been recycled by VACUUM if it +drops its pin in the meantime. It must conservatively also remember the +LSN of the page, and only act to set LP_DEAD bits when the LSN has not +changed at all. (Avoiding dropping the pin entirely also makes it safe, of +course.) + +Bottom-Up deletion +------------------ + +We attempt to delete whatever duplicates happen to be present on the page +when the duplicates are suspected to be caused by version churn from +successive UPDATEs. This only happens when we receive an executor hint +indicating that optimizations like heapam's HOT have not worked out for +the index -- the incoming tuple must be a logically unchanged duplicate +which is needed for MVCC purposes, suggesting that that might well be the +dominant source of new index tuples on the leaf page in question. (Also, +bottom-up deletion is triggered within unique indexes in cases with +continual INSERT and DELETE related churn, since that is easy to detect +without any external hint.) + +Simple deletion will already have failed to prevent a page split when a +bottom-up deletion pass takes place (often because no LP_DEAD bits were +ever set on the page). The two mechanisms have closely related +implementations. The same WAL records are used for each operation, and +the same tableam infrastructure is used to determine what TIDs/tuples are +actually safe to delete. The implementations only differ in how they pick +TIDs to consider for deletion, and whether or not the tableam will give up +before accessing all table blocks (bottom-up deletion lives with the +uncertainty of its success by keeping the cost of failure low). Even +still, the two mechanisms are clearly distinct at the conceptual level. + +Bottom-up index deletion is driven entirely by heuristics (whereas simple +deletion is guaranteed to delete at least those index tuples that are +already LP_DEAD marked -- there must be at least one). We have no +certainty that we'll find even one index tuple to delete. That's why we +closely cooperate with the tableam to keep the costs it pays in balance +with the benefits we receive. The interface that we use for this is +described in detail in access/tableam.h. + +Bottom-up index deletion can be thought of as a backstop mechanism against +unnecessary version-driven page splits. It is based in part on an idea +from generational garbage collection: the "generational hypothesis". This +is the empirical observation that "most objects die young". Within +nbtree, new index tuples often quickly appear in the same place, and then +quickly become garbage. There can be intense concentrations of garbage in +relatively few leaf pages with certain workloads (or there could be in +earlier versions of PostgreSQL without bottom-up index deletion, at +least). See doc/src/sgml/btree.sgml for a high-level description of the +design principles behind bottom-up index deletion in nbtree, including +details of how it complements VACUUM. + +We expect to find a reasonably large number of tuples that are safe to +delete within each bottom-up pass. If we don't then we won't need to +consider the question of bottom-up deletion for the same leaf page for +quite a while (usually because the page splits, which resolves the +situation for the time being). We expect to perform regular bottom-up +deletion operations against pages that are at constant risk of unnecessary +page splits caused only by version churn. When the mechanism works well +we'll constantly be "on the verge" of having version-churn-driven page +splits, but never actually have even one. + +Our duplicate heuristics work well despite being fairly simple. +Unnecessary page splits only occur when there are truly pathological +levels of version churn (in theory a small amount of version churn could +make a page split occur earlier than strictly necessary, but that's pretty +harmless). We don't have to understand the underlying workload; we only +have to understand the general nature of the pathology that we target. +Version churn is easy to spot when it is truly pathological. Affected +leaf pages are fairly homogeneous. + +WAL Considerations +------------------ + +The insertion and deletion algorithms in themselves don't guarantee btree +consistency after a crash. To provide robustness, we depend on WAL +replay. A single WAL entry is effectively an atomic action --- we can +redo it from the log if it fails to complete. + +Ordinary item insertions (that don't force a page split) are of course +single WAL entries, since they only affect one page. The same for +leaf-item deletions (if the deletion brings the leaf page to zero items, +it is now a candidate to be deleted, but that is a separate action). + +An insertion that causes a page split is logged as a single WAL entry for +the changes occurring on the insertion's level --- including update of the +right sibling's left-link --- followed by a second WAL entry for the +insertion on the parent level (which might itself be a page split, requiring +an additional insertion above that, etc). + +For a root split, the follow-on WAL entry is a "new root" entry rather than +an "insertion" entry, but details are otherwise much the same. + +Because splitting involves multiple atomic actions, it's possible that the +system crashes between splitting a page and inserting the downlink for the +new half to the parent. After recovery, the downlink for the new page will +be missing. The search algorithm works correctly, as the page will be found +by following the right-link from its left sibling, although if a lot of +downlinks in the tree are missing, performance will suffer. A more serious +consequence is that if the page without a downlink gets split again, the +insertion algorithm will fail to find the location in the parent level to +insert the downlink. + +Our approach is to create any missing downlinks on-the-fly, when searching +the tree for a new insertion. It could be done during searches, too, but +it seems best not to put any extra updates in what would otherwise be a +read-only operation (updating is not possible in hot standby mode anyway). +It would seem natural to add the missing downlinks in VACUUM, but since +inserting a downlink might require splitting a page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. There is one +minor exception: VACUUM finishes interrupted splits of internal pages when +deleting their children. This allows the code for re-finding parent items +to be used by both page splits and page deletion. + +To identify missing downlinks, when a page is split, the left page is +flagged to indicate that the split is not yet complete (INCOMPLETE_SPLIT). +When the downlink is inserted to the parent, the flag is cleared atomically +with the insertion. The child page is kept locked until the insertion in +the parent is finished and the flag in the child cleared, but can be +released immediately after that, before recursing up the tree if the parent +also needs to be split. This ensures that incompletely split pages should +not be seen under normal circumstances; only if insertion to the parent +has failed for some reason. (It's also possible for a reader to observe +a page with the incomplete split flag set during recovery; see later +section on "Scans during Recovery" for details.) + +We flag the left page, even though it's the right page that's missing the +downlink, because it's more convenient to know already when following the +right-link from the left page to the right page that it will need to have +its downlink inserted to the parent. + +When splitting a non-root page that is alone on its level, the required +metapage update (of the "fast root" link) is performed and logged as part +of the insertion into the parent level. When splitting the root page, the +metapage update is handled as part of the "new root" action. + +Each step in page deletion is logged as a separate WAL entry: marking the +leaf as half-dead and removing the downlink is one record, and unlinking a +page is a second record. If vacuum is interrupted for some reason, or the +system crashes, the tree is consistent for searches and insertions. The +next VACUUM will find the half-dead leaf page and continue the deletion. + +Before 9.4, we used to keep track of incomplete splits and page deletions +during recovery and finish them immediately at end of recovery, instead of +doing it lazily at the next insertion or vacuum. However, that made the +recovery much more complicated, and only fixed the problem when crash +recovery was performed. An incomplete split can also occur if an otherwise +recoverable error, like out-of-memory or out-of-disk-space, happens while +inserting the downlink to the parent. + +Scans during Recovery +--------------------- + +nbtree indexes support read queries in Hot Standby mode. Every atomic +action/WAL record makes isolated changes that leave the tree in a +consistent state for readers. Readers lock pages according to the same +rules that readers follow on the primary. (Readers may have to move +right to recover from a "concurrent" page split or page deletion, just +like on the primary.) + +However, there are a couple of differences in how pages are locked by +replay/the startup process as compared to the original write operation +on the primary. The exceptions involve page splits and page deletions. +The first phase and second phase of a page split are processed +independently during replay, since they are independent atomic actions. +We do not attempt to recreate the coupling of parent and child page +write locks that took place on the primary. This is safe because readers +never care about the incomplete split flag anyway. Holding on to an +extra write lock on the primary is only necessary so that a second +writer cannot observe the incomplete split flag before the first writer +finishes the split. If we let concurrent writers on the primary observe +an incomplete split flag on the same page, each writer would attempt to +complete the unfinished split, corrupting the parent page. (Similarly, +replay of page deletion records does not hold a write lock on the target +leaf page throughout; only the primary needs to block out concurrent +writers that insert on to the page being deleted.) + +WAL replay holds same-level locks in a way that matches the approach +taken during original execution, though. This prevent readers from +observing same-level inconsistencies. It's probably possible to be more +lax about how same-level locks are acquired during recovery (most kinds +of readers could still move right to recover if we didn't couple +same-level locks), but we prefer to be conservative here. + +During recovery all index scans start with ignore_killed_tuples = false +and we never set kill_prior_tuple. We do this because the oldest xmin +on the standby server can be older than the oldest xmin on the primary +server, which means tuples can be marked LP_DEAD even when they are +still visible on the standby. We don't WAL log tuple LP_DEAD bits, but +they can still appear in the standby because of full page writes. So +we must always ignore them in standby, and that means it's not worth +setting them either. (When LP_DEAD-marked tuples are eventually deleted +on the primary, the deletion is WAL-logged. Queries that run on a +standby therefore get much of the benefit of any LP_DEAD setting that +takes place on the primary.) + +Note that we talk about scans that are started during recovery. We go to +a little trouble to allow a scan to start during recovery and end during +normal running after recovery has completed. This is a key capability +because it allows running applications to continue while the standby +changes state into a normally running server. + +The interlocking required to avoid returning incorrect results from +non-MVCC scans is not required on standby nodes. We still get a +super-exclusive lock ("cleanup lock") when replaying VACUUM records +during recovery, but recovery does not need to lock every leaf page +(only those leaf pages that have items to delete). That is safe because +HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(), +HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only ever +used during write transactions, which cannot exist on the standby. MVCC +scans are already protected by definition, so HeapTupleSatisfiesMVCC() +is not a problem. The optimizer looks at the boundaries of value ranges +using HeapTupleSatisfiesNonVacuumable() with an index-only scan, which +is also safe. That leaves concern only for HeapTupleSatisfiesToast(). + +HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's +because it doesn't need to - if the main heap row is visible then the +toast rows will also be visible. So as long as we follow a toast +pointer from a visible (live) tuple the corresponding toast rows +will also be visible, so we do not need to recheck MVCC on them. + +Other Things That Are Handy to Know +----------------------------------- + +Page zero of every btree is a meta-data page. This page stores the +location of the root page --- both the true root and the current effective +root ("fast" root). To avoid fetching the metapage for every single index +search, we cache a copy of the meta-data information in the index's +relcache entry (rd_amcache). This is a bit ticklish since using the cache +implies following a root page pointer that could be stale. However, a +backend following a cached pointer can sufficiently verify whether it +reached the intended page; either by checking the is-root flag when it +is going to the true root, or by checking that the page has no siblings +when going to the fast root. At worst, this could result in descending +some extra tree levels if we have a cached pointer to a fast root that is +now above the real fast root. Such cases shouldn't arise often enough to +be worth optimizing; and in any case we can expect a relcache flush will +discard the cached metapage before long, since a VACUUM that's moved the +fast root pointer can be expected to issue a statistics update for the +index. + +The algorithm assumes we can fit at least three items per page +(a "high key" and two real data items). Therefore it's unsafe +to accept items larger than 1/3rd page size. Larger items would +work sometimes, but could cause failures later on depending on +what else gets put on their page. + +"ScanKey" data structures are used in two fundamentally different ways +in this code, which we describe as "search" scankeys and "insertion" +scankeys. A search scankey is the kind passed to btbeginscan() or +btrescan() from outside the btree code. The sk_func pointers in a search +scankey point to comparison functions that return boolean, such as int4lt. +There might be more than one scankey entry for a given index column, or +none at all. (We require the keys to appear in index column order, but +the order of multiple keys for a given column is unspecified.) An +insertion scankey ("BTScanInsert" data structure) uses a similar +array-of-ScanKey data structure, but the sk_func pointers point to btree +comparison support functions (ie, 3-way comparators that return int4 values +interpreted as <0, =0, >0). In an insertion scankey there is at most one +entry per index column. There is also other data about the rules used to +locate where to begin the scan, such as whether or not the scan is a +"nextkey" scan. Insertion scankeys are built within the btree code (eg, by +_bt_mkscankey()) and are used to locate the starting point of a scan, as +well as for locating the place to insert a new index tuple. (Note: in the +case of an insertion scankey built from a search scankey or built from a +truncated pivot tuple, there might be fewer keys than index columns, +indicating that we have no constraints for the remaining index columns.) +After we have located the starting point of a scan, the original search +scankey is consulted as each index entry is sequentially scanned to decide +whether to return the entry and whether the scan can stop (see +_bt_checkkeys()). + +Notes about suffix truncation +----------------------------- + +We truncate away suffix key attributes that are not needed for a page high +key during a leaf page split. The remaining attributes must distinguish +the last index tuple on the post-split left page as belonging on the left +page, and the first index tuple on the post-split right page as belonging +on the right page. Tuples logically retain truncated key attributes, +though they implicitly have "negative infinity" as their value, and have no +storage overhead. Since the high key is subsequently reused as the +downlink in the parent page for the new right page, suffix truncation makes +pivot tuples short. INCLUDE indexes are guaranteed to have non-key +attributes truncated at the time of a leaf page split, but may also have +some key attributes truncated away, based on the usual criteria for key +attributes. They are not a special case, since non-key attributes are +merely payload to B-Tree searches. + +The goal of suffix truncation of key attributes is to improve index +fan-out. The technique was first described by Bayer and Unterauer (R.Bayer +and K.Unterauer, Prefix B-Trees, ACM Transactions on Database Systems, Vol +2, No. 1, March 1977, pp 11-26). The Postgres implementation is loosely +based on their paper. Note that Postgres only implements what the paper +refers to as simple prefix B-Trees. Note also that the paper assumes that +the tree has keys that consist of single strings that maintain the "prefix +property", much like strings that are stored in a suffix tree (comparisons +of earlier bytes must always be more significant than comparisons of later +bytes, and, in general, the strings must compare in a way that doesn't +break transitive consistency as they're split into pieces). Suffix +truncation in Postgres currently only works at the whole-attribute +granularity, but it would be straightforward to invent opclass +infrastructure that manufactures a smaller attribute value in the case of +variable-length types, such as text. An opclass support function could +manufacture the shortest possible key value that still correctly separates +each half of a leaf page split. + +There is sophisticated criteria for choosing a leaf page split point. The +general idea is to make suffix truncation effective without unduly +influencing the balance of space for each half of the page split. The +choice of leaf split point can be thought of as a choice among points +*between* items on the page to be split, at least if you pretend that the +incoming tuple was placed on the page already (you have to pretend because +there won't actually be enough space for it on the page). Choosing the +split point between two index tuples where the first non-equal attribute +appears as early as possible results in truncating away as many suffix +attributes as possible. Evenly balancing space among each half of the +split is usually the first concern, but even small adjustments in the +precise split point can allow truncation to be far more effective. + +Suffix truncation is primarily valuable because it makes pivot tuples +smaller, which delays splits of internal pages, but that isn't the only +reason why it's effective. Even truncation that doesn't make pivot tuples +smaller due to alignment still prevents pivot tuples from being more +restrictive than truly necessary in how they describe which values belong +on which pages. + +While it's not possible to correctly perform suffix truncation during +internal page splits, it's still useful to be discriminating when splitting +an internal page. The split point that implies a downlink be inserted in +the parent that's the smallest one available within an acceptable range of +the fillfactor-wise optimal split point is chosen. This idea also comes +from the Prefix B-Tree paper. This process has much in common with what +happens at the leaf level to make suffix truncation effective. The overall +effect is that suffix truncation tends to produce smaller, more +discriminating pivot tuples, especially early in the lifetime of the index, +while biasing internal page splits makes the earlier, smaller pivot tuples +end up in the root page, delaying root page splits. + +Logical duplicates are given special consideration. The logic for +selecting a split point goes to great lengths to avoid having duplicates +span more than one page, and almost always manages to pick a split point +between two user-key-distinct tuples, accepting a completely lopsided split +if it must. When a page that's already full of duplicates must be split, +the fallback strategy assumes that duplicates are mostly inserted in +ascending heap TID order. The page is split in a way that leaves the left +half of the page mostly full, and the right half of the page mostly empty. +The overall effect is that leaf page splits gracefully adapt to inserts of +large groups of duplicates, maximizing space utilization. Note also that +"trapping" large groups of duplicates on the same leaf page like this makes +deduplication more efficient. Deduplication can be performed infrequently, +without merging together existing posting list tuples too often. + +Notes about deduplication +------------------------- + +We deduplicate non-pivot tuples in non-unique indexes to reduce storage +overhead, and to avoid (or at least delay) page splits. Note that the +goals for deduplication in unique indexes are rather different; see later +section for details. Deduplication alters the physical representation of +tuples without changing the logical contents of the index, and without +adding overhead to read queries. Non-pivot tuples are merged together +into a single physical tuple with a posting list (a simple array of heap +TIDs with the standard item pointer format). Deduplication is always +applied lazily, at the point where it would otherwise be necessary to +perform a page split. It occurs only when LP_DEAD items have been +removed, as our last line of defense against splitting a leaf page +(bottom-up index deletion may be attempted first, as our second last line +of defense). We can set the LP_DEAD bit with posting list tuples, though +only when all TIDs are known dead. + +Our lazy approach to deduplication allows the page space accounting used +during page splits to have absolutely minimal special case logic for +posting lists. Posting lists can be thought of as extra payload that +suffix truncation will reliably truncate away as needed during page +splits, just like non-key columns from an INCLUDE index tuple. +Incoming/new tuples can generally be treated as non-overlapping plain +items (though see section on posting list splits for information about how +overlapping new/incoming items are really handled). + +The representation of posting lists is almost identical to the posting +lists used by GIN, so it would be straightforward to apply GIN's varbyte +encoding compression scheme to individual posting lists. Posting list +compression would break the assumptions made by posting list splits about +page space accounting (see later section), so it's not clear how +compression could be integrated with nbtree. Besides, posting list +compression does not offer a compelling trade-off for nbtree, since in +general nbtree is optimized for consistent performance with many +concurrent readers and writers. Compression would also make the deletion +of a subset of TIDs from a posting list slow and complicated, which would +be a big problem for workloads that depend heavily on bottom-up index +deletion. + +A major goal of our lazy approach to deduplication is to limit the +performance impact of deduplication with random updates. Even concurrent +append-only inserts of the same key value will tend to have inserts of +individual index tuples in an order that doesn't quite match heap TID +order. Delaying deduplication minimizes page level fragmentation. + +Deduplication in unique indexes +------------------------------- + +Very often, the number of distinct values that can ever be placed on +almost any given leaf page in a unique index is fixed and permanent. For +example, a primary key on an identity column will usually only have leaf +page splits caused by the insertion of new logical rows within the +rightmost leaf page. If there is a split of a non-rightmost leaf page, +then the split must have been triggered by inserts associated with UPDATEs +of existing logical rows. Splitting a leaf page purely to store multiple +versions is a false economy. In effect, we're permanently degrading the +index structure just to absorb a temporary burst of duplicates. + +Deduplication in unique indexes helps to prevent these pathological page +splits. Storing duplicates in a space efficient manner is not the goal, +since in the long run there won't be any duplicates anyway. Rather, we're +buying time for standard garbage collection mechanisms to run before a +page split is needed. + +Unique index leaf pages only get a deduplication pass when an insertion +(that might have to split the page) observed an existing duplicate on the +page in passing. This is based on the assumption that deduplication will +only work out when _all_ new insertions are duplicates from UPDATEs. This +may mean that we miss an opportunity to delay a page split, but that's +okay because our ultimate goal is to delay leaf page splits _indefinitely_ +(i.e. to prevent them altogether). There is little point in trying to +delay a split that is probably inevitable anyway. This allows us to avoid +the overhead of attempting to deduplicate with unique indexes that always +have few or no duplicates. + +Note: Avoiding "unnecessary" page splits driven by version churn is also +the goal of bottom-up index deletion, which was added to PostgreSQL 14. +Bottom-up index deletion is now the preferred way to deal with this +problem (with all kinds of indexes, though especially with unique +indexes). Still, deduplication can sometimes augment bottom-up index +deletion. When deletion cannot free tuples (due to an old snapshot +holding up cleanup), falling back on deduplication provides additional +capacity. Delaying the page split by deduplicating can allow a future +bottom-up deletion pass of the same page to succeed. + +Posting list splits +------------------- + +When the incoming tuple happens to overlap with an existing posting list, +a posting list split is performed. Like a page split, a posting list +split resolves a situation where a new/incoming item "won't fit", while +inserting the incoming item in passing (i.e. as part of the same atomic +action). It's possible (though not particularly likely) that an insert of +a new item on to an almost-full page will overlap with a posting list, +resulting in both a posting list split and a page split. Even then, the +atomic action that splits the posting list also inserts the new item +(since page splits always insert the new item in passing). Including the +posting list split in the same atomic action as the insert avoids problems +caused by concurrent inserts into the same posting list -- the exact +details of how we change the posting list depend upon the new item, and +vice-versa. A single atomic action also minimizes the volume of extra +WAL required for a posting list split, since we don't have to explicitly +WAL-log the original posting list tuple. + +Despite piggy-backing on the same atomic action that inserts a new tuple, +posting list splits can be thought of as a separate, extra action to the +insert itself (or to the page split itself). Posting list splits +conceptually "rewrite" an insert that overlaps with an existing posting +list into an insert that adds its final new item just to the right of the +posting list instead. The size of the posting list won't change, and so +page space accounting code does not need to care about posting list splits +at all. This is an important upside of our design; the page split point +choice logic is very subtle even without it needing to deal with posting +list splits. + +Only a few isolated extra steps are required to preserve the illusion that +the new item never overlapped with an existing posting list in the first +place: the heap TID of the incoming tuple has its TID replaced with the +rightmost/max heap TID from the existing/originally overlapping posting +list. Similarly, the original incoming item's TID is relocated to the +appropriate offset in the posting list (we usually shift TIDs out of the +way to make a hole for it). Finally, the posting-split-with-page-split +case must generate a new high key based on an imaginary version of the +original page that has both the final new item and the after-list-split +posting tuple (page splits usually just operate against an imaginary +version that contains the new item/item that won't fit). + +This approach avoids inventing an "eager" atomic posting split operation +that splits the posting list without simultaneously finishing the insert +of the incoming item. This alternative design might seem cleaner, but it +creates subtle problems for page space accounting. In general, there +might not be enough free space on the page to split a posting list such +that the incoming/new item no longer overlaps with either posting list +half --- the operation could fail before the actual retail insert of the +new item even begins. We'd end up having to handle posting list splits +that need a page split anyway. Besides, supporting variable "split points" +while splitting posting lists won't actually improve overall space +utilization. + +Notes About Data Representation +------------------------------- + +The right-sibling link required by L&Y is kept in the page "opaque +data" area, as is the left-sibling link, the page level, and some flags. +The page level counts upwards from zero at the leaf level, to the tree +depth minus 1 at the root. (Counting up from the leaves ensures that we +don't need to renumber any existing pages when splitting the root.) + +The Postgres disk block data format (an array of items) doesn't fit +Lehman and Yao's alternating-keys-and-pointers notion of a disk page, +so we have to play some games. (The alternating-keys-and-pointers +notion is important for internal page splits, which conceptually split +at the middle of an existing pivot tuple -- the tuple's "separator" key +goes on the left side of the split as the left side's new high key, +while the tuple's pointer/downlink goes on the right side as the +first/minus infinity downlink.) + +On a page that is not rightmost in its tree level, the "high key" is +kept in the page's first item, and real data items start at item 2. +The link portion of the "high key" item goes unused. A page that is +rightmost has no "high key" (it's implicitly positive infinity), so +data items start with the first item. Putting the high key at the +left, rather than the right, may seem odd, but it avoids moving the +high key as we add data items. + +On a leaf page, the data items are simply links to (TIDs of) tuples +in the relation being indexed, with the associated key values. + +On a non-leaf page, the data items are down-links to child pages with +bounding keys. The key in each data item is a strict lower bound for +keys on that child page, so logically the key is to the left of that +downlink. The high key (if present) is the upper bound for the last +downlink. The first data item on each such page has no lower bound +--- or lower bound of minus infinity, if you prefer. The comparison +routines must treat it accordingly. The actual key stored in the +item is irrelevant, and need not be stored at all. This arrangement +corresponds to the fact that an L&Y non-leaf page has one more pointer +than key. Suffix truncation's negative infinity attributes behave in +the same way. diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c new file mode 100644 index 0000000..7ac73cb --- /dev/null +++ b/src/backend/access/nbtree/nbtcompare.c @@ -0,0 +1,335 @@ +/*------------------------------------------------------------------------- + * + * nbtcompare.c + * Comparison functions for btree access method. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtcompare.c + * + * NOTES + * + * These functions are stored in pg_amproc. For each operator class + * defined on btrees, they compute + * + * compare(a, b): + * < 0 if a < b, + * = 0 if a == b, + * > 0 if a > b. + * + * The result is always an int32 regardless of the input datatype. + * + * Although any negative int32 is acceptable for reporting "<", + * and any positive int32 is acceptable for reporting ">", routines + * that work on 32-bit or wider datatypes can't just return "a - b". + * That could overflow and give the wrong answer. + * + * NOTE: it is critical that the comparison function impose a total order + * on all non-NULL values of the data type, and that the datatype's + * boolean comparison operators (= < >= etc) yield results consistent + * with the comparison routine. Otherwise bad behavior may ensue. + * (For example, the comparison operators must NOT punt when faced with + * NAN or other funny values; you must devise some collation sequence for + * all such values.) If the datatype is not trivial, this is most + * reliably done by having the boolean operators invoke the same + * three-way comparison code that the btree function does. Therefore, + * this file contains only btree support for "trivial" datatypes --- + * all others are in the /utils/adt/ files that implement their datatypes. + * + * NOTE: these routines must not leak memory, since memory allocated + * during an index access won't be recovered till end of query. This + * primarily affects comparison routines for toastable datatypes; + * they have to be careful to free any detoasted copy of an input datum. + * + * NOTE: we used to forbid comparison functions from returning INT_MIN, + * but that proves to be too error-prone because some platforms' versions + * of memcmp() etc can return INT_MIN. As a means of stress-testing + * callers, this file can be compiled with STRESS_SORT_INT_MIN defined + * to cause many of these functions to return INT_MIN or INT_MAX instead of + * their customary -1/+1. For production, though, that's not a good idea + * since users or third-party code might expect the traditional results. + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "utils/builtins.h" +#include "utils/sortsupport.h" + +#ifdef STRESS_SORT_INT_MIN +#define A_LESS_THAN_B INT_MIN +#define A_GREATER_THAN_B INT_MAX +#else +#define A_LESS_THAN_B (-1) +#define A_GREATER_THAN_B 1 +#endif + + +Datum +btboolcmp(PG_FUNCTION_ARGS) +{ + bool a = PG_GETARG_BOOL(0); + bool b = PG_GETARG_BOOL(1); + + PG_RETURN_INT32((int32) a - (int32) b); +} + +Datum +btint2cmp(PG_FUNCTION_ARGS) +{ + int16 a = PG_GETARG_INT16(0); + int16 b = PG_GETARG_INT16(1); + + PG_RETURN_INT32((int32) a - (int32) b); +} + +static int +btint2fastcmp(Datum x, Datum y, SortSupport ssup) +{ + int16 a = DatumGetInt16(x); + int16 b = DatumGetInt16(y); + + return (int) a - (int) b; +} + +Datum +btint2sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btint2fastcmp; + PG_RETURN_VOID(); +} + +Datum +btint4cmp(PG_FUNCTION_ARGS) +{ + int32 a = PG_GETARG_INT32(0); + int32 b = PG_GETARG_INT32(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +static int +btint4fastcmp(Datum x, Datum y, SortSupport ssup) +{ + int32 a = DatumGetInt32(x); + int32 b = DatumGetInt32(y); + + if (a > b) + return A_GREATER_THAN_B; + else if (a == b) + return 0; + else + return A_LESS_THAN_B; +} + +Datum +btint4sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btint4fastcmp; + PG_RETURN_VOID(); +} + +Datum +btint8cmp(PG_FUNCTION_ARGS) +{ + int64 a = PG_GETARG_INT64(0); + int64 b = PG_GETARG_INT64(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +static int +btint8fastcmp(Datum x, Datum y, SortSupport ssup) +{ + int64 a = DatumGetInt64(x); + int64 b = DatumGetInt64(y); + + if (a > b) + return A_GREATER_THAN_B; + else if (a == b) + return 0; + else + return A_LESS_THAN_B; +} + +Datum +btint8sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btint8fastcmp; + PG_RETURN_VOID(); +} + +Datum +btint48cmp(PG_FUNCTION_ARGS) +{ + int32 a = PG_GETARG_INT32(0); + int64 b = PG_GETARG_INT64(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint84cmp(PG_FUNCTION_ARGS) +{ + int64 a = PG_GETARG_INT64(0); + int32 b = PG_GETARG_INT32(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint24cmp(PG_FUNCTION_ARGS) +{ + int16 a = PG_GETARG_INT16(0); + int32 b = PG_GETARG_INT32(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint42cmp(PG_FUNCTION_ARGS) +{ + int32 a = PG_GETARG_INT32(0); + int16 b = PG_GETARG_INT16(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint28cmp(PG_FUNCTION_ARGS) +{ + int16 a = PG_GETARG_INT16(0); + int64 b = PG_GETARG_INT64(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btint82cmp(PG_FUNCTION_ARGS) +{ + int64 a = PG_GETARG_INT64(0); + int16 b = PG_GETARG_INT16(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +Datum +btoidcmp(PG_FUNCTION_ARGS) +{ + Oid a = PG_GETARG_OID(0); + Oid b = PG_GETARG_OID(1); + + if (a > b) + PG_RETURN_INT32(A_GREATER_THAN_B); + else if (a == b) + PG_RETURN_INT32(0); + else + PG_RETURN_INT32(A_LESS_THAN_B); +} + +static int +btoidfastcmp(Datum x, Datum y, SortSupport ssup) +{ + Oid a = DatumGetObjectId(x); + Oid b = DatumGetObjectId(y); + + if (a > b) + return A_GREATER_THAN_B; + else if (a == b) + return 0; + else + return A_LESS_THAN_B; +} + +Datum +btoidsortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = btoidfastcmp; + PG_RETURN_VOID(); +} + +Datum +btoidvectorcmp(PG_FUNCTION_ARGS) +{ + oidvector *a = (oidvector *) PG_GETARG_POINTER(0); + oidvector *b = (oidvector *) PG_GETARG_POINTER(1); + int i; + + /* We arbitrarily choose to sort first by vector length */ + if (a->dim1 != b->dim1) + PG_RETURN_INT32(a->dim1 - b->dim1); + + for (i = 0; i < a->dim1; i++) + { + if (a->values[i] != b->values[i]) + { + if (a->values[i] > b->values[i]) + PG_RETURN_INT32(A_GREATER_THAN_B); + else + PG_RETURN_INT32(A_LESS_THAN_B); + } + } + PG_RETURN_INT32(0); +} + +Datum +btcharcmp(PG_FUNCTION_ARGS) +{ + char a = PG_GETARG_CHAR(0); + char b = PG_GETARG_CHAR(1); + + /* Be careful to compare chars as unsigned */ + PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b)); +} diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c new file mode 100644 index 0000000..1cd1b59 --- /dev/null +++ b/src/backend/access/nbtree/nbtdedup.c @@ -0,0 +1,1098 @@ +/*------------------------------------------------------------------------- + * + * nbtdedup.c + * Deduplicate or bottom-up delete items in Postgres btrees. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtdedup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + +static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, + TM_IndexDeleteOp *delstate); +static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem); +static void _bt_singleval_fillfactor(Page page, BTDedupState state, + Size newitemsz); +#ifdef USE_ASSERT_CHECKING +static bool _bt_posting_valid(IndexTuple posting); +#endif + +/* + * Perform a deduplication pass. + * + * The general approach taken here is to perform as much deduplication as + * possible to free as much space as possible. Note, however, that "single + * value" strategy is used for !bottomupdedup callers when the page is full of + * tuples of a single value. Deduplication passes that apply the strategy + * will leave behind a few untouched tuples at the end of the page, preparing + * the page for an anticipated page split that uses nbtsplitloc.c's own single + * value strategy. Our high level goal is to delay merging the untouched + * tuples until after the page splits. + * + * When a call to _bt_bottomupdel_pass() just took place (and failed), our + * high level goal is to prevent a page split entirely by buying more time. + * We still hope that a page split can be avoided altogether. That's why + * single value strategy is not even considered for bottomupdedup callers. + * + * The page will have to be split if we cannot successfully free at least + * newitemsz (we also need space for newitem's line pointer, which isn't + * included in caller's newitemsz). + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +void +_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, + Size newitemsz, bool bottomupdedup) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Page newpage; + BTDedupState state; + Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0; + bool singlevalstrat = false; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* + * Initialize deduplication state. + * + * It would be possible for maxpostingsize (limit on posting list tuple + * size) to be set to one third of the page. However, it seems like a + * good idea to limit the size of posting lists to one sixth of a page. + * That ought to leave us with a good split point when pages full of + * duplicates can be split several times. + */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); + /* Metadata about base tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + /* Size of all physical tuples to be replaced by pending posting list */ + state->phystupsize = 0; + /* nintervals should be initialized to zero */ + state->nintervals = 0; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Consider applying "single value" strategy, though only if the page + * seems likely to be split in the near future + */ + if (!bottomupdedup) + singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); + + /* + * Deduplicate items from page, and write them to newpage. + * + * Copy the original page's LSN into newpage copy. This will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + newpage = PageGetTempPageCopySpecial(page); + PageSetLSN(newpage, PageGetLSN(page)); + + /* Copy high key, if any */ + if (!P_RIGHTMOST(opaque)) + { + ItemId hitemid = PageGetItemId(page, P_HIKEY); + Size hitemsz = ItemIdGetLength(hitemid); + IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); + + if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (state->deduplicate && + _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID(s) for itup have been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list for some other reason (e.g., adding more + * TIDs would have caused posting list to exceed current + * maxpostingsize). + * + * If state contains pending posting list with more than one item, + * form new posting tuple, and actually update the page. Else + * reset the state and move on without modifying the page. + */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + if (singlevalstrat) + { + /* + * Single value strategy's extra steps. + * + * Lower maxpostingsize for sixth and final large posting list + * tuple at the point where 5 maxpostingsize-capped tuples + * have either been formed or observed. + * + * When a sixth maxpostingsize-capped item is formed/observed, + * stop merging together tuples altogether. The few tuples + * that remain at the end of the page won't be merged together + * at all (at least not until after a future page split takes + * place). + */ + if (state->nmaxitems == 5) + _bt_singleval_fillfactor(page, state, newitemsz); + else if (state->nmaxitems == 6) + { + state->deduplicate = false; + singlevalstrat = false; /* won't be back here */ + } + } + + /* itup starts new pending posting list */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + + /* Handle the last item */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + /* + * If no items suitable for deduplication were found, newpage must be + * exactly the same as the original page, so just return from function. + * + * We could determine whether or not to proceed on the basis the space + * savings being sufficient to avoid an immediate page split instead. We + * don't do that because there is some small value in nbtsplitloc.c always + * operating against a page that is fully deduplicated (apart from + * newitem). Besides, most of the cost has already been paid. + */ + if (state->nintervals == 0) + { + /* cannot leak memory here */ + pfree(newpage); + pfree(state->htids); + pfree(state); + return; + } + + /* + * By here, it's clear that deduplication will definitely go ahead. + * + * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace + * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway. + * But keep things tidy. + */ + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + START_CRIT_SECTION(); + + PageRestoreTempPage(newpage, page); + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.nintervals = state->nintervals; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + /* + * The intervals array is not in the buffer, but pretend that it is. + * When XLogInsert stores the whole buffer, the array need not be + * stored too. + */ + XLogRegisterBufData(0, (char *) state->intervals, + state->nintervals * sizeof(BTDedupInterval)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* Local space accounting should agree with page accounting */ + Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); + + /* cannot leak memory here */ + pfree(state->htids); + pfree(state); +} + +/* + * Perform bottom-up index deletion pass. + * + * See if duplicate index tuples (plus certain nearby tuples) are eligible to + * be deleted via bottom-up index deletion. The high level goal here is to + * entirely prevent "unnecessary" page splits caused by MVCC version churn + * from UPDATEs (when the UPDATEs don't logically modify any of the columns + * covered by the 'rel' index). This is qualitative, not quantitative -- we + * do not particularly care about once-off opportunities to delete many index + * tuples together. + * + * See nbtree/README for details on the design of nbtree bottom-up deletion. + * See access/tableam.h for a description of how we're expected to cooperate + * with the tableam. + * + * Returns true on success, in which case caller can assume page split will be + * avoided for a reasonable amount of time. Returns false when caller should + * deduplicate the page (if possible at all). + * + * Note: Occasionally we return true despite failing to delete enough items to + * avoid a split. This makes caller skip deduplication and go split the page + * right away. Our return value is always just advisory information. + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +bool +_bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, + Size newitemsz) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + BTDedupState state; + TM_IndexDeleteOp delstate; + bool neverdedup; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* Initialize deduplication state */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->nintervals = 0; + + /* + * Initialize tableam state that describes bottom-up index deletion + * operation. + * + * We'll go on to ask the tableam to search for TIDs whose index tuples we + * can safely delete. The tableam will search until our leaf page space + * target is satisfied, or until the cost of continuing with the tableam + * operation seems too high. It focuses its efforts on TIDs associated + * with duplicate index tuples that we mark "promising". + * + * This space target is a little arbitrary. The tableam must be able to + * keep the costs and benefits in balance. We provide the tableam with + * exhaustive information about what might work, without directly + * concerning ourselves with avoiding work during the tableam call. Our + * role in costing the bottom-up deletion process is strictly advisory. + */ + delstate.bottomup = true; + delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz); + delstate.ndeltids = 0; + delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); + delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* itup starts first pending interval */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + /* Tuple is equal; just added its TIDs to pending interval */ + } + else + { + /* Finalize interval -- move its TIDs to delete state */ + _bt_bottomupdel_finish_pending(page, state, &delstate); + + /* itup starts new pending interval */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + /* Finalize final interval -- move its TIDs to delete state */ + _bt_bottomupdel_finish_pending(page, state, &delstate); + + /* + * We don't give up now in the event of having few (or even zero) + * promising tuples for the tableam because it's not up to us as the index + * AM to manage costs (note that the tableam might have heuristics of its + * own that work out what to do). We should at least avoid having our + * caller do a useless deduplication pass after we return in the event of + * zero promising tuples, though. + */ + neverdedup = false; + if (state->nintervals == 0) + neverdedup = true; + + pfree(state->htids); + pfree(state); + + /* Ask tableam which TIDs are deletable, then physically delete them */ + _bt_delitems_delete_check(rel, buf, heapRel, &delstate); + + pfree(delstate.deltids); + pfree(delstate.status); + + /* Report "success" to caller unconditionally to avoid deduplication */ + if (neverdedup) + return true; + + /* Don't dedup when we won't end up back here any time soon anyway */ + return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz); +} + +/* + * Create a new pending posting list tuple based on caller's base tuple. + * + * Every tuple processed by deduplication either becomes the base tuple for a + * posting list, or gets its heap TID(s) accepted into a pending posting list. + * A tuple that starts out as the base tuple for a posting list will only + * actually be rewritten within _bt_dedup_finish_pending() when it turns out + * that there are duplicates that can be merged into the base tuple. + */ +void +_bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff) +{ + Assert(state->nhtids == 0); + Assert(state->nitems == 0); + Assert(!BTreeTupleIsPivot(base)); + + /* + * Copy heap TID(s) from new base tuple for new candidate posting list + * into working state's array + */ + if (!BTreeTupleIsPosting(base)) + { + memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData)); + state->nhtids = 1; + state->basetupsize = IndexTupleSize(base); + } + else + { + int nposting; + + nposting = BTreeTupleGetNPosting(base); + memcpy(state->htids, BTreeTupleGetPosting(base), + sizeof(ItemPointerData) * nposting); + state->nhtids = nposting; + /* basetupsize should not include existing posting list */ + state->basetupsize = BTreeTupleGetPostingOffset(base); + } + + /* + * Save new base tuple itself -- it'll be needed if we actually create a + * new posting list from new pending posting list. + * + * Must maintain physical size of all existing tuples (including line + * pointer overhead) so that we can calculate space savings on page. + */ + state->nitems = 1; + state->base = base; + state->baseoff = baseoff; + state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); + /* Also save baseoff in pending state for interval */ + state->intervals[state->nintervals].baseoff = state->baseoff; +} + +/* + * Save itup heap TID(s) into pending posting list where possible. + * + * Returns bool indicating if the pending posting list managed by state now + * includes itup's heap TID(s). + */ +bool +_bt_dedup_save_htid(BTDedupState state, IndexTuple itup) +{ + int nhtids; + ItemPointer htids; + Size mergedtupsz; + + Assert(!BTreeTupleIsPivot(itup)); + + if (!BTreeTupleIsPosting(itup)) + { + nhtids = 1; + htids = &itup->t_tid; + } + else + { + nhtids = BTreeTupleGetNPosting(itup); + htids = BTreeTupleGetPosting(itup); + } + + /* + * Don't append (have caller finish pending posting list as-is) if + * appending heap TID(s) from itup would put us over maxpostingsize limit. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. + */ + mergedtupsz = MAXALIGN(state->basetupsize + + (state->nhtids + nhtids) * sizeof(ItemPointerData)); + + if (mergedtupsz > state->maxpostingsize) + { + /* + * Count this as an oversized item for single value strategy, though + * only when there are 50 TIDs in the final posting list tuple. This + * limit (which is fairly arbitrary) avoids confusion about how many + * 1/6 of a page tuples have been encountered/created by the current + * deduplication pass. + * + * Note: We deliberately don't consider which deduplication pass + * merged together tuples to create this item (could be a previous + * deduplication pass, or current pass). See _bt_do_singleval() + * comments. + */ + if (state->nhtids > 50) + state->nmaxitems++; + + return false; + } + + /* + * Save heap TIDs to pending posting list tuple -- itup can be merged into + * pending posting list + */ + state->nitems++; + memcpy(state->htids + state->nhtids, htids, + sizeof(ItemPointerData) * nhtids); + state->nhtids += nhtids; + state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + + return true; +} + +/* + * Finalize pending posting list tuple, and add it to the page. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * Returns space saving from deduplicating to make a new posting list tuple. + * Note that this includes line pointer overhead. This is zero in the case + * where no deduplication was possible. + */ +Size +_bt_dedup_finish_pending(Page newpage, BTDedupState state) +{ + OffsetNumber tupoff; + Size tuplesz; + Size spacesaving; + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage)); + if (state->nitems == 1) + { + /* Use original, unchanged base tuple */ + tuplesz = IndexTupleSize(state->base); + if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + spacesaving = 0; + } + else + { + IndexTuple final; + + /* Form a tuple with a posting list */ + final = _bt_form_posting(state->base, state->htids, state->nhtids); + tuplesz = IndexTupleSize(final); + Assert(tuplesz <= state->maxpostingsize); + + /* Save final number of items for posting list */ + state->intervals[state->nintervals].nitems = state->nitems; + + Assert(tuplesz == MAXALIGN(IndexTupleSize(final))); + if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + pfree(final); + spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData)); + /* Increment nintervals, since we wrote a new posting list tuple */ + state->nintervals++; + Assert(spacesaving > 0 && spacesaving < BLCKSZ); + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + + return spacesaving; +} + +/* + * Finalize interval during bottom-up index deletion. + * + * During a bottom-up pass we expect that TIDs will be recorded in dedup state + * first, and then get moved over to delstate (in variable-sized batches) by + * calling here. Call here happens when the number of TIDs in a dedup + * interval is known, and interval gets finalized (i.e. when caller sees next + * tuple on the page is not a duplicate, or when caller runs out of tuples to + * process from leaf page). + * + * This is where bottom-up deletion determines and remembers which entries are + * duplicates. This will be important information to the tableam delete + * infrastructure later on. Plain index tuple duplicates are marked + * "promising" here, per tableam contract. + * + * Our approach to marking entries whose TIDs come from posting lists is more + * complicated. Posting lists can only be formed by a deduplication pass (or + * during an index build), so recent version churn affecting the pointed-to + * logical rows is not particularly likely. We may still give a weak signal + * about posting list tuples' entries (by marking just one of its TIDs/entries + * promising), though this is only a possibility in the event of further + * duplicate index tuples in final interval that covers posting list tuple (as + * in the plain tuple case). A weak signal/hint will be useful to the tableam + * when it has no stronger signal to go with for the deletion operation as a + * whole. + * + * The heuristics we use work well in practice because we only need to give + * the tableam the right _general_ idea about where to look. Garbage tends to + * naturally get concentrated in relatively few table blocks with workloads + * that bottom-up deletion targets. The tableam cannot possibly rank all + * available table blocks sensibly based on the hints we provide, but that's + * okay -- only the extremes matter. The tableam just needs to be able to + * predict which few table blocks will have the most tuples that are safe to + * delete for each deletion operation, with low variance across related + * deletion operations. + */ +static void +_bt_bottomupdel_finish_pending(Page page, BTDedupState state, + TM_IndexDeleteOp *delstate) +{ + bool dupinterval = (state->nitems > 1); + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + for (int i = 0; i < state->nitems; i++) + { + OffsetNumber offnum = state->baseoff + i; + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids]; + TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids]; + + if (!BTreeTupleIsPosting(itup)) + { + /* Simple case: A plain non-pivot tuple */ + ideltid->tid = itup->t_tid; + ideltid->id = delstate->ndeltids; + istatus->idxoffnum = offnum; + istatus->knowndeletable = false; /* for now */ + istatus->promising = dupinterval; /* simple rule */ + istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData); + + delstate->ndeltids++; + } + else + { + /* + * Complicated case: A posting list tuple. + * + * We make the conservative assumption that there can only be at + * most one affected logical row per posting list tuple. There + * will be at most one promising entry in deltids to represent + * this presumed lone logical row. Note that this isn't even + * considered unless the posting list tuple is also in an interval + * of duplicates -- this complicated rule is just a variant of the + * simple rule used to decide if plain index tuples are promising. + */ + int nitem = BTreeTupleGetNPosting(itup); + bool firstpromising = false; + bool lastpromising = false; + + Assert(_bt_posting_valid(itup)); + + if (dupinterval) + { + /* + * Complicated rule: either the first or last TID in the + * posting list gets marked promising (if any at all) + */ + BlockNumber minblocklist, + midblocklist, + maxblocklist; + ItemPointer mintid, + midtid, + maxtid; + + mintid = BTreeTupleGetHeapTID(itup); + midtid = BTreeTupleGetPostingN(itup, nitem / 2); + maxtid = BTreeTupleGetMaxHeapTID(itup); + minblocklist = ItemPointerGetBlockNumber(mintid); + midblocklist = ItemPointerGetBlockNumber(midtid); + maxblocklist = ItemPointerGetBlockNumber(maxtid); + + /* Only entry with predominant table block can be promising */ + firstpromising = (minblocklist == midblocklist); + lastpromising = (!firstpromising && + midblocklist == maxblocklist); + } + + for (int p = 0; p < nitem; p++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, p); + + ideltid->tid = *htid; + ideltid->id = delstate->ndeltids; + istatus->idxoffnum = offnum; + istatus->knowndeletable = false; /* for now */ + istatus->promising = false; + if ((firstpromising && p == 0) || + (lastpromising && p == nitem - 1)) + istatus->promising = true; + istatus->freespace = sizeof(ItemPointerData); /* at worst */ + + ideltid++; + istatus++; + delstate->ndeltids++; + } + } + } + + if (dupinterval) + { + state->intervals[state->nintervals].nitems = state->nitems; + state->nintervals++; + } + + /* Reset state for next interval */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; +} + +/* + * Determine if page non-pivot tuples (data items) are all duplicates of the + * same value -- if they are, deduplication's "single value" strategy should + * be applied. The general goal of this strategy is to ensure that + * nbtsplitloc.c (which uses its own single value strategy) will find a useful + * split point as further duplicates are inserted, and successive rightmost + * page splits occur among pages that store the same duplicate value. When + * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full, + * just like it would if deduplication were disabled. + * + * We expect that affected workloads will require _several_ single value + * strategy deduplication passes (over a page that only stores duplicates) + * before the page is finally split. The first deduplication pass should only + * find regular non-pivot tuples. Later deduplication passes will find + * existing maxpostingsize-capped posting list tuples, which must be skipped + * over. The penultimate pass is generally the first pass that actually + * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a + * few untouched non-pivot tuples. The final deduplication pass won't free + * any space -- it will skip over everything without merging anything (it + * retraces the steps of the penultimate pass). + * + * Fortunately, having several passes isn't too expensive. Each pass (after + * the first pass) won't spend many cycles on the large posting list tuples + * left by previous passes. Each pass will find a large contiguous group of + * smaller duplicate tuples to merge together at the end of the page. + */ +static bool +_bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, minoff); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + { + itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + return true; + } + + return false; +} + +/* + * Lower maxpostingsize when using "single value" strategy, to avoid a sixth + * and final maxpostingsize-capped tuple. The sixth and final posting list + * tuple will end up somewhat smaller than the first five. (Note: The first + * five tuples could actually just be very large duplicate tuples that + * couldn't be merged together at all. Deduplication will simply not modify + * the page when that happens.) + * + * When there are six posting lists on the page (after current deduplication + * pass goes on to create/observe a sixth very large tuple), caller should end + * its deduplication pass. It isn't useful to try to deduplicate items that + * are supposed to end up on the new right sibling page following the + * anticipated page split. A future deduplication pass of future right + * sibling page might take care of it. (This is why the first single value + * strategy deduplication pass for a given leaf page will generally find only + * plain non-pivot tuples -- see _bt_do_singleval() comments.) + */ +static void +_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz) +{ + Size leftfree; + int reduction; + + /* This calculation needs to match nbtsplitloc.c */ + leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + /* Subtract size of new high key (includes pivot heap TID space) */ + leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); + + /* + * Reduce maxpostingsize by an amount equal to target free space on left + * half of page + */ + reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0); + if (state->maxpostingsize > reduction) + state->maxpostingsize -= reduction; + else + state->maxpostingsize = 0; +} + +/* + * Build a posting list tuple based on caller's "base" index tuple and list of + * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a + * posting list. (Posting list tuples can never have a single heap TID, partly + * because that ensures that deduplication always reduces final MAXALIGN()'d + * size of entire tuple.) + * + * Convention is that posting list starts at a MAXALIGN()'d offset (rather + * than a SHORTALIGN()'d offset), in line with the approach taken when + * appending a heap TID to new pivot tuple/high key during suffix truncation. + * This sometimes wastes a little space that was only needed as alignment + * padding in the original tuple. Following this convention simplifies the + * space accounting used when deduplicating a page (the same convention + * simplifies the accounting for choosing a point to split a page at). + * + * Note: Caller's "htids" array must be unique and already in ascending TID + * order. Any existing heap TIDs from "base" won't automatically appear in + * returned posting list tuple (they must be included in htids array.) + */ +IndexTuple +_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids) +{ + uint32 keysize, + newsize; + IndexTuple itup; + + if (BTreeTupleIsPosting(base)) + keysize = BTreeTupleGetPostingOffset(base); + else + keysize = IndexTupleSize(base); + + Assert(!BTreeTupleIsPivot(base)); + Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX); + Assert(keysize == MAXALIGN(keysize)); + + /* Determine final size of new tuple */ + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, base, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + memcpy(BTreeTupleGetPosting(itup), htids, + sizeof(ItemPointerData) * nhtids); + Assert(_bt_posting_valid(itup)); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(htids, &itup->t_tid); + Assert(ItemPointerIsValid(&itup->t_tid)); + } + + return itup; +} + +/* + * Generate a replacement tuple by "updating" a posting list tuple so that it + * no longer has TIDs that need to be deleted. + * + * Used by both VACUUM and index deletion. Caller's vacposting argument + * points to the existing posting list tuple to be updated. + * + * On return, caller's vacposting argument will point to final "updated" + * tuple, which will be palloc()'d in caller's memory context. + */ +void +_bt_update_posting(BTVacuumPosting vacposting) +{ + IndexTuple origtuple = vacposting->itup; + uint32 keysize, + newsize; + IndexTuple itup; + int nhtids; + int ui, + d; + ItemPointer htids; + + nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids; + + Assert(_bt_posting_valid(origtuple)); + Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple)); + + /* + * Determine final size of new tuple. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. We avoid calling _bt_form_posting() here + * to save ourselves a second memory allocation for a htids workspace. + */ + keysize = BTreeTupleGetPostingOffset(origtuple); + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, origtuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + htids = BTreeTupleGetPosting(itup); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + htids = &itup->t_tid; + } + + ui = 0; + d = 0; + for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++) + { + if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i) + { + d++; + continue; + } + htids[ui++] = *BTreeTupleGetPostingN(origtuple, i); + } + Assert(ui == nhtids); + Assert(d == vacposting->ndeletedtids); + Assert(nhtids == 1 || _bt_posting_valid(itup)); + Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid)); + + /* vacposting arg's itup will now point to updated version */ + vacposting->itup = itup; +} + +/* + * Prepare for a posting list split by swapping heap TID in newitem with heap + * TID from original posting list (the 'oposting' heap TID located at offset + * 'postingoff'). Modifies newitem, so caller should pass their own private + * copy that can safely be modified. + * + * Returns new posting list tuple, which is palloc()'d in caller's context. + * This is guaranteed to be the same size as 'oposting'. Modified newitem is + * what caller actually inserts. (This happens inside the same critical + * section that performs an in-place update of old posting list using new + * posting list returned here.) + * + * While the keys from newitem and oposting must be opclass equal, and must + * generate identical output when run through the underlying type's output + * function, it doesn't follow that their representations match exactly. + * Caller must avoid assuming that there can't be representational differences + * that make datums from oposting bigger or smaller than the corresponding + * datums from newitem. For example, differences in TOAST input state might + * break a faulty assumption about tuple size (the executor is entitled to + * apply TOAST compression based on its own criteria). It also seems possible + * that further representational variation will be introduced in the future, + * in order to support nbtree features like page-level prefix compression. + * + * See nbtree/README for details on the design of posting list splits. + */ +IndexTuple +_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) +{ + int nhtids; + char *replacepos; + char *replaceposright; + Size nmovebytes; + IndexTuple nposting; + + nhtids = BTreeTupleGetNPosting(oposting); + Assert(_bt_posting_valid(oposting)); + + /* + * The postingoff argument originated as a _bt_binsrch_posting() return + * value. It will be 0 in the event of corruption that makes a leaf page + * contain a non-pivot tuple that's somehow identical to newitem (no two + * non-pivot tuples should ever have the same TID). This has been known + * to happen in the field from time to time. + * + * Perform a basic sanity check to catch this case now. + */ + if (!(postingoff > 0 && postingoff < nhtids)) + elog(ERROR, "posting list tuple with %d items cannot be split at offset %d", + nhtids, postingoff); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID. We shift TIDs one place to the right, losing original + * rightmost TID. (nmovebytes must not include TIDs to the left of + * postingoff, nor the existing rightmost/max TID that gets overwritten.) + */ + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); + replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1); + nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); + memmove(replaceposright, replacepos, nmovebytes); + + /* Fill the gap at postingoff with TID of new item (original new TID) */ + Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem)); + ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); + + /* Now copy oposting's rightmost/max TID into new item (final new TID) */ + ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid); + + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), + BTreeTupleGetHeapTID(newitem)) < 0); + Assert(_bt_posting_valid(nposting)); + + return nposting; +} + +/* + * Verify posting list invariants for "posting", which must be a posting list + * tuple. Used within assertions. + */ +#ifdef USE_ASSERT_CHECKING +static bool +_bt_posting_valid(IndexTuple posting) +{ + ItemPointerData last; + ItemPointer htid; + + if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2) + return false; + + /* Remember first heap TID for loop */ + ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last); + if (!ItemPointerIsValid(&last)) + return false; + + /* Iterate, starting from second TID */ + for (int i = 1; i < BTreeTupleGetNPosting(posting); i++) + { + htid = BTreeTupleGetPostingN(posting, i); + + if (!ItemPointerIsValid(htid)) + return false; + if (ItemPointerCompare(htid, &last) <= 0) + return false; + ItemPointerCopy(htid, &last); + } + + return true; +} +#endif diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c new file mode 100644 index 0000000..1241c56 --- /dev/null +++ b/src/backend/access/nbtree/nbtinsert.c @@ -0,0 +1,3009 @@ +/*------------------------------------------------------------------------- + * + * nbtinsert.c + * Item insertion in Lehman and Yao btrees for Postgres. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtinsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/transam.h" +#include "access/xloginsert.h" +#include "lib/qunique.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/smgr.h" + +/* Minimum tree height for application of fastpath optimization */ +#define BTREE_FASTPATH_MIN_LEVEL 2 + + +static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate); +static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, + Relation heapRel, + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken); +static OffsetNumber _bt_findinsertloc(Relation rel, + BTInsertState insertstate, + bool checkingunique, + bool indexUnchanged, + BTStack stack, + Relation heapRel); +static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack); +static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, + Buffer buf, + Buffer cbuf, + BTStack stack, + IndexTuple itup, + Size itemsz, + OffsetNumber newitemoff, + int postingoff, + bool split_only_page); +static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, + Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, + IndexTuple newitem, IndexTuple orignewitem, + IndexTuple nposting, uint16 postingoff); +static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, + BTStack stack, bool isroot, bool isonly); +static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); +static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, + OffsetNumber itup_off, bool newfirstdataitem); +static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, + BTInsertState insertstate, + bool simpleonly, bool checkingunique, + bool uniquedup, bool indexUnchanged); +static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, + OffsetNumber *deletable, int ndeletable, + IndexTuple newitem, OffsetNumber minoff, + OffsetNumber maxoff); +static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable, + int ndeletable, IndexTuple newitem, + int *nblocks); +static inline int _bt_blk_cmp(const void *arg1, const void *arg2); + +/* + * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. + * + * This routine is called by the public interface routine, btinsert. + * By here, itup is filled in, including the TID. + * + * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this + * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or + * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. + * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and + * don't actually insert. + * + * indexUnchanged executor hint indicates if itup is from an + * UPDATE that didn't logically change the indexed value, but + * must nevertheless have a new entry to point to a successor + * version. + * + * The result value is only significant for UNIQUE_CHECK_PARTIAL: + * it must be true if the entry is known unique, else false. + * (In the current implementation we'll also return true after a + * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but + * that's just a coding artifact.) + */ +bool +_bt_doinsert(Relation rel, IndexTuple itup, + IndexUniqueCheck checkUnique, bool indexUnchanged, + Relation heapRel) +{ + bool is_unique = false; + BTInsertStateData insertstate; + BTScanInsert itup_key; + BTStack stack; + bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); + + /* we need an insertion scan key to do our search, so build one */ + itup_key = _bt_mkscankey(rel, itup); + + if (checkingunique) + { + if (!itup_key->anynullkeys) + { + /* No (heapkeyspace) scantid until uniqueness established */ + itup_key->scantid = NULL; + } + else + { + /* + * Scan key for new tuple contains NULL key values. Bypass + * checkingunique steps. They are unnecessary because core code + * considers NULL unequal to every value, including NULL. + * + * This optimization avoids O(N^2) behavior within the + * _bt_findinsertloc() heapkeyspace path when a unique index has a + * large number of "duplicates" with NULL key values. + */ + checkingunique = false; + /* Tuple is unique in the sense that core code cares about */ + Assert(checkUnique != UNIQUE_CHECK_EXISTING); + is_unique = true; + } + } + + /* + * Fill in the BTInsertState working area, to track the current page and + * position within the page to insert on. + * + * Note that itemsz is passed down to lower level code that deals with + * inserting the item. It must be MAXALIGN()'d. This ensures that space + * accounting code consistently considers the alignment overhead that we + * expect PageAddItem() will add later. (Actually, index_form_tuple() is + * already conservative about alignment, but we don't rely on that from + * this distance. Besides, preserving the "true" tuple size in index + * tuple headers for the benefit of nbtsplitloc.c might happen someday. + * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.) + */ + insertstate.itup = itup; + insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); + insertstate.itup_key = itup_key; + insertstate.bounds_valid = false; + insertstate.buf = InvalidBuffer; + insertstate.postingoff = 0; + +search: + + /* + * Find and lock the leaf page that the tuple should be added to by + * searching from the root page. insertstate.buf will hold a buffer that + * is locked in exclusive mode afterwards. + */ + stack = _bt_search_insert(rel, &insertstate); + + /* + * checkingunique inserts are not allowed to go ahead when two tuples with + * equal key attribute values would be visible to new MVCC snapshots once + * the xact commits. Check for conflicts in the locked page/buffer (if + * needed) here. + * + * It might be necessary to check a page to the right in _bt_check_unique, + * though that should be very rare. In practice the first page the value + * could be on (with scantid omitted) is almost always also the only page + * that a matching tuple might be found on. This is due to the behavior + * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can + * only be allowed to cross a page boundary when there is no candidate + * leaf page split point that avoids it. Also, _bt_check_unique can use + * the leaf page high key to determine that there will be no duplicates on + * the right sibling without actually visiting it (it uses the high key in + * cases where the new item happens to belong at the far right of the leaf + * page). + * + * NOTE: obviously, _bt_check_unique can only detect keys that are already + * in the index; so it cannot defend against concurrent insertions of the + * same key. We protect against that by means of holding a write lock on + * the first page the value could be on, with omitted/-inf value for the + * implicit heap TID tiebreaker attribute. Any other would-be inserter of + * the same key must acquire a write lock on the same page, so only one + * would-be inserter can be making the check at one time. Furthermore, + * once we are past the check we hold write locks continuously until we + * have performed our insertion, so no later inserter can fail to see our + * insertion. (This requires some care in _bt_findinsertloc.) + * + * If we must wait for another xact, we release the lock while waiting, + * and then must perform a new search. + * + * For a partial uniqueness check, we don't wait for the other xact. Just + * let the tuple in and return false for possibly non-unique, or true for + * definitely unique. + */ + if (checkingunique) + { + TransactionId xwait; + uint32 speculativeToken; + + xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique, + &is_unique, &speculativeToken); + + if (unlikely(TransactionIdIsValid(xwait))) + { + /* Have to wait for the other guy ... */ + _bt_relbuf(rel, insertstate.buf); + insertstate.buf = InvalidBuffer; + + /* + * If it's a speculative insertion, wait for it to finish (ie. to + * go ahead with the insertion, or kill the tuple). Otherwise + * wait for the transaction to finish as usual. + */ + if (speculativeToken) + SpeculativeInsertionWait(xwait, speculativeToken); + else + XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); + + /* start over... */ + if (stack) + _bt_freestack(stack); + goto search; + } + + /* Uniqueness is established -- restore heap tid as scantid */ + if (itup_key->heapkeyspace) + itup_key->scantid = &itup->t_tid; + } + + if (checkUnique != UNIQUE_CHECK_EXISTING) + { + OffsetNumber newitemoff; + + /* + * The only conflict predicate locking cares about for indexes is when + * an index tuple insert conflicts with an existing lock. We don't + * know the actual page we're going to insert on for sure just yet in + * checkingunique and !heapkeyspace cases, but it's okay to use the + * first page the value could be on (with scantid omitted) instead. + */ + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf)); + + /* + * Do the insertion. Note that insertstate contains cached binary + * search bounds established within _bt_check_unique when insertion is + * checkingunique. + */ + newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, + indexUnchanged, stack, heapRel); + _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, + itup, insertstate.itemsz, newitemoff, + insertstate.postingoff, false); + } + else + { + /* just release the buffer */ + _bt_relbuf(rel, insertstate.buf); + } + + /* be tidy */ + if (stack) + _bt_freestack(stack); + pfree(itup_key); + + return is_unique; +} + +/* + * _bt_search_insert() -- _bt_search() wrapper for inserts + * + * Search the tree for a particular scankey, or more precisely for the first + * leaf page it could be on. Try to make use of the fastpath optimization's + * rightmost leaf page cache before actually searching the tree from the root + * page, though. + * + * Return value is a stack of parent-page pointers (though see notes about + * fastpath optimization and page splits below). insertstate->buf is set to + * the address of the leaf-page buffer, which is write-locked and pinned in + * all cases (if necessary by creating a new empty root page for caller). + * + * The fastpath optimization avoids most of the work of searching the tree + * repeatedly when a single backend inserts successive new tuples on the + * rightmost leaf page of an index. A backend cache of the rightmost leaf + * page is maintained within _bt_insertonpg(), and used here. The cache is + * invalidated here when an insert of a non-pivot tuple must take place on a + * non-rightmost leaf page. + * + * The optimization helps with indexes on an auto-incremented field. It also + * helps with indexes on datetime columns, as well as indexes with lots of + * NULL values. (NULLs usually get inserted in the rightmost page for single + * column indexes, since they usually get treated as coming after everything + * else in the key space. Individual NULL tuples will generally be placed on + * the rightmost leaf page due to the influence of the heap TID column.) + * + * Note that we avoid applying the optimization when there is insufficient + * space on the rightmost page to fit caller's new item. This is necessary + * because we'll need to return a real descent stack when a page split is + * expected (actually, caller can cope with a leaf page split that uses a NULL + * stack, but that's very slow and so must be avoided). Note also that the + * fastpath optimization acquires the lock on the page conditionally as a way + * of reducing extra contention when there are concurrent insertions into the + * rightmost page (we give up if we'd have to wait for the lock). We assume + * that it isn't useful to apply the optimization when there is contention, + * since each per-backend cache won't stay valid for long. + */ +static BTStack +_bt_search_insert(Relation rel, BTInsertState insertstate) +{ + Assert(insertstate->buf == InvalidBuffer); + Assert(!insertstate->bounds_valid); + Assert(insertstate->postingoff == 0); + + if (RelationGetTargetBlock(rel) != InvalidBlockNumber) + { + /* Simulate a _bt_getbuf() call with conditional locking */ + insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); + if (_bt_conditionallockbuf(rel, insertstate->buf)) + { + Page page; + BTPageOpaque opaque; + + _bt_checkpage(rel, insertstate->buf); + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Check if the page is still the rightmost leaf page and has + * enough free space to accommodate the new tuple. Also check + * that the insertion scan key is strictly greater than the first + * non-pivot tuple on the page. (Note that we expect itup_key's + * scantid to be unset when our caller is a checkingunique + * inserter.) + */ + if (P_RIGHTMOST(opaque) && + P_ISLEAF(opaque) && + !P_IGNORE(opaque) && + PageGetFreeSpace(page) > insertstate->itemsz && + PageGetMaxOffsetNumber(page) >= P_HIKEY && + _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0) + { + /* + * Caller can use the fastpath optimization because cached + * block is still rightmost leaf page, which can fit caller's + * new tuple without splitting. Keep block in local cache for + * next insert, and have caller use NULL stack. + * + * Note that _bt_insert_parent() has an assertion that catches + * leaf page splits that somehow follow from a fastpath insert + * (it should only be passed a NULL stack when it must deal + * with a concurrent root page split, and never because a NULL + * stack was returned here). + */ + return NULL; + } + + /* Page unsuitable for caller, drop lock and pin */ + _bt_relbuf(rel, insertstate->buf); + } + else + { + /* Lock unavailable, drop pin */ + ReleaseBuffer(insertstate->buf); + } + + /* Forget block, since cache doesn't appear to be useful */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + + /* Cannot use optimization -- descend tree, return proper descent stack */ + return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE, + NULL); +} + +/* + * _bt_check_unique() -- Check for violation of unique index constraint + * + * Returns InvalidTransactionId if there is no conflict, else an xact ID + * we must wait for to see if it commits a conflicting tuple. If an actual + * conflict is detected, no return --- just ereport(). If an xact ID is + * returned, and the conflicting tuple still has a speculative insertion in + * progress, *speculativeToken is set to non-zero, and the caller can wait for + * the verdict on the insertion using SpeculativeInsertionWait(). + * + * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return + * InvalidTransactionId because we don't want to wait. In this case we + * set *is_unique to false if there is a potential conflict, and the + * core code must redo the uniqueness check later. + * + * As a side-effect, sets state in insertstate that can later be used by + * _bt_findinsertloc() to reuse most of the binary search work we do + * here. + * + * Do not call here when there are NULL values in scan key. NULL should be + * considered unequal to NULL when checking for duplicates, but we are not + * prepared to handle that correctly. + */ +static TransactionId +_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken) +{ + IndexTuple itup = insertstate->itup; + IndexTuple curitup = NULL; + ItemId curitemid = NULL; + BTScanInsert itup_key = insertstate->itup_key; + SnapshotData SnapshotDirty; + OffsetNumber offset; + OffsetNumber maxoff; + Page page; + BTPageOpaque opaque; + Buffer nbuf = InvalidBuffer; + bool found = false; + bool inposting = false; + bool prevalldead = true; + int curposti = 0; + + /* Assume unique until we find a duplicate */ + *is_unique = true; + + InitDirtySnapshot(SnapshotDirty); + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Find the first tuple with the same key. + * + * This also saves the binary search bounds in insertstate. We use them + * in the fastpath below, but also in the _bt_findinsertloc() call later. + */ + Assert(!insertstate->bounds_valid); + offset = _bt_binsrch_insert(rel, insertstate); + + /* + * Scan over all equal tuples, looking for live conflicts. + */ + Assert(!insertstate->bounds_valid || insertstate->low == offset); + Assert(!itup_key->anynullkeys); + Assert(itup_key->scantid == NULL); + for (;;) + { + /* + * Each iteration of the loop processes one heap TID, not one index + * tuple. Current offset number for page isn't usually advanced on + * iterations that process heap TIDs from posting list tuples. + * + * "inposting" state is set when _inside_ a posting list --- not when + * we're at the start (or end) of a posting list. We advance curposti + * at the end of the iteration when inside a posting list tuple. In + * general, every loop iteration either advances the page offset or + * advances curposti --- an iteration that handles the rightmost/max + * heap TID in a posting list finally advances the page offset (and + * unsets "inposting"). + * + * Make sure the offset points to an actual index tuple before trying + * to examine it... + */ + if (offset <= maxoff) + { + /* + * Fastpath: In most cases, we can use cached search bounds to + * limit our consideration to items that are definitely + * duplicates. This fastpath doesn't apply when the original page + * is empty, or when initial offset is past the end of the + * original page, which may indicate that we need to examine a + * second or subsequent page. + * + * Note that this optimization allows us to avoid calling + * _bt_compare() directly when there are no duplicates, as long as + * the offset where the key will go is not at the end of the page. + */ + if (nbuf == InvalidBuffer && offset == insertstate->stricthigh) + { + Assert(insertstate->bounds_valid); + Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); + Assert(insertstate->low <= insertstate->stricthigh); + Assert(_bt_compare(rel, itup_key, page, offset) < 0); + break; + } + + /* + * We can skip items that are already marked killed. + * + * In the presence of heavy update activity an index may contain + * many killed items with the same key; running _bt_compare() on + * each killed item gets expensive. Just advance over killed + * items as quickly as we can. We only apply _bt_compare() when + * we get to a non-killed item. We could reuse the bounds to + * avoid _bt_compare() calls for known equal tuples, but it + * doesn't seem worth it. + */ + if (!inposting) + curitemid = PageGetItemId(page, offset); + if (inposting || !ItemIdIsDead(curitemid)) + { + ItemPointerData htid; + bool all_dead = false; + + if (!inposting) + { + /* Plain tuple, or first TID in posting list tuple */ + if (_bt_compare(rel, itup_key, page, offset) != 0) + break; /* we're past all the equal tuples */ + + /* Advanced curitup */ + curitup = (IndexTuple) PageGetItem(page, curitemid); + Assert(!BTreeTupleIsPivot(curitup)); + } + + /* okay, we gotta fetch the heap tuple using htid ... */ + if (!BTreeTupleIsPosting(curitup)) + { + /* ... htid is from simple non-pivot tuple */ + Assert(!inposting); + htid = curitup->t_tid; + } + else if (!inposting) + { + /* ... htid is first TID in new posting list */ + inposting = true; + prevalldead = true; + curposti = 0; + htid = *BTreeTupleGetPostingN(curitup, 0); + } + else + { + /* ... htid is second or subsequent TID in posting list */ + Assert(curposti > 0); + htid = *BTreeTupleGetPostingN(curitup, curposti); + } + + /* + * If we are doing a recheck, we expect to find the tuple we + * are rechecking. It's not a duplicate, but we have to keep + * scanning. + */ + if (checkUnique == UNIQUE_CHECK_EXISTING && + ItemPointerCompare(&htid, &itup->t_tid) == 0) + { + found = true; + } + + /* + * Check if there's any table tuples for this index entry + * satisfying SnapshotDirty. This is necessary because for AMs + * with optimizations like heap's HOT, we have just a single + * index entry for the entire chain. + */ + else if (table_index_fetch_tuple_check(heapRel, &htid, + &SnapshotDirty, + &all_dead)) + { + TransactionId xwait; + + /* + * It is a duplicate. If we are only doing a partial + * check, then don't bother checking if the tuple is being + * updated in another transaction. Just return the fact + * that it is a potential conflict and leave the full + * check till later. Don't invalidate binary search + * bounds. + */ + if (checkUnique == UNIQUE_CHECK_PARTIAL) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + *is_unique = false; + return InvalidTransactionId; + } + + /* + * If this tuple is being updated by other transaction + * then we have to wait for its commit/abort. + */ + xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? + SnapshotDirty.xmin : SnapshotDirty.xmax; + + if (TransactionIdIsValid(xwait)) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; + /* Caller releases lock on buf immediately */ + insertstate->bounds_valid = false; + return xwait; + } + + /* + * Otherwise we have a definite conflict. But before + * complaining, look to see if the tuple we want to insert + * is itself now committed dead --- if so, don't complain. + * This is a waste of time in normal scenarios but we must + * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the + * exact tuple which triggered the insert, but that's OK + * because if we find a live tuple anywhere in this chain, + * we have a unique key conflict. The other live tuple is + * not part of this chain because it had a different index + * entry. + */ + htid = itup->t_tid; + if (table_index_fetch_tuple_check(heapRel, &htid, + SnapshotSelf, NULL)) + { + /* Normal case --- it's still live */ + } + else + { + /* + * It's been deleted, so no error, and no need to + * continue searching + */ + break; + } + + /* + * Check for a conflict-in as we would if we were going to + * write to this page. We aren't actually going to write, + * but we want a chance to report SSI conflicts that would + * otherwise be masked by this unique constraint + * violation. + */ + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf)); + + /* + * This is a definite conflict. Break the tuple down into + * datums and report the error. But first, make sure we + * release the buffer locks we're holding --- + * BuildIndexValueDescription could make catalog accesses, + * which in the worst case might touch this same index and + * cause deadlocks. + */ + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + _bt_relbuf(rel, insertstate->buf); + insertstate->buf = InvalidBuffer; + insertstate->bounds_valid = false; + + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(rel), + values, isnull); + + key_desc = BuildIndexValueDescription(rel, values, + isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("duplicate key value violates unique constraint \"%s\"", + RelationGetRelationName(rel)), + key_desc ? errdetail("Key %s already exists.", + key_desc) : 0, + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + } + } + else if (all_dead && (!inposting || + (prevalldead && + curposti == BTreeTupleGetNPosting(curitup) - 1))) + { + /* + * The conflicting tuple (or all HOT chains pointed to by + * all posting list TIDs) is dead to everyone, so mark the + * index entry killed. + */ + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + + /* + * Mark buffer with a dirty hint, since state is not + * crucial. Be sure to mark the proper buffer dirty. + */ + if (nbuf != InvalidBuffer) + MarkBufferDirtyHint(nbuf, true); + else + MarkBufferDirtyHint(insertstate->buf, true); + } + + /* + * Remember if posting list tuple has even a single HOT chain + * whose members are not all dead + */ + if (!all_dead && inposting) + prevalldead = false; + } + } + + if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1) + { + /* Advance to next TID in same posting list */ + curposti++; + continue; + } + else if (offset < maxoff) + { + /* Advance to next tuple */ + curposti = 0; + inposting = false; + offset = OffsetNumberNext(offset); + } + else + { + int highkeycmp; + + /* If scankey == hikey we gotta check the next page too */ + if (P_RIGHTMOST(opaque)) + break; + highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); + Assert(highkeycmp <= 0); + if (highkeycmp != 0) + break; + /* Advance to next non-dead page --- there must be one */ + for (;;) + { + BlockNumber nblkno = opaque->btpo_next; + + nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ); + page = BufferGetPage(nbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + break; + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + } + /* Will also advance to next tuple */ + curposti = 0; + inposting = false; + maxoff = PageGetMaxOffsetNumber(page); + offset = P_FIRSTDATAKEY(opaque); + /* Don't invalidate binary search bounds */ + } + } + + /* + * If we are doing a recheck then we should have found the tuple we are + * checking. Otherwise there's something very wrong --- probably, the + * index is on a non-immutable expression. + */ + if (checkUnique == UNIQUE_CHECK_EXISTING && !found) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to re-find tuple within index \"%s\"", + RelationGetRelationName(rel)), + errhint("This may be because of a non-immutable index expression."), + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + + return InvalidTransactionId; +} + + +/* + * _bt_findinsertloc() -- Finds an insert location for a tuple + * + * On entry, insertstate buffer contains the page the new tuple belongs + * on. It is exclusive-locked and pinned by the caller. + * + * If 'checkingunique' is true, the buffer on entry is the first page + * that contains duplicates of the new key. If there are duplicates on + * multiple pages, the correct insertion position might be some page to + * the right, rather than the first page. In that case, this function + * moves right to the correct target page. + * + * (In a !heapkeyspace index, there can be multiple pages with the same + * high key, where the new tuple could legitimately be placed on. In + * that case, the caller passes the first page containing duplicates, + * just like when checkingunique=true. If that page doesn't have enough + * room for the new tuple, this function moves right, trying to find a + * legal page that does.) + * + * If 'indexUnchanged' is true, this is for an UPDATE that didn't + * logically change the indexed value, but must nevertheless have a new + * entry to point to a successor version. This hint from the executor + * will influence our behavior when the page might have to be split and + * we must consider our options. Bottom-up index deletion can avoid + * pathological version-driven page splits, but we only want to go to the + * trouble of trying it when we already have moderate confidence that + * it's appropriate. The hint should not significantly affect our + * behavior over time unless practically all inserts on to the leaf page + * get the hint. + * + * On exit, insertstate buffer contains the chosen insertion page, and + * the offset within that page is returned. If _bt_findinsertloc needed + * to move right, the lock and pin on the original page are released, and + * the new buffer is exclusively locked and pinned instead. + * + * If insertstate contains cached binary search bounds, we will take + * advantage of them. This avoids repeating comparisons that we made in + * _bt_check_unique() already. + * + * If there is not enough room on the page for the new tuple, we try to + * make room by removing any LP_DEAD tuples. + */ +static OffsetNumber +_bt_findinsertloc(Relation rel, + BTInsertState insertstate, + bool checkingunique, + bool indexUnchanged, + BTStack stack, + Relation heapRel) +{ + BTScanInsert itup_key = insertstate->itup_key; + Page page = BufferGetPage(insertstate->buf); + BTPageOpaque opaque; + OffsetNumber newitemoff; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Check 1/3 of a page restriction */ + if (unlikely(insertstate->itemsz > BTMaxItemSize(page))) + _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page, + insertstate->itup); + + Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque)); + Assert(!insertstate->bounds_valid || checkingunique); + Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL); + Assert(itup_key->heapkeyspace || itup_key->scantid == NULL); + Assert(!itup_key->allequalimage || itup_key->heapkeyspace); + + if (itup_key->heapkeyspace) + { + /* Keep track of whether checkingunique duplicate seen */ + bool uniquedup = indexUnchanged; + + /* + * If we're inserting into a unique index, we may have to walk right + * through leaf pages to find the one leaf page that we must insert on + * to. + * + * This is needed for checkingunique callers because a scantid was not + * used when we called _bt_search(). scantid can only be set after + * _bt_check_unique() has checked for duplicates. The buffer + * initially stored in insertstate->buf has the page where the first + * duplicate key might be found, which isn't always the page that new + * tuple belongs on. The heap TID attribute for new tuple (scantid) + * could force us to insert on a sibling page, though that should be + * very rare in practice. + */ + if (checkingunique) + { + if (insertstate->low < insertstate->stricthigh) + { + /* Encountered a duplicate in _bt_check_unique() */ + Assert(insertstate->bounds_valid); + uniquedup = true; + } + + for (;;) + { + /* + * Does the new tuple belong on this page? + * + * The earlier _bt_check_unique() call may well have + * established a strict upper bound on the offset for the new + * item. If it's not the last item of the page (i.e. if there + * is at least one tuple on the page that goes after the tuple + * we're inserting) then we know that the tuple belongs on + * this page. We can skip the high key check. + */ + if (insertstate->bounds_valid && + insertstate->low <= insertstate->stricthigh && + insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) + break; + + /* Test '<=', not '!=', since scantid is set now */ + if (P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY) <= 0) + break; + + _bt_stepright(rel, insertstate, stack); + /* Update local state after stepping right */ + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* Assume duplicates (if checkingunique) */ + uniquedup = true; + } + } + + /* + * If the target page cannot fit newitem, try to avoid splitting the + * page on insert by performing deletion or deduplication now + */ + if (PageGetFreeSpace(page) < insertstate->itemsz) + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, + checkingunique, uniquedup, + indexUnchanged); + } + else + { + /*---------- + * This is a !heapkeyspace (version 2 or 3) index. The current page + * is the first page that we could insert the new tuple to, but there + * may be other pages to the right that we could opt to use instead. + * + * If the new key is equal to one or more existing keys, we can + * legitimately place it anywhere in the series of equal keys. In + * fact, if the new key is equal to the page's "high key" we can place + * it on the next page. If it is equal to the high key, and there's + * not room to insert the new tuple on the current page without + * splitting, then we move right hoping to find more free space and + * avoid a split. + * + * Keep scanning right until we + * (a) find a page with enough free space, + * (b) reach the last page where the tuple can legally go, or + * (c) get tired of searching. + * (c) is not flippant; it is important because if there are many + * pages' worth of equal keys, it's better to split one of the early + * pages than to scan all the way to the end of the run of equal keys + * on every insert. We implement "get tired" as a random choice, + * since stopping after scanning a fixed number of pages wouldn't work + * well (we'd never reach the right-hand side of previously split + * pages). The probability of moving right is set at 0.99, which may + * seem too high to change the behavior much, but it does an excellent + * job of preventing O(N^2) behavior with many equal keys. + *---------- + */ + while (PageGetFreeSpace(page) < insertstate->itemsz) + { + /* + * Before considering moving right, see if we can obtain enough + * space by erasing LP_DEAD items + */ + if (P_HAS_GARBAGE(opaque)) + { + /* Perform simple deletion */ + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + false, false, false); + + if (PageGetFreeSpace(page) >= insertstate->itemsz) + break; /* OK, now we have enough space */ + } + + /* + * Nope, so check conditions (b) and (c) enumerated above + * + * The earlier _bt_check_unique() call may well have established a + * strict upper bound on the offset for the new item. If it's not + * the last item of the page (i.e. if there is at least one tuple + * on the page that's greater than the tuple we're inserting to) + * then we know that the tuple belongs on this page. We can skip + * the high key check. + */ + if (insertstate->bounds_valid && + insertstate->low <= insertstate->stricthigh && + insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) + break; + + if (P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY) != 0 || + random() <= (MAX_RANDOM_VALUE / 100)) + break; + + _bt_stepright(rel, insertstate, stack); + /* Update local state after stepping right */ + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + } + + /* + * We should now be on the correct page. Find the offset within the page + * for the new tuple. (Possibly reusing earlier search bounds.) + */ + Assert(P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); + + newitemoff = _bt_binsrch_insert(rel, insertstate); + + if (insertstate->postingoff == -1) + { + /* + * There is an overlapping posting list tuple with its LP_DEAD bit + * set. We don't want to unnecessarily unset its LP_DEAD bit while + * performing a posting list split, so perform simple index tuple + * deletion early. + */ + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + false, false, false); + + /* + * Do new binary search. New insert location cannot overlap with any + * posting list now. + */ + Assert(!insertstate->bounds_valid); + insertstate->postingoff = 0; + newitemoff = _bt_binsrch_insert(rel, insertstate); + Assert(insertstate->postingoff == 0); + } + + return newitemoff; +} + +/* + * Step right to next non-dead page, during insertion. + * + * This is a bit more complicated than moving right in a search. We must + * write-lock the target page before releasing write lock on current page; + * else someone else's _bt_check_unique scan could fail to see our insertion. + * Write locks on intermediate dead pages won't do because we don't know when + * they will get de-linked from the tree. + * + * This is more aggressive than it needs to be for non-unique !heapkeyspace + * indexes. + */ +static void +_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) +{ + Page page; + BTPageOpaque opaque; + Buffer rbuf; + BlockNumber rblkno; + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + rbuf = InvalidBuffer; + rblkno = opaque->btpo_next; + for (;;) + { + rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this page was incompletely split, finish the split now. We do + * this while holding a lock on the left sibling, which is not good + * because finishing the split could be a fairly lengthy operation. + * But this should happen very seldom. + */ + if (P_INCOMPLETE_SPLIT(opaque)) + { + _bt_finish_split(rel, rbuf, stack); + rbuf = InvalidBuffer; + continue; + } + + if (!P_IGNORE(opaque)) + break; + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + + rblkno = opaque->btpo_next; + } + /* rbuf locked; unlock buf, update state for caller */ + _bt_relbuf(rel, insertstate->buf); + insertstate->buf = rbuf; + insertstate->bounds_valid = false; +} + +/*---------- + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * + * This recursive procedure does the following things: + * + * + if postingoff != 0, splits existing posting list tuple + * (since it overlaps with new 'itup' tuple). + * + if necessary, splits the target page, using 'itup_key' for + * suffix truncation on leaf pages (caller passes NULL for + * non-leaf pages). + * + inserts the new tuple (might be split from posting list). + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invokes itself with the appropriate tuple for the right + * child page on the parent. + * + updates the metapage if a true root or fast root is split. + * + * On entry, we must have the correct buffer in which to do the + * insertion, and the buffer must be pinned and write-locked. On return, + * we will have dropped both the pin and the lock on the buffer. + * + * This routine only performs retail tuple insertions. 'itup' should + * always be either a non-highkey leaf item, or a downlink (new high + * key items are created indirectly, when a page is split). When + * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page + * we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + *---------- + */ +static void +_bt_insertonpg(Relation rel, + BTScanInsert itup_key, + Buffer buf, + Buffer cbuf, + BTStack stack, + IndexTuple itup, + Size itemsz, + OffsetNumber newitemoff, + int postingoff, + bool split_only_page) +{ + Page page; + BTPageOpaque opaque; + bool isleaf, + isroot, + isrightmost, + isonly; + IndexTuple oposting = NULL; + IndexTuple origitup = NULL; + IndexTuple nposting = NULL; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + isleaf = P_ISLEAF(opaque); + isroot = P_ISROOT(opaque); + isrightmost = P_RIGHTMOST(opaque); + isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque); + + /* child buffer must be given iff inserting on an internal page */ + Assert(isleaf == !BufferIsValid(cbuf)); + /* tuple must have appropriate number of attributes */ + Assert(!isleaf || + BTreeTupleGetNAtts(itup, rel) == + IndexRelationGetNumberOfAttributes(rel)); + Assert(isleaf || + BTreeTupleGetNAtts(itup, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(!BTreeTupleIsPosting(itup)); + Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz); + /* Caller must always finish incomplete split for us */ + Assert(!P_INCOMPLETE_SPLIT(opaque)); + + /* + * Every internal page should have exactly one negative infinity item at + * all times. Only _bt_split() and _bt_newroot() should add items that + * become negative infinity items through truncation, since they're the + * only routines that allocate new internal pages. + */ + Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque)); + + /* + * Do we need to split an existing posting list item? + */ + if (postingoff != 0) + { + ItemId itemid = PageGetItemId(page, newitemoff); + + /* + * The new tuple is a duplicate with a heap TID that falls inside the + * range of an existing posting list tuple on a leaf page. Prepare to + * split an existing posting list. Overwriting the posting list with + * its post-split version is treated as an extra step in either the + * insert or page split critical section. + */ + Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* + * postingoff value comes from earlier call to _bt_binsrch_posting(). + * Its binary search might think that a plain tuple must be a posting + * list tuple that needs to be split. This can happen with corruption + * involving an existing plain tuple that is a duplicate of the new + * item, up to and including its table TID. Check for that here in + * passing. + * + * Also verify that our caller has made sure that the existing posting + * list tuple does not have its LP_DEAD bit set. + */ + if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(&itup->t_tid), + ItemPointerGetOffsetNumber(&itup->t_tid), + newitemoff, BufferGetBlockNumber(buf), + RelationGetRelationName(rel)))); + + /* use a mutable copy of itup as our itup from here on */ + origitup = itup; + itup = CopyIndexTuple(origitup); + nposting = _bt_swap_posting(itup, oposting, postingoff); + /* itup now contains rightmost/max TID from oposting */ + + /* Alter offset so that newitem goes after posting list */ + newitemoff = OffsetNumberNext(newitemoff); + } + + /* + * Do we need to split the page to fit the item on it? + * + * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, + * so this comparison is correct even though we appear to be accounting + * only for the item and not for its line pointer. + */ + if (PageGetFreeSpace(page) < itemsz) + { + Buffer rbuf; + + Assert(!split_only_page); + + /* split the buffer into left and right halves */ + rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup, + origitup, nposting, postingoff); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buf), + BufferGetBlockNumber(rbuf)); + + /*---------- + * By here, + * + * + our target page has been split; + * + the original tuple has been inserted; + * + we have write locks on both the old (left half) + * and new (right half) buffers, after the split; and + * + we know the key we want to insert into the parent + * (it's the "high key" on the left child page). + * + * We're ready to do the parent insertion. We need to hold onto the + * locks for the child pages until we locate the parent, but we can + * at least release the lock on the right child before doing the + * actual insertion. The lock on the left child will be released + * last of all by parent insertion, where it is the 'cbuf' of parent + * page. + *---------- + */ + _bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly); + } + else + { + Buffer metabuf = InvalidBuffer; + Page metapg = NULL; + BTMetaPageData *metad = NULL; + BlockNumber blockcache; + + /* + * If we are doing this insert because we split a page that was the + * only one on its tree level, but was not the root, it may have been + * the "fast root". We need to ensure that the fast root link points + * at or above the current page. We can safely acquire a lock on the + * metapage here --- see comments for _bt_newroot(). + */ + if (unlikely(split_only_page)) + { + Assert(!isleaf); + Assert(BufferIsValid(cbuf)); + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + if (metad->btm_fastlevel >= opaque->btpo_level) + { + /* no update wanted */ + _bt_relbuf(rel, metabuf); + metabuf = InvalidBuffer; + } + } + + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + if (postingoff != 0) + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false, + false) == InvalidOffsetNumber) + elog(PANIC, "failed to add new item to block %u in index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + + MarkBufferDirty(buf); + + if (BufferIsValid(metabuf)) + { + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + metad->btm_fastroot = BufferGetBlockNumber(buf); + metad->btm_fastlevel = opaque->btpo_level; + MarkBufferDirty(metabuf); + } + + /* + * Clear INCOMPLETE_SPLIT flag on child if inserting the new item + * finishes a split + */ + if (!isleaf) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + Assert(P_INCOMPLETE_SPLIT(cpageop)); + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_insert xlrec; + xl_btree_metadata xlmeta; + uint8 xlinfo; + XLogRecPtr recptr; + uint16 upostingoff; + + xlrec.offnum = newitemoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); + + if (isleaf && postingoff == 0) + { + /* Simple leaf insert */ + xlinfo = XLOG_BTREE_INSERT_LEAF; + } + else if (postingoff != 0) + { + /* + * Leaf insert with posting list split. Must include + * postingoff field before newitem/orignewitem. + */ + Assert(isleaf); + xlinfo = XLOG_BTREE_INSERT_POST; + } + else + { + /* Internal page insert, which finishes a split on cbuf */ + xlinfo = XLOG_BTREE_INSERT_UPPER; + XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD); + + if (BufferIsValid(metabuf)) + { + /* Actually, it's an internal page insert + meta update */ + xlinfo = XLOG_BTREE_INSERT_META; + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + xlmeta.version = metad->btm_version; + xlmeta.root = metad->btm_root; + xlmeta.level = metad->btm_level; + xlmeta.fastroot = metad->btm_fastroot; + xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; + xlmeta.allequalimage = metad->btm_allequalimage; + + XLogRegisterBuffer(2, metabuf, + REGBUF_WILL_INIT | REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &xlmeta, + sizeof(xl_btree_metadata)); + } + } + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + if (postingoff == 0) + { + /* Just log itup from caller */ + XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + } + else + { + /* + * Insert with posting list split (XLOG_BTREE_INSERT_POST + * record) case. + * + * Log postingoff. Also log origitup, not itup. REDO routine + * must reconstruct final itup (as well as nposting) using + * _bt_swap_posting(). + */ + upostingoff = postingoff; + + XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16)); + XLogRegisterBufData(0, (char *) origitup, + IndexTupleSize(origitup)); + } + + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + if (BufferIsValid(metabuf)) + PageSetLSN(metapg, recptr); + if (!isleaf) + PageSetLSN(BufferGetPage(cbuf), recptr); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* Release subsidiary buffers */ + if (BufferIsValid(metabuf)) + _bt_relbuf(rel, metabuf); + if (!isleaf) + _bt_relbuf(rel, cbuf); + + /* + * Cache the block number if this is the rightmost leaf page. Cache + * may be used by a future inserter within _bt_search_insert(). + */ + blockcache = InvalidBlockNumber; + if (isrightmost && isleaf && !isroot) + blockcache = BufferGetBlockNumber(buf); + + /* Release buffer for insertion target block */ + _bt_relbuf(rel, buf); + + /* + * If we decided to cache the insertion target block before releasing + * its buffer lock, then cache it now. Check the height of the tree + * first, though. We don't go for the optimization with small + * indexes. Defer final check to this point to ensure that we don't + * call _bt_getrootheight while holding a buffer lock. + */ + if (BlockNumberIsValid(blockcache) && + _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) + RelationSetTargetBlock(rel, blockcache); + } + + /* be tidy */ + if (postingoff != 0) + { + /* itup is actually a modified copy of caller's original */ + pfree(nposting); + pfree(itup); + } +} + +/* + * _bt_split() -- split a page in the btree. + * + * On entry, buf is the page to split, and is pinned and write-locked. + * newitemoff etc. tell us about the new item that must be inserted + * along with the data from the original page. + * + * itup_key is used for suffix truncation on leaf pages (internal + * page callers pass NULL). When splitting a non-leaf page, 'cbuf' + * is the left-sibling of the page we're inserting the downlink for. + * This function will clear the INCOMPLETE_SPLIT flag on it, and + * release the buffer. + * + * orignewitem, nposting, and postingoff are needed when an insert of + * orignewitem results in both a posting list split and a page split. + * These extra posting list split details are used here in the same + * way as they are used in the more common case where a posting list + * split does not coincide with a page split. We need to deal with + * posting list splits directly in order to ensure that everything + * that follows from the insert of orignewitem is handled as a single + * atomic operation (though caller's insert of a new pivot/downlink + * into parent page will still be a separate operation). See + * nbtree/README for details on the design of posting list splits. + * + * Returns the new right sibling of buf, pinned and write-locked. + * The pin and lock on buf are maintained. + */ +static Buffer +_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff) +{ + Buffer rbuf; + Page origpage; + Page leftpage, + rightpage; + BlockNumber origpagenumber, + rightpagenumber; + BTPageOpaque ropaque, + lopaque, + oopaque; + Buffer sbuf = InvalidBuffer; + Page spage = NULL; + BTPageOpaque sopaque = NULL; + Size itemsz; + ItemId itemid; + IndexTuple firstright, + lefthighkey; + OffsetNumber firstrightoff; + OffsetNumber afterleftoff, + afterrightoff, + minusinfoff; + OffsetNumber origpagepostingoff; + OffsetNumber maxoff; + OffsetNumber i; + bool newitemonleft, + isleaf, + isrightmost; + + /* + * origpage is the original page to be split. leftpage is a temporary + * buffer that receives the left-sibling data, which will be copied back + * into origpage on success. rightpage is the new page that will receive + * the right-sibling data. + * + * leftpage is allocated after choosing a split point. rightpage's new + * buffer isn't acquired until after leftpage is initialized and has new + * high key, the last point where splitting the page may fail (barring + * corruption). Failing before acquiring new buffer won't have lasting + * consequences, since origpage won't have been modified and leftpage is + * only workspace. + */ + origpage = BufferGetPage(buf); + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + isleaf = P_ISLEAF(oopaque); + isrightmost = P_RIGHTMOST(oopaque); + maxoff = PageGetMaxOffsetNumber(origpage); + origpagenumber = BufferGetBlockNumber(buf); + + /* + * Choose a point to split origpage at. + * + * A split point can be thought of as a point _between_ two existing data + * items on origpage (the lastleft and firstright tuples), provided you + * pretend that the new item that didn't fit is already on origpage. + * + * Since origpage does not actually contain newitem, the representation of + * split points needs to work with two boundary cases: splits where + * newitem is lastleft, and splits where newitem is firstright. + * newitemonleft resolves the ambiguity that would otherwise exist when + * newitemoff == firstrightoff. In all other cases it's clear which side + * of the split every tuple goes on from context. newitemonleft is + * usually (but not always) redundant information. + * + * firstrightoff is supposed to be an origpage offset number, but it's + * possible that its value will be maxoff+1, which is "past the end" of + * origpage. This happens in the rare case where newitem goes after all + * existing items (i.e. newitemoff is maxoff+1) and we end up splitting + * origpage at the point that leaves newitem alone on new right page. Any + * "!newitemonleft && newitemoff == firstrightoff" split point makes + * newitem the firstright tuple, though, so this case isn't a special + * case. + */ + firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, + newitem, &newitemonleft); + + /* Allocate temp buffer for leftpage */ + leftpage = PageGetTempPage(origpage); + _bt_pageinit(leftpage, BufferGetPageSize(buf)); + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + + /* + * leftpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + lopaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + /* set flag in leftpage indicating that rightpage has no downlink yet */ + lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; + lopaque->btpo_prev = oopaque->btpo_prev; + /* handle btpo_next after rightpage buffer acquired */ + lopaque->btpo_level = oopaque->btpo_level; + /* handle btpo_cycleid after rightpage buffer acquired */ + + /* + * Copy the original page's LSN into leftpage, which will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + PageSetLSN(leftpage, PageGetLSN(origpage)); + + /* + * Determine page offset number of existing overlapped-with-orignewitem + * posting list when it is necessary to perform a posting list split in + * passing. Note that newitem was already changed by caller (newitem no + * longer has the orignewitem TID). + * + * This page offset number (origpagepostingoff) will be used to pretend + * that the posting split has already taken place, even though the + * required modifications to origpage won't occur until we reach the + * critical section. The lastleft and firstright tuples of our page split + * point should, in effect, come from an imaginary version of origpage + * that has the nposting tuple instead of the original posting list tuple. + * + * Note: _bt_findsplitloc() should have compensated for coinciding posting + * list splits in just the same way, at least in theory. It doesn't + * bother with that, though. In practice it won't affect its choice of + * split point. + */ + origpagepostingoff = InvalidOffsetNumber; + if (postingoff != 0) + { + Assert(isleaf); + Assert(ItemPointerCompare(&orignewitem->t_tid, + &newitem->t_tid) < 0); + Assert(BTreeTupleIsPosting(nposting)); + origpagepostingoff = OffsetNumberPrev(newitemoff); + } + + /* + * The high key for the new left page is a possibly-truncated copy of + * firstright on the leaf level (it's "firstright itself" on internal + * pages; see !isleaf comments below). This may seem to be contrary to + * Lehman & Yao's approach of using a copy of lastleft as the new high key + * when splitting on the leaf level. It isn't, though. + * + * Suffix truncation will leave the left page's high key fully equal to + * lastleft when lastleft and firstright are equal prior to heap TID (that + * is, the tiebreaker TID value comes from lastleft). It isn't actually + * necessary for a new leaf high key to be a copy of lastleft for the L&Y + * "subtree" invariant to hold. It's sufficient to make sure that the new + * leaf high key is strictly less than firstright, and greater than or + * equal to (not necessarily equal to) lastleft. In other words, when + * suffix truncation isn't possible during a leaf page split, we take + * L&Y's exact approach to generating a new high key for the left page. + * (Actually, that is slightly inaccurate. We don't just use a copy of + * lastleft. A tuple with all the keys from firstright but the max heap + * TID from lastleft is used, to avoid introducing a special case.) + */ + if (!newitemonleft && newitemoff == firstrightoff) + { + /* incoming tuple becomes firstright */ + itemsz = newitemsz; + firstright = newitem; + } + else + { + /* existing item at firstrightoff becomes firstright */ + itemid = PageGetItemId(origpage, firstrightoff); + itemsz = ItemIdGetLength(itemid); + firstright = (IndexTuple) PageGetItem(origpage, itemid); + if (firstrightoff == origpagepostingoff) + firstright = nposting; + } + + if (isleaf) + { + IndexTuple lastleft; + + /* Attempt suffix truncation for leaf page splits */ + if (newitemonleft && newitemoff == firstrightoff) + { + /* incoming tuple becomes lastleft */ + lastleft = newitem; + } + else + { + OffsetNumber lastleftoff; + + /* existing item before firstrightoff becomes lastleft */ + lastleftoff = OffsetNumberPrev(firstrightoff); + Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); + itemid = PageGetItemId(origpage, lastleftoff); + lastleft = (IndexTuple) PageGetItem(origpage, itemid); + if (lastleftoff == origpagepostingoff) + lastleft = nposting; + } + + lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key); + itemsz = IndexTupleSize(lefthighkey); + } + else + { + /* + * Don't perform suffix truncation on a copy of firstright to make + * left page high key for internal page splits. Must use firstright + * as new high key directly. + * + * Each distinct separator key value originates as a leaf level high + * key; all other separator keys/pivot tuples are copied from one + * level down. A separator key in a grandparent page must be + * identical to high key in rightmost parent page of the subtree to + * its left, which must itself be identical to high key in rightmost + * child page of that same subtree (this even applies to separator + * from grandparent's high key). There must always be an unbroken + * "seam" of identical separator keys that guide index scans at every + * level, starting from the grandparent. That's why suffix truncation + * is unsafe here. + * + * Internal page splits will truncate firstright into a "negative + * infinity" data item when it gets inserted on the new right page + * below, though. This happens during the call to _bt_pgaddtup() for + * the new first data item for right page. Do not confuse this + * mechanism with suffix truncation. It is just a convenient way of + * implementing page splits that split the internal page "inside" + * firstright. The lefthighkey separator key cannot appear a second + * time in the right page (only firstright's downlink goes in right + * page). + */ + lefthighkey = firstright; + } + + /* + * Add new high key to leftpage + */ + afterleftoff = P_HIKEY; + + Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0); + Assert(BTreeTupleGetNAtts(lefthighkey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey))); + if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "failed to add high key to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + afterleftoff = OffsetNumberNext(afterleftoff); + + /* + * Acquire a new right page to split into, now that left page has a new + * high key. From here on, it's not okay to throw an error without + * zeroing rightpage first. This coding rule ensures that we won't + * confuse future VACUUM operations, which might otherwise try to re-find + * a downlink to a leftover junk page as the page undergoes deletion. + * + * It would be reasonable to start the critical section just after the new + * rightpage buffer is acquired instead; that would allow us to avoid + * leftover junk pages without bothering to zero rightpage. We do it this + * way because it avoids an unnecessary PANIC when either origpage or its + * existing sibling page are corrupt. + */ + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rightpage = BufferGetPage(rbuf); + rightpagenumber = BufferGetBlockNumber(rbuf); + /* rightpage was initialized by _bt_getbuf */ + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + /* + * Finish off remaining leftpage special area fields. They cannot be set + * before both origpage (leftpage) and rightpage buffers are acquired and + * locked. + * + * btpo_cycleid is only used with leaf pages, though we set it here in all + * cases just to be consistent. + */ + lopaque->btpo_next = rightpagenumber; + lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); + + /* + * rightpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + ropaque->btpo_flags = oopaque->btpo_flags; + ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = oopaque->btpo_next; + ropaque->btpo_level = oopaque->btpo_level; + ropaque->btpo_cycleid = lopaque->btpo_cycleid; + + /* + * Add new high key to rightpage where necessary. + * + * If the page we're splitting is not the rightmost page at its level in + * the tree, then the first entry on the page is the high key from + * origpage. + */ + afterrightoff = P_HIKEY; + + if (!isrightmost) + { + IndexTuple righthighkey; + + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + righthighkey = (IndexTuple) PageGetItem(origpage, itemid); + Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0); + Assert(BTreeTupleGetNAtts(righthighkey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff, + false, false) == InvalidOffsetNumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add high key to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + + /* + * Internal page splits truncate first data item on right page -- it + * becomes "minus infinity" item for the page. Set this up here. + */ + minusinfoff = InvalidOffsetNumber; + if (!isleaf) + minusinfoff = afterrightoff; + + /* + * Now transfer all the data items (non-pivot tuples in isleaf case, or + * additional pivot tuples in !isleaf case) to the appropriate page. + * + * Note: we *must* insert at least the right page's items in item-number + * order, for the benefit of _bt_restore_page(). + */ + for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple dataitem; + + itemid = PageGetItemId(origpage, i); + itemsz = ItemIdGetLength(itemid); + dataitem = (IndexTuple) PageGetItem(origpage, itemid); + + /* replace original item with nposting due to posting split? */ + if (i == origpagepostingoff) + { + Assert(BTreeTupleIsPosting(dataitem)); + Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); + dataitem = nposting; + } + + /* does new item belong before this one? */ + else if (i == newitemoff) + { + if (newitemonleft) + { + Assert(newitemoff <= firstrightoff); + if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff, + false)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterleftoff = OffsetNumberNext(afterleftoff); + } + else + { + Assert(newitemoff >= firstrightoff); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, + afterrightoff == minusinfoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + } + + /* decide which page to put it on */ + if (i < firstrightoff) + { + if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterleftoff = OffsetNumberNext(afterleftoff); + } + else + { + if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff, + afterrightoff == minusinfoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + } + + /* Handle case where newitem goes at the end of rightpage */ + if (i <= newitemoff) + { + /* + * Can't have newitemonleft here; that would imply we were told to put + * *everything* on the left page, which cannot fit (if it could, we'd + * not be splitting the page). + */ + Assert(!newitemonleft && newitemoff == maxoff + 1); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, + afterrightoff == minusinfoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + afterrightoff = OffsetNumberNext(afterrightoff); + } + + /* + * We have to grab the original right sibling (if any) and update its prev + * link. We are guaranteed that this is deadlock-free, since we couple + * the locks in the standard order: left to right. + */ + if (!isrightmost) + { + sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); + spage = BufferGetPage(sbuf); + sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); + if (sopaque->btpo_prev != origpagenumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling's left-link doesn't match: " + "block %u links to %u instead of expected %u in index \"%s\"", + oopaque->btpo_next, sopaque->btpo_prev, origpagenumber, + RelationGetRelationName(rel)))); + } + + /* + * Check to see if we can set the SPLIT_END flag in the right-hand + * split page; this can save some I/O for vacuum since it need not + * proceed to the right sibling. We can set the flag if the right + * sibling has a different cycleid: that means it could not be part of + * a group of pages that were all split off from the same ancestor + * page. If you're confused, imagine that page A splits to A B and + * then again, yielding A C B, while vacuum is in progress. Tuples + * originally in A could now be in either B or C, hence vacuum must + * examine both pages. But if D, our right sibling, has a different + * cycleid then it could not contain any tuples that were in A when + * the vacuum started. + */ + if (sopaque->btpo_cycleid != ropaque->btpo_cycleid) + ropaque->btpo_flags |= BTP_SPLIT_END; + } + + /* + * Right sibling is locked, new siblings are prepared, but original page + * is not updated yet. + * + * NO EREPORT(ERROR) till right sibling is updated. We can get away with + * not starting the critical section till here because we haven't been + * scribbling on the original page yet; see comments above. + */ + START_CRIT_SECTION(); + + /* + * By here, the original data page has been split into two new halves, and + * these are correct. The algorithm requires that the left page never + * move during a split, so we copy the new left page back on top of the + * original. We need to do this before writing the WAL record, so that + * XLogInsert can WAL log an image of the page if necessary. + */ + PageRestoreTempPage(leftpage, origpage); + /* leftpage, lopaque must not be used below here */ + + MarkBufferDirty(buf); + MarkBufferDirty(rbuf); + + if (!isrightmost) + { + sopaque->btpo_prev = rightpagenumber; + MarkBufferDirty(sbuf); + } + + /* + * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes + * a split + */ + if (!isleaf) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_split xlrec; + uint8 xlinfo; + XLogRecPtr recptr; + + xlrec.level = ropaque->btpo_level; + /* See comments below on newitem, orignewitem, and posting lists */ + xlrec.firstrightoff = firstrightoff; + xlrec.newitemoff = newitemoff; + xlrec.postingoff = 0; + if (postingoff != 0 && origpagepostingoff < firstrightoff) + xlrec.postingoff = postingoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); + /* Log original right sibling, since we've changed its prev-pointer */ + if (!isrightmost) + XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); + if (!isleaf) + XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); + + /* + * Log the new item, if it was inserted on the left page. (If it was + * put on the right page, we don't need to explicitly WAL log it + * because it's included with all the other items on the right page.) + * Show the new item as belonging to the left page buffer, so that it + * is not stored if XLogInsert decides it needs a full-page image of + * the left page. We always store newitemoff in the record, though. + * + * The details are sometimes slightly different for page splits that + * coincide with a posting list split. If both the replacement + * posting list and newitem go on the right page, then we don't need + * to log anything extra, just like the simple !newitemonleft + * no-posting-split case (postingoff is set to zero in the WAL record, + * so recovery doesn't need to process a posting list split at all). + * Otherwise, we set postingoff and log orignewitem instead of + * newitem, despite having actually inserted newitem. REDO routine + * must reconstruct nposting and newitem using _bt_swap_posting(). + * + * Note: It's possible that our page split point is the point that + * makes the posting list lastleft and newitem firstright. This is + * the only case where we log orignewitem/newitem despite newitem + * going on the right page. If XLogInsert decides that it can omit + * orignewitem due to logging a full-page image of the left page, + * everything still works out, since recovery only needs to log + * orignewitem for items on the left page (just like the regular + * newitem-logged case). + */ + if (newitemonleft && xlrec.postingoff == 0) + XLogRegisterBufData(0, (char *) newitem, newitemsz); + else if (xlrec.postingoff != 0) + { + Assert(isleaf); + Assert(newitemonleft || firstrightoff == newitemoff); + Assert(newitemsz == IndexTupleSize(orignewitem)); + XLogRegisterBufData(0, (char *) orignewitem, newitemsz); + } + + /* Log the left page's new high key */ + if (!isleaf) + { + /* lefthighkey isn't local copy, get current pointer */ + itemid = PageGetItemId(origpage, P_HIKEY); + lefthighkey = (IndexTuple) PageGetItem(origpage, itemid); + } + XLogRegisterBufData(0, (char *) lefthighkey, + MAXALIGN(IndexTupleSize(lefthighkey))); + + /* + * Log the contents of the right page in the format understood by + * _bt_restore_page(). The whole right page will be recreated. + * + * Direct access to page is not good but faster - we should implement + * some new func in page API. Note we only store the tuples + * themselves, knowing that they were inserted in item-number order + * and so the line pointers can be reconstructed. See comments for + * _bt_restore_page(). + */ + XLogRegisterBufData(1, + (char *) rightpage + ((PageHeader) rightpage)->pd_upper, + ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); + + xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + PageSetLSN(origpage, recptr); + PageSetLSN(rightpage, recptr); + if (!isrightmost) + PageSetLSN(spage, recptr); + if (!isleaf) + PageSetLSN(BufferGetPage(cbuf), recptr); + } + + END_CRIT_SECTION(); + + /* release the old right sibling */ + if (!isrightmost) + _bt_relbuf(rel, sbuf); + + /* release the child */ + if (!isleaf) + _bt_relbuf(rel, cbuf); + + /* be tidy */ + if (isleaf) + pfree(lefthighkey); + + /* split's done */ + return rbuf; +} + +/* + * _bt_insert_parent() -- Insert downlink into parent, completing split. + * + * On entry, buf and rbuf are the left and right split pages, which we + * still hold write locks on. Both locks will be released here. We + * release the rbuf lock once we have a write lock on the page that we + * intend to insert a downlink to rbuf on (i.e. buf's current parent page). + * The lock on buf is released at the same point as the lock on the parent + * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same + * atomic operation that completes the split by inserting a new downlink. + * + * stack - stack showing how we got here. Will be NULL when splitting true + * root, or during concurrent root split, where we can be inefficient + * isroot - we split the true root + * isonly - we split a page alone on its level (might have been fast root) + */ +static void +_bt_insert_parent(Relation rel, + Buffer buf, + Buffer rbuf, + BTStack stack, + bool isroot, + bool isonly) +{ + /* + * Here we have to do something Lehman and Yao don't talk about: deal with + * a root split and construction of a new root. If our stack is empty + * then we have just split a node on what had been the root level when we + * descended the tree. If it was still the root then we perform a + * new-root construction. If it *wasn't* the root anymore, search to find + * the next higher level that someone constructed meanwhile, and find the + * right place to insert as for the normal case. + * + * If we have to search for the parent level, we do so by re-descending + * from the root. This is not super-efficient, but it's rare enough not + * to matter. + */ + if (isroot) + { + Buffer rootbuf; + + Assert(stack == NULL); + Assert(isonly); + /* create a new root node and update the metapage */ + rootbuf = _bt_newroot(rel, buf, rbuf); + /* release the split buffers */ + _bt_relbuf(rel, rootbuf); + _bt_relbuf(rel, rbuf); + _bt_relbuf(rel, buf); + } + else + { + BlockNumber bknum = BufferGetBlockNumber(buf); + BlockNumber rbknum = BufferGetBlockNumber(rbuf); + Page page = BufferGetPage(buf); + IndexTuple new_item; + BTStackData fakestack; + IndexTuple ritem; + Buffer pbuf; + + if (stack == NULL) + { + BTPageOpaque opaque; + + elog(DEBUG2, "concurrent ROOT page split"); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * We should never reach here when a leaf page split takes place + * despite the insert of newitem being able to apply the fastpath + * optimization. Make sure of that with an assertion. + * + * This is more of a performance issue than a correctness issue. + * The fastpath won't have a descent stack. Using a phony stack + * here works, but never rely on that. The fastpath should be + * rejected within _bt_search_insert() when the rightmost leaf + * page will split, since it's faster to go through _bt_search() + * and get a stack in the usual way. + */ + Assert(!(P_ISLEAF(opaque) && + BlockNumberIsValid(RelationGetTargetBlock(rel)))); + + /* Find the leftmost page at the next level up */ + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); + /* Set up a phony stack entry pointing there */ + stack = &fakestack; + stack->bts_blkno = BufferGetBlockNumber(pbuf); + stack->bts_offset = InvalidOffsetNumber; + stack->bts_parent = NULL; + _bt_relbuf(rel, pbuf); + } + + /* get high key from left, a strict lower bound for new right page */ + ritem = (IndexTuple) PageGetItem(page, + PageGetItemId(page, P_HIKEY)); + + /* form an index tuple that points at the new right page */ + new_item = CopyIndexTuple(ritem); + BTreeTupleSetDownLink(new_item, rbknum); + + /* + * Re-find and write lock the parent of buf. + * + * It's possible that the location of buf's downlink has changed since + * our initial _bt_search() descent. _bt_getstackbuf() will detect + * and recover from this, updating the stack, which ensures that the + * new downlink will be inserted at the correct offset. Even buf's + * parent may have changed. + */ + pbuf = _bt_getstackbuf(rel, stack, bknum); + + /* + * Unlock the right child. The left child will be unlocked in + * _bt_insertonpg(). + * + * Unlocking the right child must be delayed until here to ensure that + * no concurrent VACUUM operation can become confused. Page deletion + * cannot be allowed to fail to re-find a downlink for the rbuf page. + * (Actually, this is just a vestige of how things used to work. The + * page deletion code is expected to check for the INCOMPLETE_SPLIT + * flag on the left child. It won't attempt deletion of the right + * child until the split is complete. Despite all this, we opt to + * conservatively delay unlocking the right child until here.) + */ + _bt_relbuf(rel, rbuf); + + if (pbuf == InvalidBuffer) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u", + RelationGetRelationName(rel), bknum, rbknum))); + + /* Recursively insert into the parent */ + _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, + new_item, MAXALIGN(IndexTupleSize(new_item)), + stack->bts_offset + 1, 0, isonly); + + /* be tidy */ + pfree(new_item); + } +} + +/* + * _bt_finish_split() -- Finish an incomplete split + * + * A crash or other failure can leave a split incomplete. The insertion + * routines won't allow to insert on a page that is incompletely split. + * Before inserting on such a page, call _bt_finish_split(). + * + * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked + * and unpinned. + */ +void +_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) +{ + Page lpage = BufferGetPage(lbuf); + BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); + Buffer rbuf; + Page rpage; + BTPageOpaque rpageop; + bool wasroot; + bool wasonly; + + Assert(P_INCOMPLETE_SPLIT(lpageop)); + + /* Lock right sibling, the one missing the downlink */ + rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* Could this be a root split? */ + if (!stack) + { + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf)); + + _bt_relbuf(rel, metabuf); + } + else + wasroot = false; + + /* Was this the only page on the level before split? */ + wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); + + elog(DEBUG1, "finishing incomplete split of %u/%u", + BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); + + _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly); +} + +/* + * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot + * tuple whose downlink points to child page. + * + * Caller passes child's block number, which is used to identify + * associated pivot tuple in parent page using a linear search that + * matches on pivot's downlink/block number. The expected location of + * the pivot tuple is taken from the stack one level above the child + * page. This is used as a starting point. Insertions into the + * parent level could cause the pivot tuple to move right; deletions + * could cause it to move left, but not left of the page we previously + * found it on. + * + * Caller can use its stack to relocate the pivot tuple/downlink for + * any same-level page to the right of the page found by its initial + * descent. This is necessary because of the possibility that caller + * moved right to recover from a concurrent page split. It's also + * convenient for certain callers to be able to step right when there + * wasn't a concurrent page split, while still using their original + * stack. For example, the checkingunique _bt_doinsert() case may + * have to step right when there are many physical duplicates, and its + * scantid forces an insertion to the right of the "first page the + * value could be on". (This is also relied on by all of our callers + * when dealing with !heapkeyspace indexes.) + * + * Returns write-locked parent page buffer, or InvalidBuffer if pivot + * tuple not found (should not happen). Adjusts bts_blkno & + * bts_offset if changed. Page split caller should insert its new + * pivot tuple for its new right sibling page on parent page, at the + * offset number bts_offset + 1. + */ +Buffer +_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child) +{ + BlockNumber blkno; + OffsetNumber start; + + blkno = stack->bts_blkno; + start = stack->bts_offset; + + for (;;) + { + Buffer buf; + Page page; + BTPageOpaque opaque; + + buf = _bt_getbuf(rel, blkno, BT_WRITE); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_INCOMPLETE_SPLIT(opaque)) + { + _bt_finish_split(rel, buf, stack->bts_parent); + continue; + } + + if (!P_IGNORE(opaque)) + { + OffsetNumber offnum, + minoff, + maxoff; + ItemId itemid; + IndexTuple item; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * start = InvalidOffsetNumber means "search the whole page". We + * need this test anyway due to possibility that page has a high + * key now when it didn't before. + */ + if (start < minoff) + start = minoff; + + /* + * Need this check too, to guard against possibility that page + * split since we visited it originally. + */ + if (start > maxoff) + start = OffsetNumberNext(maxoff); + + /* + * These loops will check every item on the page --- but in an + * order that's attuned to the probability of where it actually + * is. Scan to the right first, then to the left. + */ + for (offnum = start; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (IndexTuple) PageGetItem(page, itemid); + + if (BTreeTupleGetDownLink(item) == child) + { + /* Return accurate pointer to where link is now */ + stack->bts_blkno = blkno; + stack->bts_offset = offnum; + return buf; + } + } + + for (offnum = OffsetNumberPrev(start); + offnum >= minoff; + offnum = OffsetNumberPrev(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (IndexTuple) PageGetItem(page, itemid); + + if (BTreeTupleGetDownLink(item) == child) + { + /* Return accurate pointer to where link is now */ + stack->bts_blkno = blkno; + stack->bts_offset = offnum; + return buf; + } + } + } + + /* + * The item we're looking for moved right at least one page. + * + * Lehman and Yao couple/chain locks when moving right here, which we + * can avoid. See nbtree/README. + */ + if (P_RIGHTMOST(opaque)) + { + _bt_relbuf(rel, buf); + return InvalidBuffer; + } + blkno = opaque->btpo_next; + start = InvalidOffsetNumber; + _bt_relbuf(rel, buf); + } +} + +/* + * _bt_newroot() -- Create a new root page for the index. + * + * We've just split the old root page and need to create a new one. + * In order to do this, we add a new root page to the file, then lock + * the metadata page and update it. This is guaranteed to be deadlock- + * free, because all readers release their locks on the metadata page + * before trying to lock the root, and all writers lock the root before + * trying to lock the metadata page. We have a write lock on the old + * root page, so we have not introduced any cycles into the waits-for + * graph. + * + * On entry, lbuf (the old root) and rbuf (its new peer) are write- + * locked. On exit, a new root page exists with entries for the + * two new children, metapage is updated and unlocked/unpinned. + * The new root buffer is returned to caller which has to unlock/unpin + * lbuf, rbuf & rootbuf. + */ +static Buffer +_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) +{ + Buffer rootbuf; + Page lpage, + rootpage; + BlockNumber lbkno, + rbkno; + BlockNumber rootblknum; + BTPageOpaque rootopaque; + BTPageOpaque lopaque; + ItemId itemid; + IndexTuple item; + IndexTuple left_item; + Size left_item_sz; + IndexTuple right_item; + Size right_item_sz; + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + lbkno = BufferGetBlockNumber(lbuf); + rbkno = BufferGetBlockNumber(rbuf); + lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + + /* get a new root page */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootpage = BufferGetPage(rootbuf); + rootblknum = BufferGetBlockNumber(rootbuf); + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + /* + * Create downlink item for left page (old root). The key value used is + * "minus infinity", a sentinel value that's reliably less than any real + * key value that could appear in the left page. + */ + left_item_sz = sizeof(IndexTupleData); + left_item = (IndexTuple) palloc(left_item_sz); + left_item->t_info = left_item_sz; + BTreeTupleSetDownLink(left_item, lbkno); + BTreeTupleSetNAtts(left_item, 0, false); + + /* + * Create downlink item for right page. The key for it is obtained from + * the "high key" position in the left page. + */ + itemid = PageGetItemId(lpage, P_HIKEY); + right_item_sz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(lpage, itemid); + right_item = CopyIndexTuple(item); + BTreeTupleSetDownLink(right_item, rbkno); + + /* NO EREPORT(ERROR) from here till newroot op is logged */ + START_CRIT_SECTION(); + + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + + /* set btree special data */ + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags = BTP_ROOT; + rootopaque->btpo_level = + ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1; + rootopaque->btpo_cycleid = 0; + + /* update metapage data */ + metad->btm_root = rootblknum; + metad->btm_level = rootopaque->btpo_level; + metad->btm_fastroot = rootblknum; + metad->btm_fastlevel = rootopaque->btpo_level; + + /* + * Insert the left page pointer into the new root page. The root page is + * the rightmost page on its level so there is no "high key" in it; the + * two items will go into positions P_HIKEY and P_FIRSTKEY. + * + * Note: we *must* insert the two items in item-number order, for the + * benefit of _bt_restore_page(). + */ + Assert(BTreeTupleGetNAtts(left_item, rel) == 0); + if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add leftkey to new root page" + " while splitting block %u of index \"%s\"", + BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); + + /* + * insert the right page pointer into the new root page. + */ + Assert(BTreeTupleGetNAtts(right_item, rel) > 0); + Assert(BTreeTupleGetNAtts(right_item, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add rightkey to new root page" + " while splitting block %u of index \"%s\"", + BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); + + /* Clear the incomplete-split flag in the left child */ + Assert(P_INCOMPLETE_SPLIT(lopaque)); + lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(lbuf); + + MarkBufferDirty(rootbuf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_newroot xlrec; + XLogRecPtr recptr; + xl_btree_metadata md; + + xlrec.rootblk = rootblknum; + xlrec.level = metad->btm_level; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); + + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + md.version = metad->btm_version; + md.root = rootblknum; + md.level = metad->btm_level; + md.fastroot = rootblknum; + md.fastlevel = metad->btm_level; + md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; + md.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); + + /* + * Direct access to page is not good but faster - we should implement + * some new func in page API. + */ + XLogRegisterBufData(0, + (char *) rootpage + ((PageHeader) rootpage)->pd_upper, + ((PageHeader) rootpage)->pd_special - + ((PageHeader) rootpage)->pd_upper); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); + + PageSetLSN(lpage, recptr); + PageSetLSN(rootpage, recptr); + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + + /* done with metapage */ + _bt_relbuf(rel, metabuf); + + pfree(left_item); + pfree(right_item); + + return rootbuf; +} + +/* + * _bt_pgaddtup() -- add a data item to a particular page during split. + * + * The difference between this routine and a bare PageAddItem call is + * that this code can deal with the first data item on an internal btree + * page in passing. This data item (which is called "firstright" within + * _bt_split()) has a key that must be treated as minus infinity after + * the split. Therefore, we truncate away all attributes when caller + * specifies it's the first data item on page (downlink is not changed, + * though). This extra step is only needed for the right page of an + * internal page split. There is no need to do this for the first data + * item on the existing/left page, since that will already have been + * truncated during an earlier page split. + * + * See _bt_split() for a high level explanation of why we truncate here. + * Note that this routine has nothing to do with suffix truncation, + * despite using some of the same infrastructure. + */ +static inline bool +_bt_pgaddtup(Page page, + Size itemsize, + IndexTuple itup, + OffsetNumber itup_off, + bool newfirstdataitem) +{ + IndexTupleData trunctuple; + + if (newfirstdataitem) + { + trunctuple = *itup; + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetNAtts(&trunctuple, 0, false); + itup = &trunctuple; + itemsize = sizeof(IndexTupleData); + } + + if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false, + false) == InvalidOffsetNumber)) + return false; + + return true; +} + +/* + * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split. + * + * There are three operations performed here: simple index deletion, bottom-up + * index deletion, and deduplication. If all three operations fail to free + * enough space for the incoming item then caller will go on to split the + * page. We always consider simple deletion first. If that doesn't work out + * we consider alternatives. Callers that only want us to consider simple + * deletion (without any fallback) ask for that using the 'simpleonly' + * argument. + * + * We usually pick only one alternative "complex" operation when simple + * deletion alone won't prevent a page split. The 'checkingunique', + * 'uniquedup', and 'indexUnchanged' arguments are used for that. + * + * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page + * level flag was found set. The flag was useful back when there wasn't + * necessarily one single page for a duplicate tuple to go on (before heap TID + * became a part of the key space in version 4 indexes). But we don't + * actually look at the flag anymore (it's not a gating condition for our + * caller). That would cause us to miss tuples that are safe to delete, + * without getting any benefit in return. We know that the alternative is to + * split the page; scanning the line pointer array in passing won't have + * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite + * all this because !heapkeyspace indexes must still do a "getting tired" + * linear search, and so are likely to get some benefit from using it as a + * gating condition.) + */ +static void +_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, + BTInsertState insertstate, + bool simpleonly, bool checkingunique, + bool uniquedup, bool indexUnchanged) +{ + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable = 0; + OffsetNumber offnum, + minoff, + maxoff; + Buffer buffer = insertstate->buf; + BTScanInsert itup_key = insertstate->itup_key; + Page page = BufferGetPage(buffer); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque)); + Assert(simpleonly || itup_key->heapkeyspace); + Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged)); + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. We'll usually manage to delete a few extra items that + * are not marked LP_DEAD in passing. Often the extra items that actually + * end up getting deleted are items that would have had their LP_DEAD bit + * set before long anyway (if we opted not to include them as extras). + */ + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable, + insertstate->itup, minoff, maxoff); + insertstate->bounds_valid = false; + + /* Return when a page split has already been avoided */ + if (PageGetFreeSpace(page) >= insertstate->itemsz) + return; + + /* Might as well assume duplicates (if checkingunique) */ + uniquedup = true; + } + + /* + * We're done with simple deletion. Return early with callers that only + * call here so that simple deletion can be considered. This includes + * callers that explicitly ask for this and checkingunique callers that + * probably don't have any version churn duplicates on the page. + * + * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we + * return at this point (or when we go on the try either or both of our + * other strategies and they also fail). We do not bother expending a + * separate write to clear it, however. Caller will definitely clear it + * when it goes on to split the page (note also that the deduplication + * process will clear the flag in passing, just to keep things tidy). + */ + if (simpleonly || (checkingunique && !uniquedup)) + { + Assert(!indexUnchanged); + return; + } + + /* Assume bounds about to be invalidated (this is almost certain now) */ + insertstate->bounds_valid = false; + + /* + * Perform bottom-up index deletion pass when executor hint indicated that + * incoming item is logically unchanged, or for a unique index that is + * known to have physical duplicates for some other reason. (There is a + * large overlap between these two cases for a unique index. It's worth + * having both triggering conditions in order to apply the optimization in + * the event of successive related INSERT and DELETE statements.) + * + * We'll go on to do a deduplication pass when a bottom-up pass fails to + * delete an acceptable amount of free space (a significant fraction of + * the page, or space for the new item, whichever is greater). + * + * Note: Bottom-up index deletion uses the same equality/equivalence + * routines as deduplication internally. However, it does not merge + * together index tuples, so the same correctness considerations do not + * apply. We deliberately omit an index-is-allequalimage test here. + */ + if ((indexUnchanged || uniquedup) && + _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz)) + return; + + /* Perform deduplication pass (when enabled and index-is-allequalimage) */ + if (BTGetDeduplicateItems(rel) && itup_key->allequalimage) + _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup, + insertstate->itemsz, (indexUnchanged || uniquedup)); +} + +/* + * _bt_simpledel_pass - Simple index tuple deletion pass. + * + * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers + * of all such tuples are determined by caller (caller passes these to us as + * its 'deletable' argument). + * + * We might also delete extra index tuples that turn out to be safe to delete + * in passing (though they must be cheap to check in passing to begin with). + * There is no certainty that any extra tuples will be deleted, though. The + * high level goal of the approach we take is to get the most out of each call + * here (without noticeably increasing the per-call overhead compared to what + * we need to do just to be able to delete the page's LP_DEAD-marked index + * tuples). + * + * The number of extra index tuples that turn out to be deletable might + * greatly exceed the number of LP_DEAD-marked index tuples due to various + * locality related effects. For example, it's possible that the total number + * of table blocks (pointed to by all TIDs on the leaf page) is naturally + * quite low, in which case we might end up checking if it's possible to + * delete _most_ index tuples on the page (without the tableam needing to + * access additional table blocks). The tableam will sometimes stumble upon + * _many_ extra deletable index tuples in indexes where this pattern is + * common. + * + * See nbtree/README for further details on simple index tuple deletion. + */ +static void +_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, + OffsetNumber *deletable, int ndeletable, IndexTuple newitem, + OffsetNumber minoff, OffsetNumber maxoff) +{ + Page page = BufferGetPage(buffer); + BlockNumber *deadblocks; + int ndeadblocks; + TM_IndexDeleteOp delstate; + OffsetNumber offnum; + + /* Get array of table blocks pointed to by LP_DEAD-set tuples */ + deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem, + &ndeadblocks); + + /* Initialize tableam state that describes index deletion operation */ + delstate.bottomup = false; + delstate.bottomupfreespace = 0; + delstate.ndeltids = 0; + delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); + delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids]; + TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids]; + BlockNumber tidblock; + void *match; + + if (!BTreeTupleIsPosting(itup)) + { + tidblock = ItemPointerGetBlockNumber(&itup->t_tid); + match = bsearch(&tidblock, deadblocks, ndeadblocks, + sizeof(BlockNumber), _bt_blk_cmp); + + if (!match) + { + Assert(!ItemIdIsDead(itemid)); + continue; + } + + /* + * TID's table block is among those pointed to by the TIDs from + * LP_DEAD-bit set tuples on page -- add TID to deltids + */ + odeltid->tid = itup->t_tid; + odeltid->id = delstate.ndeltids; + ostatus->idxoffnum = offnum; + ostatus->knowndeletable = ItemIdIsDead(itemid); + ostatus->promising = false; /* unused */ + ostatus->freespace = 0; /* unused */ + + delstate.ndeltids++; + } + else + { + int nitem = BTreeTupleGetNPosting(itup); + + for (int p = 0; p < nitem; p++) + { + ItemPointer tid = BTreeTupleGetPostingN(itup, p); + + tidblock = ItemPointerGetBlockNumber(tid); + match = bsearch(&tidblock, deadblocks, ndeadblocks, + sizeof(BlockNumber), _bt_blk_cmp); + + if (!match) + { + Assert(!ItemIdIsDead(itemid)); + continue; + } + + /* + * TID's table block is among those pointed to by the TIDs + * from LP_DEAD-bit set tuples on page -- add TID to deltids + */ + odeltid->tid = *tid; + odeltid->id = delstate.ndeltids; + ostatus->idxoffnum = offnum; + ostatus->knowndeletable = ItemIdIsDead(itemid); + ostatus->promising = false; /* unused */ + ostatus->freespace = 0; /* unused */ + + odeltid++; + ostatus++; + delstate.ndeltids++; + } + } + } + + pfree(deadblocks); + + Assert(delstate.ndeltids >= ndeletable); + + /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */ + _bt_delitems_delete_check(rel, buffer, heapRel, &delstate); + + pfree(delstate.deltids); + pfree(delstate.status); +} + +/* + * _bt_deadblocks() -- Get LP_DEAD related table blocks. + * + * Builds sorted and unique-ified array of table block numbers from index + * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table + * block from incoming newitem just in case it isn't among the LP_DEAD-related + * table blocks. + * + * Always counting the newitem's table block as an LP_DEAD related block makes + * sense because the cost is consistently low; it is practically certain that + * the table block will not incur a buffer miss in tableam. On the other hand + * the benefit is often quite high. There is a decent chance that there will + * be some deletable items from this block, since in general most garbage + * tuples became garbage in the recent past (in many cases this won't be the + * first logical row that core code added to/modified in table block + * recently). + * + * Returns final array, and sets *nblocks to its final size for caller. + */ +static BlockNumber * +_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, + IndexTuple newitem, int *nblocks) +{ + int spacentids, + ntids; + BlockNumber *tidblocks; + + /* + * Accumulate each TID's block in array whose initial size has space for + * one table block per LP_DEAD-set tuple (plus space for the newitem table + * block). Array will only need to grow when there are LP_DEAD-marked + * posting list tuples (which is not that common). + */ + spacentids = ndeletable + 1; + ntids = 0; + tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids); + + /* + * First add the table block for the incoming newitem. This is the one + * case where simple deletion can visit a table block that doesn't have + * any known deletable items. + */ + Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem)); + tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid); + + for (int i = 0; i < ndeletable; i++) + { + ItemId itemid = PageGetItemId(page, deletable[i]); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + + if (!BTreeTupleIsPosting(itup)) + { + if (ntids + 1 > spacentids) + { + spacentids *= 2; + tidblocks = (BlockNumber *) + repalloc(tidblocks, sizeof(BlockNumber) * spacentids); + } + + tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid); + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + if (ntids + nposting > spacentids) + { + spacentids = Max(spacentids * 2, ntids + nposting); + tidblocks = (BlockNumber *) + repalloc(tidblocks, sizeof(BlockNumber) * spacentids); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer tid = BTreeTupleGetPostingN(itup, j); + + tidblocks[ntids++] = ItemPointerGetBlockNumber(tid); + } + } + } + + qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); + *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); + + return tidblocks; +} + +/* + * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass + */ +static inline int +_bt_blk_cmp(const void *arg1, const void *arg2) +{ + BlockNumber b1 = *((BlockNumber *) arg1); + BlockNumber b2 = *((BlockNumber *) arg2); + + if (b1 < b2) + return -1; + else if (b1 > b2) + return 1; + + return 0; +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c new file mode 100644 index 0000000..ebec8fa --- /dev/null +++ b/src/backend/access/nbtree/nbtpage.c @@ -0,0 +1,3073 @@ +/*------------------------------------------------------------------------- + * + * nbtpage.c + * BTree-specific page management code for the Postgres btree access + * method. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtpage.c + * + * NOTES + * Postgres btree pages look like ordinary relation pages. The opaque + * data at high addresses includes pointers to left and right siblings + * and flag data describing page state. The first page in a btree, page + * zero, is special -- it stores meta-information describing the tree. + * Pages one and higher store the actual tree data. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + +static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); +static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, + FullTransactionId safexid); +static void _bt_delitems_delete(Relation rel, Buffer buf, + TransactionId latestRemovedXid, + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable); +static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, + OffsetNumber *updatedoffsets, + Size *updatedbuflen, bool needswal); +static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, + BTStack stack); +static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, + BlockNumber scanblkno, + bool *rightsib_empty, + BTVacState *vstate); +static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, + BTStack stack, + Buffer *subtreeparent, + OffsetNumber *poffset, + BlockNumber *topparent, + BlockNumber *topparentrightsib); +static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, + FullTransactionId safexid); + +/* + * _bt_initmetapage() -- Fill a page buffer with a correct metapage image + */ +void +_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool allequalimage) +{ + BTMetaPageData *metad; + BTPageOpaque metaopaque; + + _bt_pageinit(page, BLCKSZ); + + metad = BTPageGetMeta(page); + metad->btm_magic = BTREE_MAGIC; + metad->btm_version = BTREE_VERSION; + metad->btm_root = rootbknum; + metad->btm_level = level; + metad->btm_fastroot = rootbknum; + metad->btm_fastlevel = level; + metad->btm_last_cleanup_num_delpages = 0; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + metad->btm_allequalimage = allequalimage; + + metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); + metaopaque->btpo_flags = BTP_META; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; +} + +/* + * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version + * 3, the last version that can be updated without broadly affecting + * on-disk compatibility. (A REINDEX is required to upgrade to v4.) + * + * This routine does purely in-memory image upgrade. Caller is + * responsible for locking, WAL-logging etc. + */ +void +_bt_upgrademetapage(Page page) +{ + BTMetaPageData *metad; + BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY; + + metad = BTPageGetMeta(page); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* It must be really a meta page of upgradable version */ + Assert(metaopaque->btpo_flags & BTP_META); + Assert(metad->btm_version < BTREE_NOVAC_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + + /* Set version number and fill extra fields added into version 3 */ + metad->btm_version = BTREE_NOVAC_VERSION; + metad->btm_last_cleanup_num_delpages = 0; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + /* Only a REINDEX can set this field */ + Assert(!metad->btm_allequalimage); + metad->btm_allequalimage = false; + + /* Adjust pd_lower (see _bt_initmetapage() for details) */ + ((PageHeader) page)->pd_lower = + ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; +} + +/* + * Get metadata from share-locked buffer containing metapage, while performing + * standard sanity checks. + * + * Callers that cache data returned here in local cache should note that an + * on-the-fly upgrade using _bt_upgrademetapage() can change the version field + * and BTREE_NOVAC_VERSION specific fields without invalidating local cache. + */ +static BTMetaPageData * +_bt_getmeta(Relation rel, Buffer metabuf) +{ + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + /* sanity-check the metapage */ + if (!P_ISMETA(metaopaque) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" is not a btree", + RelationGetRelationName(rel)))); + + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); + + return metad; +} + +/* + * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup + * + * Called by btvacuumcleanup when btbulkdelete was never called because no + * index tuples needed to be deleted. + */ +bool +_bt_vacuum_needs_cleanup(Relation rel) +{ + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + uint32 btm_version; + BlockNumber prev_num_delpages; + + /* + * Copy details from metapage to local variables quickly. + * + * Note that we deliberately avoid using cached version of metapage here. + */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + btm_version = metad->btm_version; + + if (btm_version < BTREE_NOVAC_VERSION) + { + /* + * Metapage needs to be dynamically upgraded to store fields that are + * only present when btm_version >= BTREE_NOVAC_VERSION + */ + _bt_relbuf(rel, metabuf); + return true; + } + + prev_num_delpages = metad->btm_last_cleanup_num_delpages; + _bt_relbuf(rel, metabuf); + + /* + * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the + * total size of the index. We can reasonably expect (though are not + * guaranteed) to be able to recycle this many pages if we decide to do a + * btvacuumscan call during the ongoing btvacuumcleanup. For further + * details see the nbtree/README section on placing deleted pages in the + * FSM. + */ + if (prev_num_delpages > 0 && + prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20) + return true; + + return false; +} + +/* + * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup. + * + * Called at the end of btvacuumcleanup, when num_delpages value has been + * finalized. + */ +void +_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages) +{ + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + /* + * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage + * field started out as a TransactionId field called btm_oldest_btpo_xact. + * Both "versions" are just uint32 fields. It was convenient to repurpose + * the field when we began to use 64-bit XIDs in deleted pages. + * + * It's possible that a pg_upgrade'd database will contain an XID value in + * what is now recognized as the metapage's btm_last_cleanup_num_delpages + * field. _bt_vacuum_needs_cleanup() may even believe that this value + * indicates that there are lots of pages that it needs to recycle, when + * in reality there are only one or two. The worst that can happen is + * that there will be a call to btvacuumscan a little earlier, which will + * set btm_last_cleanup_num_delpages to a sane value when we're called. + * + * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is + * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just + * to be consistent. + */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */ + if (metad->btm_version >= BTREE_NOVAC_VERSION && + metad->btm_last_cleanup_num_delpages == num_delpages) + { + /* Usually means index continues to have num_delpages of 0 */ + _bt_relbuf(rel, metabuf); + return; + } + + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); + + START_CRIT_SECTION(); + + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + + /* update cleanup-related information */ + metad->btm_last_cleanup_num_delpages = num_delpages; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + MarkBufferDirty(metabuf); + + /* write wal record if needed */ + if (RelationNeedsWAL(rel)) + { + xl_btree_metadata md; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + md.version = metad->btm_version; + md.root = metad->btm_root; + md.level = metad->btm_level; + md.fastroot = metad->btm_fastroot; + md.fastlevel = metad->btm_fastlevel; + md.last_cleanup_num_delpages = num_delpages; + md.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP); + + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + + _bt_relbuf(rel, metabuf); +} + +/* + * _bt_getroot() -- Get the root page of the btree. + * + * Since the root page can move around the btree file, we have to read + * its location from the metadata page, and then read the root page + * itself. If no root page exists yet, we have to create one. + * + * The access type parameter (BT_READ or BT_WRITE) controls whether + * a new root page will be created or not. If access = BT_READ, + * and no root page exists, we just return InvalidBuffer. For + * BT_WRITE, we try to create the root page if it doesn't exist. + * NOTE that the returned root page will have only a read lock set + * on it even if access = BT_WRITE! + * + * The returned page is not necessarily the true root --- it could be + * a "fast root" (a page that is alone in its level due to deletions). + * Also, if the root page is split while we are "in flight" to it, + * what we will return is the old root, which is now just the leftmost + * page on a probably-not-very-wide level. For most purposes this is + * as good as or better than the true root, so we do not bother to + * insist on finding the true root. We do, however, guarantee to + * return a live (not deleted or half-dead) page. + * + * On successful return, the root page is pinned and read-locked. + * The metadata page is not locked or pinned on exit. + */ +Buffer +_bt_getroot(Relation rel, int access) +{ + Buffer metabuf; + Buffer rootbuf; + Page rootpage; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + uint32 rootlevel; + BTMetaPageData *metad; + + /* + * Try to use previously-cached metapage data to find the root. This + * normally saves one buffer access per index search, which is a very + * helpful savings in bufmgr traffic and hence contention. + */ + if (rel->rd_amcache != NULL) + { + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); + Assert(metad->btm_root != P_NONE); + + rootblkno = metad->btm_fastroot; + Assert(rootblkno != P_NONE); + rootlevel = metad->btm_fastlevel; + + rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + + /* + * Since the cache might be stale, we check the page more carefully + * here than normal. We *must* check that it's not deleted. If it's + * not alone on its level, then we reject too --- this may be overly + * paranoid but better safe than sorry. Note we don't check P_ISROOT, + * because that's not set in a "fast root". + */ + if (!P_IGNORE(rootopaque) && + rootopaque->btpo_level == rootlevel && + P_LEFTMOST(rootopaque) && + P_RIGHTMOST(rootopaque)) + { + /* OK, accept cached page as the root */ + return rootbuf; + } + _bt_relbuf(rel, rootbuf); + /* Cache is stale, throw it away */ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + } + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* if no root page initialized yet, do it */ + if (metad->btm_root == P_NONE) + { + Page metapg; + + /* If access = BT_READ, caller doesn't want us to create root yet */ + if (access == BT_READ) + { + _bt_relbuf(rel, metabuf); + return InvalidBuffer; + } + + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); + + /* + * Race condition: if someone else initialized the metadata between + * the time we released the read lock and acquired the write lock, we + * must avoid doing it again. + */ + if (metad->btm_root != P_NONE) + { + /* + * Metadata initialized by someone else. In order to guarantee no + * deadlocks, we have to release the metadata page and start all + * over again. (Is that really true? But it's hardly worth trying + * to optimize this case.) + */ + _bt_relbuf(rel, metabuf); + return _bt_getroot(rel, access); + } + + /* + * Get, initialize, write, and leave a lock of the appropriate type on + * the new root page. Since this is the first page in the tree, it's + * a leaf as well as the root. + */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootblkno = BufferGetBlockNumber(rootbuf); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); + rootopaque->btpo_level = 0; + rootopaque->btpo_cycleid = 0; + /* Get raw page pointer for metapage */ + metapg = BufferGetPage(metabuf); + + /* NO ELOG(ERROR) till meta is updated */ + START_CRIT_SECTION(); + + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + + metad->btm_root = rootblkno; + metad->btm_level = 0; + metad->btm_fastroot = rootblkno; + metad->btm_fastlevel = 0; + metad->btm_last_cleanup_num_delpages = 0; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + + MarkBufferDirty(rootbuf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_newroot xlrec; + XLogRecPtr recptr; + xl_btree_metadata md; + + XLogBeginInsert(); + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + md.version = metad->btm_version; + md.root = rootblkno; + md.level = 0; + md.fastroot = rootblkno; + md.fastlevel = 0; + md.last_cleanup_num_delpages = 0; + md.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); + + xlrec.rootblk = rootblkno; + xlrec.level = 0; + + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); + + PageSetLSN(rootpage, recptr); + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + + /* + * swap root write lock for read lock. There is no danger of anyone + * else accessing the new root page while it's unlocked, since no one + * else knows where it is yet. + */ + _bt_unlockbuf(rel, rootbuf); + _bt_lockbuf(rel, rootbuf, BT_READ); + + /* okay, metadata is correct, release lock on it without caching */ + _bt_relbuf(rel, metabuf); + } + else + { + rootblkno = metad->btm_fastroot; + Assert(rootblkno != P_NONE); + rootlevel = metad->btm_fastlevel; + + /* + * Cache the metapage data for next time + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + + /* + * We are done with the metapage; arrange to release it via first + * _bt_relandgetbuf call + */ + rootbuf = metabuf; + + for (;;) + { + rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + + if (!P_IGNORE(rootopaque)) + break; + + /* it's dead, Jim. step right one page */ + if (P_RIGHTMOST(rootopaque)) + elog(ERROR, "no live root page found in index \"%s\"", + RelationGetRelationName(rel)); + rootblkno = rootopaque->btpo_next; + } + + if (rootopaque->btpo_level != rootlevel) + elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", + rootblkno, RelationGetRelationName(rel), + rootopaque->btpo_level, rootlevel); + } + + /* + * By here, we have a pin and read lock on the root page, and no lock set + * on the metadata page. Return the root page's buffer. + */ + return rootbuf; +} + +/* + * _bt_gettrueroot() -- Get the true root page of the btree. + * + * This is the same as the BT_READ case of _bt_getroot(), except + * we follow the true-root link not the fast-root link. + * + * By the time we acquire lock on the root page, it might have been split and + * not be the true root anymore. This is okay for the present uses of this + * routine; we only really need to be able to move up at least one tree level + * from whatever non-root page we were at. If we ever do need to lock the + * one true root page, we could loop here, re-reading the metapage on each + * failure. (Note that it wouldn't do to hold the lock on the metapage while + * moving to the root --- that'd deadlock against any concurrent root split.) + */ +Buffer +_bt_gettrueroot(Relation rel) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + Buffer rootbuf; + Page rootpage; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + uint32 rootlevel; + BTMetaPageData *metad; + + /* + * We don't try to use cached metapage data here, since (a) this path is + * not performance-critical, and (b) if we are here it suggests our cache + * is out-of-date anyway. In light of point (b), it's probably safest to + * actively flush any cached metapage info. + */ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + if (!P_ISMETA(metaopaque) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" is not a btree", + RelationGetRelationName(rel)))); + + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); + + /* if no root page initialized yet, fail */ + if (metad->btm_root == P_NONE) + { + _bt_relbuf(rel, metabuf); + return InvalidBuffer; + } + + rootblkno = metad->btm_root; + rootlevel = metad->btm_level; + + /* + * We are done with the metapage; arrange to release it via first + * _bt_relandgetbuf call + */ + rootbuf = metabuf; + + for (;;) + { + rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); + rootpage = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + + if (!P_IGNORE(rootopaque)) + break; + + /* it's dead, Jim. step right one page */ + if (P_RIGHTMOST(rootopaque)) + elog(ERROR, "no live root page found in index \"%s\"", + RelationGetRelationName(rel)); + rootblkno = rootopaque->btpo_next; + } + + if (rootopaque->btpo_level != rootlevel) + elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", + rootblkno, RelationGetRelationName(rel), + rootopaque->btpo_level, rootlevel); + + return rootbuf; +} + +/* + * _bt_getrootheight() -- Get the height of the btree search tree. + * + * We return the level (counting from zero) of the current fast root. + * This represents the number of tree levels we'd have to descend through + * to start any btree index search. + * + * This is used by the planner for cost-estimation purposes. Since it's + * only an estimate, slightly-stale data is fine, hence we don't worry + * about updating previously cached data. + */ +int +_bt_getrootheight(Relation rel) +{ + BTMetaPageData *metad; + + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here and report the index height is zero. + * (XXX perhaps _bt_getroot() should be changed to allow this case.) + */ + if (metad->btm_root == P_NONE) + { + _bt_relbuf(rel, metabuf); + return 0; + } + + /* + * Cache the metapage data for next time + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + _bt_relbuf(rel, metabuf); + } + + /* Get cached page */ + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); + Assert(metad->btm_fastroot != P_NONE); + + return metad->btm_fastlevel; +} + +/* + * _bt_metaversion() -- Get version/status info from metapage. + * + * Sets caller's *heapkeyspace and *allequalimage arguments using data + * from the B-Tree metapage (could be locally-cached version). This + * information needs to be stashed in insertion scankey, so we provide a + * single function that fetches both at once. + * + * This is used to determine the rules that must be used to descend a + * btree. Version 4 indexes treat heap TID as a tiebreaker attribute. + * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable + * performance when inserting a new BTScanInsert-wise duplicate tuple + * among many leaf pages already full of such duplicates. + * + * Also sets allequalimage field, which indicates whether or not it is + * safe to apply deduplication. We rely on the assumption that + * btm_allequalimage will be zero'ed on heapkeyspace indexes that were + * pg_upgrade'd from Postgres 12. + */ +void +_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage) +{ + BTMetaPageData *metad; + + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here. (XXX perhaps _bt_getroot() should + * be changed to allow this case.) + */ + if (metad->btm_root == P_NONE) + { + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *allequalimage = metad->btm_allequalimage; + + _bt_relbuf(rel, metabuf); + return; + } + + /* + * Cache the metapage data for next time + * + * An on-the-fly version upgrade performed by _bt_upgrademetapage() + * can change the nbtree version for an index without invalidating any + * local cache. This is okay because it can only happen when moving + * from version 2 to version 3, both of which are !heapkeyspace + * versions. + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + _bt_relbuf(rel, metabuf); + } + + /* Get cached page */ + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); + Assert(metad->btm_fastroot != P_NONE); + + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *allequalimage = metad->btm_allequalimage; +} + +/* + * _bt_checkpage() -- Verify that a freshly-read page looks sane. + */ +void +_bt_checkpage(Relation rel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * ReadBuffer verifies that every newly-read page passes + * PageHeaderIsValid, which means it either contains a reasonably sane + * page header or is all-zero. We have to defend against the all-zero + * case, however. + */ + if (PageIsNew(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains unexpected zero page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); + + /* + * Additionally check that the special area looks sane. + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buf)), + errhint("Please REINDEX it."))); +} + +/* + * Log the reuse of a page from the FSM. + */ +static void +_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid) +{ + xl_btree_reuse_page xlrec_reuse; + + /* + * Note that we don't register the buffer with the record, because this + * operation doesn't modify the page. This record only exists to provide a + * conflict point for Hot Standby. + */ + + /* XLOG stuff */ + xlrec_reuse.node = rel->rd_node; + xlrec_reuse.block = blkno; + xlrec_reuse.latestRemovedFullXid = safexid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); + + XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE); +} + +/* + * _bt_getbuf() -- Get a buffer by block number for read or write. + * + * blkno == P_NEW means to get an unallocated index page. The page + * will be initialized before returning it. + * + * The general rule in nbtree is that it's never okay to access a + * page without holding both a buffer pin and a buffer lock on + * the page's buffer. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer and its reference count has been incremented + * (ie, the buffer is "locked and pinned"). Also, we apply + * _bt_checkpage to sanity-check the page (except in P_NEW case), + * and perform Valgrind client requests that help Valgrind detect + * unsafe page accesses. + * + * Note: raw LockBuffer() calls are disallowed in nbtree; all + * buffer lock requests need to go through wrapper functions such + * as _bt_lockbuf(). + */ +Buffer +_bt_getbuf(Relation rel, BlockNumber blkno, int access) +{ + Buffer buf; + + if (blkno != P_NEW) + { + /* Read an existing block of the relation */ + buf = ReadBuffer(rel, blkno); + _bt_lockbuf(rel, buf, access); + _bt_checkpage(rel, buf); + } + else + { + bool needLock; + Page page; + + Assert(access == BT_WRITE); + + /* + * First see if the FSM knows of any free pages. + * + * We can't trust the FSM's report unreservedly; we have to check that + * the page is still free. (For example, an already-free page could + * have been re-used between the time the last VACUUM scanned it and + * the time the VACUUM made its FSM updates.) + * + * In fact, it's worse than that: we can't even assume that it's safe + * to take a lock on the reported page. If somebody else has a lock + * on it, or even worse our own caller does, we could deadlock. (The + * own-caller scenario is actually not improbable. Consider an index + * on a serial or timestamp column. Nearly all splits will be at the + * rightmost page, so it's entirely likely that _bt_split will call us + * while holding a lock on the page most recently acquired from FSM. A + * VACUUM running concurrently with the previous split could well have + * placed that page back in FSM.) + * + * To get around that, we ask for only a conditional lock on the + * reported page. If we fail, then someone else is using the page, + * and we may reasonably assume it's not free. (If we happen to be + * wrong, the worst consequence is the page will be lost to use till + * the next VACUUM, which is no big problem.) + */ + for (;;) + { + blkno = GetFreeIndexPage(rel); + if (blkno == InvalidBlockNumber) + break; + buf = ReadBuffer(rel, blkno); + if (_bt_conditionallockbuf(rel, buf)) + { + page = BufferGetPage(buf); + + /* + * It's possible to find an all-zeroes page in an index. For + * example, a backend might successfully extend the relation + * one page and then crash before it is able to make a WAL + * entry for adding the page. If we find a zeroed page then + * reclaim it immediately. + */ + if (PageIsNew(page)) + { + /* Okay to use page. Initialize and return it. */ + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; + } + + if (BTPageIsRecyclable(page)) + { + /* + * If we are generating WAL for Hot Standby then create a + * WAL record that will allow us to conflict with queries + * running on standby, in case they have snapshots older + * than safexid value + */ + if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) + _bt_log_reuse_page(rel, blkno, + BTPageGetDeleteXid(page)); + + /* Okay to use page. Re-initialize and return it. */ + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; + } + elog(DEBUG2, "FSM returned nonrecyclable page"); + _bt_relbuf(rel, buf); + } + else + { + elog(DEBUG2, "FSM returned nonlockable page"); + /* couldn't get lock, so just drop pin */ + ReleaseBuffer(buf); + } + } + + /* + * Extend the relation by one page. + * + * We have to use a lock to ensure no one else is extending the rel at + * the same time, else we will both try to initialize the same new + * page. We can skip locking for new or temp relations, however, + * since no one else could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(rel); + + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + + buf = ReadBuffer(rel, P_NEW); + + /* Acquire buffer lock on new page */ + _bt_lockbuf(rel, buf, BT_WRITE); + + /* + * Release the file-extension lock; it's now OK for someone else to + * extend the relation some more. Note that we cannot release this + * lock before we have buffer lock on the new page, or we risk a race + * condition against btvacuumscan --- see comments therein. + */ + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + /* Initialize the new page before returning it */ + page = BufferGetPage(buf); + Assert(PageIsNew(page)); + _bt_pageinit(page, BufferGetPageSize(buf)); + } + + /* ref count and lock type are correct */ + return buf; +} + +/* + * _bt_relandgetbuf() -- release a locked buffer and get another one. + * + * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the + * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer + * then it reduces to just _bt_getbuf; allowing this case simplifies some + * callers. + * + * The original motivation for using this was to avoid two entries to the + * bufmgr when one would do. However, now it's mainly just a notational + * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf + * is when the target page is the same one already in the buffer. + */ +Buffer +_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) +{ + Buffer buf; + + Assert(blkno != P_NEW); + if (BufferIsValid(obuf)) + _bt_unlockbuf(rel, obuf); + buf = ReleaseAndReadBuffer(obuf, rel, blkno); + _bt_lockbuf(rel, buf, access); + + _bt_checkpage(rel, buf); + return buf; +} + +/* + * _bt_relbuf() -- release a locked buffer. + * + * Lock and pin (refcount) are both dropped. + */ +void +_bt_relbuf(Relation rel, Buffer buf) +{ + _bt_unlockbuf(rel, buf); + ReleaseBuffer(buf); +} + +/* + * _bt_lockbuf() -- lock a pinned buffer. + * + * Lock is acquired without acquiring another pin. This is like a raw + * LockBuffer() call, but performs extra steps needed by Valgrind. + * + * Note: Caller may need to call _bt_checkpage() with buf when pin on buf + * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). + */ +void +_bt_lockbuf(Relation rel, Buffer buf, int access) +{ + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, access); + + /* + * It doesn't matter that _bt_unlockbuf() won't get called in the event of + * an nbtree error (e.g. a unique violation error). That won't cause + * Valgrind false positives. + * + * The nbtree client requests are superimposed on top of the bufmgr.c + * buffer pin client requests. In the event of an nbtree error the buffer + * will certainly get marked as defined when the backend once again + * acquires its first pin on the buffer. (Of course, if the backend never + * touches the buffer again then it doesn't matter that it remains + * non-accessible to Valgrind.) + * + * Note: When an IndexTuple C pointer gets computed using an ItemId read + * from a page while a lock was held, the C pointer becomes unsafe to + * dereference forever as soon as the lock is released. Valgrind can only + * detect cases where the pointer gets dereferenced with no _current_ + * lock/pin held, though. + */ + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_unlockbuf() -- unlock a pinned buffer. + */ +void +_bt_unlockbuf(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable. Check that proactively. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned + * buffer. + * + * Note: Caller may need to call _bt_checkpage() with buf when pin on buf + * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). + */ +bool +_bt_conditionallockbuf(Relation rel, Buffer buf) +{ + /* ConditionalLockBuffer() asserts that pin is held by this backend */ + if (!ConditionalLockBuffer(buf)) + return false; + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + + return true; +} + +/* + * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup + * lock. + */ +void +_bt_upgradelockbufcleanup(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable. Check that proactively. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); +} + +/* + * _bt_pageinit() -- Initialize a new page. + * + * On return, the page header is initialized; data space is empty; + * special space is zeroed out. + */ +void +_bt_pageinit(Page page, Size size) +{ + PageInit(page, size, sizeof(BTPageOpaqueData)); +} + +/* + * Delete item(s) from a btree leaf page during VACUUM. + * + * This routine assumes that the caller has a super-exclusive write lock on + * the buffer. Also, the given deletable and updatable arrays *must* be + * sorted in ascending order. + * + * Routine deals with deleting TIDs when some (but not all) of the heap TIDs + * in an existing posting list item are to be removed. This works by + * updating/overwriting an existing item with caller's new version of the item + * (a version that lacks the TIDs that are to be deleted). + * + * We record VACUUMs and b-tree deletes differently in WAL. Deletes must + * generate their own latestRemovedXid by accessing the table directly, + * whereas VACUUMs rely on the initial VACUUM table scan performing + * WAL-logging that takes care of the issue for the table's indexes + * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree + * deletes don't do. + */ +void +_bt_delitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable) +{ + Page page = BufferGetPage(buf); + BTPageOpaque opaque; + bool needswal = RelationNeedsWAL(rel); + char *updatedbuf = NULL; + Size updatedbuflen = 0; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + + /* Shouldn't be called unless there's something to do */ + Assert(ndeletable > 0 || nupdatable > 0); + + /* Generate new version of posting lists without deleted TIDs */ + if (nupdatable > 0) + updatedbuf = _bt_delitems_update(updatable, nupdatable, + updatedoffsets, &updatedbuflen, + needswal); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* + * Handle posting tuple updates. + * + * Deliberately do this before handling simple deletes. If we did it the + * other way around (i.e. WAL record order -- simple deletes before + * updates) then we'd have to make compensating changes to the 'updatable' + * array of offset numbers. + * + * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it + * happens to already be set. It's important that we not interfere with + * _bt_delitems_delete(). + */ + for (int i = 0; i < nupdatable; i++) + { + OffsetNumber updatedoffset = updatedoffsets[i]; + IndexTuple itup; + Size itemsz; + + itup = updatable[i]->itup; + itemsz = MAXALIGN(IndexTupleSize(itup)); + if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, + itemsz)) + elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + } + + /* Now handle simple deletes of entire tuples */ + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * We can clear the vacuum cycle ID since this page has certainly been + * processed by the current vacuum scan. + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_cycleid = 0; + + /* + * Clear the BTP_HAS_GARBAGE page flag. + * + * This flag indicates the presence of LP_DEAD items on the page (though + * not reliably). Note that we only rely on it with pg_upgrade'd + * !heapkeyspace indexes. That's why clearing it here won't usually + * interfere with _bt_delitems_delete(). + */ + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (needswal) + { + XLogRecPtr recptr; + xl_btree_vacuum xlrec_vacuum; + + xlrec_vacuum.ndeleted = ndeletable; + xlrec_vacuum.nupdated = nupdatable; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum); + + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + if (nupdatable > 0) + { + XLogRegisterBufData(0, (char *) updatedoffsets, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updatedbuf, updatedbuflen); + } + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* can't leak memory here */ + if (updatedbuf != NULL) + pfree(updatedbuf); + /* free tuples allocated within _bt_delitems_update() */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]->itup); +} + +/* + * Delete item(s) from a btree leaf page during single-page cleanup. + * + * This routine assumes that the caller has pinned and write locked the + * buffer. Also, the given deletable and updatable arrays *must* be sorted in + * ascending order. + * + * Routine deals with deleting TIDs when some (but not all) of the heap TIDs + * in an existing posting list item are to be removed. This works by + * updating/overwriting an existing item with caller's new version of the item + * (a version that lacks the TIDs that are to be deleted). + * + * This is nearly the same as _bt_delitems_vacuum as far as what it does to + * the page, but it needs its own latestRemovedXid from caller (caller gets + * this from tableam). This is used by the REDO routine to generate recovery + * conflicts. The other difference is that only _bt_delitems_vacuum will + * clear page's VACUUM cycle ID. + */ +static void +_bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable) +{ + Page page = BufferGetPage(buf); + BTPageOpaque opaque; + bool needswal = RelationNeedsWAL(rel); + char *updatedbuf = NULL; + Size updatedbuflen = 0; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + + /* Shouldn't be called unless there's something to do */ + Assert(ndeletable > 0 || nupdatable > 0); + + /* Generate new versions of posting lists without deleted TIDs */ + if (nupdatable > 0) + updatedbuf = _bt_delitems_update(updatable, nupdatable, + updatedoffsets, &updatedbuflen, + needswal); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* Handle updates and deletes just like _bt_delitems_vacuum */ + for (int i = 0; i < nupdatable; i++) + { + OffsetNumber updatedoffset = updatedoffsets[i]; + IndexTuple itup; + Size itemsz; + + itup = updatable[i]->itup; + itemsz = MAXALIGN(IndexTupleSize(itup)); + if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, + itemsz)) + elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + } + + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at + * this point. The VACUUM command alone controls vacuum cycle IDs. + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Clear the BTP_HAS_GARBAGE page flag. + * + * This flag indicates the presence of LP_DEAD items on the page (though + * not reliably). Note that we only rely on it with pg_upgrade'd + * !heapkeyspace indexes. + */ + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (needswal) + { + XLogRecPtr recptr; + xl_btree_delete xlrec_delete; + + xlrec_delete.latestRemovedXid = latestRemovedXid; + xlrec_delete.ndeleted = ndeletable; + xlrec_delete.nupdated = nupdatable; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete); + + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + if (nupdatable > 0) + { + XLogRegisterBufData(0, (char *) updatedoffsets, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updatedbuf, updatedbuflen); + } + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* can't leak memory here */ + if (updatedbuf != NULL) + pfree(updatedbuf); + /* free tuples allocated within _bt_delitems_update() */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]->itup); +} + +/* + * Set up state needed to delete TIDs from posting list tuples via "updating" + * the tuple. Performs steps common to both _bt_delitems_vacuum and + * _bt_delitems_delete. These steps must take place before each function's + * critical section begins. + * + * updatable and nupdatable are inputs, though note that we will use + * _bt_update_posting() to replace the original itup with a pointer to a final + * version in palloc()'d memory. Caller should free the tuples when its done. + * + * The first nupdatable entries from updatedoffsets are set to the page offset + * number for posting list tuples that caller updates. This is mostly useful + * because caller may need to WAL-log the page offsets (though we always do + * this for caller out of convenience). + * + * Returns buffer consisting of an array of xl_btree_update structs that + * describe the steps we perform here for caller (though only when needswal is + * true). Also sets *updatedbuflen to the final size of the buffer. This + * buffer is used by caller when WAL logging is required. + */ +static char * +_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, + OffsetNumber *updatedoffsets, Size *updatedbuflen, + bool needswal) +{ + char *updatedbuf = NULL; + Size buflen = 0; + + /* Shouldn't be called unless there's something to do */ + Assert(nupdatable > 0); + + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + Size itemsz; + + /* Replace work area IndexTuple with updated version */ + _bt_update_posting(vacposting); + + /* Keep track of size of xl_btree_update for updatedbuf in passing */ + itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16); + buflen += itemsz; + + /* Build updatedoffsets buffer in passing */ + updatedoffsets[i] = vacposting->updatedoffset; + } + + /* XLOG stuff */ + if (needswal) + { + Size offset = 0; + + /* Allocate, set final size for caller */ + updatedbuf = palloc(buflen); + *updatedbuflen = buflen; + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + Size itemsz; + xl_btree_update update; + + update.ndeletedtids = vacposting->ndeletedtids; + memcpy(updatedbuf + offset, &update.ndeletedtids, + SizeOfBtreeUpdate); + offset += SizeOfBtreeUpdate; + + itemsz = update.ndeletedtids * sizeof(uint16); + memcpy(updatedbuf + offset, vacposting->deletetids, itemsz); + offset += itemsz; + } + } + + return updatedbuf; +} + +/* + * Comparator used by _bt_delitems_delete_check() to restore deltids array + * back to its original leaf-page-wise sort order + */ +static int +_bt_delitems_cmp(const void *a, const void *b) +{ + TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a; + TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b; + + if (indexdelete1->id > indexdelete2->id) + return 1; + if (indexdelete1->id < indexdelete2->id) + return -1; + + Assert(false); + + return 0; +} + +/* + * Try to delete item(s) from a btree leaf page during single-page cleanup. + * + * nbtree interface to table_index_delete_tuples(). Deletes a subset of index + * tuples from caller's deltids array: those whose TIDs are found safe to + * delete by the tableam (or already marked LP_DEAD in index, and so already + * known to be deletable by our simple index deletion caller). We physically + * delete index tuples from buf leaf page last of all (for index tuples where + * that is known to be safe following our table_index_delete_tuples() call). + * + * Simple index deletion caller only includes TIDs from index tuples marked + * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be + * included without increasing the total number of distinct table blocks for + * the deletion operation as a whole. This approach often allows us to delete + * some extra index tuples that were practically free for tableam to check in + * passing (when they actually turn out to be safe to delete). It probably + * only makes sense for the tableam to go ahead with these extra checks when + * it is block-oriented (otherwise the checks probably won't be practically + * free, which we rely on). The tableam interface requires the tableam side + * to handle the problem, though, so this is okay (we as an index AM are free + * to make the simplifying assumption that all tableams must be block-based). + * + * Bottom-up index deletion caller provides all the TIDs from the leaf page, + * without expecting that tableam will check most of them. The tableam has + * considerable discretion around which entries/blocks it checks. Our role in + * costing the bottom-up deletion operation is strictly advisory. + * + * Note: Caller must have added deltids entries (i.e. entries that go in + * delstate's main array) in leaf-page-wise order: page offset number order, + * TID order among entries taken from the same posting list tuple (tiebreak on + * TID). This order is convenient to work with here. + * + * Note: We also rely on the id field of each deltids element "capturing" this + * original leaf-page-wise order. That is, we expect to be able to get back + * to the original leaf-page-wise order just by sorting deltids on the id + * field (tableam will sort deltids for its own reasons, so we'll need to put + * it back in leaf-page-wise order afterwards). + */ +void +_bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, + TM_IndexDeleteOp *delstate) +{ + Page page = BufferGetPage(buf); + TransactionId latestRemovedXid; + OffsetNumber postingidxoffnum = InvalidOffsetNumber; + int ndeletable = 0, + nupdatable = 0; + OffsetNumber deletable[MaxIndexTuplesPerPage]; + BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + + /* Use tableam interface to determine which tuples to delete first */ + latestRemovedXid = table_index_delete_tuples(heapRel, delstate); + + /* Should not WAL-log latestRemovedXid unless it's required */ + if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel)) + latestRemovedXid = InvalidTransactionId; + + /* + * Construct a leaf-page-wise description of what _bt_delitems_delete() + * needs to do to physically delete index tuples from the page. + * + * Must sort deltids array to restore leaf-page-wise order (original order + * before call to tableam). This is the order that the loop expects. + * + * Note that deltids array might be a lot smaller now. It might even have + * no entries at all (with bottom-up deletion caller), in which case there + * is nothing left to do. + */ + qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete), + _bt_delitems_cmp); + if (delstate->ndeltids == 0) + { + Assert(delstate->bottomup); + return; + } + + /* We definitely have to delete at least one index tuple (or one TID) */ + for (int i = 0; i < delstate->ndeltids; i++) + { + TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id; + OffsetNumber idxoffnum = dstatus->idxoffnum; + ItemId itemid = PageGetItemId(page, idxoffnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + int nestedi, + nitem; + BTVacuumPosting vacposting; + + Assert(OffsetNumberIsValid(idxoffnum)); + + if (idxoffnum == postingidxoffnum) + { + /* + * This deltid entry is a TID from a posting list tuple that has + * already been completely processed + */ + Assert(BTreeTupleIsPosting(itup)); + Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup), + &delstate->deltids[i].tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup), + &delstate->deltids[i].tid) >= 0); + continue; + } + + if (!BTreeTupleIsPosting(itup)) + { + /* Plain non-pivot tuple */ + Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid)); + if (dstatus->knowndeletable) + deletable[ndeletable++] = idxoffnum; + continue; + } + + /* + * itup is a posting list tuple whose lowest deltids entry (which may + * or may not be for the first TID from itup) is considered here now. + * We should process all of the deltids entries for the posting list + * together now, though (not just the lowest). Remember to skip over + * later itup-related entries during later iterations of outermost + * loop. + */ + postingidxoffnum = idxoffnum; /* Remember work in outermost loop */ + nestedi = i; /* Initialize for first itup deltids entry */ + vacposting = NULL; /* Describes final action for itup */ + nitem = BTreeTupleGetNPosting(itup); + for (int p = 0; p < nitem; p++) + { + ItemPointer ptid = BTreeTupleGetPostingN(itup, p); + int ptidcmp = -1; + + /* + * This nested loop reuses work across ptid TIDs taken from itup. + * We take advantage of the fact that both itup's TIDs and deltids + * entries (within a single itup/posting list grouping) must both + * be in ascending TID order. + */ + for (; nestedi < delstate->ndeltids; nestedi++) + { + TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi]; + TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id); + + /* Stop once we get past all itup related deltids entries */ + Assert(tdstatus->idxoffnum >= idxoffnum); + if (tdstatus->idxoffnum != idxoffnum) + break; + + /* Skip past non-deletable itup related entries up front */ + if (!tdstatus->knowndeletable) + continue; + + /* Entry is first partial ptid match (or an exact match)? */ + ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid); + if (ptidcmp >= 0) + { + /* Greater than or equal (partial or exact) match... */ + break; + } + } + + /* ...exact ptid match to a deletable deltids entry? */ + if (ptidcmp != 0) + continue; + + /* Exact match for deletable deltids entry -- ptid gets deleted */ + if (vacposting == NULL) + { + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + nitem * sizeof(uint16)); + vacposting->itup = itup; + vacposting->updatedoffset = idxoffnum; + vacposting->ndeletedtids = 0; + } + vacposting->deletetids[vacposting->ndeletedtids++] = p; + } + + /* Final decision on itup, a posting list tuple */ + + if (vacposting == NULL) + { + /* No TIDs to delete from itup -- do nothing */ + } + else if (vacposting->ndeletedtids == nitem) + { + /* Straight delete of itup (to delete all TIDs) */ + deletable[ndeletable++] = idxoffnum; + /* Turns out we won't need granular information */ + pfree(vacposting); + } + else + { + /* Delete some (but not all) TIDs from itup */ + Assert(vacposting->ndeletedtids > 0 && + vacposting->ndeletedtids < nitem); + updatable[nupdatable++] = vacposting; + } + } + + /* Physically delete tuples (or TIDs) using deletable (or updatable) */ + _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable, + updatable, nupdatable); + + /* be tidy */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]); +} + +/* + * Check that leftsib page (the btpo_prev of target page) is not marked with + * INCOMPLETE_SPLIT flag. Used during page deletion. + * + * Returning true indicates that page flag is set in leftsib (which is + * definitely still the left sibling of target). When that happens, the + * target doesn't have a downlink in parent, and the page deletion algorithm + * isn't prepared to handle that. Deletion of the target page (or the whole + * subtree that contains the target page) cannot take place. + * + * Caller should not have a lock on the target page itself, since pages on the + * same level must always be locked left to right to avoid deadlocks. + */ +static bool +_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + bool result; + + /* Easy case: No left sibling */ + if (leftsib == P_NONE) + return false; + + buf = _bt_getbuf(rel, leftsib, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If the left sibling was concurrently split, so that its next-pointer + * doesn't point to the current page anymore, the split that created + * target must be completed. Caller can reasonably expect that there will + * be a downlink to the target page that it can relocate using its stack. + * (We don't allow splitting an incompletely split page again until the + * previous split has been completed.) + */ + result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque)); + _bt_relbuf(rel, buf); + + return result; +} + +/* + * Check that leafrightsib page (the btpo_next of target leaf page) is not + * marked with ISHALFDEAD flag. Used during page deletion. + * + * Returning true indicates that page flag is set in leafrightsib, so page + * deletion cannot go ahead. Our caller is not prepared to deal with the case + * where the parent page does not have a pivot tuples whose downlink points to + * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't + * seem worth going to the trouble of teaching our caller to deal with it. + * The situation will be resolved after VACUUM finishes the deletion of the + * half-dead page (when a future VACUUM operation reaches the target page + * again). + * + * _bt_leftsib_splitflag() is called for both leaf pages and internal pages. + * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is + * okay because of the restriction on deleting pages that are the rightmost + * page of their parent (i.e. that such deletions can only take place when the + * entire subtree must be deleted). The leaf level check made here will apply + * to a right "cousin" leaf page rather than a simple right sibling leaf page + * in cases where caller actually goes on to attempt deleting pages that are + * above the leaf page. The right cousin leaf page is representative of the + * left edge of the subtree to the right of the to-be-deleted subtree as a + * whole, which is exactly the condition that our caller cares about. + * (Besides, internal pages are never marked half-dead, so it isn't even + * possible to _directly_ assess if an internal page is part of some other + * to-be-deleted subtree.) + */ +static bool +_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + bool result; + + Assert(leafrightsib != P_NONE); + + buf = _bt_getbuf(rel, leafrightsib, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque)); + result = P_ISHALFDEAD(opaque); + _bt_relbuf(rel, buf); + + return result; +} + +/* + * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so. + * + * This action unlinks the leaf page from the b-tree structure, removing all + * pointers leading to it --- but not touching its own left and right links. + * The page cannot be physically reclaimed right away, since other processes + * may currently be trying to follow links leading to the page; they have to + * be allowed to use its right-link to recover. See nbtree/README. + * + * On entry, the target buffer must be pinned and locked (either read or write + * lock is OK). The page must be an empty leaf page, which may be half-dead + * already (a half-dead page should only be passed to us when an earlier + * VACUUM operation was interrupted, though). Note in particular that caller + * should never pass a buffer containing an existing deleted page here. The + * lock and pin on caller's buffer will be dropped before we return. + * + * Maintains bulk delete stats for caller, which are taken from vstate. We + * need to cooperate closely with caller here so that whole VACUUM operation + * reliably avoids any double counting of subsidiary-to-leafbuf pages that we + * delete in passing. If such pages happen to be from a block number that is + * ahead of the current scanblkno position, then caller is expected to count + * them directly later on. It's simpler for us to understand caller's + * requirements than it would be for caller to understand when or how a + * deleted page became deleted after the fact. + * + * NOTE: this leaks memory. Rather than trying to clean up everything + * carefully, it's better to run it in a temp context that can be reset + * frequently. + */ +void +_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) +{ + BlockNumber rightsib; + bool rightsib_empty; + Page page; + BTPageOpaque opaque; + + /* + * Save original leafbuf block number from caller. Only deleted blocks + * that are <= scanblkno are added to bulk delete stat's pages_deleted + * count. + */ + BlockNumber scanblkno = BufferGetBlockNumber(leafbuf); + + /* + * "stack" is a search stack leading (approximately) to the target page. + * It is initially NULL, but when iterating, we keep it to avoid + * duplicated search effort. + * + * Also, when "stack" is not NULL, we have already checked that the + * current page is not the right half of an incomplete split, i.e. the + * left sibling does not have its INCOMPLETE_SPLIT flag set, including + * when the current target page is to the right of caller's initial page + * (the scanblkno page). + */ + BTStack stack = NULL; + + for (;;) + { + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Internal pages are never deleted directly, only as part of deleting + * the whole subtree all the way down to leaf level. + * + * Also check for deleted pages here. Caller never passes us a fully + * deleted page. Only VACUUM can delete pages, so there can't have + * been a concurrent deletion. Assume that we reached any deleted + * page encountered here by following a sibling link, and that the + * index is corrupt. + */ + Assert(!P_ISDELETED(opaque)); + if (!P_ISLEAF(opaque) || P_ISDELETED(opaque)) + { + /* + * Pre-9.4 page deletion only marked internal pages as half-dead, + * but now we only use that flag on leaf pages. The old algorithm + * was never supposed to leave half-dead pages in the tree, it was + * just a transient state, but it was nevertheless possible in + * error scenarios. We don't know how to deal with them here. They + * are harmless as far as searches are considered, but inserts + * into the deleted keyspace could add out-of-order downlinks in + * the upper levels. Log a notice, hopefully the admin will notice + * and reindex. + */ + if (P_ISHALFDEAD(opaque)) + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains a half-dead internal page", + RelationGetRelationName(rel)), + errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); + + if (P_ISDELETED(opaque)) + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"", + BufferGetBlockNumber(leafbuf), + scanblkno, + RelationGetRelationName(rel)))); + + _bt_relbuf(rel, leafbuf); + return; + } + + /* + * We can never delete rightmost pages nor root pages. While at it, + * check that page is empty, since it's possible that the leafbuf page + * was empty a moment ago, but has since had some inserts. + * + * To keep the algorithm simple, we also never delete an incompletely + * split page (they should be rare enough that this doesn't make any + * meaningful difference to disk usage): + * + * The INCOMPLETE_SPLIT flag on the page tells us if the page is the + * left half of an incomplete split, but ensuring that it's not the + * right half is more complicated. For that, we have to check that + * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using + * _bt_leftsib_splitflag(). On the first iteration, we temporarily + * release the lock on scanblkno/leafbuf, check the left sibling, and + * construct a search stack to scanblkno. On subsequent iterations, + * we know we stepped right from a page that passed these tests, so + * it's OK. + */ + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || + P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || + P_INCOMPLETE_SPLIT(opaque)) + { + /* Should never fail to delete a half-dead page */ + Assert(!P_ISHALFDEAD(opaque)); + + _bt_relbuf(rel, leafbuf); + return; + } + + /* + * First, remove downlink pointing to the page (or a parent of the + * page, if we are going to delete a taller subtree), and mark the + * leafbuf page half-dead + */ + if (!P_ISHALFDEAD(opaque)) + { + /* + * We need an approximate pointer to the page's parent page. We + * use a variant of the standard search mechanism to search for + * the page's high key; this will give us a link to either the + * current parent or someplace to its left (if there are multiple + * equal high keys, which is possible with !heapkeyspace indexes). + * + * Also check if this is the right-half of an incomplete split + * (see comment above). + */ + if (!stack) + { + BTScanInsert itup_key; + ItemId itemid; + IndexTuple targetkey; + BlockNumber leftsib, + leafblkno; + Buffer sleafbuf; + + itemid = PageGetItemId(page, P_HIKEY); + targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); + + leftsib = opaque->btpo_prev; + leafblkno = BufferGetBlockNumber(leafbuf); + + /* + * To avoid deadlocks, we'd better drop the leaf page lock + * before going further. + */ + _bt_unlockbuf(rel, leafbuf); + + /* + * Check that the left sibling of leafbuf (if any) is not + * marked with INCOMPLETE_SPLIT flag before proceeding + */ + Assert(leafblkno == scanblkno); + if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) + { + ReleaseBuffer(leafbuf); + return; + } + + /* we need an insertion scan key for the search, so build one */ + itup_key = _bt_mkscankey(rel, targetkey); + /* find the leftmost leaf page with matching pivot/high key */ + itup_key->pivotsearch = true; + stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL); + /* won't need a second lock or pin on leafbuf */ + _bt_relbuf(rel, sleafbuf); + + /* + * Re-lock the leaf page, and start over to use our stack + * within _bt_mark_page_halfdead. We must do it that way + * because it's possible that leafbuf can no longer be + * deleted. We need to recheck. + * + * Note: We can't simply hold on to the sleafbuf lock instead, + * because it's barely possible that sleafbuf is not the same + * page as leafbuf. This happens when leafbuf split after our + * original lock was dropped, but before _bt_search finished + * its descent. We rely on the assumption that we'll find + * leafbuf isn't safe to delete anymore in this scenario. + * (Page deletion can cope with the stack being to the left of + * leafbuf, but not to the right of leafbuf.) + */ + _bt_lockbuf(rel, leafbuf, BT_WRITE); + continue; + } + + /* + * See if it's safe to delete the leaf page, and determine how + * many parent/internal pages above the leaf level will be + * deleted. If it's safe then _bt_mark_page_halfdead will also + * perform the first phase of deletion, which includes marking the + * leafbuf page half-dead. + */ + Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque)); + if (!_bt_mark_page_halfdead(rel, leafbuf, stack)) + { + _bt_relbuf(rel, leafbuf); + return; + } + } + + /* + * Then unlink it from its siblings. Each call to + * _bt_unlink_halfdead_page unlinks the topmost page from the subtree, + * making it shallower. Iterate until the leafbuf page is deleted. + */ + rightsib_empty = false; + Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)); + while (P_ISHALFDEAD(opaque)) + { + /* Check for interrupts in _bt_unlink_halfdead_page */ + if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, + &rightsib_empty, vstate)) + { + /* + * _bt_unlink_halfdead_page should never fail, since we + * established that deletion is generally safe in + * _bt_mark_page_halfdead -- index must be corrupt. + * + * Note that _bt_unlink_halfdead_page already released the + * lock and pin on leafbuf for us. + */ + Assert(false); + return; + } + } + + Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); + + rightsib = opaque->btpo_next; + + _bt_relbuf(rel, leafbuf); + + /* + * Check here, as calling loops will have locks held, preventing + * interrupts from being processed. + */ + CHECK_FOR_INTERRUPTS(); + + /* + * The page has now been deleted. If its right sibling is completely + * empty, it's possible that the reason we haven't deleted it earlier + * is that it was the rightmost child of the parent. Now that we + * removed the downlink for this page, the right sibling might now be + * the only child of the parent, and could be removed. It would be + * picked up by the next vacuum anyway, but might as well try to + * remove it now, so loop back to process the right sibling. + * + * Note: This relies on the assumption that _bt_getstackbuf() will be + * able to reuse our original descent stack with a different child + * block (provided that the child block is to the right of the + * original leaf page reached by _bt_search()). It will even update + * the descent stack each time we loop around, avoiding repeated work. + */ + if (!rightsib_empty) + break; + + leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); + } +} + +/* + * First stage of page deletion. + * + * Establish the height of the to-be-deleted subtree with leafbuf at its + * lowest level, remove the downlink to the subtree, and mark leafbuf + * half-dead. The final to-be-deleted subtree is usually just leafbuf itself, + * but may include additional internal pages (at most one per level of the + * tree below the root). + * + * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is + * the rightmost child of its parent (and parent has more than one downlink). + * Returns 'true' when the first stage of page deletion completed + * successfully. + */ +static bool +_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) +{ + BlockNumber leafblkno; + BlockNumber leafrightsib; + BlockNumber topparent; + BlockNumber topparentrightsib; + ItemId itemid; + Page page; + BTPageOpaque opaque; + Buffer subtreeparent; + OffsetNumber poffset; + OffsetNumber nextoffset; + IndexTuple itup; + IndexTupleData trunctuple; + + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && + P_ISLEAF(opaque) && !P_IGNORE(opaque) && + P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); + + /* + * Save info about the leaf page. + */ + leafblkno = BufferGetBlockNumber(leafbuf); + leafrightsib = opaque->btpo_next; + + /* + * Before attempting to lock the parent page, check that the right sibling + * is not in half-dead state. A half-dead right sibling would have no + * downlink in the parent, which would be highly confusing later when we + * delete the downlink. It would fail the "right sibling of target page + * is also the next child in parent page" cross-check below. + */ + if (_bt_rightsib_halfdeadflag(rel, leafrightsib)) + { + elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead", + leafblkno, leafrightsib); + return false; + } + + /* + * We cannot delete a page that is the rightmost child of its immediate + * parent, unless it is the only child --- in which case the parent has to + * be deleted too, and the same condition applies recursively to it. We + * have to check this condition all the way up before trying to delete, + * and lock the parent of the root of the to-be-deleted subtree (the + * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent + * for us. We remove the downlink to the "top parent" page (subtree root + * page) from the subtree parent page below. + * + * Initialize topparent to be leafbuf page now. The final to-be-deleted + * subtree is often a degenerate one page subtree consisting only of the + * leafbuf page. When that happens, the leafbuf page is the final subtree + * root page/top parent page. + */ + topparent = leafblkno; + topparentrightsib = leafrightsib; + if (!_bt_lock_subtree_parent(rel, leafblkno, stack, + &subtreeparent, &poffset, + &topparent, &topparentrightsib)) + return false; + + /* + * Check that the parent-page index items we're about to delete/overwrite + * in subtree parent page contain what we expect. This can fail if the + * index has become corrupt for some reason. We want to throw any error + * before entering the critical section --- otherwise it'd be a PANIC. + */ + page = BufferGetPage(subtreeparent); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + +#ifdef USE_ASSERT_CHECKING + + /* + * This is just an assertion because _bt_lock_subtree_parent should have + * guaranteed tuple has the expected contents + */ + itemid = PageGetItemId(page, poffset); + itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleGetDownLink(itup) == topparent); +#endif + + nextoffset = OffsetNumberNext(poffset); + itemid = PageGetItemId(page, nextoffset); + itup = (IndexTuple) PageGetItem(page, itemid); + if (BTreeTupleGetDownLink(itup) != topparentrightsib) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"", + topparentrightsib, topparent, + BTreeTupleGetDownLink(itup), + BufferGetBlockNumber(subtreeparent), + RelationGetRelationName(rel)))); + + /* + * Any insert which would have gone on the leaf block will now go to its + * right sibling. In other words, the key space moves right. + */ + PredicateLockPageCombine(rel, leafblkno, leafrightsib); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* + * Update parent of subtree. We want to delete the downlink to the top + * parent page/root of the subtree, and the *following* key. Easiest way + * is to copy the right sibling's downlink over the downlink that points + * to top parent page, and then delete the right sibling's original pivot + * tuple. + * + * Lanin and Shasha make the key space move left when deleting a page, + * whereas the key space moves right here. That's why we cannot simply + * delete the pivot tuple with the downlink to the top parent page. See + * nbtree/README. + */ + page = BufferGetPage(subtreeparent); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + itemid = PageGetItemId(page, poffset); + itup = (IndexTuple) PageGetItem(page, itemid); + BTreeTupleSetDownLink(itup, topparentrightsib); + + nextoffset = OffsetNumberNext(poffset); + PageIndexTupleDelete(page, nextoffset); + + /* + * Mark the leaf page as half-dead, and stamp it with a link to the top + * parent page. When the leaf page is also the top parent page, the link + * is set to InvalidBlockNumber. + */ + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags |= BTP_HALF_DEAD; + + Assert(PageGetMaxOffsetNumber(page) == P_HIKEY); + MemSet(&trunctuple, 0, sizeof(IndexTupleData)); + trunctuple.t_info = sizeof(IndexTupleData); + if (topparent != leafblkno) + BTreeTupleSetTopParent(&trunctuple, topparent); + else + BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber); + + if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple, + IndexTupleSize(&trunctuple))) + elog(ERROR, "could not overwrite high key in half-dead page"); + + /* Must mark buffers dirty before XLogInsert */ + MarkBufferDirty(subtreeparent); + MarkBufferDirty(leafbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_mark_page_halfdead xlrec; + XLogRecPtr recptr; + + xlrec.poffset = poffset; + xlrec.leafblk = leafblkno; + if (topparent != leafblkno) + xlrec.topparent = topparent; + else + xlrec.topparent = InvalidBlockNumber; + + XLogBeginInsert(); + XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD); + + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + xlrec.leftblk = opaque->btpo_prev; + xlrec.rightblk = opaque->btpo_next; + + XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD); + + page = BufferGetPage(subtreeparent); + PageSetLSN(page, recptr); + page = BufferGetPage(leafbuf); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + _bt_relbuf(rel, subtreeparent); + return true; +} + +/* + * Second stage of page deletion. + * + * Unlinks a single page (in the subtree undergoing deletion) from its + * siblings. Also marks the page deleted. + * + * To get rid of the whole subtree, including the leaf page itself, call here + * until the leaf page is deleted. The original "top parent" established in + * the first stage of deletion is deleted in the first call here, while the + * leaf page is deleted in the last call here. Note that the leaf page itself + * is often the initial top parent page. + * + * Returns 'false' if the page could not be unlinked (shouldn't happen). If + * the right sibling of the current target page is empty, *rightsib_empty is + * set to true, allowing caller to delete the target's right sibling page in + * passing. Note that *rightsib_empty is only actually used by caller when + * target page is leafbuf, following last call here for leafbuf/the subtree + * containing leafbuf. (We always set *rightsib_empty for caller, just to be + * consistent.) + * + * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). + * On success exit, we'll be holding pin and write lock. On failure exit, + * we'll release both pin and lock before returning (we define it that way + * to avoid having to reacquire a lock we already released). + */ +static bool +_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, + bool *rightsib_empty, BTVacState *vstate) +{ + BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); + IndexBulkDeleteResult *stats = vstate->stats; + BlockNumber leafleftsib; + BlockNumber leafrightsib; + BlockNumber target; + BlockNumber leftsib; + BlockNumber rightsib; + Buffer lbuf = InvalidBuffer; + Buffer buf; + Buffer rbuf; + Buffer metabuf = InvalidBuffer; + Page metapg = NULL; + BTMetaPageData *metad = NULL; + ItemId itemid; + Page page; + BTPageOpaque opaque; + FullTransactionId safexid; + bool rightsib_is_rightmost; + uint32 targetlevel; + IndexTuple leafhikey; + BlockNumber leaftopparent; + + page = BufferGetPage(leafbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)); + + /* + * Remember some information about the leaf page. + */ + itemid = PageGetItemId(page, P_HIKEY); + leafhikey = (IndexTuple) PageGetItem(page, itemid); + target = BTreeTupleGetTopParent(leafhikey); + leafleftsib = opaque->btpo_prev; + leafrightsib = opaque->btpo_next; + + _bt_unlockbuf(rel, leafbuf); + + /* + * Check here, as calling loops will have locks held, preventing + * interrupts from being processed. + */ + CHECK_FOR_INTERRUPTS(); + + /* Unlink the current top parent of the subtree */ + if (!BlockNumberIsValid(target)) + { + /* Target is leaf page (or leaf page is top parent, if you prefer) */ + target = leafblkno; + + buf = leafbuf; + leftsib = leafleftsib; + targetlevel = 0; + } + else + { + /* Target is the internal page taken from leaf's top parent link */ + Assert(target != leafblkno); + + /* Fetch the block number of the target's left sibling */ + buf = _bt_getbuf(rel, target, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + leftsib = opaque->btpo_prev; + targetlevel = opaque->btpo_level; + Assert(targetlevel > 0); + + /* + * To avoid deadlocks, we'd better drop the target page lock before + * going further. + */ + _bt_unlockbuf(rel, buf); + } + + /* + * We have to lock the pages we need to modify in the standard order: + * moving right, then up. Else we will deadlock against other writers. + * + * So, first lock the leaf page, if it's not the target. Then find and + * write-lock the current left sibling of the target page. The sibling + * that was current a moment ago could have split, so we may have to move + * right. + */ + if (target != leafblkno) + _bt_lockbuf(rel, leafbuf, BT_WRITE); + if (leftsib != P_NONE) + { + lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + while (P_ISDELETED(opaque) || opaque->btpo_next != target) + { + bool leftsibvalid = true; + + /* + * Before we follow the link from the page that was the left + * sibling mere moments ago, validate its right link. This + * reduces the opportunities for loop to fail to ever make any + * progress in the presence of index corruption. + * + * Note: we rely on the assumption that there can only be one + * vacuum process running at a time (against the same index). + */ + if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) || + leftsib == opaque->btpo_next) + leftsibvalid = false; + + leftsib = opaque->btpo_next; + _bt_relbuf(rel, lbuf); + + if (!leftsibvalid) + { + if (target != leafblkno) + { + /* we have only a pin on target, but pin+lock on leafbuf */ + ReleaseBuffer(buf); + _bt_relbuf(rel, leafbuf); + } + else + { + /* we have only a pin on leafbuf */ + ReleaseBuffer(leafbuf); + } + + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("valid left sibling for deletion target could not be located: " + "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"", + leftsib, target, leafblkno, scanblkno, + RelationGetRelationName(rel)))); + + return false; + } + + CHECK_FOR_INTERRUPTS(); + + /* step right one page */ + lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + } + else + lbuf = InvalidBuffer; + + /* Next write-lock the target page itself */ + _bt_lockbuf(rel, buf, BT_WRITE); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Check page is still empty etc, else abandon deletion. This is just for + * paranoia's sake; a half-dead page cannot resurrect because there can be + * only one vacuum process running at a time. + */ + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque)) + elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"", + target, RelationGetRelationName(rel)); + + if (opaque->btpo_prev != leftsib) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"", + leftsib, opaque->btpo_prev, target, + RelationGetRelationName(rel)))); + + if (target == leafblkno) + { + if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || + !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) + elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"", + target, RelationGetRelationName(rel)); + + /* Leaf page is also target page: don't set leaftopparent */ + leaftopparent = InvalidBlockNumber; + } + else + { + IndexTuple finaldataitem; + + if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) || + P_ISLEAF(opaque)) + elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"", + targetlevel, target, RelationGetRelationName(rel)); + + /* Target is internal: set leaftopparent for next call here... */ + itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); + finaldataitem = (IndexTuple) PageGetItem(page, itemid); + leaftopparent = BTreeTupleGetDownLink(finaldataitem); + /* ...except when it would be a redundant pointer-to-self */ + if (leaftopparent == leafblkno) + leaftopparent = InvalidBlockNumber; + } + + /* No leaftopparent for level 0 (leaf page) or level 1 target */ + Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1); + + /* + * And next write-lock the (current) right sibling. + */ + rightsib = opaque->btpo_next; + rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (opaque->btpo_prev != target) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling's left-link doesn't match: " + "block %u links to %u instead of expected %u in index \"%s\"", + rightsib, opaque->btpo_prev, target, + RelationGetRelationName(rel)))); + rightsib_is_rightmost = P_RIGHTMOST(opaque); + *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); + + /* + * If we are deleting the next-to-last page on the target's level, then + * the rightsib is a candidate to become the new fast root. (In theory, it + * might be possible to push the fast root even further down, but the odds + * of doing so are slim, and the locking considerations daunting.) + * + * We can safely acquire a lock on the metapage here --- see comments for + * _bt_newroot(). + */ + if (leftsib == P_NONE && rightsib_is_rightmost) + { + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_RIGHTMOST(opaque)) + { + /* rightsib will be the only one left on the level */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + /* + * The expected case here is btm_fastlevel == targetlevel+1; if + * the fastlevel is <= targetlevel, something is wrong, and we + * choose to overwrite it to fix it. + */ + if (metad->btm_fastlevel > targetlevel + 1) + { + /* no update wanted */ + _bt_relbuf(rel, metabuf); + metabuf = InvalidBuffer; + } + } + } + + /* + * Here we begin doing the deletion. + */ + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* + * Update siblings' side-links. Note the target page's side-links will + * continue to point to the siblings. Asserts here are just rechecking + * things we already verified above. + */ + if (BufferIsValid(lbuf)) + { + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->btpo_next == target); + opaque->btpo_next = rightsib; + } + page = BufferGetPage(rbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->btpo_prev == target); + opaque->btpo_prev = leftsib; + + /* + * If we deleted a parent of the targeted leaf page, instead of the leaf + * itself, update the leaf to point to the next remaining child in the + * subtree. + * + * Note: We rely on the fact that a buffer pin on the leaf page has been + * held since leafhikey was initialized. This is safe, though only + * because the page was already half-dead at that point. The leaf page + * cannot have been modified by any other backend during the period when + * no lock was held. + */ + if (target != leafblkno) + BTreeTupleSetTopParent(leafhikey, leaftopparent); + + /* + * Mark the page itself deleted. It can be recycled when all current + * transactions are gone. Storing GetTopTransactionId() would work, but + * we're in VACUUM and would not otherwise have an XID. Having already + * updated links to the target, ReadNextFullTransactionId() suffices as an + * upper bound. Any scan having retained a now-stale link is advertising + * in its PGPROC an xmin less than or equal to the value we read here. It + * will continue to do so, holding back the xmin horizon, for the duration + * of that scan. + */ + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); + + /* + * Store upper bound XID that's used to determine when deleted page is no + * longer needed as a tombstone + */ + safexid = ReadNextFullTransactionId(); + BTPageSetDeleted(page, safexid); + opaque->btpo_cycleid = 0; + + /* And update the metapage, if needed */ + if (BufferIsValid(metabuf)) + { + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_NOVAC_VERSION) + _bt_upgrademetapage(metapg); + metad->btm_fastroot = rightsib; + metad->btm_fastlevel = targetlevel; + MarkBufferDirty(metabuf); + } + + /* Must mark buffers dirty before XLogInsert */ + MarkBufferDirty(rbuf); + MarkBufferDirty(buf); + if (BufferIsValid(lbuf)) + MarkBufferDirty(lbuf); + if (target != leafblkno) + MarkBufferDirty(leafbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_unlink_page xlrec; + xl_btree_metadata xlmeta; + uint8 xlinfo; + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); + if (BufferIsValid(lbuf)) + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + if (target != leafblkno) + XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); + + /* information stored on the target/to-be-unlinked block */ + xlrec.leftsib = leftsib; + xlrec.rightsib = rightsib; + xlrec.level = targetlevel; + xlrec.safexid = safexid; + + /* information needed to recreate the leaf block (if not the target) */ + xlrec.leafleftsib = leafleftsib; + xlrec.leafrightsib = leafrightsib; + xlrec.leaftopparent = leaftopparent; + + XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage); + + if (BufferIsValid(metabuf)) + { + XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + Assert(metad->btm_version >= BTREE_NOVAC_VERSION); + xlmeta.version = metad->btm_version; + xlmeta.root = metad->btm_root; + xlmeta.level = metad->btm_level; + xlmeta.fastroot = metad->btm_fastroot; + xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; + xlmeta.allequalimage = metad->btm_allequalimage; + + XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); + xlinfo = XLOG_BTREE_UNLINK_PAGE_META; + } + else + xlinfo = XLOG_BTREE_UNLINK_PAGE; + + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + if (BufferIsValid(metabuf)) + { + PageSetLSN(metapg, recptr); + } + page = BufferGetPage(rbuf); + PageSetLSN(page, recptr); + page = BufferGetPage(buf); + PageSetLSN(page, recptr); + if (BufferIsValid(lbuf)) + { + page = BufferGetPage(lbuf); + PageSetLSN(page, recptr); + } + if (target != leafblkno) + { + page = BufferGetPage(leafbuf); + PageSetLSN(page, recptr); + } + } + + END_CRIT_SECTION(); + + /* release metapage */ + if (BufferIsValid(metabuf)) + _bt_relbuf(rel, metabuf); + + /* release siblings */ + if (BufferIsValid(lbuf)) + _bt_relbuf(rel, lbuf); + _bt_relbuf(rel, rbuf); + + /* If the target is not leafbuf, we're done with it now -- release it */ + if (target != leafblkno) + _bt_relbuf(rel, buf); + + /* + * Maintain pages_newly_deleted, which is simply the number of pages + * deleted by the ongoing VACUUM operation. + * + * Maintain pages_deleted in a way that takes into account how + * btvacuumpage() will count deleted pages that have yet to become + * scanblkno -- only count page when it's not going to get that treatment + * later on. + */ + stats->pages_newly_deleted++; + if (target <= scanblkno) + stats->pages_deleted++; + + /* + * Remember information about the target page (now a newly deleted page) + * in dedicated vstate space for later. The page will be considered as a + * candidate to place in the FSM at the end of the current btvacuumscan() + * call. + */ + _bt_pendingfsm_add(vstate, target, safexid); + + return true; +} + +/* + * Establish how tall the to-be-deleted subtree will be during the first stage + * of page deletion. + * + * Caller's child argument is the block number of the page caller wants to + * delete (this is leafbuf's block number, except when we're called + * recursively). stack is a search stack leading to it. Note that we will + * update the stack entry(s) to reflect current downlink positions --- this is + * similar to the corresponding point in page split handling. + * + * If "first stage" caller cannot go ahead with deleting _any_ pages, returns + * false. Returns true on success, in which case caller can use certain + * details established here to perform the first stage of deletion. This + * function is the last point at which page deletion may be deemed unsafe + * (barring index corruption, or unexpected concurrent page deletions). + * + * We write lock the parent of the root of the to-be-deleted subtree for + * caller on success (i.e. we leave our lock on the *subtreeparent buffer for + * caller). Caller will have to remove a downlink from *subtreeparent. We + * also set a *subtreeparent offset number in *poffset, to indicate the + * location of the pivot tuple that contains the relevant downlink. + * + * The root of the to-be-deleted subtree is called the "top parent". Note + * that the leafbuf page is often the final "top parent" page (you can think + * of the leafbuf page as a degenerate single page subtree when that happens). + * Caller should initialize *topparent to the target leafbuf page block number + * (while *topparentrightsib should be set to leafbuf's right sibling block + * number). We will update *topparent (and *topparentrightsib) for caller + * here, though only when it turns out that caller will delete at least one + * internal page (i.e. only when caller needs to store a valid link to the top + * parent block in the leafbuf page using BTreeTupleSetTopParent()). + */ +static bool +_bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, + Buffer *subtreeparent, OffsetNumber *poffset, + BlockNumber *topparent, BlockNumber *topparentrightsib) +{ + BlockNumber parent, + leftsibparent; + OffsetNumber parentoffset, + maxoff; + Buffer pbuf; + Page page; + BTPageOpaque opaque; + + /* + * Locate the pivot tuple whose downlink points to "child". Write lock + * the parent page itself. + */ + pbuf = _bt_getstackbuf(rel, stack, child); + if (pbuf == InvalidBuffer) + { + /* + * Failed to "re-find" a pivot tuple whose downlink matched our child + * block number on the parent level -- the index must be corrupt. + * Don't even try to delete the leafbuf subtree. Just report the + * issue and press on with vacuuming the index. + * + * Note: _bt_getstackbuf() recovers from concurrent page splits that + * take place on the parent level. Its approach is a near-exhaustive + * linear search. This also gives it a surprisingly good chance of + * recovering in the event of a buggy or inconsistent opclass. But we + * don't rely on that here. + */ + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u", + RelationGetRelationName(rel), child))); + return false; + } + + parent = stack->bts_blkno; + parentoffset = stack->bts_offset; + + page = BufferGetPage(pbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + leftsibparent = opaque->btpo_prev; + + /* + * _bt_getstackbuf() completes page splits on returned parent buffer when + * required. + * + * In general it's a bad idea for VACUUM to use up more disk space, which + * is why page deletion does not finish incomplete page splits most of the + * time. We allow this limited exception because the risk is much lower, + * and the potential downside of not proceeding is much higher: A single + * internal page with the INCOMPLETE_SPLIT flag set might otherwise + * prevent us from deleting hundreds of empty leaf pages from one level + * down. + */ + Assert(!P_INCOMPLETE_SPLIT(opaque)); + + if (parentoffset < maxoff) + { + /* + * Child is not the rightmost child in parent, so it's safe to delete + * the subtree whose root/topparent is child page + */ + *subtreeparent = pbuf; + *poffset = parentoffset; + return true; + } + + /* + * Child is the rightmost child of parent. + * + * Since it's the rightmost child of parent, deleting the child (or + * deleting the subtree whose root/topparent is the child page) is only + * safe when it's also possible to delete the parent. + */ + Assert(parentoffset == maxoff); + if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque)) + { + /* + * Child isn't parent's only child, or parent is rightmost on its + * entire level. Definitely cannot delete any pages. + */ + _bt_relbuf(rel, pbuf); + return false; + } + + /* + * Now make sure that the parent deletion is itself safe by examining the + * child's grandparent page. Recurse, passing the parent page as the + * child page (child's grandparent is the parent on the next level up). If + * parent deletion is unsafe, then child deletion must also be unsafe (in + * which case caller cannot delete any pages at all). + */ + *topparent = parent; + *topparentrightsib = opaque->btpo_next; + + /* + * Release lock on parent before recursing. + * + * It's OK to release page locks on parent before recursive call locks + * grandparent. An internal page can only acquire an entry if the child + * is split, but that cannot happen as long as we still hold a lock on the + * leafbuf page. + */ + _bt_relbuf(rel, pbuf); + + /* + * Before recursing, check that the left sibling of parent (if any) is not + * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the + * parent lock). + * + * Note: We deliberately avoid completing incomplete splits here. + */ + if (_bt_leftsib_splitflag(rel, leftsibparent, parent)) + return false; + + /* Recurse to examine child page's grandparent page */ + return _bt_lock_subtree_parent(rel, parent, stack->bts_parent, + subtreeparent, poffset, + topparent, topparentrightsib); +} + +/* + * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize + * optimization. + * + * Called at the start of a btvacuumscan(). Caller's cleanuponly argument + * indicates if ongoing VACUUM has not (and will not) call btbulkdelete(). + * + * We expect to allocate memory inside VACUUM's top-level memory context here. + * The working buffer is subject to a limit based on work_mem. Our strategy + * when the array can no longer grow within the bounds of that limit is to + * stop saving additional newly deleted pages, while proceeding as usual with + * the pages that we can fit. + */ +void +_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly) +{ + int64 maxbufsize; + + /* + * Don't bother with optimization in cleanup-only case -- we don't expect + * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan() + * can only take place because this optimization didn't work out during + * the last VACUUM. + */ + if (cleanuponly) + return; + + /* + * Cap maximum size of array so that we always respect work_mem. Avoid + * int overflow here. + */ + vstate->bufsize = 256; + maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM); + maxbufsize = Min(maxbufsize, INT_MAX); + maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM)); + /* Stay sane with small work_mem */ + maxbufsize = Max(maxbufsize, vstate->bufsize); + vstate->maxbufsize = maxbufsize; + + /* Allocate buffer, indicate that there are currently 0 pending pages */ + vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize); + vstate->npendingpages = 0; +} + +/* + * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during + * the ongoing VACUUM operation) into the free space map -- though only when + * it is actually safe to do so by now. + * + * Called at the end of a btvacuumscan(), just before free space map vacuuming + * takes place. + * + * Frees memory allocated by _bt_pendingfsm_init(), if any. + */ +void +_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate) +{ + IndexBulkDeleteResult *stats = vstate->stats; + + Assert(stats->pages_newly_deleted >= vstate->npendingpages); + + if (vstate->npendingpages == 0) + { + /* Just free memory when nothing to do */ + if (vstate->pendingpages) + pfree(vstate->pendingpages); + + return; + } + +#ifdef DEBUG_BTREE_PENDING_FSM + + /* + * Debugging aid: Sleep for 5 seconds to greatly increase the chances of + * placing pending pages in the FSM. Note that the optimization will + * never be effective without some other backend concurrently consuming an + * XID. + */ + pg_usleep(5000000L); +#endif + + /* + * Recompute VACUUM XID boundaries. + * + * We don't actually care about the oldest non-removable XID. Computing + * the oldest such XID has a useful side-effect that we rely on: it + * forcibly updates the XID horizon state for this backend. This step is + * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize + * that it is now safe to recycle newly deleted pages without this step. + */ + GetOldestNonRemovableTransactionId(NULL); + + for (int i = 0; i < vstate->npendingpages; i++) + { + BlockNumber target = vstate->pendingpages[i].target; + FullTransactionId safexid = vstate->pendingpages[i].safexid; + + /* + * Do the equivalent of checking BTPageIsRecyclable(), but without + * accessing the page again a second time. + * + * Give up on finding the first non-recyclable page -- all later pages + * must be non-recyclable too, since _bt_pendingfsm_add() adds pages + * to the array in safexid order. + */ + if (!GlobalVisCheckRemovableFullXid(NULL, safexid)) + break; + + RecordFreeIndexPage(rel, target); + stats->pages_free++; + } + + pfree(vstate->pendingpages); +} + +/* + * Maintain array of pages that were deleted during current btvacuumscan() + * call, for use in _bt_pendingfsm_finalize() + */ +static void +_bt_pendingfsm_add(BTVacState *vstate, + BlockNumber target, + FullTransactionId safexid) +{ + Assert(vstate->npendingpages <= vstate->bufsize); + Assert(vstate->bufsize <= vstate->maxbufsize); + +#ifdef USE_ASSERT_CHECKING + + /* + * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the + * array will always be in safexid order (since that is the order that we + * save them in here) + */ + if (vstate->npendingpages > 0) + { + FullTransactionId lastsafexid = + vstate->pendingpages[vstate->npendingpages - 1].safexid; + + Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid)); + } +#endif + + /* + * If temp buffer reaches maxbufsize/work_mem capacity then we discard + * information about this page. + * + * Note that this also covers the case where we opted to not use the + * optimization in _bt_pendingfsm_init(). + */ + if (vstate->npendingpages == vstate->maxbufsize) + return; + + /* Consider enlarging buffer */ + if (vstate->npendingpages == vstate->bufsize) + { + int newbufsize = vstate->bufsize * 2; + + /* Respect work_mem */ + if (newbufsize > vstate->maxbufsize) + newbufsize = vstate->maxbufsize; + + vstate->bufsize = newbufsize; + vstate->pendingpages = + repalloc(vstate->pendingpages, + sizeof(BTPendingFSM) * vstate->bufsize); + } + + /* Save metadata for newly deleted page */ + vstate->pendingpages[vstate->npendingpages].target = target; + vstate->pendingpages[vstate->npendingpages].safexid = safexid; + vstate->npendingpages++; +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c new file mode 100644 index 0000000..1360ab8 --- /dev/null +++ b/src/backend/access/nbtree/nbtree.c @@ -0,0 +1,1446 @@ +/*------------------------------------------------------------------------- + * + * nbtree.c + * Implementation of Lehman and Yao's btree management algorithm for + * Postgres. + * + * NOTES + * This file contains only the public interface routines. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtree.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/relscan.h" +#include "access/xlog.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "storage/condition_variable.h" +#include "storage/indexfsm.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" + + +/* + * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. + * + * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to + * a new page; others must wait. + * + * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan + * to a new page; some process can start doing that. + * + * BTPARALLEL_DONE indicates that the scan is complete (including error exit). + * We reach this state once for every distinct combination of array keys. + */ +typedef enum +{ + BTPARALLEL_NOT_INITIALIZED, + BTPARALLEL_ADVANCING, + BTPARALLEL_IDLE, + BTPARALLEL_DONE +} BTPS_State; + +/* + * BTParallelScanDescData contains btree specific shared information required + * for parallel scan. + */ +typedef struct BTParallelScanDescData +{ + BlockNumber btps_scanPage; /* latest or next page to be scanned */ + BTPS_State btps_pageStatus; /* indicates whether next page is + * available for scan. see above for + * possible states of parallel scan. */ + int btps_arrayKeyCount; /* count indicating number of array scan + * keys processed by parallel scan */ + slock_t btps_mutex; /* protects above variables */ + ConditionVariable btps_cv; /* used to synchronize parallel scan */ +} BTParallelScanDescData; + +typedef struct BTParallelScanDescData *BTParallelScanDesc; + + +static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state, + BTCycleId cycleid); +static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno); +static BTVacuumPosting btreevacuumposting(BTVacState *vstate, + IndexTuple posting, + OffsetNumber updatedoffset, + int *nremaining); + + +/* + * Btree handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +bthandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = BTMaxStrategyNumber; + amroutine->amsupport = BTNProcs; + amroutine->amoptsprocnum = BTOPTIONS_PROC; + amroutine->amcanorder = true; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = true; + amroutine->amcanunique = true; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = true; + amroutine->amsearchnulls = true; + amroutine->amstorage = false; + amroutine->amclusterable = true; + amroutine->ampredlocks = true; + amroutine->amcanparallel = true; + amroutine->amcaninclude = true; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = btbuild; + amroutine->ambuildempty = btbuildempty; + amroutine->aminsert = btinsert; + amroutine->ambulkdelete = btbulkdelete; + amroutine->amvacuumcleanup = btvacuumcleanup; + amroutine->amcanreturn = btcanreturn; + amroutine->amcostestimate = btcostestimate; + amroutine->amoptions = btoptions; + amroutine->amproperty = btproperty; + amroutine->ambuildphasename = btbuildphasename; + amroutine->amvalidate = btvalidate; + amroutine->amadjustmembers = btadjustmembers; + amroutine->ambeginscan = btbeginscan; + amroutine->amrescan = btrescan; + amroutine->amgettuple = btgettuple; + amroutine->amgetbitmap = btgetbitmap; + amroutine->amendscan = btendscan; + amroutine->ammarkpos = btmarkpos; + amroutine->amrestrpos = btrestrpos; + amroutine->amestimateparallelscan = btestimateparallelscan; + amroutine->aminitparallelscan = btinitparallelscan; + amroutine->amparallelrescan = btparallelrescan; + + PG_RETURN_POINTER(amroutine); +} + +/* + * btbuildempty() -- build an empty btree index in the initialization fork + */ +void +btbuildempty(Relation index) +{ + Page metapage; + + /* Construct metapage. */ + metapage = (Page) palloc(BLCKSZ); + _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); + + /* + * Write the page and log it. It might seem that an immediate sync would + * be sufficient to guarantee that the file exists on disk, but recovery + * itself might remove it while replaying, for example, an + * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need + * this even when wal_level=minimal. + */ + PageSetChecksumInplace(metapage, BTREE_METAPAGE); + smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, + (char *) metapage, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BTREE_METAPAGE, metapage, true); + + /* + * An immediate sync is required even if we xlog'd the page, because the + * write did not go through shared_buffers and therefore a concurrent + * checkpoint may have moved the redo pointer past our xlog record. + */ + smgrimmedsync(index->rd_smgr, INIT_FORKNUM); +} + +/* + * btinsert() -- insert an index tuple into a btree. + * + * Descend the tree recursively, find the appropriate location for our + * new tuple, and put it there. + */ +bool +btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + bool result; + IndexTuple itup; + + /* generate an index tuple */ + itup = index_form_tuple(RelationGetDescr(rel), values, isnull); + itup->t_tid = *ht_ctid; + + result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel); + + pfree(itup); + + return result; +} + +/* + * btgettuple() -- Get the next tuple in the scan. + */ +bool +btgettuple(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool res; + + /* btree indexes are never lossy */ + scan->xs_recheck = false; + + /* + * If we have any array keys, initialize them during first call for a + * scan. We can't do this in btrescan because we don't know the scan + * direction at that time. + */ + if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) + { + /* punt if we have any unsatisfiable array keys */ + if (so->numArrayKeys < 0) + return false; + + _bt_start_array_keys(scan, dir); + } + + /* This loop handles advancing to the next array elements, if any */ + do + { + /* + * If we've already initialized this scan, we can just advance it in + * the appropriate direction. If we haven't done so yet, we call + * _bt_first() to get the first item in the scan. + */ + if (!BTScanPosIsValid(so->currPos)) + res = _bt_first(scan, dir); + else + { + /* + * Check to see if we should kill the previously-fetched tuple. + */ + if (scan->kill_prior_tuple) + { + /* + * Yes, remember it for later. (We'll deal with all such + * tuples at once right before leaving the index page.) The + * test for numKilled overrun is not just paranoia: if the + * caller reverses direction in the indexscan then the same + * item might get entered multiple times. It's not worth + * trying to optimize that, so we don't detect it, but instead + * just forget any excess entries. + */ + if (so->killedItems == NULL) + so->killedItems = (int *) + palloc(MaxTIDsPerBTreePage * sizeof(int)); + if (so->numKilled < MaxTIDsPerBTreePage) + so->killedItems[so->numKilled++] = so->currPos.itemIndex; + } + + /* + * Now continue the scan. + */ + res = _bt_next(scan, dir); + } + + /* If we have a tuple, return it ... */ + if (res) + break; + /* ... otherwise see if we have more array keys to deal with */ + } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + + return res; +} + +/* + * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap + */ +int64 +btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int64 ntids = 0; + ItemPointer heapTid; + + /* + * If we have any array keys, initialize them. + */ + if (so->numArrayKeys) + { + /* punt if we have any unsatisfiable array keys */ + if (so->numArrayKeys < 0) + return ntids; + + _bt_start_array_keys(scan, ForwardScanDirection); + } + + /* This loop handles advancing to the next array elements, if any */ + do + { + /* Fetch the first page & tuple */ + if (_bt_first(scan, ForwardScanDirection)) + { + /* Save tuple ID, and continue scanning */ + heapTid = &scan->xs_heaptid; + tbm_add_tuples(tbm, heapTid, 1, false); + ntids++; + + for (;;) + { + /* + * Advance to next tuple within page. This is the same as the + * easy case in _bt_next(). + */ + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + /* let _bt_next do the heavy lifting */ + if (!_bt_next(scan, ForwardScanDirection)) + break; + } + + /* Save tuple ID, and continue scanning */ + heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; + tbm_add_tuples(tbm, heapTid, 1, false); + ntids++; + } + } + /* Now see if we have more array keys to deal with */ + } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); + + return ntids; +} + +/* + * btbeginscan() -- start a scan on a btree index + */ +IndexScanDesc +btbeginscan(Relation rel, int nkeys, int norderbys) +{ + IndexScanDesc scan; + BTScanOpaque so; + + /* no order by operators allowed */ + Assert(norderbys == 0); + + /* get the scan */ + scan = RelationGetIndexScan(rel, nkeys, norderbys); + + /* allocate private workspace */ + so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); + BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(so->markPos); + if (scan->numberOfKeys > 0) + so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); + else + so->keyData = NULL; + + so->arrayKeyData = NULL; /* assume no array keys for now */ + so->numArrayKeys = 0; + so->arrayKeys = NULL; + so->arrayContext = NULL; + + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; + + /* + * We don't know yet whether the scan will be index-only, so we do not + * allocate the tuple workspace arrays until btrescan. However, we set up + * scan->xs_itupdesc whether we'll need it or not, since that's so cheap. + */ + so->currTuples = so->markTuples = NULL; + + scan->xs_itupdesc = RelationGetDescr(rel); + + scan->opaque = so; + + return scan; +} + +/* + * btrescan() -- rescan an index relation + */ +void +btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* we aren't holding any read locks, but gotta drop the pins */ + if (BTScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + } + + so->markItemIndex = -1; + so->arrayKeyCount = 0; + BTScanPosUnpinIfPinned(so->markPos); + BTScanPosInvalidate(so->markPos); + + /* + * Allocate tuple workspace arrays, if needed for an index-only scan and + * not already done in a previous rescan call. To save on palloc + * overhead, both workspaces are allocated as one palloc block; only this + * function and btendscan know that. + * + * NOTE: this data structure also makes it safe to return data from a + * "name" column, even though btree name_ops uses an underlying storage + * datatype of cstring. The risk there is that "name" is supposed to be + * padded to NAMEDATALEN, but the actual index tuple is probably shorter. + * However, since we only return data out of tuples sitting in the + * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some + * data out of the markTuples array --- running off the end of memory for + * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats + * adding special-case treatment for name_ops elsewhere. + */ + if (scan->xs_want_itup && so->currTuples == NULL) + { + so->currTuples = (char *) palloc(BLCKSZ * 2); + so->markTuples = so->currTuples + BLCKSZ; + } + + /* + * Reset the scan keys + */ + if (scankey && scan->numberOfKeys > 0) + memmove(scan->keyData, + scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ + + /* If any keys are SK_SEARCHARRAY type, set up array-key info */ + _bt_preprocess_array_keys(scan); +} + +/* + * btendscan() -- close down a scan + */ +void +btendscan(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* we aren't holding any read locks, but gotta drop the pins */ + if (BTScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + BTScanPosUnpinIfPinned(so->currPos); + } + + so->markItemIndex = -1; + BTScanPosUnpinIfPinned(so->markPos); + + /* No need to invalidate positions, the RAM is about to be freed. */ + + /* Release storage */ + if (so->keyData != NULL) + pfree(so->keyData); + /* so->arrayKeyData and so->arrayKeys are in arrayContext */ + if (so->arrayContext != NULL) + MemoryContextDelete(so->arrayContext); + if (so->killedItems != NULL) + pfree(so->killedItems); + if (so->currTuples != NULL) + pfree(so->currTuples); + /* so->markTuples should not be pfree'd, see btrescan */ + pfree(so); +} + +/* + * btmarkpos() -- save current scan position + */ +void +btmarkpos(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* There may be an old mark with a pin (but no lock). */ + BTScanPosUnpinIfPinned(so->markPos); + + /* + * Just record the current itemIndex. If we later step to next page + * before releasing the marked position, _bt_steppage makes a full copy of + * the currPos struct in markPos. If (as often happens) the mark is moved + * before we leave the page, we don't have to do that work. + */ + if (BTScanPosIsValid(so->currPos)) + so->markItemIndex = so->currPos.itemIndex; + else + { + BTScanPosInvalidate(so->markPos); + so->markItemIndex = -1; + } + + /* Also record the current positions of any array keys */ + if (so->numArrayKeys) + _bt_mark_array_keys(scan); +} + +/* + * btrestrpos() -- restore scan to last saved position + */ +void +btrestrpos(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* Restore the marked positions of any array keys */ + if (so->numArrayKeys) + _bt_restore_array_keys(scan); + + if (so->markItemIndex >= 0) + { + /* + * The scan has never moved to a new page since the last mark. Just + * restore the itemIndex. + * + * NB: In this case we can't count on anything in so->markPos to be + * accurate. + */ + so->currPos.itemIndex = so->markItemIndex; + } + else + { + /* + * The scan moved to a new page after last mark or restore, and we are + * now restoring to the marked page. We aren't holding any read + * locks, but if we're still holding the pin for the current position, + * we must drop it. + */ + if (BTScanPosIsValid(so->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + BTScanPosUnpinIfPinned(so->currPos); + } + + if (BTScanPosIsValid(so->markPos)) + { + /* bump pin on mark buffer for assignment to current buffer */ + if (BTScanPosIsPinned(so->markPos)) + IncrBufferRefCount(so->markPos.buf); + memcpy(&so->currPos, &so->markPos, + offsetof(BTScanPosData, items[1]) + + so->markPos.lastItem * sizeof(BTScanPosItem)); + if (so->currTuples) + memcpy(so->currTuples, so->markTuples, + so->markPos.nextTupleOffset); + } + else + BTScanPosInvalidate(so->currPos); + } +} + +/* + * btestimateparallelscan -- estimate storage for BTParallelScanDescData + */ +Size +btestimateparallelscan(void) +{ + return sizeof(BTParallelScanDescData); +} + +/* + * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan + */ +void +btinitparallelscan(void *target) +{ + BTParallelScanDesc bt_target = (BTParallelScanDesc) target; + + SpinLockInit(&bt_target->btps_mutex); + bt_target->btps_scanPage = InvalidBlockNumber; + bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + bt_target->btps_arrayKeyCount = 0; + ConditionVariableInit(&bt_target->btps_cv); +} + +/* + * btparallelrescan() -- reset parallel scan + */ +void +btparallelrescan(IndexScanDesc scan) +{ + BTParallelScanDesc btscan; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + + Assert(parallel_scan); + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + /* + * In theory, we don't need to acquire the spinlock here, because there + * shouldn't be any other workers running at this point, but we do so for + * consistency. + */ + SpinLockAcquire(&btscan->btps_mutex); + btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrayKeyCount = 0; + SpinLockRelease(&btscan->btps_mutex); +} + +/* + * _bt_parallel_seize() -- Begin the process of advancing the scan to a new + * page. Other scans must wait until we call _bt_parallel_release() + * or _bt_parallel_done(). + * + * The return value is true if we successfully seized the scan and false + * if we did not. The latter case occurs if no pages remain for the current + * set of scankeys. + * + * If the return value is true, *pageno returns the next or current page + * of the scan (depending on the scan direction). An invalid block number + * means the scan hasn't yet started, and P_NONE means we've reached the end. + * The first time a participating process reaches the last page, it will return + * true and set *pageno to P_NONE; after that, further attempts to seize the + * scan will return false. + * + * Callers should ignore the value of pageno if the return value is false. + */ +bool +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTPS_State pageStatus; + bool exit_loop = false; + bool status = true; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + *pageno = P_NONE; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + while (1) + { + SpinLockAcquire(&btscan->btps_mutex); + pageStatus = btscan->btps_pageStatus; + + if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + { + /* Parallel scan has already advanced to a new set of scankeys. */ + status = false; + } + else if (pageStatus == BTPARALLEL_DONE) + { + /* + * We're done with this set of scankeys. This may be the end, or + * there could be more sets to try. + */ + status = false; + } + else if (pageStatus != BTPARALLEL_ADVANCING) + { + /* + * We have successfully seized control of the scan for the purpose + * of advancing it to a new page! + */ + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + *pageno = btscan->btps_scanPage; + exit_loop = true; + } + SpinLockRelease(&btscan->btps_mutex); + if (exit_loop || !status) + break; + ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE); + } + ConditionVariableCancelSleep(); + + return status; +} + +/* + * _bt_parallel_release() -- Complete the process of advancing the scan to a + * new page. We now have the new value btps_scanPage; some other backend + * can now begin advancing the scan. + */ +void +_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) +{ + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + SpinLockAcquire(&btscan->btps_mutex); + btscan->btps_scanPage = scan_page; + btscan->btps_pageStatus = BTPARALLEL_IDLE; + SpinLockRelease(&btscan->btps_mutex); + ConditionVariableSignal(&btscan->btps_cv); +} + +/* + * _bt_parallel_done() -- Mark the parallel scan as complete. + * + * When there are no pages left to scan, this function should be called to + * notify other workers. Otherwise, they might wait forever for the scan to + * advance to the next page. + */ +void +_bt_parallel_done(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + bool status_changed = false; + + /* Do nothing, for non-parallel scans */ + if (parallel_scan == NULL) + return; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + /* + * Mark the parallel scan as done for this combination of scan keys, + * unless some other process already did so. See also + * _bt_advance_array_keys. + */ + SpinLockAcquire(&btscan->btps_mutex); + if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && + btscan->btps_pageStatus != BTPARALLEL_DONE) + { + btscan->btps_pageStatus = BTPARALLEL_DONE; + status_changed = true; + } + SpinLockRelease(&btscan->btps_mutex); + + /* wake up all the workers associated with this parallel scan */ + if (status_changed) + ConditionVariableBroadcast(&btscan->btps_cv); +} + +/* + * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array + * keys. + * + * Updates the count of array keys processed for both local and parallel + * scans. + */ +void +_bt_parallel_advance_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + so->arrayKeyCount++; + SpinLockAcquire(&btscan->btps_mutex); + if (btscan->btps_pageStatus == BTPARALLEL_DONE) + { + btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrayKeyCount++; + } + SpinLockRelease(&btscan->btps_mutex); +} + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation rel = info->index; + BTCycleId cycleid; + + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* Establish the vacuum cycle ID to use for this scan */ + /* The ENSURE stuff ensures we clean up shared memory on failure */ + PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); + { + cycleid = _bt_start_vacuum(rel); + + btvacuumscan(info, stats, callback, callback_state, cycleid); + } + PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); + _bt_end_vacuum(rel); + + return stats; +} + +/* + * Post-VACUUM cleanup. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + BlockNumber num_delpages; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + /* + * If btbulkdelete was called, we need not do anything (we just maintain + * the information used within _bt_vacuum_needs_cleanup() by calling + * _bt_set_cleanup_info() below). + * + * If btbulkdelete was _not_ called, then we have a choice to make: we + * must decide whether or not a btvacuumscan() call is needed now (i.e. + * whether the ongoing VACUUM operation can entirely avoid a physical scan + * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us + * now. + */ + if (stats == NULL) + { + /* Check if VACUUM operation can entirely avoid btvacuumscan() call */ + if (!_bt_vacuum_needs_cleanup(info->index)) + return NULL; + + /* + * Since we aren't going to actually delete any leaf items, there's no + * need to go through all the vacuum-cycle-ID pushups here. + * + * Posting list tuples are a source of inaccuracy for cleanup-only + * scans. btvacuumscan() will assume that the number of index tuples + * from each page can be used as num_index_tuples, even though + * num_index_tuples is supposed to represent the number of TIDs in the + * index. This naive approach can underestimate the number of tuples + * in the index significantly. + * + * We handle the problem by making num_index_tuples an estimate in + * cleanup-only case. + */ + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + btvacuumscan(info, stats, NULL, NULL, 0); + stats->estimated_count = true; + } + + /* + * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup(). + * + * num_delpages is the number of deleted pages now in the index that were + * not safe to place in the FSM to be recycled just yet. num_delpages is + * greater than 0 only when _bt_pagedel() actually deleted pages during + * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must + * have failed to place any newly deleted pages in the FSM just moments + * ago. (Actually, there are edge cases where recycling of the current + * VACUUM's newly deleted pages does not even become safe by the time the + * next VACUUM comes around. See nbtree/README.) + */ + Assert(stats->pages_deleted >= stats->pages_free); + num_delpages = stats->pages_deleted - stats->pages_free; + _bt_set_cleanup_info(info->index, num_delpages); + + /* + * It's quite possible for us to be fooled by concurrent page splits into + * double-counting some index tuples, so disbelieve any total that exceeds + * the underlying heap's count ... if we know that accurately. Otherwise + * this might just make matters worse. + */ + if (!info->estimated_count) + { + if (stats->num_index_tuples > info->num_heap_tuples) + stats->num_index_tuples = info->num_heap_tuples; + } + + return stats; +} + +/* + * btvacuumscan --- scan the index for VACUUMing purposes + * + * This combines the functions of looking for leaf tuples that are deletable + * according to the vacuum callback, looking for empty pages that can be + * deleted, and looking for old deleted pages that can be recycled. Both + * btbulkdelete and btvacuumcleanup invoke this (the latter only if no + * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true). + * + * The caller is responsible for initially allocating/zeroing a stats struct + * and for obtaining a vacuum cycle ID if necessary. + */ +static void +btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state, + BTCycleId cycleid) +{ + Relation rel = info->index; + BTVacState vstate; + BlockNumber num_pages; + BlockNumber scanblkno; + bool needLock; + + /* + * Reset fields that track information about the entire index now. This + * avoids double-counting in the case where a single VACUUM command + * requires multiple scans of the index. + * + * Avoid resetting the tuples_removed and pages_newly_deleted fields here, + * since they track information about the VACUUM command, and so must last + * across each call to btvacuumscan(). + * + * (Note that pages_free is treated as state about the whole index, not + * the current VACUUM. This is appropriate because RecordFreeIndexPage() + * calls are idempotent, and get repeated for the same deleted pages in + * some scenarios. The point for us is to track the number of recyclable + * pages in the index at the end of the VACUUM command.) + */ + stats->num_pages = 0; + stats->num_index_tuples = 0; + stats->pages_deleted = 0; + stats->pages_free = 0; + + /* Set up info to pass down to btvacuumpage */ + vstate.info = info; + vstate.stats = stats; + vstate.callback = callback; + vstate.callback_state = callback_state; + vstate.cycleid = cycleid; + + /* Create a temporary memory context to run _bt_pagedel in */ + vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, + "_bt_pagedel", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize vstate fields used by _bt_pendingfsm_finalize */ + vstate.bufsize = 0; + vstate.maxbufsize = 0; + vstate.pendingpages = NULL; + vstate.npendingpages = 0; + /* Consider applying _bt_pendingfsm_finalize optimization */ + _bt_pendingfsm_init(rel, &vstate, (callback == NULL)); + + /* + * The outer loop iterates over all index pages except the metapage, in + * physical order (we hope the kernel will cooperate in providing + * read-ahead for speed). It is critical that we visit all leaf pages, + * including ones added after we start the scan, else we might fail to + * delete some deletable tuples. Hence, we must repeatedly check the + * relation length. We must acquire the relation-extension lock while + * doing so to avoid a race condition: if someone else is extending the + * relation, there is a window where bufmgr/smgr have created a new + * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If + * we manage to scan such a page here, we'll improperly assume it can be + * recycled. Taking the lock synchronizes things enough to prevent a + * problem: either num_pages won't include the new page, or _bt_getbuf + * already has write lock on the buffer and it will be fully initialized + * before we can examine it. (See also vacuumlazy.c, which has the same + * issue.) Also, we need not worry if a page is added immediately after + * we look; the page splitting code already has write-lock on the left + * page before it adds a right page, so we must already have processed any + * tuples due to be moved into such a page. + * + * We can skip locking for new or temp relations, however, since no one + * else could be accessing them. + */ + needLock = !RELATION_IS_LOCAL(rel); + + scanblkno = BTREE_METAPAGE + 1; + for (;;) + { + /* Get the current relation length */ + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + num_pages = RelationGetNumberOfBlocks(rel); + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + if (info->report_progress) + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, + num_pages); + + /* Quit if we've scanned the whole relation */ + if (scanblkno >= num_pages) + break; + /* Iterate over pages, then loop back to recheck length */ + for (; scanblkno < num_pages; scanblkno++) + { + btvacuumpage(&vstate, scanblkno); + if (info->report_progress) + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + scanblkno); + } + } + + /* Set statistics num_pages field to final size of index */ + stats->num_pages = num_pages; + + MemoryContextDelete(vstate.pagedelcontext); + + /* + * If there were any calls to _bt_pagedel() during scan of the index then + * see if any of the resulting pages can be placed in the FSM now. When + * it's not safe we'll have to leave it up to a future VACUUM operation. + * + * Finally, if we placed any pages in the FSM (either just now or during + * the scan), forcibly update the upper-level FSM pages to ensure that + * searchers can find them. + */ + _bt_pendingfsm_finalize(rel, &vstate); + if (stats->pages_free > 0) + IndexFreeSpaceMapVacuum(rel); +} + +/* + * btvacuumpage --- VACUUM one page + * + * This processes a single page for btvacuumscan(). In some cases we must + * backtrack to re-examine and VACUUM pages that were the scanblkno during + * a previous call here. This is how we handle page splits (that happened + * after our cycleid was acquired) whose right half page happened to reuse + * a block that we might have processed at some point before it was + * recycled (i.e. before the page split). + */ +static void +btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) +{ + IndexVacuumInfo *info = vstate->info; + IndexBulkDeleteResult *stats = vstate->stats; + IndexBulkDeleteCallback callback = vstate->callback; + void *callback_state = vstate->callback_state; + Relation rel = info->index; + bool attempt_pagedel; + BlockNumber blkno, + backtrack_to; + Buffer buf; + Page page; + BTPageOpaque opaque; + + blkno = scanblkno; + +backtrack: + + attempt_pagedel = false; + backtrack_to = P_NONE; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + /* + * We can't use _bt_getbuf() here because it always applies + * _bt_checkpage(), which will barf on an all-zero page. We want to + * recycle all-zero pages, not fail. Also, we want to use a nondefault + * buffer access strategy. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + _bt_lockbuf(rel, buf, BT_READ); + page = BufferGetPage(buf); + opaque = NULL; + if (!PageIsNew(page)) + { + _bt_checkpage(rel, buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + Assert(blkno <= scanblkno); + if (blkno != scanblkno) + { + /* + * We're backtracking. + * + * We followed a right link to a sibling leaf page (a page that + * happens to be from a block located before scanblkno). The only + * case we want to do anything with is a live leaf page having the + * current vacuum cycle ID. + * + * The page had better be in a state that's consistent with what we + * expect. Check for conditions that imply corruption in passing. It + * can't be half-dead because only an interrupted VACUUM process can + * leave pages in that state, so we'd definitely have dealt with it + * back when the page was the scanblkno page (half-dead pages are + * always marked fully deleted by _bt_pagedel()). This assumes that + * there can be only one vacuum process running at a time. + */ + if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque)) + { + Assert(false); + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"", + blkno, scanblkno, RelationGetRelationName(rel)))); + _bt_relbuf(rel, buf); + return; + } + + /* + * We may have already processed the page in an earlier call, when the + * page was scanblkno. This happens when the leaf page split occurred + * after the scan began, but before the right sibling page became the + * scanblkno. + * + * Page may also have been deleted by current btvacuumpage() call, + * since _bt_pagedel() sometimes deletes the right sibling page of + * scanblkno in passing (it does so after we decided where to + * backtrack to). We don't need to process this page as a deleted + * page a second time now (in fact, it would be wrong to count it as a + * deleted page in the bulk delete statistics a second time). + */ + if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque)) + { + /* Done with current scanblkno (and all lower split pages) */ + _bt_relbuf(rel, buf); + return; + } + } + + if (!opaque || BTPageIsRecyclable(page)) + { + /* Okay to recycle this page (which could be leaf or internal) */ + RecordFreeIndexPage(rel, blkno); + stats->pages_deleted++; + stats->pages_free++; + } + else if (P_ISDELETED(opaque)) + { + /* + * Already deleted page (which could be leaf or internal). Can't + * recycle yet. + */ + stats->pages_deleted++; + } + else if (P_ISHALFDEAD(opaque)) + { + /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */ + attempt_pagedel = true; + + /* + * _bt_pagedel() will increment both pages_newly_deleted and + * pages_deleted stats in all cases (barring corruption) + */ + } + else if (P_ISLEAF(opaque)) + { + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable; + BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + int nupdatable; + OffsetNumber offnum, + minoff, + maxoff; + int nhtidsdead, + nhtidslive; + + /* + * Trade in the initial read lock for a super-exclusive write lock on + * this page. We must get such a lock on every leaf page over the + * course of the vacuum scan, whether or not it actually contains any + * deletable tuples --- see nbtree/README. + */ + _bt_upgradelockbufcleanup(rel, buf); + + /* + * Check whether we need to backtrack to earlier pages. What we are + * concerned about is a page split that happened since we started the + * vacuum scan. If the split moved tuples on the right half of the + * split (i.e. the tuples that sort high) to a block that we already + * passed over, then we might have missed the tuples. We need to + * backtrack now. (Must do this before possibly clearing btpo_cycleid + * or deleting scanblkno page below!) + */ + if (vstate->cycleid != 0 && + opaque->btpo_cycleid == vstate->cycleid && + !(opaque->btpo_flags & BTP_SPLIT_END) && + !P_RIGHTMOST(opaque) && + opaque->btpo_next < scanblkno) + backtrack_to = opaque->btpo_next; + + /* + * When each VACUUM begins, it determines an OldestXmin cutoff value. + * Tuples before the cutoff are removed by VACUUM. Scan over all + * items to see which ones need to be deleted according to cutoff + * point using callback. + */ + ndeletable = 0; + nupdatable = 0; + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + nhtidsdead = 0; + nhtidslive = 0; + if (callback) + { + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + IndexTuple itup; + + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, offnum)); + + /* + * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM + * records do not produce their own conflicts. This is safe + * as long as the callback function only considers whether the + * index tuple refers to pre-cutoff heap tuples that were + * certainly already pruned away during VACUUM's initial heap + * scan by the time we get here. (heapam's XLOG_HEAP2_PRUNE + * records produce conflicts using a latestRemovedXid value + * for the pointed-to heap tuples, so there is no need to + * produce our own conflict now.) + * + * Backends with snapshots acquired after a VACUUM starts but + * before it finishes could have visibility cutoff with a + * later xid than VACUUM's OldestXmin cutoff. These backends + * might happen to opportunistically mark some index tuples + * LP_DEAD before we reach them, even though they may be after + * our cutoff. We don't try to kill these "extra" index + * tuples in _bt_delitems_vacuum(). This keep things simple, + * and allows us to always avoid generating our own conflicts. + */ + Assert(!BTreeTupleIsPivot(itup)); + if (!BTreeTupleIsPosting(itup)) + { + /* Regular tuple, standard table TID representation */ + if (callback(&itup->t_tid, callback_state)) + { + deletable[ndeletable++] = offnum; + nhtidsdead++; + } + else + nhtidslive++; + } + else + { + BTVacuumPosting vacposting; + int nremaining; + + /* Posting list tuple */ + vacposting = btreevacuumposting(vstate, itup, offnum, + &nremaining); + if (vacposting == NULL) + { + /* + * All table TIDs from the posting tuple remain, so no + * delete or update required + */ + Assert(nremaining == BTreeTupleGetNPosting(itup)); + } + else if (nremaining > 0) + { + + /* + * Store metadata about posting list tuple in + * updatable array for entire page. Existing tuple + * will be updated during the later call to + * _bt_delitems_vacuum(). + */ + Assert(nremaining < BTreeTupleGetNPosting(itup)); + updatable[nupdatable++] = vacposting; + nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining; + } + else + { + /* + * All table TIDs from the posting list must be + * deleted. We'll delete the index tuple completely + * (no update required). + */ + Assert(nremaining == 0); + deletable[ndeletable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup); + pfree(vacposting); + } + + nhtidslive += nremaining; + } + } + } + + /* + * Apply any needed deletes or updates. We issue just one + * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic. + */ + if (ndeletable > 0 || nupdatable > 0) + { + Assert(nhtidsdead >= ndeletable + nupdatable); + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, + nupdatable); + + stats->tuples_removed += nhtidsdead; + /* must recompute maxoff */ + maxoff = PageGetMaxOffsetNumber(page); + + /* can't leak memory here */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]); + } + else + { + /* + * If the leaf page has been split during this vacuum cycle, it + * seems worth expending a write to clear btpo_cycleid even if we + * don't have any deletions to do. (If we do, _bt_delitems_vacuum + * takes care of this.) This ensures we won't process the page + * again. + * + * We treat this like a hint-bit update because there's no need to + * WAL-log it. + */ + Assert(nhtidsdead == 0); + if (vstate->cycleid != 0 && + opaque->btpo_cycleid == vstate->cycleid) + { + opaque->btpo_cycleid = 0; + MarkBufferDirtyHint(buf, true); + } + } + + /* + * If the leaf page is now empty, try to delete it; else count the + * live tuples (live table TIDs in posting lists are counted as + * separate live tuples). We don't delete when backtracking, though, + * since that would require teaching _bt_pagedel() about backtracking + * (doesn't seem worth adding more complexity to deal with that). + * + * We don't count the number of live TIDs during cleanup-only calls to + * btvacuumscan (i.e. when callback is not set). We count the number + * of index tuples directly instead. This avoids the expense of + * directly examining all of the tuples on each page. VACUUM will + * treat num_index_tuples as an estimate in cleanup-only case, so it + * doesn't matter that this underestimates num_index_tuples + * significantly in some cases. + */ + if (minoff > maxoff) + attempt_pagedel = (blkno == scanblkno); + else if (callback) + stats->num_index_tuples += nhtidslive; + else + stats->num_index_tuples += maxoff - minoff + 1; + + Assert(!attempt_pagedel || nhtidslive == 0); + } + + if (attempt_pagedel) + { + MemoryContext oldcontext; + + /* Run pagedel in a temp context to avoid memory leakage */ + MemoryContextReset(vstate->pagedelcontext); + oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); + + /* + * _bt_pagedel maintains the bulk delete stats on our behalf; + * pages_newly_deleted and pages_deleted are likely to be incremented + * during call + */ + Assert(blkno == scanblkno); + _bt_pagedel(rel, buf, vstate); + + MemoryContextSwitchTo(oldcontext); + /* pagedel released buffer, so we shouldn't */ + } + else + _bt_relbuf(rel, buf); + + if (backtrack_to != P_NONE) + { + blkno = backtrack_to; + goto backtrack; + } +} + +/* + * btreevacuumposting --- determine TIDs still needed in posting list + * + * Returns metadata describing how to build replacement tuple without the TIDs + * that VACUUM needs to delete. Returned value is NULL in the common case + * where no changes are needed to caller's posting list tuple (we avoid + * allocating memory here as an optimization). + * + * The number of TIDs that should remain in the posting list tuple is set for + * caller in *nremaining. + */ +static BTVacuumPosting +btreevacuumposting(BTVacState *vstate, IndexTuple posting, + OffsetNumber updatedoffset, int *nremaining) +{ + int live = 0; + int nitem = BTreeTupleGetNPosting(posting); + ItemPointer items = BTreeTupleGetPosting(posting); + BTVacuumPosting vacposting = NULL; + + for (int i = 0; i < nitem; i++) + { + if (!vstate->callback(items + i, vstate->callback_state)) + { + /* Live table TID */ + live++; + } + else if (vacposting == NULL) + { + /* + * First dead table TID encountered. + * + * It's now clear that we need to delete one or more dead table + * TIDs, so start maintaining metadata describing how to update + * existing posting list tuple. + */ + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + nitem * sizeof(uint16)); + + vacposting->itup = posting; + vacposting->updatedoffset = updatedoffset; + vacposting->ndeletedtids = 0; + vacposting->deletetids[vacposting->ndeletedtids++] = i; + } + else + { + /* Second or subsequent dead table TID */ + vacposting->deletetids[vacposting->ndeletedtids++] = i; + } + } + + *nremaining = live; + return vacposting; +} + +/* + * btcanreturn() -- Check whether btree indexes support index-only scans. + * + * btrees always do, so this is trivial. + */ +bool +btcanreturn(Relation index, int attno) +{ + return true; +} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c new file mode 100644 index 0000000..fdf0e56 --- /dev/null +++ b/src/backend/access/nbtree/nbtsearch.c @@ -0,0 +1,2501 @@ +/*------------------------------------------------------------------------- + * + * nbtsearch.c + * Search code for postgres btrees. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsearch.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/relscan.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/predicate.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" + + +static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); +static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, + OffsetNumber offnum); +static void _bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup); +static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset); +static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); +static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, + ScanDirection dir); +static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); +static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); +static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); + + +/* + * _bt_drop_lock_and_maybe_pin() + * + * Unlock the buffer; and if it is safe to release the pin, do that, too. It + * is safe if the scan is using an MVCC snapshot and the index is WAL-logged. + * This will prevent vacuum from stalling in a blocked state trying to read a + * page when a cursor is sitting on it -- at least in many important cases. + * + * Set the buffer to invalid if the pin is released, since the buffer may be + * re-used. If we need to go back to this block (for example, to apply + * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it + * will remain in shared memory for as long as it takes to scan the index + * buffer page. + */ +static void +_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +{ + _bt_unlockbuf(scan->indexRelation, sp->buf); + + if (IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation) && + !scan->xs_want_itup) + { + ReleaseBuffer(sp->buf); + sp->buf = InvalidBuffer; + } +} + +/* + * _bt_search() -- Search the tree for a particular scankey, + * or more precisely for the first leaf page it could be on. + * + * The passed scankey is an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * + * Return value is a stack of parent-page pointers (i.e. there is no entry for + * the leaf level/page). *bufP is set to the address of the leaf-page buffer, + * which is locked and pinned. No locks are held on the parent pages, + * however! + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + * + * The returned buffer is locked according to access parameter. Additionally, + * access = BT_WRITE will allow an empty root page to be created and returned. + * When access = BT_READ, an empty index will result in *bufP being set to + * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered + * during the search will be finished. + */ +BTStack +_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, + Snapshot snapshot) +{ + BTStack stack_in = NULL; + int page_access = BT_READ; + + /* Get the root page to start with */ + *bufP = _bt_getroot(rel, access); + + /* If index is empty and access = BT_READ, no root page is created. */ + if (!BufferIsValid(*bufP)) + return (BTStack) NULL; + + /* Loop iterates once per level descended in the tree */ + for (;;) + { + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + ItemId itemid; + IndexTuple itup; + BlockNumber child; + BTStack new_stack; + + /* + * Race -- the page we just grabbed may have split since we read its + * downlink in its parent page (or the metapage). If it has, we may + * need to move right to its new sibling. Do that. + * + * In write-mode, allow _bt_moveright to finish any incomplete splits + * along the way. Strictly speaking, we'd only need to finish an + * incomplete split on the leaf page we're about to insert to, not on + * any of the upper levels (internal pages with incomplete splits are + * also taken care of in _bt_getstackbuf). But this is a good + * opportunity to finish splits of internal pages too. + */ + *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in, + page_access, snapshot); + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISLEAF(opaque)) + break; + + /* + * Find the appropriate pivot tuple on this page. Its downlink points + * to the child page that we're about to descend to. + */ + offnum = _bt_binsrch(rel, key, *bufP); + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); + child = BTreeTupleGetDownLink(itup); + + /* + * We need to save the location of the pivot tuple we chose in a new + * stack entry for this page/level. If caller ends up splitting a + * page one level down, it usually ends up inserting a new pivot + * tuple/downlink immediately after the location recorded here. + */ + new_stack = (BTStack) palloc(sizeof(BTStackData)); + new_stack->bts_blkno = BufferGetBlockNumber(*bufP); + new_stack->bts_offset = offnum; + new_stack->bts_parent = stack_in; + + /* + * Page level 1 is lowest non-leaf page level prior to leaves. So, if + * we're on the level 1 and asked to lock leaf page in write mode, + * then lock next page in write mode, because it must be a leaf. + */ + if (opaque->btpo_level == 1 && access == BT_WRITE) + page_access = BT_WRITE; + + /* drop the read lock on the page, then acquire one on its child */ + *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access); + + /* okay, all set to move down a level */ + stack_in = new_stack; + } + + /* + * If we're asked to lock leaf in write mode, but didn't manage to, then + * relock. This should only happen when the root page is a leaf page (and + * the only page in the index other than the metapage). + */ + if (access == BT_WRITE && page_access == BT_READ) + { + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, *bufP); + _bt_lockbuf(rel, *bufP, BT_WRITE); + + /* + * Race -- the leaf page may have split after we dropped the read lock + * but before we acquired a write lock. If it has, we may need to + * move right to its new sibling. Do that. + */ + *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE, + snapshot); + } + + return stack_in; +} + +/* + * _bt_moveright() -- move right in the btree if necessary. + * + * When we follow a pointer to reach a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. + * + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry is + * strictly less than the scankey, or <= the scankey in the + * key.nextkey=true case, then we followed the wrong link and we need + * to move right. + * + * The passed insertion-type scankey can omit the rightmost column(s) of the + * index. (see nbtree/README) + * + * When key.nextkey is false (the usual case), we are looking for the first + * item >= key. When key.nextkey is true, we are looking for the first item + * strictly greater than key. + * + * If forupdate is true, we will attempt to finish any incomplete splits + * that we encounter. This is required when locking a target page for an + * insertion, because we don't allow inserting on a page before the split + * is completed. 'stack' is only used if forupdate is true. + * + * On entry, we have the buffer pinned and a lock of the type specified by + * 'access'. If we move right, we release the buffer and lock and acquire + * the same on the right sibling. Return value is the buffer we stop at. + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + */ +Buffer +_bt_moveright(Relation rel, + BTScanInsert key, + Buffer buf, + bool forupdate, + BTStack stack, + int access, + Snapshot snapshot) +{ + Page page; + BTPageOpaque opaque; + int32 cmpval; + + /* + * When nextkey = false (normal case): if the scan key that brought us to + * this page is > the high key stored on the page, then the page has split + * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could + * have some duplicates to the right as well as the left, but that's + * something that's only ever dealt with on the leaf level, after + * _bt_search has found an initial leaf page.) + * + * When nextkey = true: move right if the scan key is >= page's high key. + * (Note that key.scantid cannot be set in this case.) + * + * The page could even have split more than once, so scan as far as + * needed. + * + * We also have to move right if we followed a link that brought us to a + * dead page. + */ + cmpval = key->nextkey ? 0 : 1; + + for (;;) + { + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_RIGHTMOST(opaque)) + break; + + /* + * Finish any incomplete splits we encounter along the way. + */ + if (forupdate && P_INCOMPLETE_SPLIT(opaque)) + { + BlockNumber blkno = BufferGetBlockNumber(buf); + + /* upgrade our lock if necessary */ + if (access == BT_READ) + { + _bt_unlockbuf(rel, buf); + _bt_lockbuf(rel, buf, BT_WRITE); + } + + if (P_INCOMPLETE_SPLIT(opaque)) + _bt_finish_split(rel, buf, stack); + else + _bt_relbuf(rel, buf); + + /* re-acquire the lock in the right mode, and re-check */ + buf = _bt_getbuf(rel, blkno, access); + continue; + } + + if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval) + { + /* step right one page */ + buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); + continue; + } + else + break; + } + + if (P_IGNORE(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + + return buf; +} + +/* + * _bt_binsrch() -- Do a binary search for a key on a particular page. + * + * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first + * key >= given scankey, or > scankey if nextkey is true. (NOTE: in + * particular, this means it is possible to return a value 1 greater than the + * number of keys on the page, if the scankey is > all keys on the page.) + * + * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber + * of the last key < given scankey, or last key <= given scankey if nextkey + * is true. (Since _bt_compare treats the first data key of such a page as + * minus infinity, there will be at least one key < scankey, so the result + * always points at one of the keys on the page.) This key indicates the + * right place to descend to be sure we find all leaf keys >= given scankey + * (or leaf keys > given scankey when nextkey is true). + * + * This procedure is not responsible for walking right, it just examines + * the given page. _bt_binsrch() has no lock or refcount side effects + * on the buffer. + */ +static OffsetNumber +_bt_binsrch(Relation rel, + BTScanInsert key, + Buffer buf) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high; + int32 result, + cmpval; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Requesting nextkey semantics while using scantid seems nonsensical */ + Assert(!key->nextkey || key->scantid == NULL); + /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */ + Assert(!P_ISLEAF(opaque) || key->scantid == NULL); + + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + + /* + * If there are no keys on the page, return the first available slot. Note + * this covers two cases: the page is really empty (no keys), or it + * contains only a high key. The latter case is possible after vacuuming. + * This can never happen on an internal page, however, since they are + * never empty (an internal page must have children). + */ + if (unlikely(high < low)) + return low; + + /* + * Binary search to find the first key on the page >= scan key, or first + * key > scankey when nextkey is true. + * + * For nextkey=false (cmpval=1), the loop invariant is: all slots before + * 'low' are < scan key, all slots at or after 'high' are >= scan key. + * + * For nextkey=true (cmpval=0), the loop invariant is: all slots before + * 'low' are <= scan key, all slots at or after 'high' are > scan key. + * + * We can fall out when high == low. + */ + high++; /* establish the loop invariant for high */ + + cmpval = key->nextkey ? 0 : 1; /* select comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid); + + if (result >= cmpval) + low = mid + 1; + else + high = mid; + } + + /* + * At this point we have high == low, but be careful: they could point + * past the last slot on the page. + * + * On a leaf page, we always return the first key >= scan key (resp. > + * scan key), which could be the last slot + 1. + */ + if (P_ISLEAF(opaque)) + return low; + + /* + * On a non-leaf page, return the last key < scan key (resp. <= scan key). + * There must be one if _bt_compare() is playing by the rules. + */ + Assert(low > P_FIRSTDATAKEY(opaque)); + + return OffsetNumberPrev(low); +} + +/* + * + * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search. + * + * Like _bt_binsrch(), but with support for caching the binary search + * bounds. Only used during insertion, and only on the leaf page that it + * looks like caller will insert tuple on. Exclusive-locked and pinned + * leaf page is contained within insertstate. + * + * Caches the bounds fields in insertstate so that a subsequent call can + * reuse the low and strict high bounds of original binary search. Callers + * that use these fields directly must be prepared for the case where low + * and/or stricthigh are not on the same page (one or both exceed maxoff + * for the page). The case where there are no items on the page (high < + * low) makes bounds invalid. + * + * Caller is responsible for invalidating bounds when it modifies the page + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by a posting + * list split). + */ +OffsetNumber +_bt_binsrch_insert(Relation rel, BTInsertState insertstate) +{ + BTScanInsert key = insertstate->itup_key; + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high, + stricthigh; + int32 result, + cmpval; + + page = BufferGetPage(insertstate->buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_ISLEAF(opaque)); + Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); + + if (!insertstate->bounds_valid) + { + /* Start new binary search */ + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + } + else + { + /* Restore result of previous binary search against same page */ + low = insertstate->low; + high = insertstate->stricthigh; + } + + /* If there are no keys on the page, return the first available slot */ + if (unlikely(high < low)) + { + /* Caller can't reuse bounds */ + insertstate->low = InvalidOffsetNumber; + insertstate->stricthigh = InvalidOffsetNumber; + insertstate->bounds_valid = false; + return low; + } + + /* + * Binary search to find the first key on the page >= scan key. (nextkey + * is always false when inserting). + * + * The loop invariant is: all slots before 'low' are < scan key, all slots + * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is + * maintained to save additional search effort for caller. + * + * We can fall out when high == low. + */ + if (!insertstate->bounds_valid) + high++; /* establish the loop invariant for high */ + stricthigh = high; /* high initially strictly higher */ + + cmpval = 1; /* !nextkey comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid); + + if (result >= cmpval) + low = mid + 1; + else + { + high = mid; + if (result != 0) + stricthigh = high; + } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + { + /* + * postingoff should never be set more than once per leaf page + * binary search. That would mean that there are duplicate table + * TIDs in the index, which is never okay. Check for that here. + */ + if (insertstate->postingoff != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(key->scantid), + ItemPointerGetOffsetNumber(key->scantid), + low, stricthigh, + BufferGetBlockNumber(insertstate->buf), + RelationGetRelationName(rel)))); + + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); + } + } + + /* + * On a leaf page, a binary search always returns the first key >= scan + * key (at least in !nextkey case), which could be the last slot + 1. This + * is also the lower bound of cached search. + * + * stricthigh may also be the last slot + 1, which prevents caller from + * using bounds directly, but is still useful to us if we're called a + * second time with cached bounds (cached low will be < stricthigh when + * that happens). + */ + insertstate->low = low; + insertstate->stricthigh = stricthigh; + insertstate->bounds_valid = true; + + return low; +} + +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Helper routine for _bt_binsrch_insert(). + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + Assert(key->heapkeyspace && key->allequalimage); + + /* + * In the event that posting list tuple has LP_DEAD bit set, indicate this + * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A + * second call to _bt_binsrch_insert() can take place when its caller has + * removed the dead item. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res > 0) + low = mid + 1; + else if (res < 0) + high = mid; + else + return mid; + } + + /* Exact match not found */ + return low; +} + +/*---------- + * _bt_compare() -- Compare insertion-type scankey to tuple on a page. + * + * page/offnum: location of btree item to be compared to. + * + * This routine returns: + * <0 if scankey < tuple at offnum; + * 0 if scankey == tuple at offnum; + * >0 if scankey > tuple at offnum. + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). + * + * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be + * "minus infinity": this routine will always claim it is less than the + * scankey. The actual key value stored is explicitly truncated to 0 + * attributes (explicitly minus infinity) with version 3+ indexes, but + * that isn't relied upon. This allows us to implement the Lehman and + * Yao convention that the first down-link pointer is before the first + * key. See backend/access/nbtree/README for details. + *---------- + */ +int32 +_bt_compare(Relation rel, + BTScanInsert key, + Page page, + OffsetNumber offnum) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + IndexTuple itup; + ItemPointer heapTid; + ScanKey scankey; + int ncmpkey; + int ntupatts; + int32 result; + + Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); + Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(key->heapkeyspace || key->scantid == NULL); + + /* + * Force result ">" if target item is first data item on an internal page + * --- see NOTE above. + */ + if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) + return 1; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + ntupatts = BTreeTupleGetNAtts(itup, rel); + + /* + * The scan key is set up with the attribute number associated with each + * term in the key. It is important that, if the index is multi-key, the + * scan contain the first k key attributes, and that they be in order. If + * you think about how multi-key ordering works, you'll understand why + * this is. + * + * We don't test for violation of this condition here, however. The + * initial setup for the index scan had better have gotten it right (see + * _bt_first). + */ + + ncmpkey = Min(ntupatts, key->keysz); + Assert(key->heapkeyspace || ncmpkey == key->keysz); + Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); + scankey = key->scankeys; + for (int i = 1; i <= ncmpkey; i++) + { + Datum datum; + bool isNull; + + datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); + + if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ + { + if (isNull) + result = 0; /* NULL "=" NULL */ + else if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull) /* key is NOT_NULL and item is NULL */ + { + if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * The sk_func needs to be passed the index value as left arg and + * the sk_argument as right arg (they might be of different + * types). Since it is convenient for callers to think of + * _bt_compare as comparing the scankey to the index item, we have + * to flip the sign of the comparison result. (Unless it's a DESC + * column, in which case we *don't* flip the sign.) + */ + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); + + if (!(scankey->sk_flags & SK_BT_DESC)) + INVERT_COMPARE_RESULT(result); + } + + /* if the keys are unequal, return the difference */ + if (result != 0) + return result; + + scankey++; + } + + /* + * All non-truncated attributes (other than heap TID) were found to be + * equal. Treat truncated attributes as minus infinity when scankey has a + * key attribute value that would otherwise be compared directly. + * + * Note: it doesn't matter if ntupatts includes non-key attributes; + * scankey won't, so explicitly excluding non-key attributes isn't + * necessary. + */ + if (key->keysz > ntupatts) + return 1; + + /* + * Use the heap TID attribute and scantid to try to break the tie. The + * rules are the same as any other key attribute -- only the + * representation differs. + */ + heapTid = BTreeTupleGetHeapTID(itup); + if (key->scantid == NULL) + { + /* + * Most searches have a scankey that is considered greater than a + * truncated pivot tuple if and when the scankey has equal values for + * attributes up to and including the least significant untruncated + * attribute in tuple. + * + * For example, if an index has the minimum two attributes (single + * user key attribute, plus heap TID attribute), and a page's high key + * is ('foo', -inf), and scankey is ('foo', ), the search + * will not descend to the page to the left. The search will descend + * right instead. The truncated attribute in pivot tuple means that + * all non-pivot tuples on the page to the left are strictly < 'foo', + * so it isn't necessary to descend left. In other words, search + * doesn't have to descend left because it isn't interested in a match + * that has a heap TID value of -inf. + * + * However, some searches (pivotsearch searches) actually require that + * we descend left when this happens. -inf is treated as a possible + * match for omitted scankey attribute(s). This is needed by page + * deletion, which must re-find leaf pages that are targets for + * deletion using their high keys. + * + * Note: the heap TID part of the test ensures that scankey is being + * compared to a pivot tuple with one or more truncated key + * attributes. + * + * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the + * left here, since they have no heap TID attribute (and cannot have + * any -inf key values in any case, since truncation can only remove + * non-key attributes). !heapkeyspace searches must always be + * prepared to deal with matches on both sides of the pivot once the + * leaf level is reached. + */ + if (key->heapkeyspace && !key->pivotsearch && + key->keysz == ntupatts && heapTid == NULL) + return 1; + + /* All provided scankey arguments found to be equal */ + return 0; + } + + /* + * Treat truncated heap TID as minus infinity, since scankey has a key + * attribute value (scantid) that would otherwise be compared directly + */ + Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel)); + if (heapTid == NULL) + return 1; + + /* + * Scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * with scantid. + */ + Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); + result = ItemPointerCompare(key->scantid, heapTid); + if (result <= 0 || !BTreeTupleIsPosting(itup)) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; +} + +/* + * _bt_first() -- Find the first item in a scan. + * + * We need to be clever about the direction of scan, the search + * conditions, and the tree ordering. We find the first item (or, + * if backwards scan, the last item) in the tree that satisfies the + * qualifications in the scan key. On success exit, the page containing + * the current index tuple is pinned but not locked, and data about + * the matching tuple(s) on the page has been loaded into so->currPos. + * scan->xs_ctup.t_self is set to the heap TID of the current tuple, + * and if requested, scan->xs_itup points to a copy of the index tuple. + * + * If there are no matching items in the index, we return false, with no + * pins or locks held. + * + * Note that scan->keyData[], and the so->keyData[] scankey built from it, + * are both search-type scankeys (see nbtree/README for more about this). + * Within this routine, we build a temporary insertion-type scankey to use + * in locating the scan start position. + */ +bool +_bt_first(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + BTStack stack; + OffsetNumber offnum; + StrategyNumber strat; + bool nextkey; + bool goback; + BTScanInsertData inskey; + ScanKey startKeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + int keysCount = 0; + int i; + bool status; + StrategyNumber strat_total; + BTScanPosItem *currItem; + BlockNumber blkno; + + Assert(!BTScanPosIsValid(so->currPos)); + + pgstat_count_index_scan(rel); + + /* + * Examine the scan keys and eliminate any redundant keys; also mark the + * keys that must be matched to continue the scan. + */ + _bt_preprocess_keys(scan); + + /* + * Quit now if _bt_preprocess_keys() discovered that the scan keys can + * never be satisfied (eg, x == 1 AND x > 2). + */ + if (!so->qual_ok) + { + /* Notify any other workers that we're done with this scan key. */ + _bt_parallel_done(scan); + return false; + } + + /* + * For parallel scans, get the starting page from shared state. If the + * scan has not started, proceed to find out first leaf page in the usual + * way while keeping other participating processes waiting. If the scan + * has already begun, use the page number from the shared structure. + */ + if (scan->parallel_scan != NULL) + { + status = _bt_parallel_seize(scan, &blkno); + if (!status) + return false; + else if (blkno == P_NONE) + { + _bt_parallel_done(scan); + return false; + } + else if (blkno != InvalidBlockNumber) + { + if (!_bt_parallel_readpage(scan, blkno, dir)) + return false; + goto readcomplete; + } + } + + /*---------- + * Examine the scan keys to discover where we need to start the scan. + * + * We want to identify the keys that can be used as starting boundaries; + * these are =, >, or >= keys for a forward scan or =, <, <= keys for + * a backwards scan. We can use keys for multiple attributes so long as + * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept + * a > or < boundary or find an attribute with no boundary (which can be + * thought of as the same as "> -infinity"), we can't use keys for any + * attributes to its right, because it would break our simplistic notion + * of what initial positioning strategy to use. + * + * When the scan keys include cross-type operators, _bt_preprocess_keys + * may not be able to eliminate redundant keys; in such cases we will + * arbitrarily pick a usable one for each attribute. This is correct + * but possibly not optimal behavior. (For example, with keys like + * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when + * x=5 would be more efficient.) Since the situation only arises given + * a poorly-worded query plus an incomplete opfamily, live with it. + * + * When both equality and inequality keys appear for a single attribute + * (again, only possible when cross-type operators appear), we *must* + * select one of the equality keys for the starting point, because + * _bt_checkkeys() will stop the scan as soon as an equality qual fails. + * For example, if we have keys like "x >= 4 AND x = 10" and we elect to + * start at x=4, we will fail and stop before reaching x=10. If multiple + * equality quals survive preprocessing, however, it doesn't matter which + * one we use --- by definition, they are either redundant or + * contradictory. + * + * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. + * If the index stores nulls at the end of the index we'll be starting + * from, and we have no boundary key for the column (which means the key + * we deduced NOT NULL from is an inequality key that constrains the other + * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to + * use as a boundary key. If we didn't do this, we might find ourselves + * traversing a lot of null entries at the start of the scan. + * + * In this loop, row-comparison keys are treated the same as keys on their + * first (leftmost) columns. We'll add on lower-order columns of the row + * comparison below, if possible. + * + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. + *---------- + */ + strat_total = BTEqualStrategyNumber; + if (so->numberOfKeys > 0) + { + AttrNumber curattr; + ScanKey chosen; + ScanKey impliesNN; + ScanKey cur; + + /* + * chosen is the so-far-chosen key for the current attribute, if any. + * We don't cast the decision in stone until we reach keys for the + * next attribute. + */ + curattr = 1; + chosen = NULL; + /* Also remember any scankey that implies a NOT NULL constraint */ + impliesNN = NULL; + + /* + * Loop iterates from 0 to numberOfKeys inclusive; we use the last + * pass to handle after-last-key processing. Actual exit from the + * loop is at one of the "break" statements below. + */ + for (cur = so->keyData, i = 0;; cur++, i++) + { + if (i >= so->numberOfKeys || cur->sk_attno != curattr) + { + /* + * Done looking at keys for curattr. If we didn't find a + * usable boundary key, see if we can deduce a NOT NULL key. + */ + if (chosen == NULL && impliesNN != NULL && + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + ScanDirectionIsForward(dir) : + ScanDirectionIsBackward(dir))) + { + /* Yes, so build the key in notnullkeys[keysCount] */ + chosen = ¬nullkeys[keysCount]; + ScanKeyEntryInitialize(chosen, + (SK_SEARCHNOTNULL | SK_ISNULL | + (impliesNN->sk_flags & + (SK_BT_DESC | SK_BT_NULLS_FIRST))), + curattr, + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + BTGreaterStrategyNumber : + BTLessStrategyNumber), + InvalidOid, + InvalidOid, + InvalidOid, + (Datum) 0); + } + + /* + * If we still didn't find a usable boundary key, quit; else + * save the boundary key pointer in startKeys. + */ + if (chosen == NULL) + break; + startKeys[keysCount++] = chosen; + + /* + * Adjust strat_total, and quit if we have stored a > or < + * key. + */ + strat = chosen->sk_strategy; + if (strat != BTEqualStrategyNumber) + { + strat_total = strat; + if (strat == BTGreaterStrategyNumber || + strat == BTLessStrategyNumber) + break; + } + + /* + * Done if that was the last attribute, or if next key is not + * in sequence (implying no boundary key is available for the + * next attribute). + */ + if (i >= so->numberOfKeys || + cur->sk_attno != curattr + 1) + break; + + /* + * Reset for next attr. + */ + curattr = cur->sk_attno; + chosen = NULL; + impliesNN = NULL; + } + + /* + * Can we use this key as a starting boundary for this attr? + * + * If not, does it imply a NOT NULL constraint? (Because + * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, + * *any* inequality key works for that; we need not test.) + */ + switch (cur->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsBackward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + case BTEqualStrategyNumber: + /* override any non-equality choice */ + chosen = cur; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsForward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + } + } + } + + /* + * If we found no usable boundary keys, we have to start from one end of + * the tree. Walk down that edge to the first or last key, and scan from + * there. + */ + if (keysCount == 0) + { + bool match; + + match = _bt_endpoint(scan, dir); + + if (!match) + { + /* No match, so mark (parallel) scan finished */ + _bt_parallel_done(scan); + } + + return match; + } + + /* + * We want to start the scan somewhere within the index. Set up an + * insertion scankey we can use to search for the boundary point we + * identified above. The insertion scankey is built using the keys + * identified by startKeys[]. (Remaining insertion scankey fields are + * initialized after initial-positioning strategy is finalized.) + */ + Assert(keysCount <= INDEX_MAX_KEYS); + for (i = 0; i < keysCount; i++) + { + ScanKey cur = startKeys[i]; + + Assert(cur->sk_attno == i + 1); + + if (cur->sk_flags & SK_ROW_HEADER) + { + /* + * Row comparison header: look to the first row member instead. + * + * The member scankeys are already in insertion format (ie, they + * have sk_func = 3-way-comparison function), but we have to watch + * out for nulls, which _bt_preprocess_keys didn't check. A null + * in the first row member makes the condition unmatchable, just + * like qual_ok = false. + */ + ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_flags & SK_ISNULL) + { + _bt_parallel_done(scan); + return false; + } + memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); + + /* + * If the row comparison is the last positioning key we accepted, + * try to add additional keys from the lower-order row members. + * (If we accepted independent conditions on additional index + * columns, we use those instead --- doesn't seem worth trying to + * determine which is more restrictive.) Note that this is OK + * even if the row comparison is of ">" or "<" type, because the + * condition applied to all but the last row member is effectively + * ">=" or "<=", and so the extra keys don't break the positioning + * scheme. But, by the same token, if we aren't able to use all + * the row members, then the part of the row comparison that we + * did use has to be treated as just a ">=" or "<=" condition, and + * so we'd better adjust strat_total accordingly. + */ + if (i == keysCount - 1) + { + bool used_all_subkeys = false; + + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) + { + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != keysCount + 1) + break; /* out-of-sequence, can't use it */ + if (subkey->sk_strategy != cur->sk_strategy) + break; /* wrong direction, can't use it */ + if (subkey->sk_flags & SK_ISNULL) + break; /* can't use null keys */ + Assert(keysCount < INDEX_MAX_KEYS); + memcpy(inskey.scankeys + keysCount, subkey, + sizeof(ScanKeyData)); + keysCount++; + if (subkey->sk_flags & SK_ROW_END) + { + used_all_subkeys = true; + break; + } + } + if (!used_all_subkeys) + { + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } + } + break; /* done with outer loop */ + } + } + else + { + /* + * Ordinary comparison key. Transform the search-style scan key + * to an insertion scan key by replacing the sk_func with the + * appropriate btree comparison function. + * + * If scankey operator is not a cross-type comparison, we can use + * the cached comparison function; otherwise gotta look it up in + * the catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means + * the opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + if (cur->sk_subtype == rel->rd_opcintype[i] || + cur->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + procinfo, + cur->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + cur->sk_subtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, + cur->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + cmp_proc, + cur->sk_argument); + } + } + } + + /*---------- + * Examine the selected initial-positioning strategy to determine exactly + * where we need to start the scan, and set flag variables to control the + * code below. + * + * If nextkey = false, _bt_search and _bt_binsrch will locate the first + * item >= scan key. If nextkey = true, they will locate the first + * item > scan key. + * + * If goback = true, we will then step back one item, while if + * goback = false, we will start the scan on the located item. + *---------- + */ + switch (strat_total) + { + case BTLessStrategyNumber: + + /* + * Find first item >= scankey, then back up one to arrive at last + * item < scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + nextkey = false; + goback = true; + break; + + case BTLessEqualStrategyNumber: + + /* + * Find first item > scankey, then back up one to arrive at last + * item <= scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + nextkey = true; + goback = true; + break; + + case BTEqualStrategyNumber: + + /* + * If a backward scan was specified, need to start with last equal + * item not first one. + */ + if (ScanDirectionIsBackward(dir)) + { + /* + * This is the same as the <= strategy. We will check at the + * end whether the found item is actually =. + */ + nextkey = true; + goback = true; + } + else + { + /* + * This is the same as the >= strategy. We will check at the + * end whether the found item is actually =. + */ + nextkey = false; + goback = false; + } + break; + + case BTGreaterEqualStrategyNumber: + + /* + * Find first item >= scankey. (This is only used for forward + * scans.) + */ + nextkey = false; + goback = false; + break; + + case BTGreaterStrategyNumber: + + /* + * Find first item > scankey. (This is only used for forward + * scans.) + */ + nextkey = true; + goback = false; + break; + + default: + /* can't get here, but keep compiler quiet */ + elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); + return false; + } + + /* Initialize remaining insertion scan key fields */ + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); + inskey.anynullkeys = false; /* unused */ + inskey.nextkey = nextkey; + inskey.pivotsearch = false; + inskey.scantid = NULL; + inskey.keysz = keysCount; + + /* + * Use the manufactured insertion scan key to descend the tree and + * position ourselves on the target leaf page. + */ + stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); + + /* don't need to keep the stack around... */ + _bt_freestack(stack); + + if (!BufferIsValid(buf)) + { + /* + * We only get here if the index is completely empty. Lock relation + * because nothing finer to lock exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + + /* + * mark parallel scan as done, so that all the workers can finish + * their scan + */ + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + + return false; + } + else + PredicateLockPage(rel, BufferGetBlockNumber(buf), + scan->xs_snapshot); + + _bt_initialize_more_data(so, dir); + + /* position to the precise item on the page */ + offnum = _bt_binsrch(rel, &inskey, buf); + + /* + * If nextkey = false, we are positioned at the first item >= scan key, or + * possibly at the end of a page on which all the existing items are less + * than the scan key and we know that everything on later pages is greater + * than or equal to scan key. + * + * If nextkey = true, we are positioned at the first item > scan key, or + * possibly at the end of a page on which all the existing items are less + * than or equal to the scan key and we know that everything on later + * pages is greater than scan key. + * + * The actually desired starting point is either this item or the prior + * one, or in the end-of-page case it's the first item on the next page or + * the last item on this page. Adjust the starting offset if needed. (If + * this results in an offset before the first item or after the last one, + * _bt_readpage will report no items found, and then we'll step to the + * next page as needed.) + */ + if (goback) + offnum = OffsetNumberPrev(offnum); + + /* remember which buffer we have pinned, if any */ + Assert(!BTScanPosIsValid(so->currPos)); + so->currPos.buf = buf; + + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, dir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + return false; + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + +readcomplete: + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_next() -- Get the next item in a scan. + * + * On entry, so->currPos describes the current page, which may be pinned + * but is not locked, and so->currPos.itemIndex identifies which item was + * previously returned. + * + * On successful exit, scan->xs_ctup.t_self is set to the TID of the + * next heap tuple, and if requested, scan->xs_itup points to a copy of + * the index tuple. so->currPos is updated as needed. + * + * On failure exit (no more tuples), we release pin and set + * so->currPos.buf to InvalidBuffer. + */ +bool +_bt_next(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPosItem *currItem; + + /* + * Advance to next tuple on current page; or if there's no more, try to + * step to the next page with data. + */ + if (ScanDirectionIsForward(dir)) + { + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + if (!_bt_steppage(scan, dir)) + return false; + } + } + else + { + if (--so->currPos.itemIndex < so->currPos.firstItem) + { + if (!_bt_steppage(scan, dir)) + return false; + } + } + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_readpage() -- Load data from current index page into so->currPos + * + * Caller must have pinned and read-locked so->currPos.buf; the buffer's state + * is not changed here. Also, currPos.moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of so->currPos are + * initialized from scratch here. + * + * We scan the current page starting at offnum and moving in the indicated + * direction. All items matching the scan keys are loaded into currPos.items. + * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports + * that there can be no more matching tuples in the current scan direction. + * + * In the case of a parallel scan, caller must have called _bt_parallel_seize + * prior to calling this function; this function will invoke + * _bt_parallel_release before returning. + * + * Returns true if any matching items found on the page, false if none. + */ +static bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int itemIndex; + bool continuescan; + int indnatts; + + /* + * We must have the buffer pinned and locked, but the usual macro can't be + * used here; this function is what makes it good for currPos. + */ + Assert(BufferIsValid(so->currPos.buf)); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* allow next page be processed by parallel worker */ + if (scan->parallel_scan) + { + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, opaque->btpo_next); + else + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + continuescan = true; /* default assumption */ + indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * We note the buffer's block number so that we can release the pin later. + * This allows us to re-read the buffer if it is needed again for hinting. + */ + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + + /* + * we must save the page's right-link while scanning it; this tells us + * where to step right to after we're done with these items. There is no + * corresponding need for the left-link, since splits always go right. + */ + so->currPos.nextPage = opaque->btpo_next; + + /* initialize tuple workspace to empty */ + so->currPos.nextTupleOffset = 0; + + /* + * Now that the current page has been made consistent, the macro should be + * good. + */ + Assert(BTScanPosIsPinned(so->currPos)); + + if (ScanDirectionIsForward(dir)) + { + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + + itup = (IndexTuple) PageGetItem(page, iid); + + if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID + */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!continuescan) + break; + + offnum = OffsetNumberNext(offnum); + } + + /* + * We don't need to visit page to the right when the high key + * indicates that no more matches will be found there. + * + * Checking the high key like this works out more often than you might + * think. Leaf page splits pick a split point between the two most + * dissimilar tuples (this is weighed against the need to evenly share + * free space). Leaf pages with high key attribute values that can + * only appear on non-pivot tuples on the right sibling page are + * common. + */ + if (continuescan && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + IndexTuple itup = (IndexTuple) PageGetItem(page, iid); + int truncatt; + + truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); + _bt_checkkeys(scan, itup, truncatt, dir, &continuescan); + } + + if (!continuescan) + so->currPos.moreRight = false; + + Assert(itemIndex <= MaxTIDsPerBTreePage); + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; + } + else + { + /* load items[] in descending order */ + itemIndex = MaxTIDsPerBTreePage; + + offnum = Min(offnum, maxoff); + + while (offnum >= minoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool tuple_alive; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual. Most of the + * time, it's a win to not bother examining the tuple's index + * keys, but just skip to the next tuple (previous, actually, + * since we're scanning backwards). However, if this is the first + * tuple on the page, we do check the index keys, to prevent + * uselessly advancing to the page to the left. This is similar + * to the high key optimization used by forward scans. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + Assert(offnum >= P_FIRSTDATAKEY(opaque)); + if (offnum > P_FIRSTDATAKEY(opaque)) + { + offnum = OffsetNumberPrev(offnum); + continue; + } + + tuple_alive = false; + } + else + tuple_alive = true; + + itup = (IndexTuple) PageGetItem(page, iid); + + passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, + &continuescan); + if (passes_quals && tuple_alive) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID. + * + * Note that we deliberately save/return items from + * posting lists in ascending heap TID order for backwards + * scans. This allows _bt_killitems() to make a + * consistent assumption about the order of items + * associated with the same posting list tuple. + */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } + } + if (!continuescan) + { + /* there can't be any more matches, so stop */ + so->currPos.moreLeft = false; + break; + } + + offnum = OffsetNumberPrev(offnum); + } + + Assert(itemIndex >= 0); + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxTIDsPerBTreePage - 1; + so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + } + + return (so->currPos.firstItem <= so->currPos.lastItem); +} + +/* Save an index item into so->currPos.items[itemIndex] */ +static void +_bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + + currItem->heapTid = itup->t_tid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + Size itupsz = IndexTupleSize(itup); + + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += MAXALIGN(itupsz); + } +} + +/* + * Setup state to save TIDs/items from a single posting list tuple. + * + * Saves an index item into so->currPos.items[itemIndex] for TID that is + * returned to scan first. Second or subsequent TIDs for posting list should + * be saved by calling _bt_savepostingitem(). + * + * Returns an offset into tuple storage space that main tuple is stored at if + * needed. + */ +static int +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(BTreeTupleIsPosting(itup)); + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + + return currItem->tupleOffset; + } + + return 0; +} + +/* + * Save an index item into so->currPos.items[itemIndex] for current posting + * tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. Caller passes its return value as tupleOffset. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every TID + * that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = tupleOffset; +} + +/* + * _bt_steppage() -- Step to next page containing valid data for scan + * + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked; + * if pinned, we'll drop the pin before moving to next page. The buffer is + * not locked on entry. + * + * For success on a scan using a non-MVCC snapshot we hold a pin, but not a + * read lock, on that page. If we do not hold the pin, we set so->currPos.buf + * to InvalidBuffer. We return true to indicate success. + */ +static bool +_bt_steppage(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BlockNumber blkno = InvalidBlockNumber; + bool status; + + Assert(BTScanPosIsValid(so->currPos)); + + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + + /* + * Before we modify currPos, make a copy of the page data if there was a + * mark position that needs it. + */ + if (so->markItemIndex >= 0) + { + /* bump pin on current buffer for assignment to mark buffer */ + if (BTScanPosIsPinned(so->currPos)) + IncrBufferRefCount(so->currPos.buf); + memcpy(&so->markPos, &so->currPos, + offsetof(BTScanPosData, items[1]) + + so->currPos.lastItem * sizeof(BTScanPosItem)); + if (so->markTuples) + memcpy(so->markTuples, so->currTuples, + so->currPos.nextTupleOffset); + so->markPos.itemIndex = so->markItemIndex; + so->markItemIndex = -1; + } + + if (ScanDirectionIsForward(dir)) + { + /* Walk right to the next page with data */ + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the next block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so use the previously-saved nextPage link. */ + blkno = so->currPos.nextPage; + } + + /* Remember we left a page with data */ + so->currPos.moreLeft = true; + + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + } + else + { + /* Remember we left a page with data */ + so->currPos.moreRight = true; + + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the current block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno); + BTScanPosUnpinIfPinned(so->currPos); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + /* Not parallel, so just use our own notion of the current page */ + blkno = so->currPos.currPage; + } + } + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + return true; +} + +/* + * _bt_readnextpage() -- Read next page containing valid data for scan + * + * On success exit, so->currPos is updated to contain data from the next + * interesting page. Caller is responsible to release lock and pin on + * buffer on success. We return true to indicate success. + * + * If there are no more matching records in the given direction, we drop all + * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. + */ +static bool +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel; + Page page; + BTPageOpaque opaque; + bool status; + + rel = scan->indexRelation; + + if (ScanDirectionIsForward(dir)) + { + for (;;) + { + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ + if (blkno == P_NONE || !so->currPos.moreRight) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + /* step right one page */ + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* check for deleted page */ + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, blkno, scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, opaque->btpo_next); + } + + /* nope, keep going */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, so->currPos.buf); + } + } + } + else + { + /* + * Should only happen in parallel cases, when some other backend + * advanced the scan. + */ + if (so->currPos.currPage != blkno) + { + BTScanPosUnpinIfPinned(so->currPos); + so->currPos.currPage = blkno; + } + + /* + * Walk left to the next page with data. This is much more complex + * than the walk-right case because of the possibility that the page + * to our left splits while we are in flight to it, plus the + * possibility that the page we were on gets deleted after we leave + * it. See nbtree/README for details. + * + * It might be possible to rearrange this code to have less overhead + * in pinning and locking, but that would require capturing the left + * pointer when the page is initially read, and using it here, along + * with big changes to _bt_walk_left() and the code below. It is not + * clear whether this would be a win, since if the page immediately to + * the left splits after we read this page and before we step left, we + * would need to visit more pages than with the current code. + * + * Note that if we change the code so that we drop the pin for a scan + * which uses a non-MVCC snapshot, we will need to modify the code for + * walking left, to allow for the possibility that a referenced page + * has been deleted. As long as the buffer is pinned or the snapshot + * is MVCC the page cannot move past the half-dead state to fully + * deleted. + */ + if (BTScanPosIsPinned(so->currPos)) + _bt_lockbuf(rel, so->currPos.buf, BT_READ); + else + so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); + + for (;;) + { + /* Done if we know there are no matching keys to the left */ + if (!so->currPos.moreLeft) + { + _bt_relbuf(rel, so->currPos.buf); + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* Step to next physical page */ + so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, + scan->xs_snapshot); + + /* if we're physically at end of index, return failure */ + if (so->currPos.buf == InvalidBuffer) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* + * Okay, we managed to move left to a non-deleted page. Done if + * it's not half-dead and contains matching tuples. Else loop back + * and do it all again. + */ + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreLeft if we can stop */ + if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + /* + * For parallel scans, get the last page scanned as it is quite + * possible that by the time we try to seize the scan, some other + * worker has already advanced the scan to a different page. We + * must continue based on the latest page scanned by any worker. + */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + } + } + } + + return true; +} + +/* + * _bt_parallel_readpage() -- Read current page containing valid data for scan + * + * On success, release lock and maybe pin on buffer. We return true to + * indicate success. + */ +static bool +_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + _bt_initialize_more_data(so, dir); + + if (!_bt_readnextpage(scan, blkno, dir)) + return false; + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + return true; +} + +/* + * _bt_walk_left() -- step left one page, if possible + * + * The given buffer must be pinned and read-locked. This will be dropped + * before stepping left. On return, we have pin and read lock on the + * returned page, instead. + * + * Returns InvalidBuffer if there is no page to the left (no lock is held + * in that case). + * + * When working on a non-leaf level, it is possible for the returned page + * to be half-dead; the caller should check that condition and step left + * again if it's important. + */ +static Buffer +_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) +{ + Page page; + BTPageOpaque opaque; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + BlockNumber obknum; + BlockNumber lblkno; + BlockNumber blkno; + int tries; + + /* if we're at end of tree, release buf and return failure */ + if (P_LEFTMOST(opaque)) + { + _bt_relbuf(rel, buf); + break; + } + /* remember original page we are stepping left from */ + obknum = BufferGetBlockNumber(buf); + /* step left */ + blkno = lblkno = opaque->btpo_prev; + _bt_relbuf(rel, buf); + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this isn't the page we want, walk right till we find what we + * want --- but go no more than four hops (an arbitrary limit). If we + * don't find the correct page by then, the most likely bet is that + * the original page got deleted and isn't in the sibling chain at all + * anymore, not that its left sibling got split more than four times. + * + * Note that it is correct to test P_ISDELETED not P_IGNORE here, + * because half-dead pages are still in the sibling chain. Caller + * must reject half-dead pages if wanted. + */ + tries = 0; + for (;;) + { + if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum) + { + /* Found desired page, return it */ + return buf; + } + if (P_RIGHTMOST(opaque) || ++tries > 4) + break; + blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Return to the original page to see what's up */ + buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * It was deleted. Move right to first nondeleted page (there + * must be one); that is the page that has acquired the deleted + * one's keyspace, so stepping left from it will take us where we + * want to be. + */ + for (;;) + { + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_ISDELETED(opaque)) + break; + } + + /* + * Now return to top of loop, resetting obknum to point to this + * nondeleted page, and try again. + */ + } + else + { + /* + * It wasn't deleted; the explanation had better be that the page + * to the left got split or deleted. Without this check, we'd go + * into an infinite loop if there's anything wrong. + */ + if (opaque->btpo_prev == lblkno) + elog(ERROR, "could not find left sibling of block %u in index \"%s\"", + obknum, RelationGetRelationName(rel)); + /* Okay to try again with new lblkno value */ + } + } + + return InvalidBuffer; +} + +/* + * _bt_get_endpoint() -- Find the first or last page on a given tree level + * + * If the index is empty, we will return InvalidBuffer; any other failure + * condition causes ereport(). We will not return a dead page. + * + * The returned buffer is pinned and read-locked. + */ +Buffer +_bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + BlockNumber blkno; + IndexTuple itup; + + /* + * If we are looking for a leaf page, okay to descend from fast root; + * otherwise better descend from true root. (There is no point in being + * smarter about intermediate levels.) + */ + if (level == 0) + buf = _bt_getroot(rel, BT_READ); + else + buf = _bt_gettrueroot(rel); + + if (!BufferIsValid(buf)) + return InvalidBuffer; + + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + /* + * If we landed on a deleted page, step right to find a live page + * (there must be one). Also, if we want the rightmost page, step + * right if needed to get to it (this could happen if the page split + * since we obtained a pointer to it). + */ + while (P_IGNORE(opaque) || + (rightmost && !P_RIGHTMOST(opaque))) + { + blkno = opaque->btpo_next; + if (blkno == P_NONE) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Done? */ + if (opaque->btpo_level == level) + break; + if (opaque->btpo_level < level) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("btree level %u not found in index \"%s\"", + level, RelationGetRelationName(rel)))); + + /* Descend to leftmost or rightmost child page */ + if (rightmost) + offnum = PageGetMaxOffsetNumber(page); + else + offnum = P_FIRSTDATAKEY(opaque); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + blkno = BTreeTupleGetDownLink(itup); + + buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + return buf; +} + +/* + * _bt_endpoint() -- Find the first or last page in the index, and scan + * from there to the first key satisfying all the quals. + * + * This is used by _bt_first() to set up a scan when we've determined + * that the scan must start at the beginning or end of the index (for + * a forward or backward scan respectively). Exit conditions are the + * same as for _bt_first(). + */ +static bool +_bt_endpoint(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber start; + BTScanPosItem *currItem; + + /* + * Scan down to the leftmost or rightmost leaf page. This is a simplified + * version of _bt_search(). We don't maintain a stack since we know we + * won't need it. + */ + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); + + if (!BufferIsValid(buf)) + { + /* + * Empty index. Lock the whole relation, as nothing finer to lock + * exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + BTScanPosInvalidate(so->currPos); + return false; + } + + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISLEAF(opaque)); + + if (ScanDirectionIsForward(dir)) + { + /* There could be dead pages to the left, so not this: */ + /* Assert(P_LEFTMOST(opaque)); */ + + start = P_FIRSTDATAKEY(opaque); + } + else if (ScanDirectionIsBackward(dir)) + { + Assert(P_RIGHTMOST(opaque)); + + start = PageGetMaxOffsetNumber(page); + } + else + { + elog(ERROR, "invalid scan direction: %d", (int) dir); + start = 0; /* keep compiler quiet */ + } + + /* remember which buffer we have pinned */ + so->currPos.buf = buf; + + _bt_initialize_more_data(so, dir); + + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, dir, start)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + return false; + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + + /* OK, itemIndex says what to return */ + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately + * for scan direction + */ +static inline void +_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) +{ + /* initialize moreLeft/moreRight appropriately for scan direction */ + if (ScanDirectionIsForward(dir)) + { + so->currPos.moreLeft = false; + so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } + so->numKilled = 0; /* just paranoia */ + so->markItemIndex = -1; /* ditto */ +} diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c new file mode 100644 index 0000000..78f78e7 --- /dev/null +++ b/src/backend/access/nbtree/nbtsort.c @@ -0,0 +1,2016 @@ +/*------------------------------------------------------------------------- + * + * nbtsort.c + * Build a btree from sorted input by loading leaf pages sequentially. + * + * NOTES + * + * We use tuplesort.c to sort the given index tuples into order. + * Then we scan the index tuples in order and build the btree pages + * for each level. We load source tuples into leaf-level pages. + * Whenever we fill a page at one level, we add a link to it to its + * parent level (starting a new parent level if necessary). When + * done, we write out each final page on each level, adding it to + * its parent level. When we have only one page on a level, it must be + * the root -- it can be attached to the btree metapage and we are done. + * + * It is not wise to pack the pages entirely full, since then *any* + * insertion would cause a split (and not only of the leaf page; the need + * for a split would cascade right up the tree). The steady-state load + * factor for btrees is usually estimated at 70%. We choose to pack leaf + * pages to the user-controllable fill factor (default 90%) while upper pages + * are always packed to 70%. This gives us reasonable density (there aren't + * many upper pages if the keys are reasonable-size) without risking a lot of + * cascading splits during early insertions. + * + * Formerly the index pages being built were kept in shared buffers, but + * that is of no value (since other backends have no interest in them yet) + * and it created locking problems for CHECKPOINT, because the upper-level + * pages were held exclusive-locked for long periods. Now we just build + * the pages in local memory and smgrwrite or smgrextend them as we finish + * them. They will need to be re-read into shared buffers on first use after + * the build finishes. + * + * This code isn't concerned about the FSM at all. The caller is responsible + * for initializing that. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/parallel.h" +#include "access/relscan.h" +#include "access/table.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "commands/progress.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" /* pgrminclude ignore */ +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + + +/* Magic numbers for parallel state sharing */ +#define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001) +#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002) +#define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003) +#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004) +#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005) +#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006) + +/* + * DISABLE_LEADER_PARTICIPATION disables the leader's participation in + * parallel index builds. This may be useful as a debugging aid. +#undef DISABLE_LEADER_PARTICIPATION + */ + +/* + * Status record for spooling/sorting phase. (Note we may have two of + * these due to the special requirements for uniqueness-checking with + * dead tuples.) + */ +typedef struct BTSpool +{ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + Relation heap; + Relation index; + bool isunique; +} BTSpool; + +/* + * Status for index builds performed in parallel. This is allocated in a + * dynamic shared memory segment. Note that there is a separate tuplesort TOC + * entry, private to tuplesort.c but allocated by this module on its behalf. + */ +typedef struct BTShared +{ + /* + * These fields are not modified during the sort. They primarily exist + * for the benefit of worker processes that need to create BTSpool state + * corresponding to that used by the leader. + */ + Oid heaprelid; + Oid indexrelid; + bool isunique; + bool isconcurrent; + int scantuplesortstates; + + /* + * workersdonecv is used to monitor the progress of workers. All parallel + * participants must indicate that they are done before leader can use + * mutable state that workers maintain during scan (and before leader can + * proceed to tuplesort_performsort()). + */ + ConditionVariable workersdonecv; + + /* + * mutex protects all fields before heapdesc. + * + * These fields contain status information of interest to B-Tree index + * builds that must work just the same when an index is built in parallel. + */ + slock_t mutex; + + /* + * Mutable state that is maintained by workers, and reported back to + * leader at end of parallel scan. + * + * nparticipantsdone is number of worker processes finished. + * + * reltuples is the total number of input heap tuples. + * + * havedead indicates if RECENTLY_DEAD tuples were encountered during + * build. + * + * indtuples is the total number of tuples that made it into the index. + * + * brokenhotchain indicates if any worker detected a broken HOT chain + * during build. + */ + int nparticipantsdone; + double reltuples; + bool havedead; + double indtuples; + bool brokenhotchain; + + /* + * ParallelTableScanDescData data follows. Can't directly embed here, as + * implementations of the parallel table scan desc interface might need + * stronger alignment. + */ +} BTShared; + +/* + * Return pointer to a BTShared's parallel table scan. + * + * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just + * MAXALIGN. + */ +#define ParallelTableScanFromBTShared(shared) \ + (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared))) + +/* + * Status for leader in parallel index build. + */ +typedef struct BTLeader +{ + /* parallel context itself */ + ParallelContext *pcxt; + + /* + * nparticipanttuplesorts is the exact number of worker processes + * successfully launched, plus one leader process if it participates as a + * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader + * participating as a worker). + */ + int nparticipanttuplesorts; + + /* + * Leader process convenience pointers to shared state (leader avoids TOC + * lookups). + * + * btshared is the shared state for entire build. sharedsort is the + * shared, tuplesort-managed state passed to each process tuplesort. + * sharedsort2 is the corresponding btspool2 shared state, used only when + * building unique indexes. snapshot is the snapshot used by the scan iff + * an MVCC snapshot is required. + */ + BTShared *btshared; + Sharedsort *sharedsort; + Sharedsort *sharedsort2; + Snapshot snapshot; + WalUsage *walusage; + BufferUsage *bufferusage; +} BTLeader; + +/* + * Working state for btbuild and its callback. + * + * When parallel CREATE INDEX is used, there is a BTBuildState for each + * participant. + */ +typedef struct BTBuildState +{ + bool isunique; + bool havedead; + Relation heap; + BTSpool *spool; + + /* + * spool2 is needed only when the index is a unique index. Dead tuples are + * put into spool2 instead of spool in order to avoid uniqueness check. + */ + BTSpool *spool2; + double indtuples; + + /* + * btleader is only present when a parallel index build is performed, and + * only in the leader process. (Actually, only the leader has a + * BTBuildState. Workers have their own spool and spool2, though.) + */ + BTLeader *btleader; +} BTBuildState; + +/* + * Status record for a btree page being built. We have one of these + * for each active tree level. + */ +typedef struct BTPageState +{ + Page btps_page; /* workspace for page building */ + BlockNumber btps_blkno; /* block # to write this page at */ + IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ + OffsetNumber btps_lastoff; /* last item offset loaded */ + Size btps_lastextra; /* last item's extra posting list space */ + uint32 btps_level; /* tree level (0 = leaf) */ + Size btps_full; /* "full" if less than this much free space */ + struct BTPageState *btps_next; /* link to parent level, if any */ +} BTPageState; + +/* + * Overall status record for index writing phase. + */ +typedef struct BTWriteState +{ + Relation heap; + Relation index; + BTScanInsert inskey; /* generic insertion scankey */ + bool btws_use_wal; /* dump pages to WAL? */ + BlockNumber btws_pages_alloced; /* # pages allocated */ + BlockNumber btws_pages_written; /* # pages written out */ + Page btws_zeropage; /* workspace for filling zeroes */ +} BTWriteState; + + +static double _bt_spools_heapscan(Relation heap, Relation index, + BTBuildState *buildstate, IndexInfo *indexInfo); +static void _bt_spooldestroy(BTSpool *btspool); +static void _bt_spool(BTSpool *btspool, ItemPointer self, + Datum *values, bool *isnull); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *state); +static Page _bt_blnewpage(uint32 level); +static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); +static void _bt_slideleft(Page rightmostpage); +static void _bt_sortaddtup(Page page, Size itemsize, + IndexTuple itup, OffsetNumber itup_off, + bool newfirstdataitem); +static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, + IndexTuple itup, Size truncextra); +static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, + BTPageState *state, + BTDedupState dstate); +static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); +static void _bt_load(BTWriteState *wstate, + BTSpool *btspool, BTSpool *btspool2); +static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, + int request); +static void _bt_end_parallel(BTLeader *btleader); +static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static double _bt_parallel_heapscan(BTBuildState *buildstate, + bool *brokenhotchain); +static void _bt_leader_participate_as_worker(BTBuildState *buildstate); +static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, + BTShared *btshared, Sharedsort *sharedsort, + Sharedsort *sharedsort2, int sortmem, + bool progress); + + +/* + * btbuild() -- build a new btree index. + */ +IndexBuildResult * +btbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + BTBuildState buildstate; + double reltuples; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + ResetUsage(); +#endif /* BTREE_BUILD_STATS */ + + buildstate.isunique = indexInfo->ii_Unique; + buildstate.havedead = false; + buildstate.heap = heap; + buildstate.spool = NULL; + buildstate.spool2 = NULL; + buildstate.indtuples = 0; + buildstate.btleader = NULL; + + /* + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + + /* + * Finish the build by (1) completing the sort of the spool file, (2) + * inserting the sorted tuples into btree pages and (3) building the upper + * levels. Finally, it may also be necessary to end use of parallelism. + */ + _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_spooldestroy(buildstate.spool); + if (buildstate.spool2) + _bt_spooldestroy(buildstate.spool2); + if (buildstate.btleader) + _bt_end_parallel(buildstate.btleader); + + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = buildstate.indtuples; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD STATS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + return result; +} + +/* + * Create and initialize one or two spool structures, and save them in caller's + * buildstate argument. May also fill-in fields within indexInfo used by index + * builds. + * + * Scans the heap, possibly in parallel, filling spools with IndexTuples. This + * routine encapsulates all aspects of managing parallelism. Caller need only + * call _bt_end_parallel() in parallel case after it is done with spool/spool2. + * + * Returns the total number of heap tuples scanned. + */ +static double +_bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, + IndexInfo *indexInfo) +{ + BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); + SortCoordinate coordinate = NULL; + double reltuples = 0; + + /* + * We size the sort area as maintenance_work_mem rather than work_mem to + * speed index creation. This should be OK since a single backend can't + * run multiple index creations in parallel (see also: notes on + * parallelism and maintenance_work_mem below). + */ + btspool->heap = heap; + btspool->index = index; + btspool->isunique = indexInfo->ii_Unique; + + /* Save as primary spool */ + buildstate->spool = btspool; + + /* Report table scan phase started */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN); + + /* Attempt to launch parallel worker scan when required */ + if (indexInfo->ii_ParallelWorkers > 0) + _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent, + indexInfo->ii_ParallelWorkers); + + /* + * If parallel build requested and at least one worker process was + * successfully launched, set up coordination state + */ + if (buildstate->btleader) + { + coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate->isWorker = false; + coordinate->nParticipants = + buildstate->btleader->nparticipanttuplesorts; + coordinate->sharedsort = buildstate->btleader->sharedsort; + } + + /* + * Begin serial/leader tuplesort. + * + * In cases where parallelism is involved, the leader receives the same + * share of maintenance_work_mem as a serial sort (it is generally treated + * in the same way as a serial sort once we return). Parallel worker + * Tuplesortstates will have received only a fraction of + * maintenance_work_mem, though. + * + * We rely on the lifetime of the Leader Tuplesortstate almost not + * overlapping with any worker Tuplesortstate's lifetime. There may be + * some small overlap, but that's okay because we rely on leader + * Tuplesortstate only allocating a small, fixed amount of memory here. + * When its tuplesort_performsort() is called (by our caller), and + * significant amounts of memory are likely to be used, all workers must + * have already freed almost all memory held by their Tuplesortstates + * (they are about to go away completely, too). The overall effect is + * that maintenance_work_mem always represents an absolute high watermark + * on the amount of memory used by a CREATE INDEX operation, regardless of + * the use of parallelism or any other factor. + */ + buildstate->spool->sortstate = + tuplesort_begin_index_btree(heap, index, buildstate->isunique, + maintenance_work_mem, coordinate, + false); + + /* + * If building a unique index, put dead tuples in a second spool to keep + * them out of the uniqueness check. We expect that the second spool (for + * dead tuples) won't get very full, so we give it only work_mem. + */ + if (indexInfo->ii_Unique) + { + BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); + SortCoordinate coordinate2 = NULL; + + /* Initialize secondary spool */ + btspool2->heap = heap; + btspool2->index = index; + btspool2->isunique = false; + /* Save as secondary spool */ + buildstate->spool2 = btspool2; + + if (buildstate->btleader) + { + /* + * Set up non-private state that is passed to + * tuplesort_begin_index_btree() about the basic high level + * coordination of a parallel sort. + */ + coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); + coordinate2->isWorker = false; + coordinate2->nParticipants = + buildstate->btleader->nparticipanttuplesorts; + coordinate2->sharedsort = buildstate->btleader->sharedsort2; + } + + /* + * We expect that the second one (for dead tuples) won't get very + * full, so we give it only work_mem + */ + buildstate->spool2->sortstate = + tuplesort_begin_index_btree(heap, index, false, work_mem, + coordinate2, false); + } + + /* Fill spool using either serial or parallel heap scan */ + if (!buildstate->btleader) + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + _bt_build_callback, (void *) buildstate, + NULL); + else + reltuples = _bt_parallel_heapscan(buildstate, + &indexInfo->ii_BrokenHotChain); + + /* + * Set the progress target for the next phase. Reset the block number + * values set by table_index_build_scan + */ + { + const int progress_index[] = { + PROGRESS_CREATEIDX_TUPLES_TOTAL, + PROGRESS_SCAN_BLOCKS_TOTAL, + PROGRESS_SCAN_BLOCKS_DONE + }; + const int64 progress_vals[] = { + buildstate->indtuples, + 0, 0 + }; + + pgstat_progress_update_multi_param(3, progress_index, progress_vals); + } + + /* okay, all heap tuples are spooled */ + if (buildstate->spool2 && !buildstate->havedead) + { + /* spool2 turns out to be unnecessary */ + _bt_spooldestroy(buildstate->spool2); + buildstate->spool2 = NULL; + } + + return reltuples; +} + +/* + * clean up a spool structure and its substructures. + */ +static void +_bt_spooldestroy(BTSpool *btspool) +{ + tuplesort_end(btspool->sortstate); + pfree(btspool); +} + +/* + * spool an index entry into the sort file. + */ +static void +_bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) +{ + tuplesort_putindextuplevalues(btspool->sortstate, btspool->index, + self, values, isnull); +} + +/* + * given a spool loaded by successive calls to _bt_spool, + * create an entire btree. + */ +static void +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +{ + BTWriteState wstate; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD (Spool) STATISTICS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + /* Execute the sort */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_1); + tuplesort_performsort(btspool->sortstate); + if (btspool2) + { + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_2); + tuplesort_performsort(btspool2->sortstate); + } + + wstate.heap = btspool->heap; + wstate.index = btspool->index; + wstate.inskey = _bt_mkscankey(wstate.index, NULL); + /* _bt_mkscankey() won't set allequalimage without metapage */ + wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + wstate.btws_use_wal = RelationNeedsWAL(wstate.index); + + /* reserve the metapage */ + wstate.btws_pages_alloced = BTREE_METAPAGE + 1; + wstate.btws_pages_written = 0; + wstate.btws_zeropage = NULL; /* until needed */ + + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_LEAF_LOAD); + _bt_load(&wstate, btspool, btspool2); +} + +/* + * Per-tuple callback for table_index_build_scan + */ +static void +_bt_build_callback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + BTBuildState *buildstate = (BTBuildState *) state; + + /* + * insert the index tuple into the appropriate spool file for subsequent + * processing + */ + if (tupleIsAlive || buildstate->spool2 == NULL) + _bt_spool(buildstate->spool, tid, values, isnull); + else + { + /* dead tuples are put into spool2 */ + buildstate->havedead = true; + _bt_spool(buildstate->spool2, tid, values, isnull); + } + + buildstate->indtuples += 1; +} + +/* + * allocate workspace for a new, clean btree page, not linked to any siblings. + */ +static Page +_bt_blnewpage(uint32 level) +{ + Page page; + BTPageOpaque opaque; + + page = (Page) palloc(BLCKSZ); + + /* Zero the page and set up standard page header info */ + _bt_pageinit(page, BLCKSZ); + + /* Initialize BT opaque state */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_prev = opaque->btpo_next = P_NONE; + opaque->btpo_level = level; + opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; + opaque->btpo_cycleid = 0; + + /* Make the P_HIKEY line pointer appear allocated */ + ((PageHeader) page)->pd_lower += sizeof(ItemIdData); + + return page; +} + +/* + * emit a completed btree page, and release the working storage. + */ +static void +_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) +{ + /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ + RelationOpenSmgr(wstate->index); + + /* XLOG stuff */ + if (wstate->btws_use_wal) + { + /* We use the XLOG_FPI record type for this */ + log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); + } + + /* + * If we have to write pages nonsequentially, fill in the space with + * zeroes until we come back and overwrite. This is not logically + * necessary on standard Unix filesystems (unwritten space will read as + * zeroes anyway), but it should help to avoid fragmentation. The dummy + * pages aren't WAL-logged though. + */ + while (blkno > wstate->btws_pages_written) + { + if (!wstate->btws_zeropage) + wstate->btws_zeropage = (Page) palloc0(BLCKSZ); + /* don't set checksum for all-zero page */ + smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, + wstate->btws_pages_written++, + (char *) wstate->btws_zeropage, + true); + } + + PageSetChecksumInplace(page, blkno); + + /* + * Now write the page. There's no need for smgr to schedule an fsync for + * this write; we'll do it ourselves before ending the build. + */ + if (blkno == wstate->btws_pages_written) + { + /* extending the file... */ + smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, + (char *) page, true); + wstate->btws_pages_written++; + } + else + { + /* overwriting a block we zero-filled before */ + smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, + (char *) page, true); + } + + pfree(page); +} + +/* + * allocate and initialize a new BTPageState. the returned structure + * is suitable for immediate use by _bt_buildadd. + */ +static BTPageState * +_bt_pagestate(BTWriteState *wstate, uint32 level) +{ + BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); + + /* create initial page for level */ + state->btps_page = _bt_blnewpage(level); + + /* and assign it a page position */ + state->btps_blkno = wstate->btws_pages_alloced++; + + state->btps_lowkey = NULL; + /* initialize lastoff so first item goes into P_FIRSTKEY */ + state->btps_lastoff = P_HIKEY; + state->btps_lastextra = 0; + state->btps_level = level; + /* set "full" threshold based on level. See notes at head of file. */ + if (level > 0) + state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); + else + state->btps_full = BTGetTargetPageFreeSpace(wstate->index); + + /* no parent level, yet */ + state->btps_next = NULL; + + return state; +} + +/* + * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to + * P_HIKEY, overwriting P_HIKEY). + * + * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the + * rightmost page on its level is not supposed to get a high key. Now that + * it's clear that this page is a rightmost page, remove the unneeded empty + * P_HIKEY line pointer space. + */ +static void +_bt_slideleft(Page rightmostpage) +{ + OffsetNumber off; + OffsetNumber maxoff; + ItemId previi; + + maxoff = PageGetMaxOffsetNumber(rightmostpage); + Assert(maxoff >= P_FIRSTKEY); + previi = PageGetItemId(rightmostpage, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) + { + ItemId thisii = PageGetItemId(rightmostpage, off); + + *previi = *thisii; + previi = thisii; + } + ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData); +} + +/* + * Add an item to a page being built. + * + * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant + * raises an error directly. + * + * Note that our nbtsort.c caller does not know yet if the page will be + * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by + * caller. Page that turns out to be the rightmost on its level is fixed by + * calling _bt_slideleft(). + */ +static void +_bt_sortaddtup(Page page, + Size itemsize, + IndexTuple itup, + OffsetNumber itup_off, + bool newfirstdataitem) +{ + IndexTupleData trunctuple; + + if (newfirstdataitem) + { + trunctuple = *itup; + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetNAtts(&trunctuple, 0, false); + itup = &trunctuple; + itemsize = sizeof(IndexTupleData); + } + + if (PageAddItem(page, (Item) itup, itemsize, itup_off, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add item to the index page"); +} + +/*---------- + * Add an item to a disk page from the sort output (or add a posting list + * item formed from the sort output). + * + * We must be careful to observe the page layout conventions of nbtsearch.c: + * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. + * - on non-leaf pages, the key portion of the first item need not be + * stored, we should store only the link. + * + * A leaf page being built looks like: + * + * +----------------+---------------------------------+ + * | PageHeaderData | linp0 linp1 linp2 ... | + * +-----------+----+---------------------------------+ + * | ... linpN | | + * +-----------+--------------------------------------+ + * | ^ last | + * | | + * +-------------+------------------------------------+ + * | | itemN ... | + * +-------------+------------------+-----------------+ + * | ... item3 item2 item1 | "special space" | + * +--------------------------------+-----------------+ + * + * Contrast this with the diagram in bufpage.h; note the mismatch + * between linps and items. This is because we reserve linp0 as a + * placeholder for the pointer to the "high key" item; when we have + * filled up the page, we will set linp0 to point to itemN and clear + * linpN. On the other hand, if we find this is the last (rightmost) + * page, we leave the items alone and slide the linp array over. If + * the high key is to be truncated, offset 1 is deleted, and we insert + * the truncated high key at offset 1. + * + * 'last' pointer indicates the last offset added to the page. + * + * 'truncextra' is the size of the posting list in itup, if any. This + * information is stashed for the next call here, when we may benefit + * from considering the impact of truncating away the posting list on + * the page before deciding to finish the page off. Posting lists are + * often relatively large, so it is worth going to the trouble of + * accounting for the saving from truncating away the posting list of + * the tuple that becomes the high key (that may be the only way to + * get close to target free space on the page). Note that this is + * only used for the soft fillfactor-wise limit, not the critical hard + * limit. + *---------- + */ +static void +_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, + Size truncextra) +{ + Page npage; + BlockNumber nblkno; + OffsetNumber last_off; + Size last_truncextra; + Size pgspc; + Size itupsz; + bool isleaf; + + /* + * This is a handy place to check for cancel interrupts during the btree + * load phase of index creation. + */ + CHECK_FOR_INTERRUPTS(); + + npage = state->btps_page; + nblkno = state->btps_blkno; + last_off = state->btps_lastoff; + last_truncextra = state->btps_lastextra; + state->btps_lastextra = truncextra; + + pgspc = PageGetFreeSpace(npage); + itupsz = IndexTupleSize(itup); + itupsz = MAXALIGN(itupsz); + /* Leaf case has slightly different rules due to suffix truncation */ + isleaf = (state->btps_level == 0); + + /* + * Check whether the new item can fit on a btree page on current level at + * all. + * + * Every newly built index will treat heap TID as part of the keyspace, + * which imposes the requirement that new high keys must occasionally have + * a heap TID appended within _bt_truncate(). That may leave a new pivot + * tuple one or two MAXALIGN() quantums larger than the original + * firstright tuple it's derived from. v4 deals with the problem by + * decreasing the limit on the size of tuples inserted on the leaf level + * by the same small amount. Enforce the new v4+ limit on the leaf level, + * and the old limit on internal levels, since pivot tuples may need to + * make use of the reserved space. This should never fail on internal + * pages. + */ + if (unlikely(itupsz > BTMaxItemSize(npage))) + _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, + itup); + + /* + * Check to see if current page will fit new item, with space left over to + * append a heap TID during suffix truncation when page is a leaf page. + * + * It is guaranteed that we can fit at least 2 non-pivot tuples plus a + * high key with heap TID when finishing off a leaf page, since we rely on + * _bt_check_third_page() rejecting oversized non-pivot tuples. On + * internal pages we can always fit 3 pivot tuples with larger internal + * page tuple limit (includes page high key). + * + * Most of the time, a page is only "full" in the sense that the soft + * fillfactor-wise limit has been exceeded. However, we must always leave + * at least two items plus a high key on each page before starting a new + * page. Disregard fillfactor and insert on "full" current page if we + * don't have the minimum number of items yet. (Note that we deliberately + * assume that suffix truncation neither enlarges nor shrinks new high key + * when applying soft limit, except when last tuple has a posting list.) + */ + Assert(last_truncextra == 0 || isleaf); + if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || + (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY)) + { + /* + * Finish off the page and write it out. + */ + Page opage = npage; + BlockNumber oblkno = nblkno; + ItemId ii; + ItemId hii; + IndexTuple oitup; + + /* Create new page of same level */ + npage = _bt_blnewpage(state->btps_level); + + /* and assign it a page position */ + nblkno = wstate->btws_pages_alloced++; + + /* + * We copy the last item on the page into the new page, and then + * rearrange the old page so that the 'last item' becomes its high key + * rather than a true data item. There had better be at least two + * items on the page already, else the page would be empty of useful + * data. + */ + Assert(last_off > P_FIRSTKEY); + ii = PageGetItemId(opage, last_off); + oitup = (IndexTuple) PageGetItem(opage, ii); + _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY, + !isleaf); + + /* + * Move 'last' into the high key position on opage. _bt_blnewpage() + * allocated empty space for a line pointer when opage was first + * created, so this is a matter of rearranging already-allocated space + * on page, and initializing high key line pointer. (Actually, leaf + * pages must also swap oitup with a truncated version of oitup, which + * is sometimes larger than oitup, though never by more than the space + * needed to append a heap TID.) + */ + hii = PageGetItemId(opage, P_HIKEY); + *hii = *ii; + ItemIdSetUnused(ii); /* redundant */ + ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); + + if (isleaf) + { + IndexTuple lastleft; + IndexTuple truncated; + + /* + * Truncate away any unneeded attributes from high key on leaf + * level. This is only done at the leaf level because downlinks + * in internal pages are either negative infinity items, or get + * their contents from copying from one level down. See also: + * _bt_split(). + * + * We don't try to bias our choice of split point to make it more + * likely that _bt_truncate() can truncate away more attributes, + * whereas the split point used within _bt_split() is chosen much + * more delicately. Even still, the lastleft and firstright + * tuples passed to _bt_truncate() here are at least not fully + * equal to each other when deduplication is used, unless there is + * a large group of duplicates (also, unique index builds usually + * have few or no spool2 duplicates). When the split point is + * between two unequal tuples, _bt_truncate() will avoid including + * a heap TID in the new high key, which is the most important + * benefit of suffix truncation. + * + * Overwrite the old item with new truncated high key directly. + * oitup is already located at the physical beginning of tuple + * space, so this should directly reuse the existing tuple space. + */ + ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); + lastleft = (IndexTuple) PageGetItem(opage, ii); + + Assert(IndexTupleSize(oitup) > last_truncextra); + truncated = _bt_truncate(wstate->index, lastleft, oitup, + wstate->inskey); + if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated, + IndexTupleSize(truncated))) + elog(ERROR, "failed to add high key to the index page"); + pfree(truncated); + + /* oitup should continue to point to the page's high key */ + hii = PageGetItemId(opage, P_HIKEY); + oitup = (IndexTuple) PageGetItem(opage, hii); + } + + /* + * Link the old page into its parent, using its low key. If we don't + * have a parent, we have to create one; this adds a new btree level. + */ + if (state->btps_next == NULL) + state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); + + Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) || + P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); + Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 || + !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); + BTreeTupleSetDownLink(state->btps_lowkey, oblkno); + _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0); + pfree(state->btps_lowkey); + + /* + * Save a copy of the high key from the old page. It is also the low + * key for the new page. + */ + state->btps_lowkey = CopyIndexTuple(oitup); + + /* + * Set the sibling links for both pages. + */ + { + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); + + oopaque->btpo_next = nblkno; + nopaque->btpo_prev = oblkno; + nopaque->btpo_next = P_NONE; /* redundant */ + } + + /* + * Write out the old page. We never need to touch it again, so we can + * free the opage workspace too. + */ + _bt_blwritepage(wstate, opage, oblkno); + + /* + * Reset last_off to point to new page + */ + last_off = P_FIRSTKEY; + } + + /* + * By here, either original page is still the current page, or a new page + * was created that became the current page. Either way, the current page + * definitely has space for new item. + * + * If the new item is the first for its page, it must also be the first + * item on its entire level. On later same-level pages, a low key for a + * page will be copied from the prior page in the code above. Generate a + * minus infinity low key here instead. + */ + if (last_off == P_HIKEY) + { + Assert(state->btps_lowkey == NULL); + state->btps_lowkey = palloc0(sizeof(IndexTupleData)); + state->btps_lowkey->t_info = sizeof(IndexTupleData); + BTreeTupleSetNAtts(state->btps_lowkey, 0, false); + } + + /* + * Add the new item into the current page. + */ + last_off = OffsetNumberNext(last_off); + _bt_sortaddtup(npage, itupsz, itup, last_off, + !isleaf && last_off == P_FIRSTKEY); + + state->btps_page = npage; + state->btps_blkno = nblkno; + state->btps_lastoff = last_off; +} + +/* + * Finalize pending posting list tuple, and add it to the index. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple + * using _bt_buildadd(). + */ +static void +_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, + BTDedupState dstate) +{ + Assert(dstate->nitems > 0); + + if (dstate->nitems == 1) + _bt_buildadd(wstate, state, dstate->base, 0); + else + { + IndexTuple postingtuple; + Size truncextra; + + /* form a tuple with a posting list */ + postingtuple = _bt_form_posting(dstate->base, + dstate->htids, + dstate->nhtids); + /* Calculate posting list overhead */ + truncextra = IndexTupleSize(postingtuple) - + BTreeTupleGetPostingOffset(postingtuple); + + _bt_buildadd(wstate, state, postingtuple, truncextra); + pfree(postingtuple); + } + + dstate->nmaxitems = 0; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; +} + +/* + * Finish writing out the completed btree. + */ +static void +_bt_uppershutdown(BTWriteState *wstate, BTPageState *state) +{ + BTPageState *s; + BlockNumber rootblkno = P_NONE; + uint32 rootlevel = 0; + Page metapage; + + /* + * Each iteration of this loop completes one more level of the tree. + */ + for (s = state; s != NULL; s = s->btps_next) + { + BlockNumber blkno; + BTPageOpaque opaque; + + blkno = s->btps_blkno; + opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); + + /* + * We have to link the last page on this level to somewhere. + * + * If we're at the top, it's the root, so attach it to the metapage. + * Otherwise, add an entry for it to its parent using its low key. + * This may cause the last page of the parent level to split, but + * that's not a problem -- we haven't gotten to it yet. + */ + if (s->btps_next == NULL) + { + opaque->btpo_flags |= BTP_ROOT; + rootblkno = blkno; + rootlevel = s->btps_level; + } + else + { + Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <= + IndexRelationGetNumberOfKeyAttributes(wstate->index) && + BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) || + P_LEFTMOST(opaque)); + Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 || + !P_LEFTMOST(opaque)); + BTreeTupleSetDownLink(s->btps_lowkey, blkno); + _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0); + pfree(s->btps_lowkey); + s->btps_lowkey = NULL; + } + + /* + * This is the rightmost page, so the ItemId array needs to be slid + * back one slot. Then we can dump out the page. + */ + _bt_slideleft(s->btps_page); + _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); + s->btps_page = NULL; /* writepage freed the workspace */ + } + + /* + * As the last step in the process, construct the metapage and make it + * point to the new root (unless we had no data at all, in which case it's + * set to point to "P_NONE"). This changes the index to the "valid" state + * by filling in a valid magic number in the metapage. + */ + metapage = (Page) palloc(BLCKSZ); + _bt_initmetapage(metapage, rootblkno, rootlevel, + wstate->inskey->allequalimage); + _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); +} + +/* + * Read tuples in correct sort order from tuplesort, and load them into + * btree leaves. + */ +static void +_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) +{ + BTPageState *state = NULL; + bool merge = (btspool2 != NULL); + IndexTuple itup, + itup2 = NULL; + bool load1; + TupleDesc tupdes = RelationGetDescr(wstate->index); + int i, + keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); + SortSupport sortKeys; + int64 tuples_done = 0; + bool deduplicate; + + deduplicate = wstate->inskey->allequalimage && !btspool->isunique && + BTGetDeduplicateItems(wstate->index); + + if (merge) + { + /* + * Another BTSpool for dead tuples exists. Now we have to merge + * btspool and btspool2. + */ + + /* the preparation of merge */ + itup = tuplesort_getindextuple(btspool->sortstate, true); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); + + /* Prepare SortSupport data for each column */ + sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); + + for (i = 0; i < keysz; i++) + { + SortSupport sortKey = sortKeys + i; + ScanKey scanKey = wstate->inskey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Abbreviation is not supported here */ + sortKey->abbreviate = false; + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); + } + + for (;;) + { + load1 = true; /* load BTSpool next ? */ + if (itup2 == NULL) + { + if (itup == NULL) + break; + } + else if (itup != NULL) + { + int32 compare = 0; + + for (i = 1; i <= keysz; i++) + { + SortSupport entry; + Datum attrDatum1, + attrDatum2; + bool isNull1, + isNull2; + + entry = sortKeys + i - 1; + attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); + attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); + + compare = ApplySortComparator(attrDatum1, isNull1, + attrDatum2, isNull2, + entry); + if (compare > 0) + { + load1 = false; + break; + } + else if (compare < 0) + break; + } + + /* + * If key values are equal, we sort on ItemPointer. This is + * required for btree indexes, since heap TID is treated as an + * implicit last key attribute in order to ensure that all + * keys in the index are physically unique. + */ + if (compare == 0) + { + compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); + Assert(compare != 0); + if (compare > 0) + load1 = false; + } + } + else + load1 = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (load1) + { + _bt_buildadd(wstate, state, itup, 0); + itup = tuplesort_getindextuple(btspool->sortstate, true); + } + else + { + _bt_buildadd(wstate, state, itup2, 0); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + pfree(sortKeys); + } + else if (deduplicate) + { + /* merge is unnecessary, deduplicate into posting lists */ + BTDedupState dstate; + + dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); + dstate->deduplicate = true; /* unused */ + dstate->nmaxitems = 0; /* unused */ + dstate->maxpostingsize = 0; /* set later */ + /* Metadata about base tuple of current pending posting list */ + dstate->base = NULL; + dstate->baseoff = InvalidOffsetNumber; /* unused */ + dstate->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + dstate->htids = NULL; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; /* unused */ + dstate->nintervals = 0; /* unused */ + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + + /* + * Limit size of posting list tuples to 1/10 space we want to + * leave behind on the page, plus space for final item's line + * pointer. This is equal to the space that we'd like to + * leave behind on each leaf page when fillfactor is 90, + * allowing us to get close to fillfactor% space utilization + * when there happen to be a great many duplicates. (This + * makes higher leaf fillfactor settings ineffective when + * building indexes that have many duplicates, but packing + * leaf pages full with few very large tuples doesn't seem + * like a useful goal.) + */ + dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - + sizeof(ItemIdData); + Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && + dstate->maxpostingsize <= INDEX_SIZE_MASK); + dstate->htids = palloc(dstate->maxpostingsize); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + else if (_bt_keep_natts_fast(wstate->index, dstate->base, + itup) > keysz && + _bt_dedup_save_htid(dstate, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID from itup has been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list. + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + + if (state) + { + /* + * Handle the last item (there must be a last item when the + * tuplesort returned one or more tuples) + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + pfree(dstate->htids); + } + + pfree(dstate); + } + else + { + /* merging and deduplication are both unnecessary */ + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + _bt_buildadd(wstate, state, itup, 0); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + } + + /* Close down final pages and write the metapage */ + _bt_uppershutdown(wstate, state); + + /* + * When we WAL-logged index pages, we must nonetheless fsync index files. + * Since we're building outside shared buffers, a CHECKPOINT occurring + * during the build has no way to flush the previously written data to + * disk (indeed it won't know the index even exists). A crash later on + * would replay WAL from the checkpoint, therefore it wouldn't replay our + * earlier WAL entries. If we do not fsync those pages here, they might + * still not be on disk when the crash occurs. + */ + if (wstate->btws_use_wal) + { + RelationOpenSmgr(wstate->index); + smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); + } +} + +/* + * Create parallel context, and launch workers for leader. + * + * buildstate argument should be initialized (with the exception of the + * tuplesort state in spools, which may later be created based on shared + * state initially set up here). + * + * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY. + * + * request is the target number of parallel worker processes to launch. + * + * Sets buildstate's BTLeader, which caller must use to shut down parallel + * mode by passing it to _bt_end_parallel() at the very end of its index + * build. If not even a single worker process can be launched, this is + * never set, and caller should proceed with a serial index build. + */ +static void +_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) +{ + ParallelContext *pcxt; + int scantuplesortstates; + Snapshot snapshot; + Size estbtshared; + Size estsort; + BTShared *btshared; + Sharedsort *sharedsort; + Sharedsort *sharedsort2; + BTSpool *btspool = buildstate->spool; + BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader)); + WalUsage *walusage; + BufferUsage *bufferusage; + bool leaderparticipates = true; + int querylen; + +#ifdef DISABLE_LEADER_PARTICIPATION + leaderparticipates = false; +#endif + + /* + * Enter parallel mode, and create context for parallel build of btree + * index + */ + EnterParallelMode(); + Assert(request > 0); + pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main", + request); + + scantuplesortstates = leaderparticipates ? request + 1 : request; + + /* + * Prepare for scan of the base relation. In a normal index build, we use + * SnapshotAny because we must retrieve all tuples and do our own time + * qual checks (because we have to index RECENTLY_DEAD tuples). In a + * concurrent build, we take a regular MVCC snapshot and index whatever's + * live according to that. + */ + if (!isconcurrent) + snapshot = SnapshotAny; + else + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + + /* + * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and + * PARALLEL_KEY_TUPLESORT tuplesort workspace + */ + estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, estbtshared); + estsort = tuplesort_estimate_shared(scantuplesortstates); + shm_toc_estimate_chunk(&pcxt->estimator, estsort); + + /* + * Unique case requires a second spool, and so we may have to account for + * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 + */ + if (!btspool->isunique) + shm_toc_estimate_keys(&pcxt->estimator, 2); + else + { + shm_toc_estimate_chunk(&pcxt->estimator, estsort); + shm_toc_estimate_keys(&pcxt->estimator, 3); + } + + /* + * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE + * and PARALLEL_KEY_BUFFER_USAGE. + * + * If there are no extensions loaded that care, we could skip this. We + * have no way of knowing whether anyone's looking at pgWalUsage or + * pgBufferUsage, so do it unconditionally. + */ + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ + if (debug_query_string) + { + querylen = strlen(debug_query_string); + shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + else + querylen = 0; /* keep compiler quiet */ + + /* Everyone's had a chance to ask for space, so now create the DSM */ + InitializeParallelDSM(pcxt); + + /* If no DSM segment was available, back out (do serial build) */ + if (pcxt->seg == NULL) + { + if (IsMVCCSnapshot(snapshot)) + UnregisterSnapshot(snapshot); + DestroyParallelContext(pcxt); + ExitParallelMode(); + return; + } + + /* Store shared build state, for which we reserved space */ + btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared); + /* Initialize immutable state */ + btshared->heaprelid = RelationGetRelid(btspool->heap); + btshared->indexrelid = RelationGetRelid(btspool->index); + btshared->isunique = btspool->isunique; + btshared->isconcurrent = isconcurrent; + btshared->scantuplesortstates = scantuplesortstates; + ConditionVariableInit(&btshared->workersdonecv); + SpinLockInit(&btshared->mutex); + /* Initialize mutable state */ + btshared->nparticipantsdone = 0; + btshared->reltuples = 0.0; + btshared->havedead = false; + btshared->indtuples = 0.0; + btshared->brokenhotchain = false; + table_parallelscan_initialize(btspool->heap, + ParallelTableScanFromBTShared(btshared), + snapshot); + + /* + * Store shared tuplesort-private state, for which we reserved space. + * Then, initialize opaque state using tuplesort routine. + */ + sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); + tuplesort_initialize_shared(sharedsort, scantuplesortstates, + pcxt->seg); + + shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); + + /* Unique case requires a second spool, and associated shared state */ + if (!btspool->isunique) + sharedsort2 = NULL; + else + { + /* + * Store additional shared tuplesort-private state, for which we + * reserved space. Then, initialize opaque state using tuplesort + * routine. + */ + sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); + tuplesort_initialize_shared(sharedsort2, scantuplesortstates, + pcxt->seg); + + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2); + } + + /* Store query string for workers */ + if (debug_query_string) + { + char *sharedquery; + + sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); + memcpy(sharedquery, debug_query_string, querylen + 1); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery); + } + + /* + * Allocate space for each worker's WalUsage and BufferUsage; no need to + * initialize. + */ + walusage = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage); + bufferusage = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage); + + /* Launch workers, saving status for leader/caller */ + LaunchParallelWorkers(pcxt); + btleader->pcxt = pcxt; + btleader->nparticipanttuplesorts = pcxt->nworkers_launched; + if (leaderparticipates) + btleader->nparticipanttuplesorts++; + btleader->btshared = btshared; + btleader->sharedsort = sharedsort; + btleader->sharedsort2 = sharedsort2; + btleader->snapshot = snapshot; + btleader->walusage = walusage; + btleader->bufferusage = bufferusage; + + /* If no workers were successfully launched, back out (do serial build) */ + if (pcxt->nworkers_launched == 0) + { + _bt_end_parallel(btleader); + return; + } + + /* Save leader state now that it's clear build will be parallel */ + buildstate->btleader = btleader; + + /* Join heap scan ourselves */ + if (leaderparticipates) + _bt_leader_participate_as_worker(buildstate); + + /* + * Caller needs to wait for all launched workers when we return. Make + * sure that the failure-to-start case will not hang forever. + */ + WaitForParallelWorkersToAttach(pcxt); +} + +/* + * Shut down workers, destroy parallel context, and end parallel mode. + */ +static void +_bt_end_parallel(BTLeader *btleader) +{ + int i; + + /* Shutdown worker processes */ + WaitForParallelWorkersToFinish(btleader->pcxt); + + /* + * Next, accumulate WAL usage. (This must wait for the workers to finish, + * or we might get incomplete data.) + */ + for (i = 0; i < btleader->pcxt->nworkers_launched; i++) + InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); + + /* Free last reference to MVCC snapshot, if one was used */ + if (IsMVCCSnapshot(btleader->snapshot)) + UnregisterSnapshot(btleader->snapshot); + DestroyParallelContext(btleader->pcxt); + ExitParallelMode(); +} + +/* + * Returns size of shared memory required to store state for a parallel + * btree index build based on the snapshot its parallel scan will use. + */ +static Size +_bt_parallel_estimate_shared(Relation heap, Snapshot snapshot) +{ + /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ + return add_size(BUFFERALIGN(sizeof(BTShared)), + table_parallelscan_estimate(heap, snapshot)); +} + +/* + * Within leader, wait for end of heap scan. + * + * When called, parallel heap scan started by _bt_begin_parallel() will + * already be underway within worker processes (when leader participates + * as a worker, we should end up here just as workers are finishing). + * + * Fills in fields needed for ambuild statistics, and lets caller set + * field indicating that some worker encountered a broken HOT chain. + * + * Returns the total number of heap tuples scanned. + */ +static double +_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain) +{ + BTShared *btshared = buildstate->btleader->btshared; + int nparticipanttuplesorts; + double reltuples; + + nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts; + for (;;) + { + SpinLockAcquire(&btshared->mutex); + if (btshared->nparticipantsdone == nparticipanttuplesorts) + { + buildstate->havedead = btshared->havedead; + buildstate->indtuples = btshared->indtuples; + *brokenhotchain = btshared->brokenhotchain; + reltuples = btshared->reltuples; + SpinLockRelease(&btshared->mutex); + break; + } + SpinLockRelease(&btshared->mutex); + + ConditionVariableSleep(&btshared->workersdonecv, + WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN); + } + + ConditionVariableCancelSleep(); + + return reltuples; +} + +/* + * Within leader, participate as a parallel worker. + */ +static void +_bt_leader_participate_as_worker(BTBuildState *buildstate) +{ + BTLeader *btleader = buildstate->btleader; + BTSpool *leaderworker; + BTSpool *leaderworker2; + int sortmem; + + /* Allocate memory and initialize private spool */ + leaderworker = (BTSpool *) palloc0(sizeof(BTSpool)); + leaderworker->heap = buildstate->spool->heap; + leaderworker->index = buildstate->spool->index; + leaderworker->isunique = buildstate->spool->isunique; + + /* Initialize second spool, if required */ + if (!btleader->btshared->isunique) + leaderworker2 = NULL; + else + { + /* Allocate memory for worker's own private secondary spool */ + leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool)); + + /* Initialize worker's own secondary spool */ + leaderworker2->heap = leaderworker->heap; + leaderworker2->index = leaderworker->index; + leaderworker2->isunique = false; + } + + /* + * Might as well use reliable figure when doling out maintenance_work_mem + * (when requested number of workers were not launched, this will be + * somewhat higher than it is for other workers). + */ + sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts; + + /* Perform work common to all participants */ + _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared, + btleader->sharedsort, btleader->sharedsort2, + sortmem, true); + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ +} + +/* + * Perform work within a launched parallel process. + */ +void +_bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) +{ + char *sharedquery; + BTSpool *btspool; + BTSpool *btspool2; + BTShared *btshared; + Sharedsort *sharedsort; + Sharedsort *sharedsort2; + Relation heapRel; + Relation indexRel; + LOCKMODE heapLockmode; + LOCKMODE indexLockmode; + WalUsage *walusage; + BufferUsage *bufferusage; + int sortmem; + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + ResetUsage(); +#endif /* BTREE_BUILD_STATS */ + + /* + * The only possible status flag that can be set to the parallel worker is + * PROC_IN_SAFE_IC. + */ + Assert((MyProc->statusFlags == 0) || + (MyProc->statusFlags == PROC_IN_SAFE_IC)); + + /* Set debug_query_string for individual workers first */ + sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true); + debug_query_string = sharedquery; + + /* Report the query string from leader */ + pgstat_report_activity(STATE_RUNNING, debug_query_string); + + /* Look up nbtree shared state */ + btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false); + + /* Open relations using lock modes known to be obtained by index.c */ + if (!btshared->isconcurrent) + { + heapLockmode = ShareLock; + indexLockmode = AccessExclusiveLock; + } + else + { + heapLockmode = ShareUpdateExclusiveLock; + indexLockmode = RowExclusiveLock; + } + + /* Open relations within worker */ + heapRel = table_open(btshared->heaprelid, heapLockmode); + indexRel = index_open(btshared->indexrelid, indexLockmode); + + /* Initialize worker's own spool */ + btspool = (BTSpool *) palloc0(sizeof(BTSpool)); + btspool->heap = heapRel; + btspool->index = indexRel; + btspool->isunique = btshared->isunique; + + /* Look up shared state private to tuplesort.c */ + sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); + tuplesort_attach_shared(sharedsort, seg); + if (!btshared->isunique) + { + btspool2 = NULL; + sharedsort2 = NULL; + } + else + { + /* Allocate memory for worker's own private secondary spool */ + btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); + + /* Initialize worker's own secondary spool */ + btspool2->heap = btspool->heap; + btspool2->index = btspool->index; + btspool2->isunique = false; + /* Look up shared state private to tuplesort.c */ + sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false); + tuplesort_attach_shared(sharedsort2, seg); + } + + /* Prepare to track buffer usage during parallel execution */ + InstrStartParallelQuery(); + + /* Perform sorting of spool, and possibly a spool2 */ + sortmem = maintenance_work_mem / btshared->scantuplesortstates; + _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort, + sharedsort2, sortmem, false); + + /* Report WAL/buffer usage during parallel execution */ + bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); + walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); + InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + &walusage[ParallelWorkerNumber]); + +#ifdef BTREE_BUILD_STATS + if (log_btree_build_stats) + { + ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +} + +/* + * Perform a worker's portion of a parallel sort. + * + * This generates a tuplesort for passed btspool, and a second tuplesort + * state if a second btspool is need (i.e. for unique index builds). All + * other spool fields should already be set when this is called. + * + * sortmem is the amount of working memory to use within each worker, + * expressed in KBs. + * + * When this returns, workers are done, and need only release resources. + */ +static void +_bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, + BTShared *btshared, Sharedsort *sharedsort, + Sharedsort *sharedsort2, int sortmem, bool progress) +{ + SortCoordinate coordinate; + BTBuildState buildstate; + TableScanDesc scan; + double reltuples; + IndexInfo *indexInfo; + + /* Initialize local tuplesort coordination state */ + coordinate = palloc0(sizeof(SortCoordinateData)); + coordinate->isWorker = true; + coordinate->nParticipants = -1; + coordinate->sharedsort = sharedsort; + + /* Begin "partial" tuplesort */ + btspool->sortstate = tuplesort_begin_index_btree(btspool->heap, + btspool->index, + btspool->isunique, + sortmem, coordinate, + false); + + /* + * Just as with serial case, there may be a second spool. If so, a + * second, dedicated spool2 partial tuplesort is required. + */ + if (btspool2) + { + SortCoordinate coordinate2; + + /* + * We expect that the second one (for dead tuples) won't get very + * full, so we give it only work_mem (unless sortmem is less for + * worker). Worker processes are generally permitted to allocate + * work_mem independently. + */ + coordinate2 = palloc0(sizeof(SortCoordinateData)); + coordinate2->isWorker = true; + coordinate2->nParticipants = -1; + coordinate2->sharedsort = sharedsort2; + btspool2->sortstate = + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, + Min(sortmem, work_mem), coordinate2, + false); + } + + /* Fill in buildstate for _bt_build_callback() */ + buildstate.isunique = btshared->isunique; + buildstate.havedead = false; + buildstate.heap = btspool->heap; + buildstate.spool = btspool; + buildstate.spool2 = btspool2; + buildstate.indtuples = 0; + buildstate.btleader = NULL; + + /* Join parallel scan */ + indexInfo = BuildIndexInfo(btspool->index); + indexInfo->ii_Concurrent = btshared->isconcurrent; + scan = table_beginscan_parallel(btspool->heap, + ParallelTableScanFromBTShared(btshared)); + reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, + true, progress, _bt_build_callback, + (void *) &buildstate, scan); + + /* Execute this worker's part of the sort */ + if (progress) + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_1); + tuplesort_performsort(btspool->sortstate); + if (btspool2) + { + if (progress) + pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, + PROGRESS_BTREE_PHASE_PERFORMSORT_2); + tuplesort_performsort(btspool2->sortstate); + } + + /* + * Done. Record ambuild statistics, and whether we encountered a broken + * HOT chain. + */ + SpinLockAcquire(&btshared->mutex); + btshared->nparticipantsdone++; + btshared->reltuples += reltuples; + if (buildstate.havedead) + btshared->havedead = true; + btshared->indtuples += buildstate.indtuples; + if (indexInfo->ii_BrokenHotChain) + btshared->brokenhotchain = true; + SpinLockRelease(&btshared->mutex); + + /* Notify leader */ + ConditionVariableSignal(&btshared->workersdonecv); + + /* We can end tuplesorts immediately */ + tuplesort_end(btspool->sortstate); + if (btspool2) + tuplesort_end(btspool2->sortstate); +} diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c new file mode 100644 index 0000000..3485e93 --- /dev/null +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -0,0 +1,1190 @@ +/*------------------------------------------------------------------------- + * + * nbtsplitloc.c + * Choose split point code for Postgres btree implementation. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsplitloc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "storage/lmgr.h" + +typedef enum +{ + /* strategy for searching through materialized list of split points */ + SPLIT_DEFAULT, /* give some weight to truncation */ + SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */ + SPLIT_SINGLE_VALUE /* leave left page almost full */ +} FindSplitStrat; + +typedef struct +{ + /* details of free space left by split */ + int16 curdelta; /* current leftfree/rightfree delta */ + int16 leftfree; /* space left on left page post-split */ + int16 rightfree; /* space left on right page post-split */ + + /* split point identifying fields (returned by _bt_findsplitloc) */ + OffsetNumber firstrightoff; /* first origpage item on rightpage */ + bool newitemonleft; /* new item goes on left, or right? */ + +} SplitPoint; + +typedef struct +{ + /* context data for _bt_recsplitloc */ + Relation rel; /* index relation */ + Page origpage; /* page undergoing split */ + IndexTuple newitem; /* new item (cause of page split) */ + Size newitemsz; /* size of newitem (includes line pointer) */ + bool is_leaf; /* T if splitting a leaf page */ + bool is_rightmost; /* T if splitting rightmost page on level */ + OffsetNumber newitemoff; /* where the new item is to be inserted */ + int leftspace; /* space available for items on left page */ + int rightspace; /* space available for items on right page */ + int olddataitemstotal; /* space taken by old items */ + Size minfirstrightsz; /* smallest firstright size */ + + /* candidate split point data */ + int maxsplits; /* maximum number of splits */ + int nsplits; /* current number of splits */ + SplitPoint *splits; /* all candidate split points for page */ + int interval; /* current range of acceptable split points */ +} FindSplitData; + +static void _bt_recsplitloc(FindSplitData *state, + OffsetNumber firstrightoff, bool newitemonleft, + int olddataitemstoleft, + Size firstrightofforigpagetuplesz); +static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, + bool usemult); +static int _bt_splitcmp(const void *arg1, const void *arg2); +static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, + int leaffillfactor, bool *usemult); +static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); +static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, + bool *newitemonleft, FindSplitStrat strategy); +static int _bt_defaultinterval(FindSplitData *state); +static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, + SplitPoint *rightpage, FindSplitStrat *strategy); +static void _bt_interval_edges(FindSplitData *state, + SplitPoint **leftinterval, SplitPoint **rightinterval); +static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split); +static inline IndexTuple _bt_split_lastleft(FindSplitData *state, + SplitPoint *split); +static inline IndexTuple _bt_split_firstright(FindSplitData *state, + SplitPoint *split); + + +/* + * _bt_findsplitloc() -- find an appropriate place to split a page. + * + * The main goal here is to equalize the free space that will be on each + * split page, *after accounting for the inserted tuple*. (If we fail to + * account for it, we might find ourselves with too little room on the page + * that it needs to go into!) + * + * If the page is the rightmost page on its level, we instead try to arrange + * to leave the left split page fillfactor% full. In this way, when we are + * inserting successively increasing keys (consider sequences, timestamps, + * etc) we will end up with a tree whose pages are about fillfactor% full, + * instead of the 50% full result that we'd get without this special case. + * This is the same as nbtsort.c produces for a newly-created tree. Note + * that leaf and nonleaf pages use different fillfactors. Note also that + * there are a number of further special cases where fillfactor is not + * applied in the standard way. + * + * We are passed the intended insert position of the new tuple, expressed as + * the offsetnumber of the tuple it must go in front of (this could be + * maxoff+1 if the tuple is to go at the end). The new tuple itself is also + * passed, since it's needed to give some weight to how effective suffix + * truncation will be. The implementation picks the split point that + * maximizes the effectiveness of suffix truncation from a small list of + * alternative candidate split points that leave each side of the split with + * about the same share of free space. Suffix truncation is secondary to + * equalizing free space, except in cases with large numbers of duplicates. + * Note that it is always assumed that caller goes on to perform truncation, + * even with pg_upgrade'd indexes where that isn't actually the case + * (!heapkeyspace indexes). See nbtree/README for more information about + * suffix truncation. + * + * We return the index of the first existing tuple that should go on the + * righthand page (which is called firstrightoff), plus a boolean + * indicating whether the new tuple goes on the left or right page. You + * can think of the returned state as a point _between_ two adjacent data + * items (laftleft and firstright data items) on an imaginary version of + * origpage that already includes newitem. The bool is necessary to + * disambiguate the case where firstrightoff == newitemoff (i.e. it is + * sometimes needed to determine if the firstright tuple for the split is + * newitem rather than the tuple from origpage at offset firstrightoff). + */ +OffsetNumber +_bt_findsplitloc(Relation rel, + Page origpage, + OffsetNumber newitemoff, + Size newitemsz, + IndexTuple newitem, + bool *newitemonleft) +{ + BTPageOpaque opaque; + int leftspace, + rightspace, + olddataitemstotal, + olddataitemstoleft, + perfectpenalty, + leaffillfactor; + FindSplitData state; + FindSplitStrat strategy; + ItemId itemid; + OffsetNumber offnum, + maxoff, + firstrightoff; + double fillfactormult; + bool usemult; + SplitPoint leftpage, + rightpage; + + opaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + maxoff = PageGetMaxOffsetNumber(origpage); + + /* Total free space available on a btree page, after fixed overhead */ + leftspace = rightspace = + PageGetPageSize(origpage) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + + /* The right page will have the same high key as the old page */ + if (!P_RIGHTMOST(opaque)) + { + itemid = PageGetItemId(origpage, P_HIKEY); + rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + + sizeof(ItemIdData)); + } + + /* Count up total space in data items before actually scanning 'em */ + olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage); + leaffillfactor = BTGetFillFactor(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + state.rel = rel; + state.origpage = origpage; + state.newitem = newitem; + state.newitemsz = newitemsz; + state.is_leaf = P_ISLEAF(opaque); + state.is_rightmost = P_RIGHTMOST(opaque); + state.leftspace = leftspace; + state.rightspace = rightspace; + state.olddataitemstotal = olddataitemstotal; + state.minfirstrightsz = SIZE_MAX; + state.newitemoff = newitemoff; + + /* newitem cannot be a posting list item */ + Assert(!BTreeTupleIsPosting(newitem)); + + /* + * nsplits should never exceed maxoff because there will be at most as + * many candidate split points as there are points _between_ tuples, once + * you imagine that the new item is already on the original page (the + * final number of splits may be slightly lower because not all points + * between tuples will be legal). + */ + state.maxsplits = maxoff; + state.splits = palloc(sizeof(SplitPoint) * state.maxsplits); + state.nsplits = 0; + + /* + * Scan through the data items and calculate space usage for a split at + * each possible position + */ + olddataitemstoleft = 0; + + for (offnum = P_FIRSTDATAKEY(opaque); + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + Size itemsz; + + itemid = PageGetItemId(origpage, offnum); + itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); + + /* + * When item offset number is not newitemoff, neither side of the + * split can be newitem. Record a split after the previous data item + * from original page, but before the current data item from original + * page. (_bt_recsplitloc() will reject the split when there are no + * previous items, which we rely on.) + */ + if (offnum < newitemoff) + _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); + else if (offnum > newitemoff) + _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); + else + { + /* + * Record a split after all "offnum < newitemoff" original page + * data items, but before newitem + */ + _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); + + /* + * Record a split after newitem, but before data item from + * original page at offset newitemoff/current offset + */ + _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); + } + + olddataitemstoleft += itemsz; + } + + /* + * Record a split after all original page data items, but before newitem. + * (Though only when it's possible that newitem will end up alone on new + * right page.) + */ + Assert(olddataitemstoleft == olddataitemstotal); + if (newitemoff > maxoff) + _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0); + + /* + * I believe it is not possible to fail to find a feasible split, but just + * in case ... + */ + if (state.nsplits == 0) + elog(ERROR, "could not find a feasible split point for index \"%s\"", + RelationGetRelationName(rel)); + + /* + * Start search for a split point among list of legal split points. Give + * primary consideration to equalizing available free space in each half + * of the split initially (start with default strategy), while applying + * rightmost and split-after-new-item optimizations where appropriate. + * Either of the two other fallback strategies may be required for cases + * with a large number of duplicates around the original/space-optimal + * split point. + * + * Default strategy gives some weight to suffix truncation in deciding a + * split point on leaf pages. It attempts to select a split point where a + * distinguishing attribute appears earlier in the new high key for the + * left side of the split, in order to maximize the number of trailing + * attributes that can be truncated away. Only candidate split points + * that imply an acceptable balance of free space on each side are + * considered. See _bt_defaultinterval(). + */ + if (!state.is_leaf) + { + /* fillfactormult only used on rightmost page */ + usemult = state.is_rightmost; + fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0; + } + else if (state.is_rightmost) + { + /* Rightmost leaf page -- fillfactormult always used */ + usemult = true; + fillfactormult = leaffillfactor / 100.0; + } + else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult)) + { + /* + * New item inserted at rightmost point among a localized grouping on + * a leaf page -- apply "split after new item" optimization, either by + * applying leaf fillfactor multiplier, or by choosing the exact split + * point that leaves newitem as lastleft. (usemult is set for us.) + */ + if (usemult) + { + /* fillfactormult should be set based on leaf fillfactor */ + fillfactormult = leaffillfactor / 100.0; + } + else + { + /* find precise split point after newitemoff */ + for (int i = 0; i < state.nsplits; i++) + { + SplitPoint *split = state.splits + i; + + if (split->newitemonleft && + newitemoff == split->firstrightoff) + { + pfree(state.splits); + *newitemonleft = true; + return newitemoff; + } + } + + /* + * Cannot legally split after newitemoff; proceed with split + * without using fillfactor multiplier. This is defensive, and + * should never be needed in practice. + */ + fillfactormult = 0.50; + } + } + else + { + /* Other leaf page. 50:50 page split. */ + usemult = false; + /* fillfactormult not used, but be tidy */ + fillfactormult = 0.50; + } + + /* + * Save leftmost and rightmost splits for page before original ordinal + * sort order is lost by delta/fillfactormult sort + */ + leftpage = state.splits[0]; + rightpage = state.splits[state.nsplits - 1]; + + /* Give split points a fillfactormult-wise delta, and sort on deltas */ + _bt_deltasortsplits(&state, fillfactormult, usemult); + + /* Determine split interval for default strategy */ + state.interval = _bt_defaultinterval(&state); + + /* + * Determine if default strategy/split interval will produce a + * sufficiently distinguishing split, or if we should change strategies. + * Alternative strategies change the range of split points that are + * considered acceptable (split interval), and possibly change + * fillfactormult, in order to deal with pages with a large number of + * duplicates gracefully. + * + * Pass low and high splits for the entire page (actually, they're for an + * imaginary version of the page that includes newitem). These are used + * when the initial split interval encloses split points that are full of + * duplicates, and we need to consider if it's even possible to avoid + * appending a heap TID. + */ + perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy); + + if (strategy == SPLIT_DEFAULT) + { + /* + * Default strategy worked out (always works out with internal page). + * Original split interval still stands. + */ + } + + /* + * Many duplicates strategy is used when a heap TID would otherwise be + * appended, but the page isn't completely full of logical duplicates. + * + * The split interval is widened to include all legal candidate split + * points. There might be a few as two distinct values in the whole-page + * split interval, though it's also possible that most of the values on + * the page are unique. The final split point will either be to the + * immediate left or to the immediate right of the group of duplicate + * tuples that enclose the first/delta-optimal split point (perfect + * penalty was set so that the lowest delta split point that avoids + * appending a heap TID will be chosen). Maximizing the number of + * attributes that can be truncated away is not a goal of the many + * duplicates strategy. + * + * Single value strategy is used when it is impossible to avoid appending + * a heap TID. It arranges to leave the left page very full. This + * maximizes space utilization in cases where tuples with the same + * attribute values span many pages. Newly inserted duplicates will tend + * to have higher heap TID values, so we'll end up splitting to the right + * consistently. (Single value strategy is harmless though not + * particularly useful with !heapkeyspace indexes.) + */ + else if (strategy == SPLIT_MANY_DUPLICATES) + { + Assert(state.is_leaf); + /* Shouldn't try to truncate away extra user attributes */ + Assert(perfectpenalty == + IndexRelationGetNumberOfKeyAttributes(state.rel)); + /* No need to resort splits -- no change in fillfactormult/deltas */ + state.interval = state.nsplits; + } + else if (strategy == SPLIT_SINGLE_VALUE) + { + Assert(state.is_leaf); + /* Split near the end of the page */ + usemult = true; + fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0; + /* Resort split points with new delta */ + _bt_deltasortsplits(&state, fillfactormult, usemult); + /* Appending a heap TID is unavoidable, so interval of 1 is fine */ + state.interval = 1; + } + + /* + * Search among acceptable split points (using final split interval) for + * the entry that has the lowest penalty, and is therefore expected to + * maximize fan-out. Sets *newitemonleft for us. + */ + firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, + strategy); + pfree(state.splits); + + return firstrightoff; +} + +/* + * Subroutine to record a particular point between two tuples (possibly the + * new item) on page (ie, combination of firstrightoff and newitemonleft + * settings) in *state for later analysis. This is also a convenient point to + * check if the split is legal (if it isn't, it won't be recorded). + * + * firstrightoff is the offset of the first item on the original page that + * goes to the right page, and firstrightofforigpagetuplesz is the size of + * that tuple. firstrightoff can be > max offset, which means that all the + * old items go to the left page and only the new item goes to the right page. + * We don't actually use firstrightofforigpagetuplesz in that case (actually, + * we don't use it for _any_ split where the firstright tuple happens to be + * newitem). + * + * olddataitemstoleft is the total size of all old items to the left of the + * split point that is recorded here when legal. Should not include + * newitemsz, since that is handled here. + */ +static void +_bt_recsplitloc(FindSplitData *state, + OffsetNumber firstrightoff, + bool newitemonleft, + int olddataitemstoleft, + Size firstrightofforigpagetuplesz) +{ + int16 leftfree, + rightfree; + Size firstrightsz; + Size postingsz = 0; + bool newitemisfirstright; + + /* Is the new item going to be split point's firstright tuple? */ + newitemisfirstright = (firstrightoff == state->newitemoff && + !newitemonleft); + + if (newitemisfirstright) + firstrightsz = state->newitemsz; + else + { + firstrightsz = firstrightofforigpagetuplesz; + + /* + * Calculate suffix truncation space saving when firstright tuple is a + * posting list tuple, though only when the tuple is over 64 bytes + * including line pointer overhead (arbitrary). This avoids accessing + * the tuple in cases where its posting list must be very small (if + * tuple has one at all). + * + * Note: We don't do this in the case where firstright tuple is + * newitem, since newitem cannot have a posting list. + */ + if (state->is_leaf && firstrightsz > 64) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->origpage, firstrightoff); + newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsz = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + + /* Account for all the old tuples */ + leftfree = state->leftspace - olddataitemstoleft; + rightfree = state->rightspace - + (state->olddataitemstotal - olddataitemstoleft); + + /* + * The first item on the right page becomes the high key of the left page; + * therefore it counts against left space as well as right space (we + * cannot assume that suffix truncation will make it any smaller). When + * index has included attributes, then those attributes of left page high + * key will be truncated leaving that page with slightly more free space. + * However, that shouldn't affect our ability to find valid split + * location, since we err in the direction of being pessimistic about free + * space on the left half. Besides, even when suffix truncation of + * non-TID attributes occurs, the new high key often won't even be a + * single MAXALIGN() quantum smaller than the firstright tuple it's based + * on. + * + * If we are on the leaf level, assume that suffix truncation cannot avoid + * adding a heap TID to the left half's new high key when splitting at the + * leaf level. In practice the new high key will often be smaller and + * will rarely be larger, but conservatively assume the worst case. We do + * go to the trouble of subtracting away posting list overhead, though + * only when it looks like it will make an appreciable difference. + * (Posting lists are the only case where truncation will typically make + * the final high key far smaller than firstright, so being a bit more + * precise there noticeably improves the balance of free space.) + */ + if (state->is_leaf) + leftfree -= (int16) (firstrightsz + + MAXALIGN(sizeof(ItemPointerData)) - + postingsz); + else + leftfree -= (int16) firstrightsz; + + /* account for the new item */ + if (newitemonleft) + leftfree -= (int16) state->newitemsz; + else + rightfree -= (int16) state->newitemsz; + + /* + * If we are not on the leaf level, we will be able to discard the key + * data from the first item that winds up on the right page. + */ + if (!state->is_leaf) + rightfree += (int16) firstrightsz - + (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); + + /* Record split if legal */ + if (leftfree >= 0 && rightfree >= 0) + { + Assert(state->nsplits < state->maxsplits); + + /* Determine smallest firstright tuple size among legal splits */ + state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz); + + state->splits[state->nsplits].curdelta = 0; + state->splits[state->nsplits].leftfree = leftfree; + state->splits[state->nsplits].rightfree = rightfree; + state->splits[state->nsplits].firstrightoff = firstrightoff; + state->splits[state->nsplits].newitemonleft = newitemonleft; + state->nsplits++; + } +} + +/* + * Subroutine to assign space deltas to materialized array of candidate split + * points based on current fillfactor, and to sort array using that fillfactor + */ +static void +_bt_deltasortsplits(FindSplitData *state, double fillfactormult, + bool usemult) +{ + for (int i = 0; i < state->nsplits; i++) + { + SplitPoint *split = state->splits + i; + int16 delta; + + if (usemult) + delta = fillfactormult * split->leftfree - + (1.0 - fillfactormult) * split->rightfree; + else + delta = split->leftfree - split->rightfree; + + if (delta < 0) + delta = -delta; + + /* Save delta */ + split->curdelta = delta; + } + + qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp); +} + +/* + * qsort-style comparator used by _bt_deltasortsplits() + */ +static int +_bt_splitcmp(const void *arg1, const void *arg2) +{ + SplitPoint *split1 = (SplitPoint *) arg1; + SplitPoint *split2 = (SplitPoint *) arg2; + + if (split1->curdelta > split2->curdelta) + return 1; + if (split1->curdelta < split2->curdelta) + return -1; + + return 0; +} + +/* + * Subroutine to determine whether or not a non-rightmost leaf page should be + * split immediately after the would-be original page offset for the + * new/incoming tuple (or should have leaf fillfactor applied when new item is + * to the right on original page). This is appropriate when there is a + * pattern of localized monotonically increasing insertions into a composite + * index, where leading attribute values form local groupings, and we + * anticipate further insertions of the same/current grouping (new item's + * grouping) in the near future. This can be thought of as a variation on + * applying leaf fillfactor during rightmost leaf page splits, since cases + * that benefit will converge on packing leaf pages leaffillfactor% full over + * time. + * + * We may leave extra free space remaining on the rightmost page of a "most + * significant column" grouping of tuples if that grouping never ends up + * having future insertions that use the free space. That effect is + * self-limiting; a future grouping that becomes the "nearest on the right" + * grouping of the affected grouping usually puts the extra free space to good + * use. + * + * Caller uses optimization when routine returns true, though the exact action + * taken by caller varies. Caller uses original leaf page fillfactor in + * standard way rather than using the new item offset directly when *usemult + * was also set to true here. Otherwise, caller applies optimization by + * locating the legal split point that makes the new tuple the lastleft tuple + * for the split. + */ +static bool +_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, + int leaffillfactor, bool *usemult) +{ + int16 nkeyatts; + ItemId itemid; + IndexTuple tup; + int keepnatts; + + Assert(state->is_leaf && !state->is_rightmost); + + nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); + + /* Single key indexes not considered here */ + if (nkeyatts == 1) + return false; + + /* Ascending insertion pattern never inferred when new item is first */ + if (state->newitemoff == P_FIRSTKEY) + return false; + + /* + * Only apply optimization on pages with equisized tuples, since ordinal + * keys are likely to be fixed-width. Testing if the new tuple is + * variable width directly might also work, but that fails to apply the + * optimization to indexes with a numeric_ops attribute. + * + * Conclude that page has equisized tuples when the new item is the same + * width as the smallest item observed during pass over page, and other + * non-pivot tuples must be the same width as well. (Note that the + * possibly-truncated existing high key isn't counted in + * olddataitemstotal, and must be subtracted from maxoff.) + */ + if (state->newitemsz != state->minfirstrightsz) + return false; + if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal) + return false; + + /* + * Avoid applying optimization when tuples are wider than a tuple + * consisting of two non-NULL int8/int64 attributes (or four non-NULL + * int4/int32 attributes) + */ + if (state->newitemsz > + MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) + + sizeof(ItemIdData)) + return false; + + /* + * At least the first attribute's value must be equal to the corresponding + * value in previous tuple to apply optimization. New item cannot be a + * duplicate, either. + * + * Handle case where new item is to the right of all items on the existing + * page. This is suggestive of monotonically increasing insertions in + * itself, so the "heap TID adjacency" test is not applied here. + */ + if (state->newitemoff > maxoff) + { + itemid = PageGetItemId(state->origpage, maxoff); + tup = (IndexTuple) PageGetItem(state->origpage, itemid); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + *usemult = true; + return true; + } + + return false; + } + + /* + * "Low cardinality leading column, high cardinality suffix column" + * indexes with a random insertion pattern (e.g., an index with a boolean + * column, such as an index on '(book_is_in_print, book_isbn)') present us + * with a risk of consistently misapplying the optimization. We're + * willing to accept very occasional misapplication of the optimization, + * provided the cases where we get it wrong are rare and self-limiting. + * + * Heap TID adjacency strongly suggests that the item just to the left was + * inserted very recently, which limits overapplication of the + * optimization. Besides, all inappropriate cases triggered here will + * still split in the middle of the page on average. + */ + itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff)); + tup = (IndexTuple) PageGetItem(state->origpage, itemid); + /* Do cheaper test first */ + if (BTreeTupleIsPosting(tup) || + !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) + return false; + /* Check same conditions as rightmost item case, too */ + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + double interp = (double) state->newitemoff / ((double) maxoff + 1); + double leaffillfactormult = (double) leaffillfactor / 100.0; + + /* + * Don't allow caller to split after a new item when it will result in + * a split point to the right of the point that a leaf fillfactor + * split would use -- have caller apply leaf fillfactor instead + */ + *usemult = interp > leaffillfactormult; + + return true; + } + + return false; +} + +/* + * Subroutine for determining if two heap TIDS are "adjacent". + * + * Adjacent means that the high TID is very likely to have been inserted into + * heap relation immediately after the low TID, probably during the current + * transaction. + */ +static bool +_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) +{ + BlockNumber lowblk, + highblk; + + lowblk = ItemPointerGetBlockNumber(lowhtid); + highblk = ItemPointerGetBlockNumber(highhtid); + + /* Make optimistic assumption of adjacency when heap blocks match */ + if (lowblk == highblk) + return true; + + /* When heap block one up, second offset should be FirstOffsetNumber */ + if (lowblk + 1 == highblk && + ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber) + return true; + + return false; +} + +/* + * Subroutine to find the "best" split point among candidate split points. + * The best split point is the split point with the lowest penalty among split + * points that fall within current/final split interval. Penalty is an + * abstract score, with a definition that varies depending on whether we're + * splitting a leaf page or an internal page. See _bt_split_penalty() for + * details. + * + * "perfectpenalty" is assumed to be the lowest possible penalty among + * candidate split points. This allows us to return early without wasting + * cycles on calculating the first differing attribute for all candidate + * splits when that clearly cannot improve our choice (or when we only want a + * minimally distinguishing split point, and don't want to make the split any + * more unbalanced than is necessary). + * + * We return the index of the first existing tuple that should go on the right + * page, plus a boolean indicating if new item is on left of split point. + */ +static OffsetNumber +_bt_bestsplitloc(FindSplitData *state, int perfectpenalty, + bool *newitemonleft, FindSplitStrat strategy) +{ + int bestpenalty, + lowsplit; + int highsplit = Min(state->interval, state->nsplits); + SplitPoint *final; + + bestpenalty = INT_MAX; + lowsplit = 0; + for (int i = lowsplit; i < highsplit; i++) + { + int penalty; + + penalty = _bt_split_penalty(state, state->splits + i); + + if (penalty < bestpenalty) + { + bestpenalty = penalty; + lowsplit = i; + } + + if (penalty <= perfectpenalty) + break; + } + + final = &state->splits[lowsplit]; + + /* + * There is a risk that the "many duplicates" strategy will repeatedly do + * the wrong thing when there are monotonically decreasing insertions to + * the right of a large group of duplicates. Repeated splits could leave + * a succession of right half pages with free space that can never be + * used. This must be avoided. + * + * Consider the example of the leftmost page in a single integer attribute + * NULLS FIRST index which is almost filled with NULLs. Monotonically + * decreasing integer insertions might cause the same leftmost page to + * split repeatedly at the same point. Each split derives its new high + * key from the lowest current value to the immediate right of the large + * group of NULLs, which will always be higher than all future integer + * insertions, directing all future integer insertions to the same + * leftmost page. + */ + if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && + !final->newitemonleft && final->firstrightoff >= state->newitemoff && + final->firstrightoff < state->newitemoff + 9) + { + /* + * Avoid the problem by performing a 50:50 split when the new item is + * just to the right of the would-be "many duplicates" split point. + * (Note that the test used for an insert that is "just to the right" + * of the split point is conservative.) + */ + final = &state->splits[0]; + } + + *newitemonleft = final->newitemonleft; + return final->firstrightoff; +} + +#define LEAF_SPLIT_DISTANCE 0.050 +#define INTERNAL_SPLIT_DISTANCE 0.075 + +/* + * Return a split interval to use for the default strategy. This is a limit + * on the number of candidate split points to give further consideration to. + * Only a fraction of all candidate splits points (those located at the start + * of the now-sorted splits array) fall within the split interval. Split + * interval is applied within _bt_bestsplitloc(). + * + * Split interval represents an acceptable range of split points -- those that + * have leftfree and rightfree values that are acceptably balanced. The final + * split point chosen is the split point with the lowest "penalty" among split + * points in this split interval (unless we change our entire strategy, in + * which case the interval also changes -- see _bt_strategy()). + * + * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits, + * and sigma b for internal ("branch") splits. It's hard to provide a + * theoretical justification for the size of the split interval, though it's + * clear that a small split interval can make tuples on level L+1 much smaller + * on average, without noticeably affecting space utilization on level L. + * (Note that the way that we calculate split interval might need to change if + * suffix truncation is taught to truncate tuples "within" the last + * attribute/datum for data types like text, which is more or less how it is + * assumed to work in the paper.) + */ +static int +_bt_defaultinterval(FindSplitData *state) +{ + SplitPoint *spaceoptimal; + int16 tolerance, + lowleftfree, + lowrightfree, + highleftfree, + highrightfree; + + /* + * Determine leftfree and rightfree values that are higher and lower than + * we're willing to tolerate. Note that the final split interval will be + * about 10% of nsplits in the common case where all non-pivot tuples + * (data items) from a leaf page are uniformly sized. We're a bit more + * aggressive when splitting internal pages. + */ + if (state->is_leaf) + tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE; + else + tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE; + + /* First candidate split point is the most evenly balanced */ + spaceoptimal = state->splits; + lowleftfree = spaceoptimal->leftfree - tolerance; + lowrightfree = spaceoptimal->rightfree - tolerance; + highleftfree = spaceoptimal->leftfree + tolerance; + highrightfree = spaceoptimal->rightfree + tolerance; + + /* + * Iterate through split points, starting from the split immediately after + * 'spaceoptimal'. Find the first split point that divides free space so + * unevenly that including it in the split interval would be unacceptable. + */ + for (int i = 1; i < state->nsplits; i++) + { + SplitPoint *split = state->splits + i; + + /* Cannot use curdelta here, since its value is often weighted */ + if (split->leftfree < lowleftfree || split->rightfree < lowrightfree || + split->leftfree > highleftfree || split->rightfree > highrightfree) + return i; + } + + return state->nsplits; +} + +/* + * Subroutine to decide whether split should use default strategy/initial + * split interval, or whether it should finish splitting the page using + * alternative strategies (this is only possible with leaf pages). + * + * Caller uses alternative strategy (or sticks with default strategy) based + * on how *strategy is set here. Return value is "perfect penalty", which is + * passed to _bt_bestsplitloc() as a final constraint on how far caller is + * willing to go to avoid appending a heap TID when using the many duplicates + * strategy (it also saves _bt_bestsplitloc() useless cycles). + */ +static int +_bt_strategy(FindSplitData *state, SplitPoint *leftpage, + SplitPoint *rightpage, FindSplitStrat *strategy) +{ + IndexTuple leftmost, + rightmost; + SplitPoint *leftinterval, + *rightinterval; + int perfectpenalty; + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); + + /* Assume that alternative strategy won't be used for now */ + *strategy = SPLIT_DEFAULT; + + /* + * Use smallest observed firstright item size for entire page (actually, + * entire imaginary version of page that includes newitem) as perfect + * penalty on internal pages. This can save cycles in the common case + * where most or all splits (not just splits within interval) have + * firstright tuples that are the same size. + */ + if (!state->is_leaf) + return state->minfirstrightsz; + + /* + * Use leftmost and rightmost tuples from leftmost and rightmost splits in + * current split interval + */ + _bt_interval_edges(state, &leftinterval, &rightinterval); + leftmost = _bt_split_lastleft(state, leftinterval); + rightmost = _bt_split_firstright(state, rightinterval); + + /* + * If initial split interval can produce a split point that will at least + * avoid appending a heap TID in new high key, we're done. Finish split + * with default strategy and initial split interval. + */ + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + if (perfectpenalty <= indnkeyatts) + return perfectpenalty; + + /* + * Work out how caller should finish split when even their "perfect" + * penalty for initial/default split interval indicates that the interval + * does not contain even a single split that avoids appending a heap TID. + * + * Use the leftmost split's lastleft tuple and the rightmost split's + * firstright tuple to assess every possible split. + */ + leftmost = _bt_split_lastleft(state, leftpage); + rightmost = _bt_split_firstright(state, rightpage); + + /* + * If page (including new item) has many duplicates but is not entirely + * full of duplicates, a many duplicates strategy split will be performed. + * If page is entirely full of duplicates, a single value strategy split + * will be performed. + */ + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + if (perfectpenalty <= indnkeyatts) + { + *strategy = SPLIT_MANY_DUPLICATES; + + /* + * Many duplicates strategy should split at either side the group of + * duplicates that enclose the delta-optimal split point. Return + * indnkeyatts rather than the true perfect penalty to make that + * happen. (If perfectpenalty was returned here then low cardinality + * composite indexes could have continual unbalanced splits.) + * + * Note that caller won't go through with a many duplicates split in + * rare cases where it looks like there are ever-decreasing insertions + * to the immediate right of the split point. This must happen just + * before a final decision is made, within _bt_bestsplitloc(). + */ + return indnkeyatts; + } + + /* + * Single value strategy is only appropriate with ever-increasing heap + * TIDs; otherwise, original default strategy split should proceed to + * avoid pathological performance. Use page high key to infer if this is + * the rightmost page among pages that store the same duplicate value. + * This should not prevent insertions of heap TIDs that are slightly out + * of order from using single value strategy, since that's expected with + * concurrent inserters of the same duplicate value. + */ + else if (state->is_rightmost) + *strategy = SPLIT_SINGLE_VALUE; + else + { + ItemId itemid; + IndexTuple hikey; + + itemid = PageGetItemId(state->origpage, P_HIKEY); + hikey = (IndexTuple) PageGetItem(state->origpage, itemid); + perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, + state->newitem); + if (perfectpenalty <= indnkeyatts) + *strategy = SPLIT_SINGLE_VALUE; + else + { + /* + * Have caller finish split using default strategy, since page + * does not appear to be the rightmost page for duplicates of the + * value the page is filled with + */ + } + } + + return perfectpenalty; +} + +/* + * Subroutine to locate leftmost and rightmost splits for current/default + * split interval. Note that it will be the same split iff there is only one + * split in interval. + */ +static void +_bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, + SplitPoint **rightinterval) +{ + int highsplit = Min(state->interval, state->nsplits); + SplitPoint *deltaoptimal; + + deltaoptimal = state->splits; + *leftinterval = NULL; + *rightinterval = NULL; + + /* + * Delta is an absolute distance to optimal split point, so both the + * leftmost and rightmost split point will usually be at the end of the + * array + */ + for (int i = highsplit - 1; i >= 0; i--) + { + SplitPoint *distant = state->splits + i; + + if (distant->firstrightoff < deltaoptimal->firstrightoff) + { + if (*leftinterval == NULL) + *leftinterval = distant; + } + else if (distant->firstrightoff > deltaoptimal->firstrightoff) + { + if (*rightinterval == NULL) + *rightinterval = distant; + } + else if (!distant->newitemonleft && deltaoptimal->newitemonleft) + { + /* + * "incoming tuple will become firstright" (distant) is to the + * left of "incoming tuple will become lastleft" (delta-optimal) + */ + Assert(distant->firstrightoff == state->newitemoff); + if (*leftinterval == NULL) + *leftinterval = distant; + } + else if (distant->newitemonleft && !deltaoptimal->newitemonleft) + { + /* + * "incoming tuple will become lastleft" (distant) is to the right + * of "incoming tuple will become firstright" (delta-optimal) + */ + Assert(distant->firstrightoff == state->newitemoff); + if (*rightinterval == NULL) + *rightinterval = distant; + } + else + { + /* There was only one or two splits in initial split interval */ + Assert(distant == deltaoptimal); + if (*leftinterval == NULL) + *leftinterval = distant; + if (*rightinterval == NULL) + *rightinterval = distant; + } + + if (*leftinterval && *rightinterval) + return; + } + + Assert(false); +} + +/* + * Subroutine to find penalty for caller's candidate split point. + * + * On leaf pages, penalty is the attribute number that distinguishes each side + * of a split. It's the last attribute that needs to be included in new high + * key for left page. It can be greater than the number of key attributes in + * cases where a heap TID will need to be appended during truncation. + * + * On internal pages, penalty is simply the size of the firstright tuple for + * the split (including line pointer overhead). This tuple will become the + * new high key for the left page. + */ +static inline int +_bt_split_penalty(FindSplitData *state, SplitPoint *split) +{ + IndexTuple lastleft; + IndexTuple firstright; + + if (!state->is_leaf) + { + ItemId itemid; + + if (!split->newitemonleft && + split->firstrightoff == state->newitemoff) + return state->newitemsz; + + itemid = PageGetItemId(state->origpage, split->firstrightoff); + + return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); + } + + lastleft = _bt_split_lastleft(state, split); + firstright = _bt_split_firstright(state, split); + + return _bt_keep_natts_fast(state->rel, lastleft, firstright); +} + +/* + * Subroutine to get a lastleft IndexTuple for a split point + */ +static inline IndexTuple +_bt_split_lastleft(FindSplitData *state, SplitPoint *split) +{ + ItemId itemid; + + if (split->newitemonleft && split->firstrightoff == state->newitemoff) + return state->newitem; + + itemid = PageGetItemId(state->origpage, + OffsetNumberPrev(split->firstrightoff)); + return (IndexTuple) PageGetItem(state->origpage, itemid); +} + +/* + * Subroutine to get a firstright IndexTuple for a split point + */ +static inline IndexTuple +_bt_split_firstright(FindSplitData *state, SplitPoint *split) +{ + ItemId itemid; + + if (!split->newitemonleft && split->firstrightoff == state->newitemoff) + return state->newitem; + + itemid = PageGetItemId(state->origpage, split->firstrightoff); + return (IndexTuple) PageGetItem(state->origpage, itemid); +} diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c new file mode 100644 index 0000000..d524310 --- /dev/null +++ b/src/backend/access/nbtree/nbtutils.c @@ -0,0 +1,2751 @@ +/*------------------------------------------------------------------------- + * + * nbtutils.c + * Utility code for Postgres btree implementation. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtutils.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/nbtree.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "catalog/catalog.h" +#include "commands/progress.h" +#include "lib/qunique.h" +#include "miscadmin.h" +#include "utils/array.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +typedef struct BTSortArrayContext +{ + FmgrInfo flinfo; + Oid collation; + bool reverse; +} BTSortArrayContext; + +static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, + StrategyNumber strat, + Datum *elems, int nelems); +static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, + bool reverse, + Datum *elems, int nelems); +static int _bt_compare_array_elements(const void *a, const void *b, void *arg); +static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, + ScanKey leftarg, ScanKey rightarg, + bool *result); +static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); +static void _bt_mark_scankey_required(ScanKey skey); +static bool _bt_check_rowcompare(ScanKey skey, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + ScanDirection dir, bool *continuescan); +static int _bt_keep_natts(Relation rel, IndexTuple lastleft, + IndexTuple firstright, BTScanInsert itup_key); + + +/* + * _bt_mkscankey + * Build an insertion scan key that contains comparison data from itup + * as well as comparator routines appropriate to the key datatypes. + * + * When itup is a non-pivot tuple, the returned insertion scan key is + * suitable for finding a place for it to go on the leaf level. Pivot + * tuples can be used to re-find leaf page with matching high key, but + * then caller needs to set scan key's pivotsearch field to true. This + * allows caller to search for a leaf page with a matching high key, + * which is usually to the left of the first leaf page a non-pivot match + * might appear on. + * + * The result is intended for use with _bt_compare() and _bt_truncate(). + * Callers that don't need to fill out the insertion scankey arguments + * (e.g. they use an ad-hoc comparison routine, or only need a scankey + * for _bt_truncate()) can pass a NULL index tuple. The scankey will + * be initialized as if an "all truncated" pivot tuple was passed + * instead. + * + * Note that we may occasionally have to share lock the metapage to + * determine whether or not the keys in the index are expected to be + * unique (i.e. if this is a "heapkeyspace" index). We assume a + * heapkeyspace index when caller passes a NULL tuple, allowing index + * build callers to avoid accessing the non-existent metapage. We + * also assume that the index is _not_ allequalimage when a NULL tuple + * is passed; CREATE INDEX callers call _bt_allequalimage() to set the + * field themselves. + */ +BTScanInsert +_bt_mkscankey(Relation rel, IndexTuple itup) +{ + BTScanInsert key; + ScanKey skey; + TupleDesc itupdesc; + int indnkeyatts; + int16 *indoption; + int tupnatts; + int i; + + itupdesc = RelationGetDescr(rel); + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + indoption = rel->rd_indoption; + tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; + + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); + + /* + * We'll execute search using scan key constructed on key columns. + * Truncated attributes and non-key attributes are omitted from the final + * scan key. + */ + key = palloc(offsetof(BTScanInsertData, scankeys) + + sizeof(ScanKeyData) * indnkeyatts); + if (itup) + _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); + else + { + /* Utility statement callers can set these fields themselves */ + key->heapkeyspace = true; + key->allequalimage = false; + } + key->anynullkeys = false; /* initial assumption */ + key->nextkey = false; + key->pivotsearch = false; + key->keysz = Min(indnkeyatts, tupnatts); + key->scantid = key->heapkeyspace && itup ? + BTreeTupleGetHeapTID(itup) : NULL; + skey = key->scankeys; + for (i = 0; i < indnkeyatts; i++) + { + FmgrInfo *procinfo; + Datum arg; + bool null; + int flags; + + /* + * We can use the cached (default) support procs since no cross-type + * comparison can be needed. + */ + procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); + + /* + * Key arguments built from truncated attributes (or when caller + * provides no tuple) are defensively represented as NULL values. They + * should never be used. + */ + if (i < tupnatts) + arg = index_getattr(itup, i + 1, itupdesc, &null); + else + { + arg = (Datum) 0; + null = true; + } + flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); + ScanKeyEntryInitializeWithInfo(&skey[i], + flags, + (AttrNumber) (i + 1), + InvalidStrategy, + InvalidOid, + rel->rd_indcollation[i], + procinfo, + arg); + /* Record if any key attribute is NULL (or truncated) */ + if (null) + key->anynullkeys = true; + } + + return key; +} + +/* + * free a retracement stack made by _bt_search. + */ +void +_bt_freestack(BTStack stack) +{ + BTStack ostack; + + while (stack != NULL) + { + ostack = stack; + stack = stack->bts_parent; + pfree(ostack); + } +} + + +/* + * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys + * + * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and + * set up BTArrayKeyInfo info for each one that is an equality-type key. + * Prepare modified scan keys in so->arrayKeyData, which will hold the current + * array elements during each primitive indexscan operation. For inequality + * array keys, it's sufficient to find the extreme element value and replace + * the whole array with that scalar value. + * + * Note: the reason we need so->arrayKeyData, rather than just scribbling + * on scan->keyData, is that callers are permitted to call btrescan without + * supplying a new set of scankey data. + */ +void +_bt_preprocess_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int numberOfKeys = scan->numberOfKeys; + int16 *indoption = scan->indexRelation->rd_indoption; + int numArrayKeys; + ScanKey cur; + int i; + MemoryContext oldContext; + + /* Quick check to see if there are any array keys */ + numArrayKeys = 0; + for (i = 0; i < numberOfKeys; i++) + { + cur = &scan->keyData[i]; + if (cur->sk_flags & SK_SEARCHARRAY) + { + numArrayKeys++; + Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL))); + /* If any arrays are null as a whole, we can quit right now. */ + if (cur->sk_flags & SK_ISNULL) + { + so->numArrayKeys = -1; + so->arrayKeyData = NULL; + return; + } + } + } + + /* Quit if nothing to do. */ + if (numArrayKeys == 0) + { + so->numArrayKeys = 0; + so->arrayKeyData = NULL; + return; + } + + /* + * Make a scan-lifespan context to hold array-associated data, or reset it + * if we already have one from a previous rescan cycle. + */ + if (so->arrayContext == NULL) + so->arrayContext = AllocSetContextCreate(CurrentMemoryContext, + "BTree array context", + ALLOCSET_SMALL_SIZES); + else + MemoryContextReset(so->arrayContext); + + oldContext = MemoryContextSwitchTo(so->arrayContext); + + /* Create modifiable copy of scan->keyData in the workspace context */ + so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); + memcpy(so->arrayKeyData, + scan->keyData, + scan->numberOfKeys * sizeof(ScanKeyData)); + + /* Allocate space for per-array data in the workspace context */ + so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo)); + + /* Now process each array key */ + numArrayKeys = 0; + for (i = 0; i < numberOfKeys; i++) + { + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + bool *elem_nulls; + int num_nonnulls; + int j; + + cur = &so->arrayKeyData[i]; + if (!(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + /* + * First, deconstruct the array into elements. Anything allocated + * here (including a possibly detoasted array value) is in the + * workspace context. + */ + arrayval = DatumGetArrayTypeP(cur->sk_argument); + /* We could cache this data, but not clear it's worth it */ + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + + /* + * Compress out any null elements. We can ignore them since we assume + * all btree operators are strict. + */ + num_nonnulls = 0; + for (j = 0; j < num_elems; j++) + { + if (!elem_nulls[j]) + elem_values[num_nonnulls++] = elem_values[j]; + } + + /* We could pfree(elem_nulls) now, but not worth the cycles */ + + /* If there's no non-nulls, the scan qual is unsatisfiable */ + if (num_nonnulls == 0) + { + numArrayKeys = -1; + break; + } + + /* + * If the comparison operator is not equality, then the array qual + * degenerates to a simple comparison against the smallest or largest + * non-null array element, as appropriate. + */ + switch (cur->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + cur->sk_argument = + _bt_find_extreme_element(scan, cur, + BTGreaterStrategyNumber, + elem_values, num_nonnulls); + continue; + case BTEqualStrategyNumber: + /* proceed with rest of loop */ + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + cur->sk_argument = + _bt_find_extreme_element(scan, cur, + BTLessStrategyNumber, + elem_values, num_nonnulls); + continue; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) cur->sk_strategy); + break; + } + + /* + * Sort the non-null elements and eliminate any duplicates. We must + * sort in the same ordering used by the index column, so that the + * successive primitive indexscans produce data in index order. + */ + num_elems = _bt_sort_array_elements(scan, cur, + (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, + elem_values, num_nonnulls); + + /* + * And set up the BTArrayKeyInfo data. + */ + so->arrayKeys[numArrayKeys].scan_key = i; + so->arrayKeys[numArrayKeys].num_elems = num_elems; + so->arrayKeys[numArrayKeys].elem_values = elem_values; + numArrayKeys++; + } + + so->numArrayKeys = numArrayKeys; + + MemoryContextSwitchTo(oldContext); +} + +/* + * _bt_find_extreme_element() -- get least or greatest array element + * + * scan and skey identify the index column, whose opfamily determines the + * comparison semantics. strat should be BTLessStrategyNumber to get the + * least element, or BTGreaterStrategyNumber to get the greatest. + */ +static Datum +_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, + StrategyNumber strat, + Datum *elems, int nelems) +{ + Relation rel = scan->indexRelation; + Oid elemtype, + cmp_op; + RegProcedure cmp_proc; + FmgrInfo flinfo; + Datum result; + int i; + + /* + * Determine the nominal datatype of the array elements. We have to + * support the convention that sk_subtype == InvalidOid means the opclass + * input type; this is a hack to simplify life for ScanKeyInit(). + */ + elemtype = skey->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[skey->sk_attno - 1]; + + /* + * Look up the appropriate comparison operator in the opfamily. + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but it seems quite unlikely that an opfamily would omit + * non-cross-type comparison operators for any datatype that it supports + * at all. + */ + cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1], + elemtype, + elemtype, + strat); + if (!OidIsValid(cmp_op)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strat, elemtype, elemtype, + rel->rd_opfamily[skey->sk_attno - 1]); + cmp_proc = get_opcode(cmp_op); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing oprcode for operator %u", cmp_op); + + fmgr_info(cmp_proc, &flinfo); + + Assert(nelems > 0); + result = elems[0]; + for (i = 1; i < nelems; i++) + { + if (DatumGetBool(FunctionCall2Coll(&flinfo, + skey->sk_collation, + elems[i], + result))) + result = elems[i]; + } + + return result; +} + +/* + * _bt_sort_array_elements() -- sort and de-dup array elements + * + * The array elements are sorted in-place, and the new number of elements + * after duplicate removal is returned. + * + * scan and skey identify the index column, whose opfamily determines the + * comparison semantics. If reverse is true, we sort in descending order. + */ +static int +_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, + bool reverse, + Datum *elems, int nelems) +{ + Relation rel = scan->indexRelation; + Oid elemtype; + RegProcedure cmp_proc; + BTSortArrayContext cxt; + + if (nelems <= 1) + return nelems; /* no work to do */ + + /* + * Determine the nominal datatype of the array elements. We have to + * support the convention that sk_subtype == InvalidOid means the opclass + * input type; this is a hack to simplify life for ScanKeyInit(). + */ + elemtype = skey->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[skey->sk_attno - 1]; + + /* + * Look up the appropriate comparison function in the opfamily. + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but it seems quite unlikely that an opfamily would omit + * non-cross-type support functions for any datatype that it supports at + * all. + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + elemtype, + elemtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", + BTORDER_PROC, elemtype, elemtype, + rel->rd_opfamily[skey->sk_attno - 1]); + + /* Sort the array elements */ + fmgr_info(cmp_proc, &cxt.flinfo); + cxt.collation = skey->sk_collation; + cxt.reverse = reverse; + qsort_arg((void *) elems, nelems, sizeof(Datum), + _bt_compare_array_elements, (void *) &cxt); + + /* Now scan the sorted elements and remove duplicates */ + return qunique_arg(elems, nelems, sizeof(Datum), + _bt_compare_array_elements, &cxt); +} + +/* + * qsort_arg comparator for sorting array elements + */ +static int +_bt_compare_array_elements(const void *a, const void *b, void *arg) +{ + Datum da = *((const Datum *) a); + Datum db = *((const Datum *) b); + BTSortArrayContext *cxt = (BTSortArrayContext *) arg; + int32 compare; + + compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo, + cxt->collation, + da, db)); + if (cxt->reverse) + INVERT_COMPARE_RESULT(compare); + return compare; +} + +/* + * _bt_start_array_keys() -- Initialize array keys at start of a scan + * + * Set up the cur_elem counters and fill in the first sk_argument value for + * each array scankey. We can't do this until we know the scan direction. + */ +void +_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int i; + + for (i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + + Assert(curArrayKey->num_elems > 0); + if (ScanDirectionIsBackward(dir)) + curArrayKey->cur_elem = curArrayKey->num_elems - 1; + else + curArrayKey->cur_elem = 0; + skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; + } +} + +/* + * _bt_advance_array_keys() -- Advance to next set of array elements + * + * Returns true if there is another set of values to consider, false if not. + * On true result, the scankeys are initialized with the next set of values. + */ +bool +_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool found = false; + int i; + + /* + * We must advance the last array key most quickly, since it will + * correspond to the lowest-order index column among the available + * qualifications. This is necessary to ensure correct ordering of output + * when there are multiple array keys. + */ + for (i = so->numArrayKeys - 1; i >= 0; i--) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + int cur_elem = curArrayKey->cur_elem; + int num_elems = curArrayKey->num_elems; + + if (ScanDirectionIsBackward(dir)) + { + if (--cur_elem < 0) + { + cur_elem = num_elems - 1; + found = false; /* need to advance next array key */ + } + else + found = true; + } + else + { + if (++cur_elem >= num_elems) + { + cur_elem = 0; + found = false; /* need to advance next array key */ + } + else + found = true; + } + + curArrayKey->cur_elem = cur_elem; + skey->sk_argument = curArrayKey->elem_values[cur_elem]; + if (found) + break; + } + + /* advance parallel scan */ + if (scan->parallel_scan != NULL) + _bt_parallel_advance_array_keys(scan); + + return found; +} + +/* + * _bt_mark_array_keys() -- Handle array keys during btmarkpos + * + * Save the current state of the array keys as the "mark" position. + */ +void +_bt_mark_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int i; + + for (i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + + curArrayKey->mark_elem = curArrayKey->cur_elem; + } +} + +/* + * _bt_restore_array_keys() -- Handle array keys during btrestrpos + * + * Restore the array keys to where they were when the mark was set. + */ +void +_bt_restore_array_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool changed = false; + int i; + + /* Restore each array key to its position when the mark was set */ + for (i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + int mark_elem = curArrayKey->mark_elem; + + if (curArrayKey->cur_elem != mark_elem) + { + curArrayKey->cur_elem = mark_elem; + skey->sk_argument = curArrayKey->elem_values[mark_elem]; + changed = true; + } + } + + /* + * If we changed any keys, we must redo _bt_preprocess_keys. That might + * sound like overkill, but in cases with multiple keys per index column + * it seems necessary to do the full set of pushups. + */ + if (changed) + { + _bt_preprocess_keys(scan); + /* The mark should have been set on a consistent set of keys... */ + Assert(so->qual_ok); + } +} + + +/* + * _bt_preprocess_keys() -- Preprocess scan keys + * + * The given search-type keys (in scan->keyData[] or so->arrayKeyData[]) + * are copied to so->keyData[] with possible transformation. + * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets + * the number of output keys (possibly less, never greater). + * + * The output keys are marked with additional sk_flags bits beyond the + * system-standard bits supplied by the caller. The DESC and NULLS_FIRST + * indoption bits for the relevant index attribute are copied into the flags. + * Also, for a DESC column, we commute (flip) all the sk_strategy numbers + * so that the index sorts in the desired direction. + * + * One key purpose of this routine is to discover which scan keys must be + * satisfied to continue the scan. It also attempts to eliminate redundant + * keys and detect contradictory keys. (If the index opfamily provides + * incomplete sets of cross-type operators, we may fail to detect redundant + * or contradictory keys, but we can survive that.) + * + * The output keys must be sorted by index attribute. Presently we expect + * (but verify) that the input keys are already so sorted --- this is done + * by match_clauses_to_index() in indxpath.c. Some reordering of the keys + * within each attribute may be done as a byproduct of the processing here, + * but no other code depends on that. + * + * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD + * if they must be satisfied in order to continue the scan forward or backward + * respectively. _bt_checkkeys uses these flags. For example, if the quals + * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple + * (1,2,7), but we must continue the scan in case there are tuples (1,3,z). + * But once we reach tuples like (1,4,z) we can stop scanning because no + * later tuples could match. This is reflected by marking the x and y keys, + * but not the z key, with SK_BT_REQFWD. In general, the keys for leading + * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD. + * For the first attribute without an "=" key, any "<" and "<=" keys are + * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD. + * This can be seen to be correct by considering the above example. Note + * in particular that if there are no keys for a given attribute, the keys for + * subsequent attributes can never be required; for instance "WHERE y = 4" + * requires a full-index scan. + * + * If possible, redundant keys are eliminated: we keep only the tightest + * >/>= bound and the tightest />= or both + * 4::int AND x > 10::bigint", and we are unable to determine + * which key is more restrictive for lack of a suitable cross-type operator. + * _bt_first will arbitrarily pick one of the keys to do the initial + * positioning with. If it picks x > 4, then the x > 10 condition will fail + * until we reach index entries > 10; but we can't stop the scan just because + * x > 10 is failing. On the other hand, if we are scanning backwards, then + * failure of either key is indeed enough to stop the scan. (In general, when + * inequality keys are present, the initial-positioning code only promises to + * position before the first possible match, not exactly at the first match, + * for a forward scan; or after the last match for a backward scan.) + * + * As a byproduct of this work, we can detect contradictory quals such + * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false, + * indicating the scan need not be run at all since no tuples can match. + * (In this case we do not bother completing the output key array!) + * Again, missing cross-type operators might cause us to fail to prove the + * quals contradictory when they really are, but the scan will work correctly. + * + * Row comparison keys are currently also treated without any smarts: + * we just transfer them into the preprocessed array without any + * editorialization. We can treat them the same as an ordinary inequality + * comparison on the row's first index column, for the purposes of the logic + * about required keys. + * + * Note: the reason we have to copy the preprocessed scan keys into private + * storage is that we are modifying the array based on comparisons of the + * key argument values, which could change on a rescan or after moving to + * new elements of array keys. Therefore we can't overwrite the source data. + */ +void +_bt_preprocess_keys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int numberOfKeys = scan->numberOfKeys; + int16 *indoption = scan->indexRelation->rd_indoption; + int new_numberOfKeys; + int numberOfEqualCols; + ScanKey inkeys; + ScanKey outkeys; + ScanKey cur; + ScanKey xform[BTMaxStrategyNumber]; + bool test_result; + int i, + j; + AttrNumber attno; + + /* initialize result variables */ + so->qual_ok = true; + so->numberOfKeys = 0; + + if (numberOfKeys < 1) + return; /* done if qual-less scan */ + + /* + * Read so->arrayKeyData if array keys are present, else scan->keyData + */ + if (so->arrayKeyData != NULL) + inkeys = so->arrayKeyData; + else + inkeys = scan->keyData; + + outkeys = so->keyData; + cur = &inkeys[0]; + /* we check that input keys are correctly ordered */ + if (cur->sk_attno < 1) + elog(ERROR, "btree index keys must be ordered by attribute"); + + /* We can short-circuit most of the work if there's just one key */ + if (numberOfKeys == 1) + { + /* Apply indoption to scankey (might change sk_strategy!) */ + if (!_bt_fix_scankey_strategy(cur, indoption)) + so->qual_ok = false; + memcpy(outkeys, cur, sizeof(ScanKeyData)); + so->numberOfKeys = 1; + /* We can mark the qual as required if it's for first index col */ + if (cur->sk_attno == 1) + _bt_mark_scankey_required(outkeys); + return; + } + + /* + * Otherwise, do the full set of pushups. + */ + new_numberOfKeys = 0; + numberOfEqualCols = 0; + + /* + * Initialize for processing of keys for attr 1. + * + * xform[i] points to the currently best scan key of strategy type i+1; it + * is NULL if we haven't yet found such a key for this attr. + */ + attno = 1; + memset(xform, 0, sizeof(xform)); + + /* + * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to + * handle after-last-key processing. Actual exit from the loop is at the + * "break" statement below. + */ + for (i = 0;; cur++, i++) + { + if (i < numberOfKeys) + { + /* Apply indoption to scankey (might change sk_strategy!) */ + if (!_bt_fix_scankey_strategy(cur, indoption)) + { + /* NULL can't be matched, so give up */ + so->qual_ok = false; + return; + } + } + + /* + * If we are at the end of the keys for a particular attr, finish up + * processing and emit the cleaned-up keys. + */ + if (i == numberOfKeys || cur->sk_attno != attno) + { + int priorNumberOfEqualCols = numberOfEqualCols; + + /* check input keys are correctly ordered */ + if (i < numberOfKeys && cur->sk_attno < attno) + elog(ERROR, "btree index keys must be ordered by attribute"); + + /* + * If = has been specified, all other keys can be eliminated as + * redundant. If we have a case like key = 1 AND key > 2, we can + * set qual_ok to false and abandon further processing. + * + * We also have to deal with the case of "key IS NULL", which is + * unsatisfiable in combination with any other index condition. By + * the time we get here, that's been classified as an equality + * check, and we've rejected any combination of it with a regular + * equality condition; but not with other types of conditions. + */ + if (xform[BTEqualStrategyNumber - 1]) + { + ScanKey eq = xform[BTEqualStrategyNumber - 1]; + + for (j = BTMaxStrategyNumber; --j >= 0;) + { + ScanKey chk = xform[j]; + + if (!chk || j == (BTEqualStrategyNumber - 1)) + continue; + + if (eq->sk_flags & SK_SEARCHNULL) + { + /* IS NULL is contradictory to anything else */ + so->qual_ok = false; + return; + } + + if (_bt_compare_scankey_args(scan, chk, eq, chk, + &test_result)) + { + if (!test_result) + { + /* keys proven mutually contradictory */ + so->qual_ok = false; + return; + } + /* else discard the redundant non-equality key */ + xform[j] = NULL; + } + /* else, cannot determine redundancy, keep both keys */ + } + /* track number of attrs for which we have "=" keys */ + numberOfEqualCols++; + } + + /* try to keep only one of <, <= */ + if (xform[BTLessStrategyNumber - 1] + && xform[BTLessEqualStrategyNumber - 1]) + { + ScanKey lt = xform[BTLessStrategyNumber - 1]; + ScanKey le = xform[BTLessEqualStrategyNumber - 1]; + + if (_bt_compare_scankey_args(scan, le, lt, le, + &test_result)) + { + if (test_result) + xform[BTLessEqualStrategyNumber - 1] = NULL; + else + xform[BTLessStrategyNumber - 1] = NULL; + } + } + + /* try to keep only one of >, >= */ + if (xform[BTGreaterStrategyNumber - 1] + && xform[BTGreaterEqualStrategyNumber - 1]) + { + ScanKey gt = xform[BTGreaterStrategyNumber - 1]; + ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1]; + + if (_bt_compare_scankey_args(scan, ge, gt, ge, + &test_result)) + { + if (test_result) + xform[BTGreaterEqualStrategyNumber - 1] = NULL; + else + xform[BTGreaterStrategyNumber - 1] = NULL; + } + } + + /* + * Emit the cleaned-up keys into the outkeys[] array, and then + * mark them if they are required. They are required (possibly + * only in one direction) if all attrs before this one had "=". + */ + for (j = BTMaxStrategyNumber; --j >= 0;) + { + if (xform[j]) + { + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, xform[j], sizeof(ScanKeyData)); + if (priorNumberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + } + } + + /* + * Exit loop here if done. + */ + if (i == numberOfKeys) + break; + + /* Re-initialize for new attno */ + attno = cur->sk_attno; + memset(xform, 0, sizeof(xform)); + } + + /* check strategy this key's operator corresponds to */ + j = cur->sk_strategy - 1; + + /* if row comparison, push it directly to the output array */ + if (cur->sk_flags & SK_ROW_HEADER) + { + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, cur, sizeof(ScanKeyData)); + if (numberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + + /* + * We don't support RowCompare using equality; such a qual would + * mess up the numberOfEqualCols tracking. + */ + Assert(j != (BTEqualStrategyNumber - 1)); + continue; + } + + /* have we seen one of these before? */ + if (xform[j] == NULL) + { + /* nope, so remember this scankey */ + xform[j] = cur; + } + else + { + /* yup, keep only the more restrictive key */ + if (_bt_compare_scankey_args(scan, cur, cur, xform[j], + &test_result)) + { + if (test_result) + xform[j] = cur; + else if (j == (BTEqualStrategyNumber - 1)) + { + /* key == a && key == b, but a != b */ + so->qual_ok = false; + return; + } + /* else old key is more restrictive, keep it */ + } + else + { + /* + * We can't determine which key is more restrictive. Keep the + * previous one in xform[j] and push this one directly to the + * output array. + */ + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, cur, sizeof(ScanKeyData)); + if (numberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + } + } + } + + so->numberOfKeys = new_numberOfKeys; +} + +/* + * Compare two scankey values using a specified operator. + * + * The test we want to perform is logically "leftarg op rightarg", where + * leftarg and rightarg are the sk_argument values in those ScanKeys, and + * the comparison operator is the one in the op ScanKey. However, in + * cross-data-type situations we may need to look up the correct operator in + * the index's opfamily: it is the one having amopstrategy = op->sk_strategy + * and amoplefttype/amoprighttype equal to the two argument datatypes. + * + * If the opfamily doesn't supply a complete set of cross-type operators we + * may not be able to make the comparison. If we can make the comparison + * we store the operator result in *result and return true. We return false + * if the comparison could not be made. + * + * Note: op always points at the same ScanKey as either leftarg or rightarg. + * Since we don't scribble on the scankeys, this aliasing should cause no + * trouble. + * + * Note: this routine needs to be insensitive to any DESC option applied + * to the index column. For example, "x < 4" is a tighter constraint than + * "x < 5" regardless of which way the index is sorted. + */ +static bool +_bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, + ScanKey leftarg, ScanKey rightarg, + bool *result) +{ + Relation rel = scan->indexRelation; + Oid lefttype, + righttype, + optype, + opcintype, + cmp_op; + StrategyNumber strat; + + /* + * First, deal with cases where one or both args are NULL. This should + * only happen when the scankeys represent IS NULL/NOT NULL conditions. + */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL) + { + bool leftnull, + rightnull; + + if (leftarg->sk_flags & SK_ISNULL) + { + Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); + leftnull = true; + } + else + leftnull = false; + if (rightarg->sk_flags & SK_ISNULL) + { + Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); + rightnull = true; + } + else + rightnull = false; + + /* + * We treat NULL as either greater than or less than all other values. + * Since true > false, the tests below work correctly for NULLS LAST + * logic. If the index is NULLS FIRST, we need to flip the strategy. + */ + strat = op->sk_strategy; + if (op->sk_flags & SK_BT_NULLS_FIRST) + strat = BTCommuteStrategyNumber(strat); + + switch (strat) + { + case BTLessStrategyNumber: + *result = (leftnull < rightnull); + break; + case BTLessEqualStrategyNumber: + *result = (leftnull <= rightnull); + break; + case BTEqualStrategyNumber: + *result = (leftnull == rightnull); + break; + case BTGreaterEqualStrategyNumber: + *result = (leftnull >= rightnull); + break; + case BTGreaterStrategyNumber: + *result = (leftnull > rightnull); + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat); + *result = false; /* keep compiler quiet */ + break; + } + return true; + } + + /* + * The opfamily we need to worry about is identified by the index column. + */ + Assert(leftarg->sk_attno == rightarg->sk_attno); + + opcintype = rel->rd_opcintype[leftarg->sk_attno - 1]; + + /* + * Determine the actual datatypes of the ScanKey arguments. We have to + * support the convention that sk_subtype == InvalidOid means the opclass + * input type; this is a hack to simplify life for ScanKeyInit(). + */ + lefttype = leftarg->sk_subtype; + if (lefttype == InvalidOid) + lefttype = opcintype; + righttype = rightarg->sk_subtype; + if (righttype == InvalidOid) + righttype = opcintype; + optype = op->sk_subtype; + if (optype == InvalidOid) + optype = opcintype; + + /* + * If leftarg and rightarg match the types expected for the "op" scankey, + * we can use its already-looked-up comparison function. + */ + if (lefttype == opcintype && righttype == optype) + { + *result = DatumGetBool(FunctionCall2Coll(&op->sk_func, + op->sk_collation, + leftarg->sk_argument, + rightarg->sk_argument)); + return true; + } + + /* + * Otherwise, we need to go to the syscache to find the appropriate + * operator. (This cannot result in infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to + * un-flip it to get the correct opfamily member. + */ + strat = op->sk_strategy; + if (op->sk_flags & SK_BT_DESC) + strat = BTCommuteStrategyNumber(strat); + + cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1], + lefttype, + righttype, + strat); + if (OidIsValid(cmp_op)) + { + RegProcedure cmp_proc = get_opcode(cmp_op); + + if (RegProcedureIsValid(cmp_proc)) + { + *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc, + op->sk_collation, + leftarg->sk_argument, + rightarg->sk_argument)); + return true; + } + } + + /* Can't make the comparison */ + *result = false; /* suppress compiler warnings */ + return false; +} + +/* + * Adjust a scankey's strategy and flags setting as needed for indoptions. + * + * We copy the appropriate indoption value into the scankey sk_flags + * (shifting to avoid clobbering system-defined flag bits). Also, if + * the DESC option is set, commute (flip) the operator strategy number. + * + * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up + * the strategy field correctly for them. + * + * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a + * NULL comparison value. Since all btree operators are assumed strict, + * a NULL means that the qual cannot be satisfied. We return true if the + * comparison value isn't NULL, or false if the scan should be abandoned. + * + * This function is applied to the *input* scankey structure; therefore + * on a rescan we will be looking at already-processed scankeys. Hence + * we have to be careful not to re-commute the strategy if we already did it. + * It's a bit ugly to modify the caller's copy of the scankey but in practice + * there shouldn't be any problem, since the index's indoptions are certainly + * not going to change while the scankey survives. + */ +static bool +_bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) +{ + int addflags; + + addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT; + + /* + * We treat all btree operators as strict (even if they're not so marked + * in pg_proc). This means that it is impossible for an operator condition + * with a NULL comparison constant to succeed, and we can reject it right + * away. + * + * However, we now also support "x IS NULL" clauses as search conditions, + * so in that case keep going. The planner has not filled in any + * particular strategy in this case, so set it to BTEqualStrategyNumber + * --- we can treat IS NULL as an equality operator for purposes of search + * strategy. + * + * Likewise, "x IS NOT NULL" is supported. We treat that as either "less + * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS + * FIRST index. + * + * Note: someday we might have to fill in sk_collation from the index + * column's collation. At the moment this is a non-issue because we'll + * never actually call the comparison operator on a NULL. + */ + if (skey->sk_flags & SK_ISNULL) + { + /* SK_ISNULL shouldn't be set in a row header scankey */ + Assert(!(skey->sk_flags & SK_ROW_HEADER)); + + /* Set indoption flags in scankey (might be done already) */ + skey->sk_flags |= addflags; + + /* Set correct strategy for IS NULL or NOT NULL search */ + if (skey->sk_flags & SK_SEARCHNULL) + { + skey->sk_strategy = BTEqualStrategyNumber; + skey->sk_subtype = InvalidOid; + skey->sk_collation = InvalidOid; + } + else if (skey->sk_flags & SK_SEARCHNOTNULL) + { + if (skey->sk_flags & SK_BT_NULLS_FIRST) + skey->sk_strategy = BTGreaterStrategyNumber; + else + skey->sk_strategy = BTLessStrategyNumber; + skey->sk_subtype = InvalidOid; + skey->sk_collation = InvalidOid; + } + else + { + /* regular qual, so it cannot be satisfied */ + return false; + } + + /* Needn't do the rest */ + return true; + } + + /* Adjust strategy for DESC, if we didn't already */ + if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC)) + skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy); + skey->sk_flags |= addflags; + + /* If it's a row header, fix row member flags and strategies similarly */ + if (skey->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT; + if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC)) + subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy); + subkey->sk_flags |= addflags; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + } + + return true; +} + +/* + * Mark a scankey as "required to continue the scan". + * + * Depending on the operator type, the key may be required for both scan + * directions or just one. Also, if the key is a row comparison header, + * we have to mark its first subsidiary ScanKey as required. (Subsequent + * subsidiary ScanKeys are normally for lower-order columns, and thus + * cannot be required, since they're after the first non-equality scankey.) + * + * Note: when we set required-key flag bits in a subsidiary scankey, we are + * scribbling on a data structure belonging to the index AM's caller, not on + * our private copy. This should be OK because the marking will not change + * from scan to scan within a query, and so we'd just re-mark the same way + * anyway on a rescan. Something to keep an eye on though. + */ +static void +_bt_mark_scankey_required(ScanKey skey) +{ + int addflags; + + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + addflags = SK_BT_REQFWD; + break; + case BTEqualStrategyNumber: + addflags = SK_BT_REQFWD | SK_BT_REQBKWD; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + addflags = SK_BT_REQBKWD; + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + addflags = 0; /* keep compiler quiet */ + break; + } + + skey->sk_flags |= addflags; + + if (skey->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + + /* First subkey should be same column/operator as the header */ + Assert(subkey->sk_flags & SK_ROW_MEMBER); + Assert(subkey->sk_attno == skey->sk_attno); + Assert(subkey->sk_strategy == skey->sk_strategy); + subkey->sk_flags |= addflags; + } +} + +/* + * Test whether an indextuple satisfies all the scankey conditions. + * + * Return true if so, false if not. If the tuple fails to pass the qual, + * we also determine whether there's any need to continue the scan beyond + * this tuple, and set *continuescan accordingly. See comments for + * _bt_preprocess_keys(), above, about how this is done. + * + * Forward scan callers can pass a high key tuple in the hopes of having + * us set *continuescan to false, and avoiding an unnecessary visit to + * the page to the right. + * + * scan: index scan descriptor (containing a search-type scankey) + * tuple: index tuple to test + * tupnatts: number of attributes in tupnatts (high key may be truncated) + * dir: direction we are scanning in + * continuescan: output parameter (will be set correctly in all cases) + */ +bool +_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan) +{ + TupleDesc tupdesc; + BTScanOpaque so; + int keysz; + int ikey; + ScanKey key; + + Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + + *continuescan = true; /* default assumption */ + + tupdesc = RelationGetDescr(scan->indexRelation); + so = (BTScanOpaque) scan->opaque; + keysz = so->numberOfKeys; + + for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) + { + Datum datum; + bool isNull; + Datum test; + + if (key->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); + continue; + } + + /* row-comparison keys need special processing */ + if (key->sk_flags & SK_ROW_HEADER) + { + if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, + continuescan)) + continue; + return false; + } + + datum = index_getattr(tuple, + key->sk_attno, + tupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* Handle IS NULL/NOT NULL tests */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (isNull) + continue; /* tuple satisfies this qual */ + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (!isNull) + continue; /* tuple satisfies this qual */ + } + + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + if (isNull) + { + if (key->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + test = FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument); + + if (!DatumGetBool(test)) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + * + * Note: because we stop the scan as soon as any required equality + * qual fails, it is critical that equality quals be used for the + * initial positioning in _bt_first() when they are available. See + * comments in _bt_first(). + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + } + + /* If we get here, the tuple passes all index quals. */ + return true; +} + +/* + * Test whether an indextuple satisfies a row-comparison scan condition. + * + * Return true if so, false if not. If not, also clear *continuescan if + * it's not possible for any future tuples in the current scan direction + * to pass the qual. + * + * This is a subroutine for _bt_checkkeys, which see for more info. + */ +static bool +_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, + TupleDesc tupdesc, ScanDirection dir, bool *continuescan) +{ + ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + int32 cmpresult = 0; + bool result; + + /* First subkey should be same as the header says */ + Assert(subkey->sk_attno == skey->sk_attno); + + /* Loop over columns of the row condition */ + for (;;) + { + Datum datum; + bool isNull; + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + + if (subkey->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); + cmpresult = 0; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + continue; + } + + datum = index_getattr(tuple, + subkey->sk_attno, + tupdesc, + &isNull); + + if (isNull) + { + if (subkey->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + */ + if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + */ + if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + if (subkey->sk_flags & SK_ISNULL) + { + /* + * Unlike the simple-scankey case, this isn't a disallowed case. + * But it can never match. If all the earlier row comparison + * columns are required for the scan direction, we can stop the + * scan, because there can't be another tuple that will succeed. + */ + if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument)) + subkey--; + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + return false; + } + + /* Perform the test --- three-way comparison not bool operator */ + cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, + subkey->sk_collation, + datum, + subkey->sk_argument)); + + if (subkey->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(cmpresult); + + /* Done comparing if unequal, else advance to next column */ + if (cmpresult != 0) + break; + + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + + /* + * At this point cmpresult indicates the overall result of the row + * comparison, and subkey points to the deciding column (or the last + * column if the result is "="). + */ + switch (subkey->sk_strategy) + { + /* EQ and NE cases aren't allowed here */ + case BTLessStrategyNumber: + result = (cmpresult < 0); + break; + case BTLessEqualStrategyNumber: + result = (cmpresult <= 0); + break; + case BTGreaterEqualStrategyNumber: + result = (cmpresult >= 0); + break; + case BTGreaterStrategyNumber: + result = (cmpresult > 0); + break; + default: + elog(ERROR, "unrecognized RowCompareType: %d", + (int) subkey->sk_strategy); + result = 0; /* keep compiler quiet */ + break; + } + + if (!result) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will pass, + * either. Note we have to look at the deciding column, not + * necessarily the first or last column of the row condition. + */ + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + + return result; +} + +/* + * _bt_killitems - set LP_DEAD state for items an indexscan caller has + * told us were killed + * + * scan->opaque, referenced locally through so, contains information about the + * current page and killed tuples thereon (generally, this should only be + * called if so->numKilled > 0). + * + * The caller does not have a lock on the page and may or may not have the + * page pinned in a buffer. Note that read-lock is sufficient for setting + * LP_DEAD status (which is only a hint). + * + * We match items by heap TID before assuming they are the right ones to + * delete. We cope with cases where items have moved right due to insertions. + * If an item has moved off the current page due to a split, we'll fail to + * find it and do nothing (this is not an error case --- we assume the item + * will eventually get marked in a future indexscan). + * + * Note that if we hold a pin on the target page continuously from initially + * reading the items until applying this function, VACUUM cannot have deleted + * any items from the page, and so there is no need to search left from the + * recorded offset. (This observation also guarantees that the item is still + * the right one to delete, which might otherwise be questionable since heap + * TIDs can get recycled.) This holds true even if the page has been modified + * by inserts and page splits, so there is no need to consult the LSN. + * + * If the pin was released after reading the page, then we re-read it. If it + * has been modified since we read it (as determined by the LSN), we dare not + * flag any entries because it is possible that the old entry was vacuumed + * away and the TID was re-used by a completely different heap tuple. + */ +void +_bt_killitems(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int i; + int numKilled = so->numKilled; + bool killedsomething = false; + bool droppedpin PG_USED_FOR_ASSERTS_ONLY; + + Assert(BTScanPosIsValid(so->currPos)); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; + + if (BTScanPosIsPinned(so->currPos)) + { + /* + * We have held the pin on this page since we read the index tuples, + * so all we need to do is lock it. The pin will have prevented + * re-use of any TID on the page, so there is no need to check the + * LSN. + */ + droppedpin = false; + _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); + + page = BufferGetPage(so->currPos.buf); + } + else + { + Buffer buf; + + droppedpin = true; + /* Attempt to re-read the buffer, getting pin and lock. */ + buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + + page = BufferGetPage(buf); + if (BufferGetLSNAtomic(buf) == so->currPos.lsn) + so->currPos.buf = buf; + else + { + /* Modified while not pinned means hinting is not safe. */ + _bt_relbuf(scan->indexRelation, buf); + return; + } + } + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < numKilled; i++) + { + int itemIndex = so->killedItems[i]; + BTScanPosItem *kitem = &so->currPos.items[itemIndex]; + OffsetNumber offnum = kitem->indexOffset; + + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + if (offnum < minoff) + continue; /* pure paranoia */ + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; + + if (BTreeTupleIsPosting(ituple)) + { + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + /* + * We rely on the convention that heap TIDs in the scanpos + * items array are stored in ascending heap TID order for a + * group of TIDs that originally came from a posting list + * tuple. This convention even applies during backwards + * scans, where returning the TIDs in descending order might + * seem more natural. This is about effectiveness, not + * correctness. + * + * Note that the page may have been modified in almost any way + * since we first read it (in the !droppedpin case), so it's + * possible that this posting list tuple wasn't a posting list + * tuple when we first encountered its heap TIDs. + */ + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* + * kitem must have matching offnum when heap TIDs match, + * though only in the common case where the page can't + * have been concurrently modified + */ + Assert(kitem->indexOffset == offnum || !droppedpin); + + /* + * Read-ahead to later kitems here. + * + * We rely on the assumption that not advancing kitem here + * will prevent us from considering the posting list tuple + * fully dead by not matching its next heap TID in next + * loop iteration. + * + * If, on the other hand, this is the final heap TID in + * the posting list tuple, then tuple gets killed + * regardless (i.e. we handle the case where the last + * kitem is also the last heap TID in the last index tuple + * correctly -- posting tuple still gets killed). + */ + if (pi < numKilled) + kitem = &so->currPos.items[so->killedItems[pi++]]; + } + + /* + * Don't bother advancing the outermost loop's int iterator to + * avoid processing killed items that relate to the same + * offnum/posting list tuple. This micro-optimization hardly + * seems worth it. (Further iterations of the outermost loop + * will fail to match on this same posting list's first heap + * TID instead, so we'll advance to the next offnum/index + * tuple pretty quickly.) + */ + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + /* + * Mark index item as dead, if it isn't already. Since this + * happens while holding a buffer lock possibly in shared mode, + * it's possible that multiple processes attempt to do this + * simultaneously, leading to multiple full-page images being sent + * to WAL (if wal_log_hints or data checksums are enabled), which + * is undesirable. + */ + if (killtuple && !ItemIdIsDead(iid)) + { + /* found the item/all posting list items */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. + * + * Whenever we mark anything LP_DEAD, we also set the page's + * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we + * only rely on the page-level flag in !heapkeyspace indexes.) + */ + if (killedsomething) + { + opaque->btpo_flags |= BTP_HAS_GARBAGE; + MarkBufferDirtyHint(so->currPos.buf, true); + } + + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); +} + + +/* + * The following routines manage a shared-memory area in which we track + * assignment of "vacuum cycle IDs" to currently-active btree vacuuming + * operations. There is a single counter which increments each time we + * start a vacuum to assign it a cycle ID. Since multiple vacuums could + * be active concurrently, we have to track the cycle ID for each active + * vacuum; this requires at most MaxBackends entries (usually far fewer). + * We assume at most one vacuum can be active for a given index. + * + * Access to the shared memory area is controlled by BtreeVacuumLock. + * In principle we could use a separate lmgr locktag for each index, + * but a single LWLock is much cheaper, and given the short time that + * the lock is ever held, the concurrency hit should be minimal. + */ + +typedef struct BTOneVacInfo +{ + LockRelId relid; /* global identifier of an index */ + BTCycleId cycleid; /* cycle ID for its active VACUUM */ +} BTOneVacInfo; + +typedef struct BTVacInfo +{ + BTCycleId cycle_ctr; /* cycle ID most recently assigned */ + int num_vacuums; /* number of currently active VACUUMs */ + int max_vacuums; /* allocated length of vacuums[] array */ + BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]; +} BTVacInfo; + +static BTVacInfo *btvacinfo; + + +/* + * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index, + * or zero if there is no active VACUUM + * + * Note: for correct interlocking, the caller must already hold pin and + * exclusive lock on each buffer it will store the cycle ID into. This + * ensures that even if a VACUUM starts immediately afterwards, it cannot + * process those pages until the page split is complete. + */ +BTCycleId +_bt_vacuum_cycleid(Relation rel) +{ + BTCycleId result = 0; + int i; + + /* Share lock is enough since this is a read-only operation */ + LWLockAcquire(BtreeVacuumLock, LW_SHARED); + + for (i = 0; i < btvacinfo->num_vacuums; i++) + { + BTOneVacInfo *vac = &btvacinfo->vacuums[i]; + + if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && + vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) + { + result = vac->cycleid; + break; + } + } + + LWLockRelease(BtreeVacuumLock); + return result; +} + +/* + * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation + * + * Note: the caller must guarantee that it will eventually call + * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure + * that this happens even in elog(FATAL) scenarios, the appropriate coding + * is not just a PG_TRY, but + * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)) + */ +BTCycleId +_bt_start_vacuum(Relation rel) +{ + BTCycleId result; + int i; + BTOneVacInfo *vac; + + LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE); + + /* + * Assign the next cycle ID, being careful to avoid zero as well as the + * reserved high values. + */ + result = ++(btvacinfo->cycle_ctr); + if (result == 0 || result > MAX_BT_CYCLE_ID) + result = btvacinfo->cycle_ctr = 1; + + /* Let's just make sure there's no entry already for this index */ + for (i = 0; i < btvacinfo->num_vacuums; i++) + { + vac = &btvacinfo->vacuums[i]; + if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && + vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) + { + /* + * Unlike most places in the backend, we have to explicitly + * release our LWLock before throwing an error. This is because + * we expect _bt_end_vacuum() to be called before transaction + * abort cleanup can run to release LWLocks. + */ + LWLockRelease(BtreeVacuumLock); + elog(ERROR, "multiple active vacuums for index \"%s\"", + RelationGetRelationName(rel)); + } + } + + /* OK, add an entry */ + if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums) + { + LWLockRelease(BtreeVacuumLock); + elog(ERROR, "out of btvacinfo slots"); + } + vac = &btvacinfo->vacuums[btvacinfo->num_vacuums]; + vac->relid = rel->rd_lockInfo.lockRelId; + vac->cycleid = result; + btvacinfo->num_vacuums++; + + LWLockRelease(BtreeVacuumLock); + return result; +} + +/* + * _bt_end_vacuum --- mark a btree VACUUM operation as done + * + * Note: this is deliberately coded not to complain if no entry is found; + * this allows the caller to put PG_TRY around the start_vacuum operation. + */ +void +_bt_end_vacuum(Relation rel) +{ + int i; + + LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE); + + /* Find the array entry */ + for (i = 0; i < btvacinfo->num_vacuums; i++) + { + BTOneVacInfo *vac = &btvacinfo->vacuums[i]; + + if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && + vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) + { + /* Remove it by shifting down the last entry */ + *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1]; + btvacinfo->num_vacuums--; + break; + } + } + + LWLockRelease(BtreeVacuumLock); +} + +/* + * _bt_end_vacuum wrapped as an on_shmem_exit callback function + */ +void +_bt_end_vacuum_callback(int code, Datum arg) +{ + _bt_end_vacuum((Relation) DatumGetPointer(arg)); +} + +/* + * BTreeShmemSize --- report amount of shared memory space needed + */ +Size +BTreeShmemSize(void) +{ + Size size; + + size = offsetof(BTVacInfo, vacuums); + size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo))); + return size; +} + +/* + * BTreeShmemInit --- initialize this module's shared memory + */ +void +BTreeShmemInit(void) +{ + bool found; + + btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State", + BTreeShmemSize(), + &found); + + if (!IsUnderPostmaster) + { + /* Initialize shared memory area */ + Assert(!found); + + /* + * It doesn't really matter what the cycle counter starts at, but + * having it always start the same doesn't seem good. Seed with + * low-order bits of time() instead. + */ + btvacinfo->cycle_ctr = (BTCycleId) time(NULL); + + btvacinfo->num_vacuums = 0; + btvacinfo->max_vacuums = MaxBackends; + } + else + Assert(found); +} + +bytea * +btoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)}, + {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, + offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}, + {"deduplicate_items", RELOPT_TYPE_BOOL, + offsetof(BTOptions, deduplicate_items)} + + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_BTREE, + sizeof(BTOptions), + tab, lengthof(tab)); + +} + +/* + * btproperty() -- Check boolean properties of indexes. + * + * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel + * to call btcanreturn. + */ +bool +btproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull) +{ + switch (prop) + { + case AMPROP_RETURNABLE: + /* answer only for columns, not AM or whole index */ + if (attno == 0) + return false; + /* otherwise, btree can always return data */ + *res = true; + return true; + + default: + return false; /* punt to generic code */ + } +} + +/* + * btbuildphasename() -- Return name of index build phase. + */ +char * +btbuildphasename(int64 phasenum) +{ + switch (phasenum) + { + case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE: + return "initializing"; + case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN: + return "scanning table"; + case PROGRESS_BTREE_PHASE_PERFORMSORT_1: + return "sorting live tuples"; + case PROGRESS_BTREE_PHASE_PERFORMSORT_2: + return "sorting dead tuples"; + case PROGRESS_BTREE_PHASE_LEAF_LOAD: + return "loading tuples in tree"; + default: + return NULL; + } +} + +/* + * _bt_truncate() -- create tuple without unneeded suffix attributes. + * + * Returns truncated pivot index tuple allocated in caller's memory context, + * with key attributes copied from caller's firstright argument. If rel is + * an INCLUDE index, non-key attributes will definitely be truncated away, + * since they're not part of the key space. More aggressive suffix + * truncation can take place when it's clear that the returned tuple does not + * need one or more suffix key attributes. We only need to keep firstright + * attributes up to and including the first non-lastleft-equal attribute. + * Caller's insertion scankey is used to compare the tuples; the scankey's + * argument values are not considered here. + * + * Note that returned tuple's t_tid offset will hold the number of attributes + * present, so the original item pointer offset is not represented. Caller + * should only change truncated tuple's downlink. Note also that truncated + * key attributes are treated as containing "minus infinity" values by + * _bt_compare(). + * + * In the worst case (when a heap TID must be appended to distinguish lastleft + * from firstright), the size of the returned tuple is the size of firstright + * plus the size of an additional MAXALIGN()'d item pointer. This guarantee + * is important, since callers need to stay under the 1/3 of a page + * restriction on tuple size. If this routine is ever taught to truncate + * within an attribute/datum, it will need to avoid returning an enlarged + * tuple to caller when truncation + TOAST compression ends up enlarging the + * final datum. + */ +IndexTuple +_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + int keepnatts; + IndexTuple pivot; + IndexTuple tidpivot; + ItemPointer pivotheaptid; + Size newsize; + + /* + * We should only ever truncate non-pivot tuples from leaf pages. It's + * never okay to truncate when splitting an internal page. + */ + Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); + + /* Determine how many attributes must be kept in truncated tuple */ + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + +#ifdef DEBUG_NO_TRUNCATE + /* Force truncation to be ineffective for testing purposes */ + keepnatts = nkeyatts + 1; +#endif + + pivot = index_truncate_tuple(itupdesc, firstright, + Min(keepnatts, nkeyatts)); + + if (BTreeTupleIsPosting(pivot)) + { + /* + * index_truncate_tuple() just returns a straight copy of firstright + * when it has no attributes to truncate. When that happens, we may + * need to truncate away a posting list here instead. + */ + Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1); + Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts); + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + + /* + * If there is a distinguishing key attribute within pivot tuple, we're + * done + */ + if (keepnatts <= nkeyatts) + { + BTreeTupleSetNAtts(pivot, keepnatts, false); + return pivot; + } + + /* + * We have to store a heap TID in the new pivot tuple, since no non-TID + * key attribute value in firstright distinguishes the right side of the + * split from the left side. nbtree conceptualizes this case as an + * inability to truncate away any key attributes, since heap TID is + * treated as just another key attribute (despite lacking a pg_attribute + * entry). + * + * Use enlarged space that holds a copy of pivot. We need the extra space + * to store a heap TID at the end (using the special pivot tuple + * representation). Note that the original pivot already has firstright's + * possible posting list/non-key attribute values removed at this point. + */ + newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData)); + tidpivot = palloc0(newsize); + memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot))); + /* Cannot leak memory here */ + pfree(pivot); + + /* + * Store all of firstright's key attribute values plus a tiebreaker heap + * TID value in enlarged pivot tuple + */ + tidpivot->t_info &= ~INDEX_SIZE_MASK; + tidpivot->t_info |= newsize; + BTreeTupleSetNAtts(tidpivot, nkeyatts, true); + pivotheaptid = BTreeTupleGetHeapTID(tidpivot); + + /* + * Lehman & Yao use lastleft as the leaf high key in all cases, but don't + * consider suffix truncation. It seems like a good idea to follow that + * example in cases where no truncation takes place -- use lastleft's heap + * TID. (This is also the closest value to negative infinity that's + * legally usable.) + */ + ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); + + /* + * We're done. Assert() that heap TID invariants hold before returning. + * + * Lehman and Yao require that the downlink to the right page, which is to + * be inserted into the parent page in the second phase of a page split be + * a strict lower bound on items on the right page, and a non-strict upper + * bound for items on the left page. Assert that heap TIDs follow these + * invariants, since a heap TID value is apparently needed as a + * tiebreaker. + */ +#ifndef DEBUG_NO_TRUNCATE + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); +#else + + /* + * Those invariants aren't guaranteed to hold for lastleft + firstright + * heap TID attribute values when they're considered here only because + * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually + * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap + * TID value that always works as a strict lower bound for items to the + * right. In particular, it must avoid using firstright's leading key + * attribute values along with lastleft's heap TID value when lastleft's + * TID happens to be greater than firstright's TID. + */ + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); + + /* + * Pivot heap TID should never be fully equal to firstright. Note that + * the pivot heap TID will still end up equal to lastleft's heap TID when + * that's the only usable value. + */ + ItemPointerSetOffsetNumber(pivotheaptid, + OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); +#endif + + return tidpivot; +} + +/* + * _bt_keep_natts - how many key attributes to keep when truncating. + * + * Caller provides two tuples that enclose a split point. Caller's insertion + * scankey is used to compare the tuples; the scankey's argument values are + * not considered here. + * + * This can return a number of attributes that is one greater than the + * number of key attributes for the index relation. This indicates that the + * caller must use a heap TID as a unique-ifier in new pivot tuple. + */ +static int +_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + TupleDesc itupdesc = RelationGetDescr(rel); + int keepnatts; + ScanKey scankey; + + /* + * _bt_compare() treats truncated key attributes as having the value minus + * infinity, which would break searches within !heapkeyspace indexes. We + * must still truncate away non-key attribute values, though. + */ + if (!itup_key->heapkeyspace) + return nkeyatts; + + scankey = itup_key->scankeys; + keepnatts = 1; + for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum1, + datum2)) != 0) + break; + + keepnatts++; + } + + /* + * Assert that _bt_keep_natts_fast() agrees with us in passing. This is + * expected in an allequalimage index. + */ + Assert(!itup_key->allequalimage || + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + + return keepnatts; +} + +/* + * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts. + * + * This is exported so that a candidate split point can have its effect on + * suffix truncation inexpensively evaluated ahead of time when finding a + * split location. A naive bitwise approach to datum comparisons is used to + * save cycles. + * + * The approach taken here usually provides the same answer as _bt_keep_natts + * will (for the same pair of tuples from a heapkeyspace index), since the + * majority of btree opclasses can never indicate that two datums are equal + * unless they're bitwise equal after detoasting. When an index only has + * "equal image" columns, routine is guaranteed to give the same result as + * _bt_keep_natts would. + * + * Callers can rely on the fact that attributes considered equal here are + * definitely also equal according to _bt_keep_natts, even when the index uses + * an opclass or collation that is not "allequalimage"/deduplication-safe. + * This weaker guarantee is good enough for nbtsplitloc.c caller, since false + * negatives generally only have the effect of making leaf page splits use a + * more balanced split point. + */ +int +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int keysz = IndexRelationGetNumberOfKeyAttributes(rel); + int keepnatts; + + keepnatts = 1; + for (int attnum = 1; attnum <= keysz; attnum++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + Form_pg_attribute att; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + att = TupleDescAttr(itupdesc, attnum - 1); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + !datum_image_eq(datum1, datum2, att->attbyval, att->attlen)) + break; + + keepnatts++; + } + + return keepnatts; +} + +/* + * _bt_check_natts() -- Verify tuple has expected number of attributes. + * + * Returns value indicating if the expected number of attributes were found + * for a particular offset on page. This can be used as a general purpose + * sanity check. + * + * Testing a tuple directly with BTreeTupleGetNAtts() should generally be + * preferred to calling here. That's usually more convenient, and is always + * more explicit. Call here instead when offnum's tuple may be a negative + * infinity tuple that uses the pre-v11 on-disk representation, or when a low + * context check is appropriate. This routine is as strict as possible about + * what is expected on each version of btree. + */ +bool +_bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) +{ + int16 natts = IndexRelationGetNumberOfAttributes(rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + IndexTuple itup; + int tupnatts; + + /* + * We cannot reliably test a deleted or half-dead page, since they have + * dummy high keys + */ + if (P_IGNORE(opaque)) + return true; + + Assert(offnum >= FirstOffsetNumber && + offnum <= PageGetMaxOffsetNumber(page)); + + /* + * Mask allocated for number of keys in index tuple must be able to fit + * maximum possible number of index attributes + */ + StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS, + "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS"); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + tupnatts = BTreeTupleGetNAtts(itup, rel); + + /* !heapkeyspace indexes do not support deduplication */ + if (!heapkeyspace && BTreeTupleIsPosting(itup)) + return false; + + /* Posting list tuples should never have "pivot heap TID" bit set */ + if (BTreeTupleIsPosting(itup) && + (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return false; + + /* INCLUDE indexes do not support deduplication */ + if (natts != nkeyatts && BTreeTupleIsPosting(itup)) + return false; + + if (P_ISLEAF(opaque)) + { + if (offnum >= P_FIRSTDATAKEY(opaque)) + { + /* + * Non-pivot tuple should never be explicitly marked as a pivot + * tuple + */ + if (BTreeTupleIsPivot(itup)) + return false; + + /* + * Leaf tuples that are not the page high key (non-pivot tuples) + * should never be truncated. (Note that tupnatts must have been + * inferred, even with a posting list tuple, because only pivot + * tuples store tupnatts directly.) + */ + return tupnatts == natts; + } + else + { + /* + * Rightmost page doesn't contain a page high key, so tuple was + * checked above as ordinary leaf tuple + */ + Assert(!P_RIGHTMOST(opaque)); + + /* + * !heapkeyspace high key tuple contains only key attributes. Note + * that tupnatts will only have been explicitly represented in + * !heapkeyspace indexes that happen to have non-key attributes. + */ + if (!heapkeyspace) + return tupnatts == nkeyatts; + + /* Use generic heapkeyspace pivot tuple handling */ + } + } + else /* !P_ISLEAF(opaque) */ + { + if (offnum == P_FIRSTDATAKEY(opaque)) + { + /* + * The first tuple on any internal page (possibly the first after + * its high key) is its negative infinity tuple. Negative + * infinity tuples are always truncated to zero attributes. They + * are a particular kind of pivot tuple. + */ + if (heapkeyspace) + return tupnatts == 0; + + /* + * The number of attributes won't be explicitly represented if the + * negative infinity tuple was generated during a page split that + * occurred with a version of Postgres before v11. There must be + * a problem when there is an explicit representation that is + * non-zero, or when there is no explicit representation and the + * tuple is evidently not a pre-pg_upgrade tuple. + * + * Prior to v11, downlinks always had P_HIKEY as their offset. + * Accept that as an alternative indication of a valid + * !heapkeyspace negative infinity tuple. + */ + return tupnatts == 0 || + ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; + } + else + { + /* + * !heapkeyspace downlink tuple with separator key contains only + * key attributes. Note that tupnatts will only have been + * explicitly represented in !heapkeyspace indexes that happen to + * have non-key attributes. + */ + if (!heapkeyspace) + return tupnatts == nkeyatts; + + /* Use generic heapkeyspace pivot tuple handling */ + } + + } + + /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */ + Assert(heapkeyspace); + + /* + * Explicit representation of the number of attributes is mandatory with + * heapkeyspace index pivot tuples, regardless of whether or not there are + * non-key attributes. + */ + if (!BTreeTupleIsPivot(itup)) + return false; + + /* Pivot tuple should not use posting list representation (redundant) */ + if (BTreeTupleIsPosting(itup)) + return false; + + /* + * Heap TID is a tiebreaker key attribute, so it cannot be untruncated + * when any other key attribute is truncated + */ + if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts) + return false; + + /* + * Pivot tuple must have at least one untruncated key attribute (minus + * infinity pivot tuples are the only exception). Pivot tuples can never + * represent that there is a value present for a key attribute that + * exceeds pg_index.indnkeyatts for the index. + */ + return tupnatts > 0 && tupnatts <= nkeyatts; +} + +/* + * + * _bt_check_third_page() -- check whether tuple fits on a btree page at all. + * + * We actually need to be able to fit three items on every page, so restrict + * any one item to 1/3 the per-page available space. Note that itemsz should + * not include the ItemId overhead. + * + * It might be useful to apply TOAST methods rather than throw an error here. + * Using out of line storage would break assumptions made by suffix truncation + * and by contrib/amcheck, though. + */ +void +_bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, + Page page, IndexTuple newtup) +{ + Size itemsz; + BTPageOpaque opaque; + + itemsz = MAXALIGN(IndexTupleSize(newtup)); + + /* Double check item size against limit */ + if (itemsz <= BTMaxItemSize(page)) + return; + + /* + * Tuple is probably too large to fit on page, but it's possible that the + * index uses version 2 or version 3, or that page is an internal page, in + * which case a slightly higher limit applies. + */ + if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page)) + return; + + /* + * Internal page insertions cannot fail here, because that would mean that + * an earlier leaf level insertion that should have failed didn't + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_ISLEAF(opaque)) + elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"", + itemsz, RelationGetRelationName(rel)); + + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"", + itemsz, + needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION, + needheaptidspace ? BTMaxItemSize(page) : + BTMaxItemSizeNoHeapTid(page), + RelationGetRelationName(rel)), + errdetail("Index row references tuple (%u,%u) in relation \"%s\".", + ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), + ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), + RelationGetRelationName(heap)), + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."), + errtableconstraint(heap, RelationGetRelationName(rel)))); +} + +/* + * Are all attributes in rel "equality is image equality" attributes? + * + * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any + * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we + * return false; otherwise we return true. + * + * Returned boolean value is stored in index metapage during index builds. + * Deduplication can only be used when we return true. + */ +bool +_bt_allequalimage(Relation rel, bool debugmessage) +{ + bool allequalimage = true; + + /* INCLUDE indexes don't support deduplication */ + if (IndexRelationGetNumberOfAttributes(rel) != + IndexRelationGetNumberOfKeyAttributes(rel)) + return false; + + /* + * There is no special reason why deduplication cannot work with system + * relations (i.e. with system catalog indexes and TOAST indexes). We + * deem deduplication unsafe for these indexes all the same, since the + * alternative is to force users to always use deduplication, without + * being able to opt out. (ALTER INDEX is not supported with system + * indexes, so users would have no way to set the deduplicate_items + * storage parameter to 'off'.) + */ + if (IsSystemRelation(rel)) + return false; + + for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++) + { + Oid opfamily = rel->rd_opfamily[i]; + Oid opcintype = rel->rd_opcintype[i]; + Oid collation = rel->rd_indcollation[i]; + Oid equalimageproc; + + equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype, + BTEQUALIMAGE_PROC); + + /* + * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to + * be unsafe. Otherwise, actually call proc and see what it says. + */ + if (!OidIsValid(equalimageproc) || + !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation, + ObjectIdGetDatum(opcintype)))) + { + allequalimage = false; + break; + } + } + + /* + * Don't elog() until here to avoid reporting on a system relation index + * or an INCLUDE index + */ + if (debugmessage) + { + if (allequalimage) + elog(DEBUG1, "index \"%s\" can safely use deduplication", + RelationGetRelationName(rel)); + else + elog(DEBUG1, "index \"%s\" cannot use deduplication", + RelationGetRelationName(rel)); + } + + return allequalimage; +} diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c new file mode 100644 index 0000000..7acb64e --- /dev/null +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -0,0 +1,380 @@ +/*------------------------------------------------------------------------- + * + * nbtvalidate.c + * Opclass validator for btree. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/xact.h" +#include "catalog/pg_am.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + + +/* + * Validator for a btree opclass. + * + * Some of the checks done here cover the whole opfamily, and therefore are + * redundant when checking each opclass in a family. But they don't run long + * enough to be much of a problem, so we accept the duplication rather than + * complicate the amvalidate API. + */ +bool +btvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + List *familytypes; + int usefulgroups; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case BTORDER_PROC: + ok = check_amproc_signature(procform->amproc, INT4OID, true, + 2, 2, procform->amproclefttype, + procform->amprocrighttype); + break; + case BTSORTSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; + case BTINRANGE_PROC: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 5, 5, + procform->amproclefttype, + procform->amproclefttype, + procform->amprocrighttype, + BOOLOID, BOOLOID); + break; + case BTEQUALIMAGE_PROC: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 1, 1, OIDOID); + break; + case BTOPTIONS_PROC: + ok = check_amoptsproc_signature(procform->amproc); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "btree", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "btree", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || + oprform->amopstrategy > BTMaxStrategyNumber) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "btree", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* btree doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "btree", + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all btree strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "btree", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + usefulgroups = 0; + opclassgroup = NULL; + familytypes = NIL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* + * It is possible for an in_range support function to have a RHS type + * that is otherwise irrelevant to the opfamily --- for instance, SQL + * requires the datetime_ops opclass to have range support with an + * interval offset. So, if this group appears to contain only an + * in_range function, ignore it: it doesn't represent a pair of + * supported types. + */ + if (thisgroup->operatorset == 0 && + thisgroup->functionset == (1 << BTINRANGE_PROC)) + continue; + + /* Else count it as a relevant group */ + usefulgroups++; + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * Identify all distinct data types handled in this opfamily. This + * implementation is O(N^2), but there aren't likely to be enough + * types in the family for it to matter. + */ + familytypes = list_append_unique_oid(familytypes, thisgroup->lefttype); + familytypes = list_append_unique_oid(familytypes, thisgroup->righttype); + + /* + * Complain if there seems to be an incomplete set of either operators + * or support functions for this datatype pair. The sortsupport, + * in_range, and equalimage functions are considered optional. + */ + if (thisgroup->operatorset != + ((1 << BTLessStrategyNumber) | + (1 << BTLessEqualStrategyNumber) | + (1 << BTEqualStrategyNumber) | + (1 << BTGreaterEqualStrategyNumber) | + (1 << BTGreaterStrategyNumber))) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s", + opfamilyname, "btree", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s", + opfamilyname, "btree", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + } + + /* Check that the originally-named opclass is supported */ + /* (if group is there, we already checked it adequately above) */ + if (!opclassgroup) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing operator(s)", + opclassname, "btree"))); + result = false; + } + + /* + * Complain if the opfamily doesn't have entries for all possible + * combinations of its supported datatypes. While missing cross-type + * operators are not fatal, they do limit the planner's ability to derive + * additional qual clauses from equivalence classes, so it seems + * reasonable to insist that all built-in btree opfamilies be complete. + */ + if (usefulgroups != (list_length(familytypes) * list_length(familytypes))) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)", + opfamilyname, "btree"))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + +/* + * Prechecking function for adding operators/functions to a btree opfamily. + */ +void +btadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + Oid opcintype; + ListCell *lc; + + /* + * Btree operators and comparison support functions are always "loose" + * members of the opfamily if they are cross-type. If they are not + * cross-type, we prefer to tie them to the appropriate opclass ... but if + * the user hasn't created one, we can't do that, and must fall back to + * using the opfamily dependency. (We mustn't force creation of an + * opclass in such a case, as leaving an incomplete opclass laying about + * would be bad. Throwing an error is another undesirable alternative.) + * + * This behavior results in a bit of a dump/reload hazard, in that the + * order of restoring objects could affect what dependencies we end up + * with. pg_dump's existing behavior will preserve the dependency choices + * in most cases, but not if a cross-type operator has been bound tightly + * into an opclass. That's a mistake anyway, so silently "fixing" it + * isn't awful. + * + * Optional support functions are always "loose" family members. + * + * To avoid repeated lookups, we remember the most recently used opclass's + * input type. + */ + if (OidIsValid(opclassoid)) + { + /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */ + CommandCounterIncrement(); + opcintype = get_opclass_input_type(opclassoid); + } + else + opcintype = InvalidOid; + + /* + * We handle operators and support functions almost identically, so rather + * than duplicate this code block, just join the lists. + */ + foreach(lc, list_concat_copy(operators, functions)) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + if (op->is_func && op->number != BTORDER_PROC) + { + /* Optional support proc, so always a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + else if (op->lefttype != op->righttype) + { + /* Cross-type, so always a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + else + { + /* Not cross-type; is there a suitable opclass? */ + if (op->lefttype != opcintype) + { + /* Avoid repeating this expensive lookup, even if it fails */ + opcintype = op->lefttype; + opclassoid = opclass_for_family_datatype(BTREE_AM_OID, + opfamilyoid, + opcintype); + } + if (OidIsValid(opclassoid)) + { + /* Hard dependency on opclass */ + op->ref_is_hard = true; + op->ref_is_family = false; + op->refobjid = opclassoid; + } + else + { + /* We're stuck, so make a soft dependency on the opfamily */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + } + } +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c new file mode 100644 index 0000000..786c08c --- /dev/null +++ b/src/backend/access/nbtree/nbtxlog.c @@ -0,0 +1,1126 @@ +/*------------------------------------------------------------------------- + * + * nbtxlog.c + * WAL replay logic for btrees. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtxlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/procarray.h" +#include "utils/memutils.h" + +static MemoryContext opCtx; /* working memory for operations */ + +/* + * _bt_restore_page -- re-enter all the index tuples on a page + * + * The page is freshly init'd, and *from (length len) is a copy of what + * had been its upper part (pd_upper to pd_special). We assume that the + * tuples had been added to the page in item-number order, and therefore + * the one with highest item number appears first (lowest on the page). + */ +static void +_bt_restore_page(Page page, char *from, int len) +{ + IndexTupleData itupdata; + Size itemsz; + char *end = from + len; + Item items[MaxIndexTuplesPerPage]; + uint16 itemsizes[MaxIndexTuplesPerPage]; + int i; + int nitems; + + /* + * To get the items back in the original order, we add them to the page in + * reverse. To figure out where one tuple ends and another begins, we + * have to scan them in forward order first. + */ + i = 0; + while (from < end) + { + /* + * As we step through the items, 'from' won't always be properly + * aligned, so we need to use memcpy(). Further, we use Item (which + * is just a char*) here for our items array for the same reason; + * wouldn't want the compiler or anyone thinking that an item is + * aligned when it isn't. + */ + memcpy(&itupdata, from, sizeof(IndexTupleData)); + itemsz = IndexTupleSize(&itupdata); + itemsz = MAXALIGN(itemsz); + + items[i] = (Item) from; + itemsizes[i] = itemsz; + i++; + + from += itemsz; + } + nitems = i; + + for (i = nitems - 1; i >= 0; i--) + { + if (PageAddItem(page, items[i], itemsizes[i], nitems - i, + false, false) == InvalidOffsetNumber) + elog(PANIC, "_bt_restore_page: cannot add item to page"); + } +} + +static void +_bt_restore_meta(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer metabuf; + Page metapg; + BTMetaPageData *md; + BTPageOpaque pageop; + xl_btree_metadata *xlrec; + char *ptr; + Size len; + + metabuf = XLogInitBufferForRedo(record, block_id); + ptr = XLogRecGetBlockData(record, block_id, &len); + + Assert(len == sizeof(xl_btree_metadata)); + Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); + xlrec = (xl_btree_metadata *) ptr; + metapg = BufferGetPage(metabuf); + + _bt_pageinit(metapg, BufferGetPageSize(metabuf)); + + md = BTPageGetMeta(metapg); + md->btm_magic = BTREE_MAGIC; + md->btm_version = xlrec->version; + md->btm_root = xlrec->root; + md->btm_level = xlrec->level; + md->btm_fastroot = xlrec->fastroot; + md->btm_fastlevel = xlrec->fastlevel; + /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */ + Assert(md->btm_version >= BTREE_NOVAC_VERSION); + md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages; + md->btm_last_cleanup_num_heap_tuples = -1.0; + md->btm_allequalimage = xlrec->allequalimage; + + pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); + pageop->btpo_flags = BTP_META; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) metapg)->pd_lower = + ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg; + + PageSetLSN(metapg, lsn); + MarkBufferDirty(metabuf); + UnlockReleaseBuffer(metabuf); +} + +/* + * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page + * + * This is a common subroutine of the redo functions of all the WAL record + * types that can insert a downlink: insert, split, and newroot. + */ +static void +_bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buf; + + if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buf); + BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + Assert(P_INCOMPLETE_SPLIT(pageop)); + pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + +static void +btree_xlog_insert(bool isleaf, bool ismeta, bool posting, + XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + + /* + * Insertion to an internal page finishes an incomplete split at the child + * level. Clear the incomplete-split flag in the child. Note: during + * normal operation, the child and parent pages are locked at the same + * time (the locks are coupled), so that clearing the flag and inserting + * the downlink appear atomic to other backends. We don't bother with + * that during replay, because readers don't care about the + * incomplete-split flag and there cannot be updates happening. + */ + if (!isleaf) + _bt_clear_incomplete_split(record, 1); + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); + + page = BufferGetPage(buffer); + + if (!posting) + { + /* Simple retail insertion */ + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add new item"); + } + else + { + ItemId itemid; + IndexTuple oposting, + newitem, + nposting; + uint16 postingoff; + + /* + * A posting list split occurred during leaf page insertion. WAL + * record data will start with an offset number representing the + * point in an existing posting list that a split occurs at. + * + * Use _bt_swap_posting() to repeat posting list split steps from + * primary. Note that newitem from WAL record is 'orignewitem', + * not the final version of newitem that is actually inserted on + * page. + */ + postingoff = *((uint16 *) datapos); + datapos += sizeof(uint16); + datalen -= sizeof(uint16); + + itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum)); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + Assert(isleaf && postingoff > 0); + newitem = CopyIndexTuple((IndexTuple) datapos); + nposting = _bt_swap_posting(newitem, oposting, postingoff); + + /* Replace existing posting list with post-split version */ + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + /* Insert "final" new item (not orignewitem from WAL stream) */ + Assert(IndexTupleSize(newitem) == datalen); + if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add posting split new item"); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * Note: in normal operation, we'd update the metapage while still holding + * lock on the page we inserted into. But during replay it's not + * necessary to hold that lock, since no other index updates can be + * happening concurrently, and readers will cope fine with following an + * obsolete link from the metapage. + */ + if (ismeta) + _bt_restore_meta(record, 2); +} + +static void +btree_xlog_split(bool newitemonleft, XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); + bool isleaf = (xlrec->level == 0); + Buffer buf; + Buffer rbuf; + Page rpage; + BTPageOpaque ropaque; + char *datapos; + Size datalen; + BlockNumber origpagenumber; + BlockNumber rightpagenumber; + BlockNumber spagenumber; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber); + XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber); + if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &spagenumber)) + spagenumber = P_NONE; + + /* + * Clear the incomplete split flag on the appropriate child page one level + * down when origpage/buf is an internal page (there must have been + * cascading page splits during original execution in the event of an + * internal page split). This is like the corresponding btree_xlog_insert + * call for internal pages. We're not clearing the incomplete split flag + * for the current page split here (you can think of this as part of the + * insert of newitem that the page split action needs to perform in + * passing). + * + * Like in btree_xlog_insert, this can be done before locking other pages. + * We never need to couple cross-level locks in REDO routines. + */ + if (!isleaf) + _bt_clear_incomplete_split(record, 3); + + /* Reconstruct right (new) sibling page from scratch */ + rbuf = XLogInitBufferForRedo(record, 1); + datapos = XLogRecGetBlockData(record, 1, &datalen); + rpage = (Page) BufferGetPage(rbuf); + + _bt_pageinit(rpage, BufferGetPageSize(rbuf)); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = spagenumber; + ropaque->btpo_level = xlrec->level; + ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; + ropaque->btpo_cycleid = 0; + + _bt_restore_page(rpage, datapos, datalen); + + PageSetLSN(rpage, lsn); + MarkBufferDirty(rbuf); + + /* Now reconstruct original page (left half of split) */ + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + /* + * To retain the same physical order of the tuples that they had, we + * initialize a temporary empty page for the left page and add all the + * items to that in item number order. This mirrors how _bt_split() + * works. Retaining the same physical order makes WAL consistency + * checking possible. See also _bt_restore_page(), which does the + * same for the right page. + */ + Page origpage = (Page) BufferGetPage(buf); + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + OffsetNumber off; + IndexTuple newitem = NULL, + left_hikey = NULL, + nposting = NULL; + Size newitemsz = 0, + left_hikeysz = 0; + Page leftpage; + OffsetNumber leftoff, + replacepostingoff = InvalidOffsetNumber; + + datapos = XLogRecGetBlockData(record, 0, &datalen); + + if (newitemonleft || xlrec->postingoff != 0) + { + newitem = (IndexTuple) datapos; + newitemsz = MAXALIGN(IndexTupleSize(newitem)); + datapos += newitemsz; + datalen -= newitemsz; + + if (xlrec->postingoff != 0) + { + ItemId itemid; + IndexTuple oposting; + + /* Posting list must be at offset number before new item's */ + replacepostingoff = OffsetNumberPrev(xlrec->newitemoff); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + newitem = CopyIndexTuple(newitem); + itemid = PageGetItemId(origpage, replacepostingoff); + oposting = (IndexTuple) PageGetItem(origpage, itemid); + nposting = _bt_swap_posting(newitem, oposting, + xlrec->postingoff); + } + } + + /* + * Extract left hikey and its size. We assume that 16-bit alignment + * is enough to apply IndexTupleSize (since it's fetching from a + * uint16 field). + */ + left_hikey = (IndexTuple) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + datapos += left_hikeysz; + datalen -= left_hikeysz; + + Assert(datalen == 0); + + leftpage = PageGetTempPageCopySpecial(origpage); + + /* Add high key tuple from WAL record to temp page */ + leftoff = P_HIKEY; + if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add high key to left page after split"); + leftoff = OffsetNumberNext(leftoff); + + for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++) + { + ItemId itemid; + Size itemsz; + IndexTuple item; + + /* Add replacement posting list when required */ + if (off == replacepostingoff) + { + Assert(newitemonleft || + xlrec->firstrightoff == xlrec->newitemoff); + if (PageAddItem(leftpage, (Item) nposting, + MAXALIGN(IndexTupleSize(nposting)), leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new posting list item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + continue; /* don't insert oposting */ + } + + /* add the new item if it was inserted on left page */ + else if (newitemonleft && off == xlrec->newitemoff) + { + if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + } + + itemid = PageGetItemId(origpage, off); + itemsz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(origpage, itemid); + if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add old item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + } + + /* cope with possibility that newitem goes at the end */ + if (newitemonleft && off == xlrec->newitemoff) + { + if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + } + + PageRestoreTempPage(leftpage, origpage); + + /* Fix opaque fields */ + oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; + if (isleaf) + oopaque->btpo_flags |= BTP_LEAF; + oopaque->btpo_next = rightpagenumber; + oopaque->btpo_cycleid = 0; + + PageSetLSN(origpage, lsn); + MarkBufferDirty(buf); + } + + /* Fix left-link of the page to the right of the new right sibling */ + if (spagenumber != P_NONE) + { + Buffer sbuf; + + if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO) + { + Page spage = (Page) BufferGetPage(sbuf); + BTPageOpaque spageop = (BTPageOpaque) PageGetSpecialPointer(spage); + + spageop->btpo_prev = rightpagenumber; + + PageSetLSN(spage, lsn); + MarkBufferDirty(sbuf); + } + if (BufferIsValid(sbuf)) + UnlockReleaseBuffer(sbuf); + } + + /* + * Finally, release the remaining buffers. sbuf, rbuf, and buf must be + * released together, so that readers cannot observe inconsistencies. + */ + UnlockReleaseBuffer(rbuf); + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + +static void +btree_xlog_dedup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record); + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + Page page = (Page) BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + OffsetNumber offnum, + minoff, + maxoff; + BTDedupState state; + BTDedupInterval *intervals; + Page newpage; + + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; /* unused */ + state->nmaxitems = 0; /* unused */ + /* Conservatively use larger maxpostingsize than primary */ + state->maxpostingsize = BTMaxItemSize(page); + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->nintervals = 0; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + newpage = PageGetTempPageCopySpecial(page); + + if (!P_RIGHTMOST(opaque)) + { + ItemId itemid = PageGetItemId(page, P_HIKEY); + Size itemsz = ItemIdGetLength(itemid); + IndexTuple item = (IndexTuple) PageGetItem(page, itemid); + + if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + intervals = (BTDedupInterval *) ptr; + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + if (offnum == minoff) + _bt_dedup_start_pending(state, itup, offnum); + else if (state->nintervals < xlrec->nintervals && + state->baseoff == intervals[state->nintervals].baseoff && + state->nitems < intervals[state->nintervals].nitems) + { + if (!_bt_dedup_save_htid(state, itup)) + elog(ERROR, "deduplication failed to add heap tid to pending posting list"); + } + else + { + _bt_dedup_finish_pending(newpage, state); + _bt_dedup_start_pending(state, itup, offnum); + } + } + + _bt_dedup_finish_pending(newpage, state); + Assert(state->nintervals == xlrec->nintervals); + Assert(memcmp(state->intervals, intervals, + state->nintervals * sizeof(BTDedupInterval)) == 0); + + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + PageRestoreTempPage(newpage, page); + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + +static void +btree_xlog_updates(Page page, OffsetNumber *updatedoffsets, + xl_btree_update *updates, int nupdated) +{ + BTVacuumPosting vacposting; + IndexTuple origtuple; + ItemId itemid; + Size itemsz; + + for (int i = 0; i < nupdated; i++) + { + itemid = PageGetItemId(page, updatedoffsets[i]); + origtuple = (IndexTuple) PageGetItem(page, itemid); + + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + updates->ndeletedtids * sizeof(uint16)); + vacposting->updatedoffset = updatedoffsets[i]; + vacposting->itup = origtuple; + vacposting->ndeletedtids = updates->ndeletedtids; + memcpy(vacposting->deletetids, + (char *) updates + SizeOfBtreeUpdate, + updates->ndeletedtids * sizeof(uint16)); + + _bt_update_posting(vacposting); + + /* Overwrite updated version of tuple */ + itemsz = MAXALIGN(IndexTupleSize(vacposting->itup)); + if (!PageIndexTupleOverwrite(page, updatedoffsets[i], + (Item) vacposting->itup, itemsz)) + elog(PANIC, "failed to update partially dead item"); + + pfree(vacposting->itup); + pfree(vacposting); + + /* advance to next xl_btree_update from array */ + updates = (xl_btree_update *) + ((char *) updates + SizeOfBtreeUpdate + + updates->ndeletedtids * sizeof(uint16)); + } +} + +static void +btree_xlog_vacuum(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque opaque; + + /* + * We need to take a cleanup lock here, just like btvacuumpage(). However, + * it isn't necessary to exhaustively get a cleanup lock on every block in + * the index during recovery (just getting a cleanup lock on pages with + * items to kill suffices). See nbtree/README for details. + */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) + == BLK_NEEDS_REDO) + { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + + page = (Page) BufferGetPage(buffer); + + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + xl_btree_update *updates; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updates = (xl_btree_update *) ((char *) updatedoffsets + + xlrec->nupdated * + sizeof(OffsetNumber)); + + btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated); + } + + if (xlrec->ndeleted > 0) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + + /* + * Mark the page as not containing any LP_DEAD items --- see comments + * in _bt_delitems_vacuum(). + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +btree_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque opaque; + + /* + * If we have any conflict processing to do, it must happen before we + * update the page + */ + if (InHotStandby) + { + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + + ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode); + } + + /* + * We don't need to take a cleanup lock to apply these changes. See + * nbtree/README for details. + */ + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + + page = (Page) BufferGetPage(buffer); + + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + xl_btree_update *updates; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updates = (xl_btree_update *) ((char *) updatedoffsets + + xlrec->nupdated * + sizeof(OffsetNumber)); + + btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated); + } + + if (xlrec->ndeleted > 0) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + + /* Mark the page as not containing any LP_DEAD items */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque pageop; + IndexTupleData trunctuple; + + /* + * In normal operation, we would lock all the pages this WAL record + * touches before changing any of them. In WAL replay, it should be okay + * to lock just one page at a time, since no concurrent index updates can + * be happening, and readers should not care whether they arrive at the + * target page or not (since it's surely empty). + */ + + /* to-be-deleted subtree's parent page */ + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + OffsetNumber poffset; + ItemId itemid; + IndexTuple itup; + OffsetNumber nextoffset; + BlockNumber rightsib; + + page = (Page) BufferGetPage(buffer); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + poffset = xlrec->poffset; + + nextoffset = OffsetNumberNext(poffset); + itemid = PageGetItemId(page, nextoffset); + itup = (IndexTuple) PageGetItem(page, itemid); + rightsib = BTreeTupleGetDownLink(itup); + + itemid = PageGetItemId(page, poffset); + itup = (IndexTuple) PageGetItem(page, itemid); + BTreeTupleSetDownLink(itup, rightsib); + nextoffset = OffsetNumberNext(poffset); + PageIndexTupleDelete(page, nextoffset); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + /* + * Don't need to couple cross-level locks in REDO routines, so release + * lock on internal page immediately + */ + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* Rewrite the leaf page as a halfdead page */ + buffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(buffer); + + _bt_pageinit(page, BufferGetPageSize(buffer)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_prev = xlrec->leftblk; + pageop->btpo_next = xlrec->rightblk; + pageop->btpo_level = 0; + pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; + pageop->btpo_cycleid = 0; + + /* + * Construct a dummy high key item that points to top parent page (value + * is InvalidBlockNumber when the top parent page is the leaf page itself) + */ + MemSet(&trunctuple, 0, sizeof(IndexTupleData)); + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetTopParent(&trunctuple, xlrec->topparent); + + if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "could not add dummy high key to half-dead page"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + + +static void +btree_xlog_unlink_page(uint8 info, XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); + BlockNumber leftsib; + BlockNumber rightsib; + uint32 level; + bool isleaf; + FullTransactionId safexid; + Buffer leftbuf; + Buffer target; + Buffer rightbuf; + Page page; + BTPageOpaque pageop; + + leftsib = xlrec->leftsib; + rightsib = xlrec->rightsib; + level = xlrec->level; + isleaf = (level == 0); + safexid = xlrec->safexid; + + /* No leaftopparent for level 0 (leaf page) or level 1 target */ + Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1); + + /* + * In normal operation, we would lock all the pages this WAL record + * touches before changing any of them. In WAL replay, we at least lock + * the pages in the same standard left-to-right order (leftsib, target, + * rightsib), and don't release the sibling locks until the target is + * marked deleted. + */ + + /* Fix right-link of left sibling, if any */ + if (leftsib != P_NONE) + { + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(leftbuf); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + pageop->btpo_next = rightsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(leftbuf); + } + } + else + leftbuf = InvalidBuffer; + + /* Rewrite target page as empty deleted page */ + target = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(target); + + _bt_pageinit(page, BufferGetPageSize(target)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_prev = leftsib; + pageop->btpo_next = rightsib; + pageop->btpo_level = level; + BTPageSetDeleted(page, safexid); + if (isleaf) + pageop->btpo_flags |= BTP_LEAF; + pageop->btpo_cycleid = 0; + + PageSetLSN(page, lsn); + MarkBufferDirty(target); + + /* Fix left-link of right sibling */ + if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(rightbuf); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + pageop->btpo_prev = leftsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(rightbuf); + } + + /* Release siblings */ + if (BufferIsValid(leftbuf)) + UnlockReleaseBuffer(leftbuf); + if (BufferIsValid(rightbuf)) + UnlockReleaseBuffer(rightbuf); + + /* Release target */ + UnlockReleaseBuffer(target); + + /* + * If we deleted a parent of the targeted leaf page, instead of the leaf + * itself, update the leaf to point to the next remaining child in the + * to-be-deleted subtree + */ + if (XLogRecHasBlockRef(record, 3)) + { + /* + * There is no real data on the page, so we just re-create it from + * scratch using the information from the WAL record. + * + * Note that we don't end up here when the target page is also the + * leafbuf page. There is no need to add a dummy hikey item with a + * top parent link when deleting leafbuf because it's the last page + * we'll delete in the subtree undergoing deletion. + */ + Buffer leafbuf; + IndexTupleData trunctuple; + + Assert(!isleaf); + + leafbuf = XLogInitBufferForRedo(record, 3); + page = (Page) BufferGetPage(leafbuf); + + _bt_pageinit(page, BufferGetPageSize(leafbuf)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; + pageop->btpo_prev = xlrec->leafleftsib; + pageop->btpo_next = xlrec->leafrightsib; + pageop->btpo_level = 0; + pageop->btpo_cycleid = 0; + + /* Add a dummy hikey item */ + MemSet(&trunctuple, 0, sizeof(IndexTupleData)); + trunctuple.t_info = sizeof(IndexTupleData); + BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent); + + if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "could not add dummy high key to half-dead page"); + + PageSetLSN(page, lsn); + MarkBufferDirty(leafbuf); + UnlockReleaseBuffer(leafbuf); + } + + /* Update metapage if needed */ + if (info == XLOG_BTREE_UNLINK_PAGE_META) + _bt_restore_meta(record, 4); +} + +static void +btree_xlog_newroot(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); + Buffer buffer; + Page page; + BTPageOpaque pageop; + char *ptr; + Size len; + + buffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(buffer); + + _bt_pageinit(page, BufferGetPageSize(buffer)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_flags = BTP_ROOT; + pageop->btpo_prev = pageop->btpo_next = P_NONE; + pageop->btpo_level = xlrec->level; + if (xlrec->level == 0) + pageop->btpo_flags |= BTP_LEAF; + pageop->btpo_cycleid = 0; + + if (xlrec->level > 0) + { + ptr = XLogRecGetBlockData(record, 0, &len); + _bt_restore_page(page, ptr, len); + + /* Clear the incomplete-split flag in left child */ + _bt_clear_incomplete_split(record, 1); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + + _bt_restore_meta(record, 2); +} + +/* + * In general VACUUM must defer recycling as a way of avoiding certain race + * conditions. Deleted pages contain a safexid value that is used by VACUUM + * to determine whether or not it's safe to place a page that was deleted by + * VACUUM earlier into the FSM now. See nbtree/README. + * + * As far as any backend operating during original execution is concerned, the + * FSM is a cache of recycle-safe pages; the mere presence of the page in the + * FSM indicates that the page must already be safe to recycle (actually, + * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just + * because it would be unwise to completely trust the FSM, given its current + * limitations). + * + * This isn't sufficient to prevent similar concurrent recycling race + * conditions during Hot Standby, though. For that we need to log a + * xl_btree_reuse_page record at the point that a page is actually recycled + * and reused for an entirely unrelated page inside _bt_split(). These + * records include the same safexid value from the original deleted page, + * stored in the record's latestRemovedFullXid field. + * + * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used + * to determine if it's safe to recycle a page. This mirrors our own test: + * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs(). + * Consequently, one XID value achieves the same exclusion effect on primary + * and standby. + */ +static void +btree_xlog_reuse_page(XLogReaderState *record) +{ + xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); + + if (InHotStandby) + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); +} + +void +btree_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + btree_xlog_insert(true, false, false, record); + break; + case XLOG_BTREE_INSERT_UPPER: + btree_xlog_insert(false, false, false, record); + break; + case XLOG_BTREE_INSERT_META: + btree_xlog_insert(false, true, false, record); + break; + case XLOG_BTREE_SPLIT_L: + btree_xlog_split(true, record); + break; + case XLOG_BTREE_SPLIT_R: + btree_xlog_split(false, record); + break; + case XLOG_BTREE_INSERT_POST: + btree_xlog_insert(true, false, true, record); + break; + case XLOG_BTREE_DEDUP: + btree_xlog_dedup(record); + break; + case XLOG_BTREE_VACUUM: + btree_xlog_vacuum(record); + break; + case XLOG_BTREE_DELETE: + btree_xlog_delete(record); + break; + case XLOG_BTREE_MARK_PAGE_HALFDEAD: + btree_xlog_mark_page_halfdead(info, record); + break; + case XLOG_BTREE_UNLINK_PAGE: + case XLOG_BTREE_UNLINK_PAGE_META: + btree_xlog_unlink_page(info, record); + break; + case XLOG_BTREE_NEWROOT: + btree_xlog_newroot(record); + break; + case XLOG_BTREE_REUSE_PAGE: + btree_xlog_reuse_page(record); + break; + case XLOG_BTREE_META_CLEANUP: + _bt_restore_meta(record, 0); + break; + default: + elog(PANIC, "btree_redo: unknown op code %u", info); + } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +btree_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "Btree recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +btree_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; +} + +/* + * Mask a btree page before performing consistency checks on it. + */ +void +btree_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + BTPageOpaque maskopaq; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_ISLEAF(maskopaq)) + { + /* + * In btree leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * _bt_killitems(), _bt_check_unique() for details. + */ + mask_lp_flags(page); + } + + /* + * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See + * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique() + * for details. + */ + maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* + * During replay of a btree page split, we don't set the BTP_SPLIT_END + * flag of the right sibling and initialize the cycle_id to 0 for the same + * page. See btree_xlog_split() for details. + */ + maskopaq->btpo_flags &= ~BTP_SPLIT_END; + maskopaq->btpo_cycleid = 0; +} diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile new file mode 100644 index 0000000..f88d72f --- /dev/null +++ b/src/backend/access/rmgrdesc/Makefile @@ -0,0 +1,34 @@ +# +# Makefile for the rmgr descriptor routines +# +# src/backend/access/rmgrdesc/Makefile +# + +subdir = src/backend/access/rmgrdesc +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + brindesc.o \ + clogdesc.o \ + committsdesc.o \ + dbasedesc.o \ + genericdesc.o \ + gindesc.o \ + gistdesc.o \ + hashdesc.o \ + heapdesc.o \ + logicalmsgdesc.o \ + mxactdesc.o \ + nbtdesc.o \ + relmapdesc.o \ + replorigindesc.o \ + seqdesc.o \ + smgrdesc.o \ + spgdesc.o \ + standbydesc.o \ + tblspcdesc.o \ + xactdesc.o \ + xlogdesc.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/rmgrdesc/brindesc.c b/src/backend/access/rmgrdesc/brindesc.c new file mode 100644 index 0000000..b6265a4 --- /dev/null +++ b/src/backend/access/rmgrdesc/brindesc.c @@ -0,0 +1,107 @@ +/*------------------------------------------------------------------------- + * + * brindesc.c + * rmgr descriptor routines for BRIN indexes + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/brindesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/brin_xlog.h" + +void +brin_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + info &= XLOG_BRIN_OPMASK; + if (info == XLOG_BRIN_CREATE_INDEX) + { + xl_brin_createidx *xlrec = (xl_brin_createidx *) rec; + + appendStringInfo(buf, "v%d pagesPerRange %u", + xlrec->version, xlrec->pagesPerRange); + } + else if (info == XLOG_BRIN_INSERT) + { + xl_brin_insert *xlrec = (xl_brin_insert *) rec; + + appendStringInfo(buf, "heapBlk %u pagesPerRange %u offnum %u", + xlrec->heapBlk, + xlrec->pagesPerRange, + xlrec->offnum); + } + else if (info == XLOG_BRIN_UPDATE) + { + xl_brin_update *xlrec = (xl_brin_update *) rec; + + appendStringInfo(buf, "heapBlk %u pagesPerRange %u old offnum %u, new offnum %u", + xlrec->insert.heapBlk, + xlrec->insert.pagesPerRange, + xlrec->oldOffnum, + xlrec->insert.offnum); + } + else if (info == XLOG_BRIN_SAMEPAGE_UPDATE) + { + xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec; + + appendStringInfo(buf, "offnum %u", xlrec->offnum); + } + else if (info == XLOG_BRIN_REVMAP_EXTEND) + { + xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec; + + appendStringInfo(buf, "targetBlk %u", xlrec->targetBlk); + } + else if (info == XLOG_BRIN_DESUMMARIZE) + { + xl_brin_desummarize *xlrec = (xl_brin_desummarize *) rec; + + appendStringInfo(buf, "pagesPerRange %u, heapBlk %u, page offset %u", + xlrec->pagesPerRange, xlrec->heapBlk, xlrec->regOffset); + } +} + +const char * +brin_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_BRIN_CREATE_INDEX: + id = "CREATE_INDEX"; + break; + case XLOG_BRIN_INSERT: + id = "INSERT"; + break; + case XLOG_BRIN_INSERT | XLOG_BRIN_INIT_PAGE: + id = "INSERT+INIT"; + break; + case XLOG_BRIN_UPDATE: + id = "UPDATE"; + break; + case XLOG_BRIN_UPDATE | XLOG_BRIN_INIT_PAGE: + id = "UPDATE+INIT"; + break; + case XLOG_BRIN_SAMEPAGE_UPDATE: + id = "SAMEPAGE_UPDATE"; + break; + case XLOG_BRIN_REVMAP_EXTEND: + id = "REVMAP_EXTEND"; + break; + case XLOG_BRIN_DESUMMARIZE: + id = "DESUMMARIZE"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/clogdesc.c b/src/backend/access/rmgrdesc/clogdesc.c new file mode 100644 index 0000000..b12f43a --- /dev/null +++ b/src/backend/access/rmgrdesc/clogdesc.c @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------- + * + * clogdesc.c + * rmgr descriptor routines for access/transam/clog.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/clogdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" + + +void +clog_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == CLOG_ZEROPAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + appendStringInfo(buf, "page %d", pageno); + } + else if (info == CLOG_TRUNCATE) + { + xl_clog_truncate xlrec; + + memcpy(&xlrec, rec, sizeof(xl_clog_truncate)); + appendStringInfo(buf, "page %d; oldestXact %u", + xlrec.pageno, xlrec.oldestXact); + } +} + +const char * +clog_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case CLOG_ZEROPAGE: + id = "ZEROPAGE"; + break; + case CLOG_TRUNCATE: + id = "TRUNCATE"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/committsdesc.c b/src/backend/access/rmgrdesc/committsdesc.c new file mode 100644 index 0000000..26bad44 --- /dev/null +++ b/src/backend/access/rmgrdesc/committsdesc.c @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------- + * + * committsdesc.c + * rmgr descriptor routines for access/transam/commit_ts.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/committsdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/commit_ts.h" +#include "utils/timestamp.h" + + +void +commit_ts_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == COMMIT_TS_ZEROPAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + appendStringInfo(buf, "%d", pageno); + } + else if (info == COMMIT_TS_TRUNCATE) + { + xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) rec; + + appendStringInfo(buf, "pageno %d, oldestXid %u", + trunc->pageno, trunc->oldestXid); + } +} + +const char * +commit_ts_identify(uint8 info) +{ + switch (info) + { + case COMMIT_TS_ZEROPAGE: + return "ZEROPAGE"; + case COMMIT_TS_TRUNCATE: + return "TRUNCATE"; + default: + return NULL; + } +} diff --git a/src/backend/access/rmgrdesc/dbasedesc.c b/src/backend/access/rmgrdesc/dbasedesc.c new file mode 100644 index 0000000..2660984 --- /dev/null +++ b/src/backend/access/rmgrdesc/dbasedesc.c @@ -0,0 +1,63 @@ +/*------------------------------------------------------------------------- + * + * dbasedesc.c + * rmgr descriptor routines for commands/dbcommands.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/dbasedesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/dbcommands_xlog.h" +#include "lib/stringinfo.h" + + +void +dbase_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_DBASE_CREATE) + { + xl_dbase_create_rec *xlrec = (xl_dbase_create_rec *) rec; + + appendStringInfo(buf, "copy dir %u/%u to %u/%u", + xlrec->src_tablespace_id, xlrec->src_db_id, + xlrec->tablespace_id, xlrec->db_id); + } + else if (info == XLOG_DBASE_DROP) + { + xl_dbase_drop_rec *xlrec = (xl_dbase_drop_rec *) rec; + int i; + + appendStringInfoString(buf, "dir"); + for (i = 0; i < xlrec->ntablespaces; i++) + appendStringInfo(buf, " %u/%u", + xlrec->tablespace_ids[i], xlrec->db_id); + } +} + +const char * +dbase_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_DBASE_CREATE: + id = "CREATE"; + break; + case XLOG_DBASE_DROP: + id = "DROP"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/genericdesc.c b/src/backend/access/rmgrdesc/genericdesc.c new file mode 100644 index 0000000..7242d0d --- /dev/null +++ b/src/backend/access/rmgrdesc/genericdesc.c @@ -0,0 +1,56 @@ +/*------------------------------------------------------------------------- + * + * genericdesc.c + * rmgr descriptor routines for access/transam/generic_xlog.c + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/rmgrdesc/genericdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/generic_xlog.h" +#include "lib/stringinfo.h" +#include "storage/relfilenode.h" + +/* + * Description of generic xlog record: write page regions that this record + * overrides. + */ +void +generic_desc(StringInfo buf, XLogReaderState *record) +{ + Pointer ptr = XLogRecGetData(record), + end = ptr + XLogRecGetDataLen(record); + + while (ptr < end) + { + OffsetNumber offset, + length; + + memcpy(&offset, ptr, sizeof(offset)); + ptr += sizeof(offset); + memcpy(&length, ptr, sizeof(length)); + ptr += sizeof(length); + ptr += length; + + if (ptr < end) + appendStringInfo(buf, "offset %u, length %u; ", offset, length); + else + appendStringInfo(buf, "offset %u, length %u", offset, length); + } +} + +/* + * Identification of generic xlog record: we don't distinguish any subtypes + * inside generic xlog records. + */ +const char * +generic_identify(uint8 info) +{ + return "Generic"; +} diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c new file mode 100644 index 0000000..ee9e69c --- /dev/null +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -0,0 +1,218 @@ +/*------------------------------------------------------------------------- + * + * gindesc.c + * rmgr descriptor routines for access/transam/gin/ginxlog.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/gindesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/ginxlog.h" +#include "access/xlogutils.h" +#include "lib/stringinfo.h" +#include "storage/relfilenode.h" + +static void +desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData) +{ + int i; + char *walbuf = ((char *) insertData) + sizeof(ginxlogRecompressDataLeaf); + + appendStringInfo(buf, " %d segments:", (int) insertData->nactions); + + for (i = 0; i < insertData->nactions; i++) + { + uint8 a_segno = *((uint8 *) (walbuf++)); + uint8 a_action = *((uint8 *) (walbuf++)); + uint16 nitems = 0; + int newsegsize = 0; + + if (a_action == GIN_SEGMENT_INSERT || + a_action == GIN_SEGMENT_REPLACE) + { + newsegsize = SizeOfGinPostingList((GinPostingList *) walbuf); + walbuf += SHORTALIGN(newsegsize); + } + + if (a_action == GIN_SEGMENT_ADDITEMS) + { + memcpy(&nitems, walbuf, sizeof(uint16)); + walbuf += sizeof(uint16); + walbuf += nitems * sizeof(ItemPointerData); + } + + switch (a_action) + { + case GIN_SEGMENT_ADDITEMS: + appendStringInfo(buf, " %d (add %d items)", a_segno, nitems); + break; + case GIN_SEGMENT_DELETE: + appendStringInfo(buf, " %d (delete)", a_segno); + break; + case GIN_SEGMENT_INSERT: + appendStringInfo(buf, " %d (insert)", a_segno); + break; + case GIN_SEGMENT_REPLACE: + appendStringInfo(buf, " %d (replace)", a_segno); + break; + default: + appendStringInfo(buf, " %d unknown action %d ???", a_segno, a_action); + /* cannot decode unrecognized actions further */ + return; + } + } +} + +void +gin_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_GIN_CREATE_PTREE: + /* no further information */ + break; + case XLOG_GIN_INSERT: + { + ginxlogInsert *xlrec = (ginxlogInsert *) rec; + + appendStringInfo(buf, "isdata: %c isleaf: %c", + (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', + (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); + if (!(xlrec->flags & GIN_INSERT_ISLEAF)) + { + char *payload = rec + sizeof(ginxlogInsert); + BlockNumber leftChildBlkno; + BlockNumber rightChildBlkno; + + leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload); + payload += sizeof(BlockIdData); + rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload); + payload += sizeof(BlockNumber); + appendStringInfo(buf, " children: %u/%u", + leftChildBlkno, rightChildBlkno); + } + if (XLogRecHasBlockImage(record, 0)) + { + if (XLogRecBlockImageApply(record, 0)) + appendStringInfoString(buf, " (full page image)"); + else + appendStringInfoString(buf, " (full page image, for WAL verification)"); + } + else + { + char *payload = XLogRecGetBlockData(record, 0, NULL); + + if (!(xlrec->flags & GIN_INSERT_ISDATA)) + appendStringInfo(buf, " isdelete: %c", + (((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F'); + else if (xlrec->flags & GIN_INSERT_ISLEAF) + desc_recompress_leaf(buf, (ginxlogRecompressDataLeaf *) payload); + else + { + ginxlogInsertDataInternal *insertData = + (ginxlogInsertDataInternal *) payload; + + appendStringInfo(buf, " pitem: %u-%u/%u", + PostingItemGetBlockNumber(&insertData->newitem), + ItemPointerGetBlockNumber(&insertData->newitem.key), + ItemPointerGetOffsetNumber(&insertData->newitem.key)); + } + } + } + break; + case XLOG_GIN_SPLIT: + { + ginxlogSplit *xlrec = (ginxlogSplit *) rec; + + appendStringInfo(buf, "isrootsplit: %c", + (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F'); + appendStringInfo(buf, " isdata: %c isleaf: %c", + (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', + (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); + } + break; + case XLOG_GIN_VACUUM_PAGE: + /* no further information */ + break; + case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: + { + if (XLogRecHasBlockImage(record, 0)) + { + if (XLogRecBlockImageApply(record, 0)) + appendStringInfoString(buf, " (full page image)"); + else + appendStringInfoString(buf, " (full page image, for WAL verification)"); + } + else + { + ginxlogVacuumDataLeafPage *xlrec = + (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, NULL); + + desc_recompress_leaf(buf, &xlrec->data); + } + } + break; + case XLOG_GIN_DELETE_PAGE: + /* no further information */ + break; + case XLOG_GIN_UPDATE_META_PAGE: + /* no further information */ + break; + case XLOG_GIN_INSERT_LISTPAGE: + /* no further information */ + break; + case XLOG_GIN_DELETE_LISTPAGE: + appendStringInfo(buf, "ndeleted: %d", + ((ginxlogDeleteListPages *) rec)->ndeleted); + break; + } +} + +const char * +gin_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_GIN_CREATE_PTREE: + id = "CREATE_PTREE"; + break; + case XLOG_GIN_INSERT: + id = "INSERT"; + break; + case XLOG_GIN_SPLIT: + id = "SPLIT"; + break; + case XLOG_GIN_VACUUM_PAGE: + id = "VACUUM_PAGE"; + break; + case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: + id = "VACUUM_DATA_LEAF_PAGE"; + break; + case XLOG_GIN_DELETE_PAGE: + id = "DELETE_PAGE"; + break; + case XLOG_GIN_UPDATE_META_PAGE: + id = "UPDATE_META_PAGE"; + break; + case XLOG_GIN_INSERT_LISTPAGE: + id = "INSERT_LISTPAGE"; + break; + case XLOG_GIN_DELETE_LISTPAGE: + id = "DELETE_LISTPAGE"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c new file mode 100644 index 0000000..8ae3112 --- /dev/null +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -0,0 +1,117 @@ +/*------------------------------------------------------------------------- + * + * gistdesc.c + * rmgr descriptor routines for access/gist/gistxlog.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/gistdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gistxlog.h" +#include "lib/stringinfo.h" +#include "storage/relfilenode.h" + +static void +out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) +{ +} + +static void +out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec) +{ + appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block, + EpochFromFullTransactionId(xlrec->latestRemovedFullXid), + XidFromFullTransactionId(xlrec->latestRemovedFullXid)); +} + +static void +out_gistxlogDelete(StringInfo buf, gistxlogDelete *xlrec) +{ + appendStringInfo(buf, "delete: latestRemovedXid %u, nitems: %u", + xlrec->latestRemovedXid, xlrec->ntodelete); + +} + +static void +out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec) +{ + appendStringInfo(buf, "page_split: splits to %d pages", + xlrec->npage); +} + +static void +out_gistxlogPageDelete(StringInfo buf, gistxlogPageDelete *xlrec) +{ + appendStringInfo(buf, "deleteXid %u:%u; downlink %u", + EpochFromFullTransactionId(xlrec->deleteXid), + XidFromFullTransactionId(xlrec->deleteXid), + xlrec->downlinkOffset); +} + +void +gist_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_GIST_PAGE_UPDATE: + out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec); + break; + case XLOG_GIST_PAGE_REUSE: + out_gistxlogPageReuse(buf, (gistxlogPageReuse *) rec); + break; + case XLOG_GIST_DELETE: + out_gistxlogDelete(buf, (gistxlogDelete *) rec); + break; + case XLOG_GIST_PAGE_SPLIT: + out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec); + break; + case XLOG_GIST_PAGE_DELETE: + out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec); + break; + case XLOG_GIST_ASSIGN_LSN: + /* No details to write out */ + break; + } +} + +const char * +gist_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_GIST_PAGE_UPDATE: + id = "PAGE_UPDATE"; + break; + case XLOG_GIST_DELETE: + id = "DELETE"; + break; + case XLOG_GIST_PAGE_REUSE: + id = "PAGE_REUSE"; + break; + case XLOG_GIST_PAGE_SPLIT: + id = "PAGE_SPLIT"; + break; + case XLOG_GIST_PAGE_DELETE: + id = "PAGE_DELETE"; + break; + case XLOG_GIST_ASSIGN_LSN: + id = "ASSIGN_LSN"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c new file mode 100644 index 0000000..90ccea0 --- /dev/null +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -0,0 +1,172 @@ +/*------------------------------------------------------------------------- + * + * hashdesc.c + * rmgr descriptor routines for access/hash/hash.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/hashdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash_xlog.h" + +void +hash_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + { + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec; + + appendStringInfo(buf, "num_tuples %g, fillfactor %d", + xlrec->num_tuples, xlrec->ffactor); + break; + } + case XLOG_HASH_INIT_BITMAP_PAGE: + { + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec; + + appendStringInfo(buf, "bmsize %d", xlrec->bmsize); + break; + } + case XLOG_HASH_INSERT: + { + xl_hash_insert *xlrec = (xl_hash_insert *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + break; + } + case XLOG_HASH_ADD_OVFL_PAGE: + { + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec; + + appendStringInfo(buf, "bmsize %d, bmpage_found %c", + xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + { + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec; + + appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c", + xlrec->new_bucket, + (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F', + (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_COMPLETE: + { + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec; + + appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u", + xlrec->old_bucket_flag, xlrec->new_bucket_flag); + break; + } + case XLOG_HASH_MOVE_PAGE_CONTENTS: + { + xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec; + + appendStringInfo(buf, "ntups %d, is_primary %c", + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_SQUEEZE_PAGE: + { + xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec; + + appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c", + xlrec->prevblkno, + xlrec->nextblkno, + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_DELETE: + { + xl_hash_delete *xlrec = (xl_hash_delete *) rec; + + appendStringInfo(buf, "clear_dead_marking %c, is_primary %c", + xlrec->clear_dead_marking ? 'T' : 'F', + xlrec->is_primary_bucket_page ? 'T' : 'F'); + break; + } + case XLOG_HASH_UPDATE_META_PAGE: + { + xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec; + + appendStringInfo(buf, "ntuples %g", + xlrec->ntuples); + break; + } + case XLOG_HASH_VACUUM_ONE_PAGE: + { + xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec; + + appendStringInfo(buf, "ntuples %d, latestRemovedXid %u", + xlrec->ntuples, + xlrec->latestRemovedXid); + break; + } + } +} + +const char * +hash_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_HASH_INIT_META_PAGE: + id = "INIT_META_PAGE"; + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + id = "INIT_BITMAP_PAGE"; + break; + case XLOG_HASH_INSERT: + id = "INSERT"; + break; + case XLOG_HASH_ADD_OVFL_PAGE: + id = "ADD_OVFL_PAGE"; + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + id = "SPLIT_ALLOCATE_PAGE"; + break; + case XLOG_HASH_SPLIT_PAGE: + id = "SPLIT_PAGE"; + break; + case XLOG_HASH_SPLIT_COMPLETE: + id = "SPLIT_COMPLETE"; + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + id = "MOVE_PAGE_CONTENTS"; + break; + case XLOG_HASH_SQUEEZE_PAGE: + id = "SQUEEZE_PAGE"; + break; + case XLOG_HASH_DELETE: + id = "DELETE"; + break; + case XLOG_HASH_SPLIT_CLEANUP: + id = "SPLIT_CLEANUP"; + break; + case XLOG_HASH_UPDATE_META_PAGE: + id = "UPDATE_META_PAGE"; + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + id = "VACUUM_ONE_PAGE"; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c new file mode 100644 index 0000000..5c29fd9 --- /dev/null +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -0,0 +1,265 @@ +/*------------------------------------------------------------------------- + * + * heapdesc.c + * rmgr descriptor routines for access/heap/heapam.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/heapdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam_xlog.h" + +static void +out_infobits(StringInfo buf, uint8 infobits) +{ + if (infobits & XLHL_XMAX_IS_MULTI) + appendStringInfoString(buf, "IS_MULTI "); + if (infobits & XLHL_XMAX_LOCK_ONLY) + appendStringInfoString(buf, "LOCK_ONLY "); + if (infobits & XLHL_XMAX_EXCL_LOCK) + appendStringInfoString(buf, "EXCL_LOCK "); + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + appendStringInfoString(buf, "KEYSHR_LOCK "); + if (infobits & XLHL_KEYS_UPDATED) + appendStringInfoString(buf, "KEYS_UPDATED "); +} + +void +heap_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + info &= XLOG_HEAP_OPMASK; + if (info == XLOG_HEAP_INSERT) + { + xl_heap_insert *xlrec = (xl_heap_insert *) rec; + + appendStringInfo(buf, "off %u flags 0x%02X", xlrec->offnum, + xlrec->flags); + } + else if (info == XLOG_HEAP_DELETE) + { + xl_heap_delete *xlrec = (xl_heap_delete *) rec; + + appendStringInfo(buf, "off %u flags 0x%02X ", + xlrec->offnum, + xlrec->flags); + out_infobits(buf, xlrec->infobits_set); + } + else if (info == XLOG_HEAP_UPDATE) + { + xl_heap_update *xlrec = (xl_heap_update *) rec; + + appendStringInfo(buf, "off %u xmax %u flags 0x%02X ", + xlrec->old_offnum, + xlrec->old_xmax, + xlrec->flags); + out_infobits(buf, xlrec->old_infobits_set); + appendStringInfo(buf, "; new off %u xmax %u", + xlrec->new_offnum, + xlrec->new_xmax); + } + else if (info == XLOG_HEAP_HOT_UPDATE) + { + xl_heap_update *xlrec = (xl_heap_update *) rec; + + appendStringInfo(buf, "off %u xmax %u flags 0x%02X ", + xlrec->old_offnum, + xlrec->old_xmax, + xlrec->flags); + out_infobits(buf, xlrec->old_infobits_set); + appendStringInfo(buf, "; new off %u xmax %u", + xlrec->new_offnum, + xlrec->new_xmax); + } + else if (info == XLOG_HEAP_TRUNCATE) + { + xl_heap_truncate *xlrec = (xl_heap_truncate *) rec; + int i; + + if (xlrec->flags & XLH_TRUNCATE_CASCADE) + appendStringInfoString(buf, "cascade "); + if (xlrec->flags & XLH_TRUNCATE_RESTART_SEQS) + appendStringInfoString(buf, "restart_seqs "); + appendStringInfo(buf, "nrelids %u relids", xlrec->nrelids); + for (i = 0; i < xlrec->nrelids; i++) + appendStringInfo(buf, " %u", xlrec->relids[i]); + } + else if (info == XLOG_HEAP_CONFIRM) + { + xl_heap_confirm *xlrec = (xl_heap_confirm *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + } + else if (info == XLOG_HEAP_LOCK) + { + xl_heap_lock *xlrec = (xl_heap_lock *) rec; + + appendStringInfo(buf, "off %u: xid %u: flags 0x%02X ", + xlrec->offnum, xlrec->locking_xid, xlrec->flags); + out_infobits(buf, xlrec->infobits_set); + } + else if (info == XLOG_HEAP_INPLACE) + { + xl_heap_inplace *xlrec = (xl_heap_inplace *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + } +} +void +heap2_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + info &= XLOG_HEAP_OPMASK; + if (info == XLOG_HEAP2_PRUNE) + { + xl_heap_prune *xlrec = (xl_heap_prune *) rec; + + appendStringInfo(buf, "latestRemovedXid %u nredirected %u ndead %u", + xlrec->latestRemovedXid, + xlrec->nredirected, + xlrec->ndead); + } + else if (info == XLOG_HEAP2_VACUUM) + { + xl_heap_vacuum *xlrec = (xl_heap_vacuum *) rec; + + appendStringInfo(buf, "nunused %u", xlrec->nunused); + } + else if (info == XLOG_HEAP2_FREEZE_PAGE) + { + xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec; + + appendStringInfo(buf, "cutoff xid %u ntuples %u", + xlrec->cutoff_xid, xlrec->ntuples); + } + else if (info == XLOG_HEAP2_VISIBLE) + { + xl_heap_visible *xlrec = (xl_heap_visible *) rec; + + appendStringInfo(buf, "cutoff xid %u flags 0x%02X", + xlrec->cutoff_xid, xlrec->flags); + } + else if (info == XLOG_HEAP2_MULTI_INSERT) + { + xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; + + appendStringInfo(buf, "%d tuples flags 0x%02X", xlrec->ntuples, + xlrec->flags); + } + else if (info == XLOG_HEAP2_LOCK_UPDATED) + { + xl_heap_lock_updated *xlrec = (xl_heap_lock_updated *) rec; + + appendStringInfo(buf, "off %u: xmax %u: flags 0x%02X ", + xlrec->offnum, xlrec->xmax, xlrec->flags); + out_infobits(buf, xlrec->infobits_set); + } + else if (info == XLOG_HEAP2_NEW_CID) + { + xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec; + + appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", + xlrec->target_node.spcNode, + xlrec->target_node.dbNode, + xlrec->target_node.relNode, + ItemPointerGetBlockNumber(&(xlrec->target_tid)), + ItemPointerGetOffsetNumber(&(xlrec->target_tid))); + appendStringInfo(buf, "; cmin: %u, cmax: %u, combo: %u", + xlrec->cmin, xlrec->cmax, xlrec->combocid); + } +} + +const char * +heap_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_HEAP_INSERT: + id = "INSERT"; + break; + case XLOG_HEAP_INSERT | XLOG_HEAP_INIT_PAGE: + id = "INSERT+INIT"; + break; + case XLOG_HEAP_DELETE: + id = "DELETE"; + break; + case XLOG_HEAP_UPDATE: + id = "UPDATE"; + break; + case XLOG_HEAP_UPDATE | XLOG_HEAP_INIT_PAGE: + id = "UPDATE+INIT"; + break; + case XLOG_HEAP_HOT_UPDATE: + id = "HOT_UPDATE"; + break; + case XLOG_HEAP_HOT_UPDATE | XLOG_HEAP_INIT_PAGE: + id = "HOT_UPDATE+INIT"; + break; + case XLOG_HEAP_TRUNCATE: + id = "TRUNCATE"; + break; + case XLOG_HEAP_CONFIRM: + id = "HEAP_CONFIRM"; + break; + case XLOG_HEAP_LOCK: + id = "LOCK"; + break; + case XLOG_HEAP_INPLACE: + id = "INPLACE"; + break; + } + + return id; +} + +const char * +heap2_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_HEAP2_PRUNE: + id = "PRUNE"; + break; + case XLOG_HEAP2_VACUUM: + id = "VACUUM"; + break; + case XLOG_HEAP2_FREEZE_PAGE: + id = "FREEZE_PAGE"; + break; + case XLOG_HEAP2_VISIBLE: + id = "VISIBLE"; + break; + case XLOG_HEAP2_MULTI_INSERT: + id = "MULTI_INSERT"; + break; + case XLOG_HEAP2_MULTI_INSERT | XLOG_HEAP_INIT_PAGE: + id = "MULTI_INSERT+INIT"; + break; + case XLOG_HEAP2_LOCK_UPDATED: + id = "LOCK_UPDATED"; + break; + case XLOG_HEAP2_NEW_CID: + id = "NEW_CID"; + break; + case XLOG_HEAP2_REWRITE: + id = "REWRITE"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/logicalmsgdesc.c b/src/backend/access/rmgrdesc/logicalmsgdesc.c new file mode 100644 index 0000000..d64ce2e --- /dev/null +++ b/src/backend/access/rmgrdesc/logicalmsgdesc.c @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * logicalmsgdesc.c + * rmgr descriptor routines for replication/logical/message.c + * + * Portions Copyright (c) 2015-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/logicalmsgdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "replication/message.h" + +void +logicalmsg_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_LOGICAL_MESSAGE) + { + xl_logical_message *xlrec = (xl_logical_message *) rec; + char *prefix = xlrec->message; + char *message = xlrec->message + xlrec->prefix_size; + char *sep = ""; + + Assert(prefix[xlrec->prefix_size] != '\0'); + + appendStringInfo(buf, "%s, prefix \"%s\"; payload (%zu bytes): ", + xlrec->transactional ? "transactional" : "non-transactional", + prefix, xlrec->message_size); + /* Write message payload as a series of hex bytes */ + for (int cnt = 0; cnt < xlrec->message_size; cnt++) + { + appendStringInfo(buf, "%s%02X", sep, (unsigned char) message[cnt]); + sep = " "; + } + } +} + +const char * +logicalmsg_identify(uint8 info) +{ + if ((info & ~XLR_INFO_MASK) == XLOG_LOGICAL_MESSAGE) + return "MESSAGE"; + + return NULL; +} diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c new file mode 100644 index 0000000..8c37690 --- /dev/null +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -0,0 +1,105 @@ +/*------------------------------------------------------------------------- + * + * mxactdesc.c + * rmgr descriptor routines for access/transam/multixact.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/mxactdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/multixact.h" + +static void +out_member(StringInfo buf, MultiXactMember *member) +{ + appendStringInfo(buf, "%u ", member->xid); + switch (member->status) + { + case MultiXactStatusForKeyShare: + appendStringInfoString(buf, "(keysh) "); + break; + case MultiXactStatusForShare: + appendStringInfoString(buf, "(sh) "); + break; + case MultiXactStatusForNoKeyUpdate: + appendStringInfoString(buf, "(fornokeyupd) "); + break; + case MultiXactStatusForUpdate: + appendStringInfoString(buf, "(forupd) "); + break; + case MultiXactStatusNoKeyUpdate: + appendStringInfoString(buf, "(nokeyupd) "); + break; + case MultiXactStatusUpdate: + appendStringInfoString(buf, "(upd) "); + break; + default: + appendStringInfoString(buf, "(unk) "); + break; + } +} + +void +multixact_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE || + info == XLOG_MULTIXACT_ZERO_MEM_PAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + appendStringInfo(buf, "%d", pageno); + } + else if (info == XLOG_MULTIXACT_CREATE_ID) + { + xl_multixact_create *xlrec = (xl_multixact_create *) rec; + int i; + + appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid, + xlrec->moff, xlrec->nmembers); + for (i = 0; i < xlrec->nmembers; i++) + out_member(buf, &xlrec->members[i]); + } + else if (info == XLOG_MULTIXACT_TRUNCATE_ID) + { + xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; + + appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)", + xlrec->startTruncOff, xlrec->endTruncOff, + xlrec->startTruncMemb, xlrec->endTruncMemb); + } +} + +const char * +multixact_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_MULTIXACT_ZERO_OFF_PAGE: + id = "ZERO_OFF_PAGE"; + break; + case XLOG_MULTIXACT_ZERO_MEM_PAGE: + id = "ZERO_MEM_PAGE"; + break; + case XLOG_MULTIXACT_CREATE_ID: + id = "CREATE_ID"; + break; + case XLOG_MULTIXACT_TRUNCATE_ID: + id = "TRUNCATE_ID"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c new file mode 100644 index 0000000..710efbd --- /dev/null +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -0,0 +1,178 @@ +/*------------------------------------------------------------------------- + * + * nbtdesc.c + * rmgr descriptor routines for access/nbtree/nbtxlog.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/nbtdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtxlog.h" + +void +btree_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + case XLOG_BTREE_INSERT_UPPER: + case XLOG_BTREE_INSERT_META: + case XLOG_BTREE_INSERT_POST: + { + xl_btree_insert *xlrec = (xl_btree_insert *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + break; + } + case XLOG_BTREE_SPLIT_L: + case XLOG_BTREE_SPLIT_R: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + appendStringInfo(buf, "level %u, firstrightoff %d, newitemoff %d, postingoff %d", + xlrec->level, xlrec->firstrightoff, + xlrec->newitemoff, xlrec->postingoff); + break; + } + case XLOG_BTREE_DEDUP: + { + xl_btree_dedup *xlrec = (xl_btree_dedup *) rec; + + appendStringInfo(buf, "nintervals %u", xlrec->nintervals); + break; + } + case XLOG_BTREE_VACUUM: + { + xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; + + appendStringInfo(buf, "ndeleted %u; nupdated %u", + xlrec->ndeleted, xlrec->nupdated); + break; + } + case XLOG_BTREE_DELETE: + { + xl_btree_delete *xlrec = (xl_btree_delete *) rec; + + appendStringInfo(buf, "latestRemovedXid %u; ndeleted %u; nupdated %u", + xlrec->latestRemovedXid, xlrec->ndeleted, xlrec->nupdated); + break; + } + case XLOG_BTREE_MARK_PAGE_HALFDEAD: + { + xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) rec; + + appendStringInfo(buf, "topparent %u; leaf %u; left %u; right %u", + xlrec->topparent, xlrec->leafblk, xlrec->leftblk, xlrec->rightblk); + break; + } + case XLOG_BTREE_UNLINK_PAGE_META: + case XLOG_BTREE_UNLINK_PAGE: + { + xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec; + + appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ", + xlrec->leftsib, xlrec->rightsib, xlrec->level, + EpochFromFullTransactionId(xlrec->safexid), + XidFromFullTransactionId(xlrec->safexid)); + appendStringInfo(buf, "leafleft %u; leafright %u; leaftopparent %u", + xlrec->leafleftsib, xlrec->leafrightsib, + xlrec->leaftopparent); + break; + } + case XLOG_BTREE_NEWROOT: + { + xl_btree_newroot *xlrec = (xl_btree_newroot *) rec; + + appendStringInfo(buf, "lev %u", xlrec->level); + break; + } + case XLOG_BTREE_REUSE_PAGE: + { + xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; + + appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, + EpochFromFullTransactionId(xlrec->latestRemovedFullXid), + XidFromFullTransactionId(xlrec->latestRemovedFullXid)); + break; + } + case XLOG_BTREE_META_CLEANUP: + { + xl_btree_metadata *xlrec; + + xlrec = (xl_btree_metadata *) XLogRecGetBlockData(record, 0, + NULL); + appendStringInfo(buf, "last_cleanup_num_delpages %u", + xlrec->last_cleanup_num_delpages); + break; + } + } +} + +const char * +btree_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_BTREE_INSERT_LEAF: + id = "INSERT_LEAF"; + break; + case XLOG_BTREE_INSERT_UPPER: + id = "INSERT_UPPER"; + break; + case XLOG_BTREE_INSERT_META: + id = "INSERT_META"; + break; + case XLOG_BTREE_SPLIT_L: + id = "SPLIT_L"; + break; + case XLOG_BTREE_SPLIT_R: + id = "SPLIT_R"; + break; + case XLOG_BTREE_INSERT_POST: + id = "INSERT_POST"; + break; + case XLOG_BTREE_DEDUP: + id = "DEDUP"; + break; + case XLOG_BTREE_VACUUM: + id = "VACUUM"; + break; + case XLOG_BTREE_DELETE: + id = "DELETE"; + break; + case XLOG_BTREE_MARK_PAGE_HALFDEAD: + id = "MARK_PAGE_HALFDEAD"; + break; + case XLOG_BTREE_UNLINK_PAGE: + id = "UNLINK_PAGE"; + break; + case XLOG_BTREE_UNLINK_PAGE_META: + id = "UNLINK_PAGE_META"; + break; + case XLOG_BTREE_NEWROOT: + id = "NEWROOT"; + break; + case XLOG_BTREE_REUSE_PAGE: + id = "REUSE_PAGE"; + break; + case XLOG_BTREE_META_CLEANUP: + id = "META_CLEANUP"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/relmapdesc.c b/src/backend/access/rmgrdesc/relmapdesc.c new file mode 100644 index 0000000..2f9d4f5 --- /dev/null +++ b/src/backend/access/rmgrdesc/relmapdesc.c @@ -0,0 +1,47 @@ +/*------------------------------------------------------------------------- + * + * relmapdesc.c + * rmgr descriptor routines for utils/cache/relmapper.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/relmapdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "utils/relmapper.h" + +void +relmap_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_RELMAP_UPDATE) + { + xl_relmap_update *xlrec = (xl_relmap_update *) rec; + + appendStringInfo(buf, "database %u tablespace %u size %u", + xlrec->dbid, xlrec->tsid, xlrec->nbytes); + } +} + +const char * +relmap_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_RELMAP_UPDATE: + id = "UPDATE"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/replorigindesc.c b/src/backend/access/rmgrdesc/replorigindesc.c new file mode 100644 index 0000000..1f314c4 --- /dev/null +++ b/src/backend/access/rmgrdesc/replorigindesc.c @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * replorigindesc.c + * rmgr descriptor routines for replication/logical/origin.c + * + * Portions Copyright (c) 2015-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/replorigindesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "replication/origin.h" + +void +replorigin_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_REPLORIGIN_SET: + { + xl_replorigin_set *xlrec; + + xlrec = (xl_replorigin_set *) rec; + + appendStringInfo(buf, "set %u; lsn %X/%X; force: %d", + xlrec->node_id, + LSN_FORMAT_ARGS(xlrec->remote_lsn), + xlrec->force); + break; + } + case XLOG_REPLORIGIN_DROP: + { + xl_replorigin_drop *xlrec; + + xlrec = (xl_replorigin_drop *) rec; + + appendStringInfo(buf, "drop %u", xlrec->node_id); + break; + } + } +} + +const char * +replorigin_identify(uint8 info) +{ + switch (info) + { + case XLOG_REPLORIGIN_SET: + return "SET"; + case XLOG_REPLORIGIN_DROP: + return "DROP"; + default: + return NULL; + } +} diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c new file mode 100644 index 0000000..0bd2946 --- /dev/null +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -0,0 +1,46 @@ +/*------------------------------------------------------------------------- + * + * seqdesc.c + * rmgr descriptor routines for commands/sequence.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/seqdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/sequence.h" + + +void +seq_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + xl_seq_rec *xlrec = (xl_seq_rec *) rec; + + if (info == XLOG_SEQ_LOG) + appendStringInfo(buf, "rel %u/%u/%u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode); +} + +const char * +seq_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_SEQ_LOG: + id = "LOG"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c new file mode 100644 index 0000000..7755553 --- /dev/null +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * smgrdesc.c + * rmgr descriptor routines for catalog/storage.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/smgrdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/storage_xlog.h" + + +void +smgr_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_SMGR_CREATE) + { + xl_smgr_create *xlrec = (xl_smgr_create *) rec; + char *path = relpathperm(xlrec->rnode, xlrec->forkNum); + + appendStringInfoString(buf, path); + pfree(path); + } + else if (info == XLOG_SMGR_TRUNCATE) + { + xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec; + char *path = relpathperm(xlrec->rnode, MAIN_FORKNUM); + + appendStringInfo(buf, "%s to %u blocks flags %d", path, + xlrec->blkno, xlrec->flags); + pfree(path); + } +} + +const char * +smgr_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_SMGR_CREATE: + id = "CREATE"; + break; + case XLOG_SMGR_TRUNCATE: + id = "TRUNCATE"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/spgdesc.c b/src/backend/access/rmgrdesc/spgdesc.c new file mode 100644 index 0000000..0fefe38 --- /dev/null +++ b/src/backend/access/rmgrdesc/spgdesc.c @@ -0,0 +1,164 @@ +/*------------------------------------------------------------------------- + * + * spgdesc.c + * rmgr descriptor routines for access/spgist/spgxlog.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/spgdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/spgxlog.h" + +void +spg_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_SPGIST_ADD_LEAF: + { + spgxlogAddLeaf *xlrec = (spgxlogAddLeaf *) rec; + + appendStringInfo(buf, "off: %u, headoff: %u, parentoff: %u, nodeI: %u", + xlrec->offnumLeaf, xlrec->offnumHeadLeaf, + xlrec->offnumParent, xlrec->nodeI); + if (xlrec->newPage) + appendStringInfoString(buf, " (newpage)"); + if (xlrec->storesNulls) + appendStringInfoString(buf, " (nulls)"); + } + break; + case XLOG_SPGIST_MOVE_LEAFS: + { + spgxlogMoveLeafs *xlrec = (spgxlogMoveLeafs *) rec; + + appendStringInfo(buf, "nmoves: %u, parentoff: %u, nodeI: %u", + xlrec->nMoves, + xlrec->offnumParent, xlrec->nodeI); + if (xlrec->newPage) + appendStringInfoString(buf, " (newpage)"); + if (xlrec->replaceDead) + appendStringInfoString(buf, " (replacedead)"); + if (xlrec->storesNulls) + appendStringInfoString(buf, " (nulls)"); + } + break; + case XLOG_SPGIST_ADD_NODE: + { + spgxlogAddNode *xlrec = (spgxlogAddNode *) rec; + + appendStringInfo(buf, "off: %u, newoff: %u, parentBlk: %d, " + "parentoff: %u, nodeI: %u", + xlrec->offnum, + xlrec->offnumNew, + xlrec->parentBlk, + xlrec->offnumParent, + xlrec->nodeI); + if (xlrec->newPage) + appendStringInfoString(buf, " (newpage)"); + } + break; + case XLOG_SPGIST_SPLIT_TUPLE: + { + spgxlogSplitTuple *xlrec = (spgxlogSplitTuple *) rec; + + appendStringInfo(buf, "prefixoff: %u, postfixoff: %u", + xlrec->offnumPrefix, + xlrec->offnumPostfix); + if (xlrec->newPage) + appendStringInfoString(buf, " (newpage)"); + if (xlrec->postfixBlkSame) + appendStringInfoString(buf, " (same)"); + } + break; + case XLOG_SPGIST_PICKSPLIT: + { + spgxlogPickSplit *xlrec = (spgxlogPickSplit *) rec; + + appendStringInfo(buf, "ndelete: %u, ninsert: %u, inneroff: %u, " + "parentoff: %u, nodeI: %u", + xlrec->nDelete, xlrec->nInsert, + xlrec->offnumInner, + xlrec->offnumParent, xlrec->nodeI); + if (xlrec->innerIsParent) + appendStringInfoString(buf, " (innerIsParent)"); + if (xlrec->storesNulls) + appendStringInfoString(buf, " (nulls)"); + if (xlrec->isRootSplit) + appendStringInfoString(buf, " (isRootSplit)"); + } + break; + case XLOG_SPGIST_VACUUM_LEAF: + { + spgxlogVacuumLeaf *xlrec = (spgxlogVacuumLeaf *) rec; + + appendStringInfo(buf, "ndead: %u, nplaceholder: %u, nmove: %u, nchain: %u", + xlrec->nDead, xlrec->nPlaceholder, + xlrec->nMove, xlrec->nChain); + } + break; + case XLOG_SPGIST_VACUUM_ROOT: + { + spgxlogVacuumRoot *xlrec = (spgxlogVacuumRoot *) rec; + + appendStringInfo(buf, "ndelete: %u", + xlrec->nDelete); + } + break; + case XLOG_SPGIST_VACUUM_REDIRECT: + { + spgxlogVacuumRedirect *xlrec = (spgxlogVacuumRedirect *) rec; + + appendStringInfo(buf, "ntoplaceholder: %u, firstplaceholder: %u, newestredirectxid: %u", + xlrec->nToPlaceholder, + xlrec->firstPlaceholder, + xlrec->newestRedirectXid); + } + break; + } +} + +const char * +spg_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_SPGIST_ADD_LEAF: + id = "ADD_LEAF"; + break; + case XLOG_SPGIST_MOVE_LEAFS: + id = "MOVE_LEAFS"; + break; + case XLOG_SPGIST_ADD_NODE: + id = "ADD_NODE"; + break; + case XLOG_SPGIST_SPLIT_TUPLE: + id = "SPLIT_TUPLE"; + break; + case XLOG_SPGIST_PICKSPLIT: + id = "PICKSPLIT"; + break; + case XLOG_SPGIST_VACUUM_LEAF: + id = "VACUUM_LEAF"; + break; + case XLOG_SPGIST_VACUUM_ROOT: + id = "VACUUM_ROOT"; + break; + case XLOG_SPGIST_VACUUM_REDIRECT: + id = "VACUUM_REDIRECT"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c new file mode 100644 index 0000000..01ee7ac --- /dev/null +++ b/src/backend/access/rmgrdesc/standbydesc.c @@ -0,0 +1,135 @@ +/*------------------------------------------------------------------------- + * + * standbydesc.c + * rmgr descriptor routines for storage/ipc/standby.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/standbydesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/standbydefs.h" + +static void +standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec) +{ + int i; + + appendStringInfo(buf, "nextXid %u latestCompletedXid %u oldestRunningXid %u", + xlrec->nextXid, + xlrec->latestCompletedXid, + xlrec->oldestRunningXid); + if (xlrec->xcnt > 0) + { + appendStringInfo(buf, "; %d xacts:", xlrec->xcnt); + for (i = 0; i < xlrec->xcnt; i++) + appendStringInfo(buf, " %u", xlrec->xids[i]); + } + + if (xlrec->subxid_overflow) + appendStringInfoString(buf, "; subxid ovf"); +} + +void +standby_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_STANDBY_LOCK) + { + xl_standby_locks *xlrec = (xl_standby_locks *) rec; + int i; + + for (i = 0; i < xlrec->nlocks; i++) + appendStringInfo(buf, "xid %u db %u rel %u ", + xlrec->locks[i].xid, xlrec->locks[i].dbOid, + xlrec->locks[i].relOid); + } + else if (info == XLOG_RUNNING_XACTS) + { + xl_running_xacts *xlrec = (xl_running_xacts *) rec; + + standby_desc_running_xacts(buf, xlrec); + } + else if (info == XLOG_INVALIDATIONS) + { + xl_invalidations *xlrec = (xl_invalidations *) rec; + + standby_desc_invalidations(buf, xlrec->nmsgs, xlrec->msgs, + xlrec->dbId, xlrec->tsId, + xlrec->relcacheInitFileInval); + } +} + +const char * +standby_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_STANDBY_LOCK: + id = "LOCK"; + break; + case XLOG_RUNNING_XACTS: + id = "RUNNING_XACTS"; + break; + case XLOG_INVALIDATIONS: + id = "INVALIDATIONS"; + break; + } + + return id; +} + +/* + * This routine is used by both standby_desc and xact_desc, because + * transaction commits and XLOG_INVALIDATIONS messages contain invalidations; + * it seems pointless to duplicate the code. + */ +void +standby_desc_invalidations(StringInfo buf, + int nmsgs, SharedInvalidationMessage *msgs, + Oid dbId, Oid tsId, + bool relcacheInitFileInval) +{ + int i; + + /* Do nothing if there are no invalidation messages */ + if (nmsgs <= 0) + return; + + if (relcacheInitFileInval) + appendStringInfo(buf, "; relcache init file inval dbid %u tsid %u", + dbId, tsId); + + appendStringInfoString(buf, "; inval msgs:"); + for (i = 0; i < nmsgs; i++) + { + SharedInvalidationMessage *msg = &msgs[i]; + + if (msg->id >= 0) + appendStringInfo(buf, " catcache %d", msg->id); + else if (msg->id == SHAREDINVALCATALOG_ID) + appendStringInfo(buf, " catalog %u", msg->cat.catId); + else if (msg->id == SHAREDINVALRELCACHE_ID) + appendStringInfo(buf, " relcache %u", msg->rc.relId); + /* not expected, but print something anyway */ + else if (msg->id == SHAREDINVALSMGR_ID) + appendStringInfoString(buf, " smgr"); + /* not expected, but print something anyway */ + else if (msg->id == SHAREDINVALRELMAP_ID) + appendStringInfo(buf, " relmap db %u", msg->rm.dbId); + else if (msg->id == SHAREDINVALSNAPSHOT_ID) + appendStringInfo(buf, " snapshot %u", msg->sn.relId); + else + appendStringInfo(buf, " unrecognized id %d", msg->id); + } +} diff --git a/src/backend/access/rmgrdesc/tblspcdesc.c b/src/backend/access/rmgrdesc/tblspcdesc.c new file mode 100644 index 0000000..cb356ea --- /dev/null +++ b/src/backend/access/rmgrdesc/tblspcdesc.c @@ -0,0 +1,56 @@ +/*------------------------------------------------------------------------- + * + * tblspcdesc.c + * rmgr descriptor routines for commands/tablespace.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/tblspcdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/tablespace.h" + + +void +tblspc_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_TBLSPC_CREATE) + { + xl_tblspc_create_rec *xlrec = (xl_tblspc_create_rec *) rec; + + appendStringInfo(buf, "%u \"%s\"", xlrec->ts_id, xlrec->ts_path); + } + else if (info == XLOG_TBLSPC_DROP) + { + xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) rec; + + appendStringInfo(buf, "%u", xlrec->ts_id); + } +} + +const char * +tblspc_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_TBLSPC_CREATE: + id = "CREATE"; + break; + case XLOG_TBLSPC_DROP: + id = "DROP"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c new file mode 100644 index 0000000..4b0d10f --- /dev/null +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -0,0 +1,438 @@ +/*------------------------------------------------------------------------- + * + * xactdesc.c + * rmgr descriptor routines for access/transam/xact.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/xactdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/transam.h" +#include "access/xact.h" +#include "storage/sinval.h" +#include "storage/standbydefs.h" +#include "utils/timestamp.h" + +/* + * Parse the WAL format of an xact commit and abort records into an easier to + * understand format. + * + * This routines are in xactdesc.c because they're accessed in backend (when + * replaying WAL) and frontend (pg_waldump) code. This file is the only xact + * specific one shared between both. They're complicated enough that + * duplication would be bothersome. + */ + +void +ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed) +{ + char *data = ((char *) xlrec) + MinSizeOfXactCommit; + + memset(parsed, 0, sizeof(*parsed)); + + parsed->xinfo = 0; /* default, if no XLOG_XACT_HAS_INFO is + * present */ + + parsed->xact_time = xlrec->xact_time; + + if (info & XLOG_XACT_HAS_INFO) + { + xl_xact_xinfo *xl_xinfo = (xl_xact_xinfo *) data; + + parsed->xinfo = xl_xinfo->xinfo; + + data += sizeof(xl_xact_xinfo); + } + + if (parsed->xinfo & XACT_XINFO_HAS_DBINFO) + { + xl_xact_dbinfo *xl_dbinfo = (xl_xact_dbinfo *) data; + + parsed->dbId = xl_dbinfo->dbId; + parsed->tsId = xl_dbinfo->tsId; + + data += sizeof(xl_xact_dbinfo); + } + + if (parsed->xinfo & XACT_XINFO_HAS_SUBXACTS) + { + xl_xact_subxacts *xl_subxacts = (xl_xact_subxacts *) data; + + parsed->nsubxacts = xl_subxacts->nsubxacts; + parsed->subxacts = xl_subxacts->subxacts; + + data += MinSizeOfXactSubxacts; + data += parsed->nsubxacts * sizeof(TransactionId); + } + + if (parsed->xinfo & XACT_XINFO_HAS_RELFILENODES) + { + xl_xact_relfilenodes *xl_relfilenodes = (xl_xact_relfilenodes *) data; + + parsed->nrels = xl_relfilenodes->nrels; + parsed->xnodes = xl_relfilenodes->xnodes; + + data += MinSizeOfXactRelfilenodes; + data += xl_relfilenodes->nrels * sizeof(RelFileNode); + } + + if (parsed->xinfo & XACT_XINFO_HAS_INVALS) + { + xl_xact_invals *xl_invals = (xl_xact_invals *) data; + + parsed->nmsgs = xl_invals->nmsgs; + parsed->msgs = xl_invals->msgs; + + data += MinSizeOfXactInvals; + data += xl_invals->nmsgs * sizeof(SharedInvalidationMessage); + } + + if (parsed->xinfo & XACT_XINFO_HAS_TWOPHASE) + { + xl_xact_twophase *xl_twophase = (xl_xact_twophase *) data; + + parsed->twophase_xid = xl_twophase->xid; + + data += sizeof(xl_xact_twophase); + + if (parsed->xinfo & XACT_XINFO_HAS_GID) + { + strlcpy(parsed->twophase_gid, data, sizeof(parsed->twophase_gid)); + data += strlen(data) + 1; + } + } + + /* Note: no alignment is guaranteed after this point */ + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + xl_xact_origin xl_origin; + + /* no alignment is guaranteed, so copy onto stack */ + memcpy(&xl_origin, data, sizeof(xl_origin)); + + parsed->origin_lsn = xl_origin.origin_lsn; + parsed->origin_timestamp = xl_origin.origin_timestamp; + + data += sizeof(xl_xact_origin); + } +} + +void +ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed) +{ + char *data = ((char *) xlrec) + MinSizeOfXactAbort; + + memset(parsed, 0, sizeof(*parsed)); + + parsed->xinfo = 0; /* default, if no XLOG_XACT_HAS_INFO is + * present */ + + parsed->xact_time = xlrec->xact_time; + + if (info & XLOG_XACT_HAS_INFO) + { + xl_xact_xinfo *xl_xinfo = (xl_xact_xinfo *) data; + + parsed->xinfo = xl_xinfo->xinfo; + + data += sizeof(xl_xact_xinfo); + } + + if (parsed->xinfo & XACT_XINFO_HAS_DBINFO) + { + xl_xact_dbinfo *xl_dbinfo = (xl_xact_dbinfo *) data; + + parsed->dbId = xl_dbinfo->dbId; + parsed->tsId = xl_dbinfo->tsId; + + data += sizeof(xl_xact_dbinfo); + } + + if (parsed->xinfo & XACT_XINFO_HAS_SUBXACTS) + { + xl_xact_subxacts *xl_subxacts = (xl_xact_subxacts *) data; + + parsed->nsubxacts = xl_subxacts->nsubxacts; + parsed->subxacts = xl_subxacts->subxacts; + + data += MinSizeOfXactSubxacts; + data += parsed->nsubxacts * sizeof(TransactionId); + } + + if (parsed->xinfo & XACT_XINFO_HAS_RELFILENODES) + { + xl_xact_relfilenodes *xl_relfilenodes = (xl_xact_relfilenodes *) data; + + parsed->nrels = xl_relfilenodes->nrels; + parsed->xnodes = xl_relfilenodes->xnodes; + + data += MinSizeOfXactRelfilenodes; + data += xl_relfilenodes->nrels * sizeof(RelFileNode); + } + + if (parsed->xinfo & XACT_XINFO_HAS_TWOPHASE) + { + xl_xact_twophase *xl_twophase = (xl_xact_twophase *) data; + + parsed->twophase_xid = xl_twophase->xid; + + data += sizeof(xl_xact_twophase); + + if (parsed->xinfo & XACT_XINFO_HAS_GID) + { + strlcpy(parsed->twophase_gid, data, sizeof(parsed->twophase_gid)); + data += strlen(data) + 1; + } + } + + /* Note: no alignment is guaranteed after this point */ + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + xl_xact_origin xl_origin; + + /* no alignment is guaranteed, so copy onto stack */ + memcpy(&xl_origin, data, sizeof(xl_origin)); + + parsed->origin_lsn = xl_origin.origin_lsn; + parsed->origin_timestamp = xl_origin.origin_timestamp; + + data += sizeof(xl_xact_origin); + } +} + +/* + * ParsePrepareRecord + */ +void +ParsePrepareRecord(uint8 info, xl_xact_prepare *xlrec, xl_xact_parsed_prepare *parsed) +{ + char *bufptr; + + bufptr = ((char *) xlrec) + MAXALIGN(sizeof(xl_xact_prepare)); + + memset(parsed, 0, sizeof(*parsed)); + + parsed->xact_time = xlrec->prepared_at; + parsed->origin_lsn = xlrec->origin_lsn; + parsed->origin_timestamp = xlrec->origin_timestamp; + parsed->twophase_xid = xlrec->xid; + parsed->dbId = xlrec->database; + parsed->nsubxacts = xlrec->nsubxacts; + parsed->nrels = xlrec->ncommitrels; + parsed->nabortrels = xlrec->nabortrels; + parsed->nmsgs = xlrec->ninvalmsgs; + + strncpy(parsed->twophase_gid, bufptr, xlrec->gidlen); + bufptr += MAXALIGN(xlrec->gidlen); + + parsed->subxacts = (TransactionId *) bufptr; + bufptr += MAXALIGN(xlrec->nsubxacts * sizeof(TransactionId)); + + parsed->xnodes = (RelFileNode *) bufptr; + bufptr += MAXALIGN(xlrec->ncommitrels * sizeof(RelFileNode)); + + parsed->abortnodes = (RelFileNode *) bufptr; + bufptr += MAXALIGN(xlrec->nabortrels * sizeof(RelFileNode)); + + parsed->msgs = (SharedInvalidationMessage *) bufptr; + bufptr += MAXALIGN(xlrec->ninvalmsgs * sizeof(SharedInvalidationMessage)); +} + +static void +xact_desc_relations(StringInfo buf, char *label, int nrels, + RelFileNode *xnodes) +{ + int i; + + if (nrels > 0) + { + appendStringInfo(buf, "; %s:", label); + for (i = 0; i < nrels; i++) + { + char *path = relpathperm(xnodes[i], MAIN_FORKNUM); + + appendStringInfo(buf, " %s", path); + pfree(path); + } + } +} + +static void +xact_desc_subxacts(StringInfo buf, int nsubxacts, TransactionId *subxacts) +{ + int i; + + if (nsubxacts > 0) + { + appendStringInfoString(buf, "; subxacts:"); + for (i = 0; i < nsubxacts; i++) + appendStringInfo(buf, " %u", subxacts[i]); + } +} + +static void +xact_desc_commit(StringInfo buf, uint8 info, xl_xact_commit *xlrec, RepOriginId origin_id) +{ + xl_xact_parsed_commit parsed; + + ParseCommitRecord(info, xlrec, &parsed); + + /* If this is a prepared xact, show the xid of the original xact */ + if (TransactionIdIsValid(parsed.twophase_xid)) + appendStringInfo(buf, "%u: ", parsed.twophase_xid); + + appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); + + xact_desc_relations(buf, "rels", parsed.nrels, parsed.xnodes); + xact_desc_subxacts(buf, parsed.nsubxacts, parsed.subxacts); + + standby_desc_invalidations(buf, parsed.nmsgs, parsed.msgs, parsed.dbId, + parsed.tsId, + XactCompletionRelcacheInitFileInval(parsed.xinfo)); + + if (XactCompletionForceSyncCommit(parsed.xinfo)) + appendStringInfoString(buf, "; sync"); + + if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN) + { + appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + origin_id, + LSN_FORMAT_ARGS(parsed.origin_lsn), + timestamptz_to_str(parsed.origin_timestamp)); + } +} + +static void +xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec) +{ + xl_xact_parsed_abort parsed; + + ParseAbortRecord(info, xlrec, &parsed); + + /* If this is a prepared xact, show the xid of the original xact */ + if (TransactionIdIsValid(parsed.twophase_xid)) + appendStringInfo(buf, "%u: ", parsed.twophase_xid); + + appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); + + xact_desc_relations(buf, "rels", parsed.nrels, parsed.xnodes); + xact_desc_subxacts(buf, parsed.nsubxacts, parsed.subxacts); +} + +static void +xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec) +{ + xl_xact_parsed_prepare parsed; + + ParsePrepareRecord(info, xlrec, &parsed); + + appendStringInfo(buf, "gid %s: ", parsed.twophase_gid); + appendStringInfoString(buf, timestamptz_to_str(parsed.xact_time)); + + xact_desc_relations(buf, "rels(commit)", parsed.nrels, parsed.xnodes); + xact_desc_relations(buf, "rels(abort)", parsed.nabortrels, + parsed.abortnodes); + xact_desc_subxacts(buf, parsed.nsubxacts, parsed.subxacts); + + standby_desc_invalidations(buf, parsed.nmsgs, parsed.msgs, parsed.dbId, + parsed.tsId, xlrec->initfileinval); +} + +static void +xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec) +{ + int i; + + appendStringInfoString(buf, "subxacts:"); + + for (i = 0; i < xlrec->nsubxacts; i++) + appendStringInfo(buf, " %u", xlrec->xsub[i]); +} + +void +xact_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (info == XLOG_XACT_COMMIT || info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) rec; + + xact_desc_commit(buf, XLogRecGetInfo(record), xlrec, + XLogRecGetOrigin(record)); + } + else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) rec; + + xact_desc_abort(buf, XLogRecGetInfo(record), xlrec); + } + else if (info == XLOG_XACT_PREPARE) + { + xl_xact_prepare *xlrec = (xl_xact_prepare *) rec; + + xact_desc_prepare(buf, XLogRecGetInfo(record), xlrec); + } + else if (info == XLOG_XACT_ASSIGNMENT) + { + xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; + + /* + * Note that we ignore the WAL record's xid, since we're more + * interested in the top-level xid that issued the record and which + * xids are being reported here. + */ + appendStringInfo(buf, "xtop %u: ", xlrec->xtop); + xact_desc_assignment(buf, xlrec); + } + else if (info == XLOG_XACT_INVALIDATIONS) + { + xl_xact_invals *xlrec = (xl_xact_invals *) rec; + + standby_desc_invalidations(buf, xlrec->nmsgs, xlrec->msgs, InvalidOid, + InvalidOid, false); + } +} + +const char * +xact_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & XLOG_XACT_OPMASK) + { + case XLOG_XACT_COMMIT: + id = "COMMIT"; + break; + case XLOG_XACT_PREPARE: + id = "PREPARE"; + break; + case XLOG_XACT_ABORT: + id = "ABORT"; + break; + case XLOG_XACT_COMMIT_PREPARED: + id = "COMMIT_PREPARED"; + break; + case XLOG_XACT_ABORT_PREPARED: + id = "ABORT_PREPARED"; + break; + case XLOG_XACT_ASSIGNMENT: + id = "ASSIGNMENT"; + break; + case XLOG_XACT_INVALIDATIONS: + id = "INVALIDATION"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c new file mode 100644 index 0000000..5bf2346 --- /dev/null +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -0,0 +1,202 @@ +/*------------------------------------------------------------------------- + * + * xlogdesc.c + * rmgr descriptor routines for access/transam/xlog.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/xlogdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "catalog/pg_control.h" +#include "utils/guc.h" +#include "utils/timestamp.h" + +/* + * GUC support + */ +const struct config_enum_entry wal_level_options[] = { + {"minimal", WAL_LEVEL_MINIMAL, false}, + {"replica", WAL_LEVEL_REPLICA, false}, + {"archive", WAL_LEVEL_REPLICA, true}, /* deprecated */ + {"hot_standby", WAL_LEVEL_REPLICA, true}, /* deprecated */ + {"logical", WAL_LEVEL_LOGICAL, false}, + {NULL, 0, false} +}; + +void +xlog_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_CHECKPOINT_SHUTDOWN || + info == XLOG_CHECKPOINT_ONLINE) + { + CheckPoint *checkpoint = (CheckPoint *) rec; + + appendStringInfo(buf, "redo %X/%X; " + "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; " + "oldest xid %u in DB %u; oldest multi %u in DB %u; " + "oldest/newest commit timestamp xid: %u/%u; " + "oldest running xid %u; %s", + LSN_FORMAT_ARGS(checkpoint->redo), + checkpoint->ThisTimeLineID, + checkpoint->PrevTimeLineID, + checkpoint->fullPageWrites ? "true" : "false", + EpochFromFullTransactionId(checkpoint->nextXid), + XidFromFullTransactionId(checkpoint->nextXid), + checkpoint->nextOid, + checkpoint->nextMulti, + checkpoint->nextMultiOffset, + checkpoint->oldestXid, + checkpoint->oldestXidDB, + checkpoint->oldestMulti, + checkpoint->oldestMultiDB, + checkpoint->oldestCommitTsXid, + checkpoint->newestCommitTsXid, + checkpoint->oldestActiveXid, + (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online"); + } + else if (info == XLOG_NEXTOID) + { + Oid nextOid; + + memcpy(&nextOid, rec, sizeof(Oid)); + appendStringInfo(buf, "%u", nextOid); + } + else if (info == XLOG_RESTORE_POINT) + { + xl_restore_point *xlrec = (xl_restore_point *) rec; + + appendStringInfoString(buf, xlrec->rp_name); + } + else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + { + /* no further information to print */ + } + else if (info == XLOG_BACKUP_END) + { + XLogRecPtr startpoint; + + memcpy(&startpoint, rec, sizeof(XLogRecPtr)); + appendStringInfo(buf, "%X/%X", LSN_FORMAT_ARGS(startpoint)); + } + else if (info == XLOG_PARAMETER_CHANGE) + { + xl_parameter_change xlrec; + const char *wal_level_str; + const struct config_enum_entry *entry; + + memcpy(&xlrec, rec, sizeof(xl_parameter_change)); + + /* Find a string representation for wal_level */ + wal_level_str = "?"; + for (entry = wal_level_options; entry->name; entry++) + { + if (entry->val == xlrec.wal_level) + { + wal_level_str = entry->name; + break; + } + } + + appendStringInfo(buf, "max_connections=%d max_worker_processes=%d " + "max_wal_senders=%d max_prepared_xacts=%d " + "max_locks_per_xact=%d wal_level=%s " + "wal_log_hints=%s track_commit_timestamp=%s", + xlrec.MaxConnections, + xlrec.max_worker_processes, + xlrec.max_wal_senders, + xlrec.max_prepared_xacts, + xlrec.max_locks_per_xact, + wal_level_str, + xlrec.wal_log_hints ? "on" : "off", + xlrec.track_commit_timestamp ? "on" : "off"); + } + else if (info == XLOG_FPW_CHANGE) + { + bool fpw; + + memcpy(&fpw, rec, sizeof(bool)); + appendStringInfoString(buf, fpw ? "true" : "false"); + } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, rec, sizeof(xl_end_of_recovery)); + appendStringInfo(buf, "tli %u; prev tli %u; time %s", + xlrec.ThisTimeLineID, xlrec.PrevTimeLineID, + timestamptz_to_str(xlrec.end_time)); + } + else if (info == XLOG_OVERWRITE_CONTRECORD) + { + xl_overwrite_contrecord xlrec; + + memcpy(&xlrec, rec, sizeof(xl_overwrite_contrecord)); + appendStringInfo(buf, "lsn %X/%X; time %s", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + timestamptz_to_str(xlrec.overwrite_time)); + } +} + +const char * +xlog_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_CHECKPOINT_SHUTDOWN: + id = "CHECKPOINT_SHUTDOWN"; + break; + case XLOG_CHECKPOINT_ONLINE: + id = "CHECKPOINT_ONLINE"; + break; + case XLOG_NOOP: + id = "NOOP"; + break; + case XLOG_NEXTOID: + id = "NEXTOID"; + break; + case XLOG_SWITCH: + id = "SWITCH"; + break; + case XLOG_BACKUP_END: + id = "BACKUP_END"; + break; + case XLOG_PARAMETER_CHANGE: + id = "PARAMETER_CHANGE"; + break; + case XLOG_RESTORE_POINT: + id = "RESTORE_POINT"; + break; + case XLOG_FPW_CHANGE: + id = "FPW_CHANGE"; + break; + case XLOG_END_OF_RECOVERY: + id = "END_OF_RECOVERY"; + break; + case XLOG_OVERWRITE_CONTRECORD: + id = "OVERWRITE_CONTRECORD"; + break; + case XLOG_FPI: + id = "FPI"; + break; + case XLOG_FPI_FOR_HINT: + id = "FPI_FOR_HINT"; + break; + } + + return id; +} diff --git a/src/backend/access/spgist/Makefile b/src/backend/access/spgist/Makefile new file mode 100644 index 0000000..8ed3b4a --- /dev/null +++ b/src/backend/access/spgist/Makefile @@ -0,0 +1,28 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/spgist +# +# IDENTIFICATION +# src/backend/access/spgist/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/spgist +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + spgdoinsert.o \ + spginsert.o \ + spgkdtreeproc.o \ + spgproc.o \ + spgquadtreeproc.o \ + spgscan.o \ + spgtextproc.o \ + spgutils.o \ + spgvacuum.o \ + spgvalidate.o \ + spgxlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README new file mode 100644 index 0000000..7117e02 --- /dev/null +++ b/src/backend/access/spgist/README @@ -0,0 +1,389 @@ +src/backend/access/spgist/README + +SP-GiST is an abbreviation of space-partitioned GiST. It provides a +generalized infrastructure for implementing space-partitioned data +structures, such as quadtrees, k-d trees, and radix trees (tries). When +implemented in main memory, these structures are usually designed as a set of +dynamically-allocated nodes linked by pointers. This is not suitable for +direct storing on disk, since the chains of pointers can be rather long and +require too many disk accesses. In contrast, disk based data structures +should have a high fanout to minimize I/O. The challenge is to map tree +nodes to disk pages in such a way that the search algorithm accesses only a +few disk pages, even if it traverses many nodes. + + +COMMON STRUCTURE DESCRIPTION + +Logically, an SP-GiST tree is a set of tuples, each of which can be either +an inner or leaf tuple. Each inner tuple contains "nodes", which are +(label,pointer) pairs, where the pointer (ItemPointerData) is a pointer to +another inner tuple or to the head of a list of leaf tuples. Inner tuples +can have different numbers of nodes (children). Branches can be of different +depth (actually, there is no control or code to support balancing), which +means that the tree is non-balanced. However, leaf and inner tuples cannot +be intermixed at the same level: a downlink from a node of an inner tuple +leads either to one inner tuple, or to a list of leaf tuples. + +The SP-GiST core requires that inner and leaf tuples fit on a single index +page, and even more stringently that the list of leaf tuples reached from a +single inner-tuple node all be stored on the same index page. (Restricting +such lists to not cross pages reduces seeks, and allows the list links to be +stored as simple 2-byte OffsetNumbers.) SP-GiST index opclasses should +therefore ensure that not too many nodes can be needed in one inner tuple, +and that inner-tuple prefixes and leaf-node datum values not be too large. + +Inner and leaf tuples are stored separately: the former are stored only on +"inner" pages, the latter only on "leaf" pages. Also, there are special +restrictions on the root page. Early in an index's life, when there is only +one page's worth of data, the root page contains an unorganized set of leaf +tuples. After the first page split has occurred, the root is required to +contain exactly one inner tuple. + +When the search traversal algorithm reaches an inner tuple, it chooses a set +of nodes to continue tree traverse in depth. If it reaches a leaf page it +scans a list of leaf tuples to find the ones that match the query. SP-GiST +also supports ordered (nearest-neighbor) searches - that is during scan pending +nodes are put into priority queue, so traversal is performed by the +closest-first model. + + +The insertion algorithm descends the tree similarly, except it must choose +just one node to descend to from each inner tuple. Insertion might also have +to modify the inner tuple before it can descend: it could add a new node, or +it could "split" the tuple to obtain a less-specific prefix that can match +the value to be inserted. If it's necessary to append a new leaf tuple to a +list and there is no free space on page, then SP-GiST creates a new inner +tuple and distributes leaf tuples into a set of lists on, perhaps, several +pages. + +An inner tuple consists of: + + optional prefix value - all successors must be consistent with it. + Example: + radix tree - prefix value is a common prefix string + quad tree - centroid + k-d tree - one coordinate + + list of nodes, where node is a (label, pointer) pair. + Example of a label: a single character for radix tree + +A leaf tuple consists of: + + a leaf value + Example: + radix tree - the rest of string (postfix) + quad and k-d tree - the point itself + + ItemPointer to the corresponding heap tuple + nextOffset number of next leaf tuple in a chain on a leaf page + + optional nulls bitmask + optional INCLUDE-column values + +For compatibility with pre-v14 indexes, a leaf tuple has a nulls bitmask +only if there are null values (among the leaf value and the INCLUDE values) +*and* there is at least one INCLUDE column. The null-ness of the leaf +value can be inferred from whether the tuple is on a "nulls page" (see below) +so it is not necessary to represent it explicitly. But we include it anyway +in a bitmask used with INCLUDE values, so that standard tuple deconstruction +code can be used. + + +NULLS HANDLING + +We assume that SPGiST-indexable operators are strict (can never succeed for +null inputs). It is still desirable to index nulls, so that whole-table +indexscans are possible and so that "x IS NULL" can be implemented by an +SPGiST indexscan. However, we prefer that SPGiST index opclasses not have +to cope with nulls. Therefore, the main tree of an SPGiST index does not +include any null entries. We store null entries in a separate SPGiST tree +occupying a disjoint set of pages (in particular, its own root page). +Insertions and searches in the nulls tree do not use any of the +opclass-supplied functions, but just use hardwired logic comparable to +AllTheSame cases in the normal tree. + + +INSERTION ALGORITHM + +Insertion algorithm is designed to keep the tree in a consistent state at +any moment. Here is a simplified insertion algorithm specification +(numbers refer to notes below): + + Start with the first tuple on the root page (1) + + loop: + if (page is leaf) then + if (enough space) + insert on page and exit (5) + else (7) + call PickSplitFn() (2) + end if + else + switch (chooseFn()) + case MatchNode - descend through selected node + case AddNode - add node and then retry chooseFn (3, 6) + case SplitTuple - split inner tuple to prefix and postfix, then + retry chooseFn with the prefix tuple (4, 6) + end if + +Notes: + +(1) Initially, we just dump leaf tuples into the root page until it is full; +then we split it. Once the root is not a leaf page, it can have only one +inner tuple, so as to keep the amount of free space on the root as large as +possible. Both of these rules are meant to postpone doing PickSplit on the +root for as long as possible, so that the topmost partitioning of the search +space is as good as we can easily make it. + +(2) Current implementation allows to do picksplit and insert a new leaf tuple +in one operation, if the new list of leaf tuples fits on one page. It's +always possible for trees with small nodes like quad tree or k-d tree, but +radix trees may require another picksplit. + +(3) Addition of node must keep size of inner tuple small enough to fit on a +page. After addition, inner tuple could become too large to be stored on +current page because of other tuples on page. In this case it will be moved +to another inner page (see notes about page management). When moving tuple to +another page, we can't change the numbers of other tuples on the page, else +we'd make downlink pointers to them invalid. To prevent that, SP-GiST leaves +a "placeholder" tuple, which can be reused later whenever another tuple is +added to the page. See also Concurrency and Vacuum sections below. Right now +only radix trees could add a node to the tuple; quad trees and k-d trees +make all possible nodes at once in PickSplitFn() call. + +(4) Prefix value could only partially match a new value, so the SplitTuple +action allows breaking the current tree branch into upper and lower sections. +Another way to say it is that we can split the current inner tuple into +"prefix" and "postfix" parts, where the prefix part is able to match the +incoming new value. Consider example of insertion into a radix tree. We use +the following notation, where tuple's id is just for discussion (no such id +is actually stored): + +inner tuple: {tuple id}(prefix string)[ comma separated list of node labels ] +leaf tuple: {tuple id} + +Suppose we need to insert string 'www.gogo.com' into inner tuple + + {1}(www.google.com/)[a, i] + +The string does not match the prefix so we cannot descend. We must +split the inner tuple into two tuples: + + {2}(www.go)[o] - prefix tuple + | + {3}(gle.com/)[a,i] - postfix tuple + +On the next iteration of loop we find that 'www.gogo.com' matches the +prefix, but not any node label, so we add a node [g] to tuple {2}: + + NIL (no child exists yet) + | + {2}(www.go)[o, g] + | + {3}(gle.com/)[a,i] + +Now we can descend through the [g] node, which will cause us to update +the target string to just 'o.com'. Finally, we'll insert a leaf tuple +bearing that string: + + {4} + | + {2}(www.go)[o, g] + | + {3}(gle.com/)[a,i] + +As we can see, the original tuple's node array moves to postfix tuple without +any changes. Note also that SP-GiST core assumes that prefix tuple is not +larger than old inner tuple. That allows us to store prefix tuple directly +in place of old inner tuple. SP-GiST core will try to store postfix tuple on +the same page if possible, but will use another page if there is not enough +free space (see notes 5 and 6). Currently, quad and k-d trees don't use this +feature, because they have no concept of a prefix being "inconsistent" with +any new value. They grow their depth only by PickSplitFn() call. + +(5) If pointer from node of parent is a NIL pointer, algorithm chooses a leaf +page to store on. At first, it tries to use the last-used leaf page with the +largest free space (which we track in each backend) to better utilize disk +space. If that's not large enough, then the algorithm allocates a new page. + +(6) Management of inner pages is very similar to management of leaf pages, +described in (5). + +(7) Actually, current implementation can move the whole list of leaf tuples +and a new tuple to another page, if the list is short enough. This improves +space utilization, but doesn't change the basis of the algorithm. + + +CONCURRENCY + +While descending the tree, the insertion algorithm holds exclusive lock on +two tree levels at a time, ie both parent and child pages (but parent and +child pages can be the same, see notes above). There is a possibility of +deadlock between two insertions if there are cross-referenced pages in +different branches. That is, if inner tuple on page M has a child on page N +while an inner tuple from another branch is on page N and has a child on +page M, then two insertions descending the two branches could deadlock, +since they will each hold their parent page's lock while trying to get the +child page's lock. + +Currently, we deal with this by conditionally locking buffers as we descend +the tree. If we fail to get lock on a buffer, we release both buffers and +restart the insertion process. This is potentially inefficient, but the +locking costs of a more deterministic approach seem very high. + +To reduce the number of cases where that happens, we introduce a concept of +"triple parity" of pages: if inner tuple is on page with BlockNumber N, then +its child tuples should be placed on the same page, or else on a page with +BlockNumber M where (N+1) mod 3 == M mod 3. This rule ensures that tuples +on page M will have no children on page N, since (M+1) mod 3 != N mod 3. +That makes it unlikely that two insertion processes will conflict against +each other while descending the tree. It's not perfect though: in the first +place, we could still get a deadlock among three or more insertion processes, +and in the second place, it's impractical to preserve this invariant in every +case when we expand or split an inner tuple. So we still have to allow for +deadlocks. + +Insertion may also need to take locks on an additional inner and/or leaf page +to add tuples of the right type(s), when there's not enough room on the pages +it descended through. However, we don't care exactly which such page we add +to, so deadlocks can be avoided by conditionally locking the additional +buffers: if we fail to get lock on an additional page, just try another one. + +Search traversal algorithm is rather traditional. At each non-leaf level, it +share-locks the page, identifies which node(s) in the current inner tuple +need to be visited, and puts those addresses on a stack of pages to examine +later. It then releases lock on the current buffer before visiting the next +stack item. So only one page is locked at a time, and no deadlock is +possible. But instead, we have to worry about race conditions: by the time +we arrive at a pointed-to page, a concurrent insertion could have replaced +the target inner tuple (or leaf tuple chain) with data placed elsewhere. +To handle that, whenever the insertion algorithm changes a nonempty downlink +in an inner tuple, it places a "redirect tuple" in place of the lower-level +inner tuple or leaf-tuple chain head that the link formerly led to. Scans +(though not insertions) must be prepared to honor such redirects. Only a +scan that had already visited the parent level could possibly reach such a +redirect tuple, so we can remove redirects once all active transactions have +been flushed out of the system. + + +DEAD TUPLES + +Tuples on leaf pages can be in one of four states: + +SPGIST_LIVE: normal, live pointer to a heap tuple. + +SPGIST_REDIRECT: placeholder that contains a link to another place in the +index. When a chain of leaf tuples has to be moved to another page, a +redirect tuple is inserted in place of the chain's head tuple. The parent +inner tuple's downlink is updated when this happens, but concurrent scans +might be "in flight" from the parent page to the child page (since they +release lock on the parent page before attempting to lock the child). +The redirect pointer serves to tell such a scan where to go. A redirect +pointer is only needed for as long as such concurrent scans could be in +progress. Eventually, it's converted into a PLACEHOLDER dead tuple by +VACUUM, and is then a candidate for replacement. Searches that find such +a tuple (which should never be part of a chain) should immediately proceed +to the other place, forgetting about the redirect tuple. Insertions that +reach such a tuple should raise error, since a valid downlink should never +point to such a tuple. + +SPGIST_DEAD: tuple is dead, but it cannot be removed or moved to a +different offset on the page because there is a link leading to it from +some inner tuple elsewhere in the index. (Such a tuple is never part of a +chain, since we don't need one unless there is nothing live left in its +chain.) Searches should ignore such entries. If an insertion action +arrives at such a tuple, it should either replace it in-place (if there's +room on the page to hold the desired new leaf tuple) or replace it with a +redirection pointer to wherever it puts the new leaf tuple. + +SPGIST_PLACEHOLDER: tuple is dead, and there are known to be no links to +it from elsewhere. When a live tuple is deleted or moved away, and not +replaced by a redirect pointer, it is replaced by a placeholder to keep +the offsets of later tuples on the same page from changing. Placeholders +can be freely replaced when adding a new tuple to the page, and also +VACUUM will delete any that are at the end of the range of valid tuple +offsets. Both searches and insertions should complain if a link from +elsewhere leads them to a placeholder tuple. + +When the root page is also a leaf, all its tuple should be in LIVE state; +there's no need for the others since there are no links and no need to +preserve offset numbers. + +Tuples on inner pages can be in LIVE, REDIRECT, or PLACEHOLDER states. +The REDIRECT state has the same function as on leaf pages, to send +concurrent searches to the place where they need to go after an inner +tuple is moved to another page. Expired REDIRECT pointers are converted +to PLACEHOLDER status by VACUUM, and are then candidates for replacement. +DEAD state is not currently possible, since VACUUM does not attempt to +remove unused inner tuples. + + +VACUUM + +VACUUM (or more precisely, spgbulkdelete) performs a single sequential scan +over the entire index. On both leaf and inner pages, we can convert old +REDIRECT tuples into PLACEHOLDER status, and then remove any PLACEHOLDERs +that are at the end of the page (since they aren't needed to preserve the +offsets of any live tuples). On leaf pages, we scan for tuples that need +to be deleted because their heap TIDs match a vacuum target TID. + +If we find a deletable tuple that is not at the head of its chain, we +can simply replace it with a PLACEHOLDER, updating the chain links to +remove it from the chain. If it is at the head of its chain, but there's +at least one live tuple remaining in the chain, we move that live tuple +to the head tuple's offset, replacing it with a PLACEHOLDER to preserve +the offsets of other tuples. This keeps the parent inner tuple's downlink +valid. If we find ourselves deleting all live tuples in a chain, we +replace the head tuple with a DEAD tuple and the rest with PLACEHOLDERS. +The parent inner tuple's downlink thus points to the DEAD tuple, and the +rules explained in the previous section keep everything working. + +VACUUM doesn't know a-priori which tuples are heads of their chains, but +it can easily figure that out by constructing a predecessor array that's +the reverse map of the nextOffset links (ie, when we see tuple x links to +tuple y, we set predecessor[y] = x). Then head tuples are the ones with +no predecessor. + +Because insertions can occur while VACUUM runs, a pure sequential scan +could miss deleting some target leaf tuples, because they could get moved +from a not-yet-visited leaf page to an already-visited leaf page as a +consequence of a PickSplit or MoveLeafs operation. Failing to delete any +target TID is not acceptable, so we have to extend the algorithm to cope +with such cases. We recognize that such a move might have occurred when +we see a leaf-page REDIRECT tuple whose XID indicates it might have been +created after the VACUUM scan started. We add the redirection target TID +to a "pending list" of places we need to recheck. Between pages of the +main sequential scan, we empty the pending list by visiting each listed +TID. If it points to an inner tuple (from a PickSplit), add each downlink +TID to the pending list. If it points to a leaf page, vacuum that page. +(We could just vacuum the single pointed-to chain, but vacuuming the +whole page simplifies the code and reduces the odds of VACUUM having to +modify the same page multiple times.) To ensure that pending-list +processing can never get into an endless loop, even in the face of +concurrent index changes, we don't remove list entries immediately but +only after we've completed all pending-list processing; instead we just +mark items as done after processing them. Adding a TID that's already in +the list is a no-op, whether or not that item is marked done yet. + +spgbulkdelete also updates the index's free space map. + +Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was +performed; otherwise, it does an spgbulkdelete scan with an empty target +list, so as to clean up redirections and placeholders, update the free +space map, and gather statistics. + + +LAST USED PAGE MANAGEMENT + +The list of last used pages contains four pages - a leaf page and three +inner pages, one from each "triple parity" group. (Actually, there's one +such list for the main tree and a separate one for the nulls tree.) This +list is stored between calls on the index meta page, but updates are never +WAL-logged to decrease WAL traffic. Incorrect data on meta page isn't +critical, because we could allocate a new page at any moment. + + +AUTHORS + + Teodor Sigaev + Oleg Bartunov diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c new file mode 100644 index 0000000..70557bc --- /dev/null +++ b/src/backend/access/spgist/spgdoinsert.c @@ -0,0 +1,2354 @@ +/*------------------------------------------------------------------------- + * + * spgdoinsert.c + * implementation of insert algorithm + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgdoinsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/spgist_private.h" +#include "access/spgxlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" + + +/* + * SPPageDesc tracks all info about a page we are inserting into. In some + * situations it actually identifies a tuple, or even a specific node within + * an inner tuple. But any of the fields can be invalid. If the buffer + * field is valid, it implies we hold pin and exclusive lock on that buffer. + * page pointer should be valid exactly when buffer is. + */ +typedef struct SPPageDesc +{ + BlockNumber blkno; /* block number, or InvalidBlockNumber */ + Buffer buffer; /* page's buffer number, or InvalidBuffer */ + Page page; /* pointer to page buffer, or NULL */ + OffsetNumber offnum; /* offset of tuple, or InvalidOffsetNumber */ + int node; /* node number within inner tuple, or -1 */ +} SPPageDesc; + + +/* + * Set the item pointer in the nodeN'th entry in inner tuple tup. This + * is used to update the parent inner tuple's downlink after a move or + * split operation. + */ +void +spgUpdateNodeLink(SpGistInnerTuple tup, int nodeN, + BlockNumber blkno, OffsetNumber offset) +{ + int i; + SpGistNodeTuple node; + + SGITITERATE(tup, i, node) + { + if (i == nodeN) + { + ItemPointerSet(&node->t_tid, blkno, offset); + return; + } + } + + elog(ERROR, "failed to find requested node %d in SPGiST inner tuple", + nodeN); +} + +/* + * Form a new inner tuple containing one more node than the given one, with + * the specified label datum, inserted at offset "offset" in the node array. + * The new tuple's prefix is the same as the old one's. + * + * Note that the new node initially has an invalid downlink. We'll find a + * page to point it to later. + */ +static SpGistInnerTuple +addNode(SpGistState *state, SpGistInnerTuple tuple, Datum label, int offset) +{ + SpGistNodeTuple node, + *nodes; + int i; + + /* if offset is negative, insert at end */ + if (offset < 0) + offset = tuple->nNodes; + else if (offset > tuple->nNodes) + elog(ERROR, "invalid offset for adding node to SPGiST inner tuple"); + + nodes = palloc(sizeof(SpGistNodeTuple) * (tuple->nNodes + 1)); + SGITITERATE(tuple, i, node) + { + if (i < offset) + nodes[i] = node; + else + nodes[i + 1] = node; + } + + nodes[offset] = spgFormNodeTuple(state, label, false); + + return spgFormInnerTuple(state, + (tuple->prefixSize > 0), + SGITDATUM(tuple, state), + tuple->nNodes + 1, + nodes); +} + +/* qsort comparator for sorting OffsetNumbers */ +static int +cmpOffsetNumbers(const void *a, const void *b) +{ + if (*(const OffsetNumber *) a == *(const OffsetNumber *) b) + return 0; + return (*(const OffsetNumber *) a > *(const OffsetNumber *) b) ? 1 : -1; +} + +/* + * Delete multiple tuples from an index page, preserving tuple offset numbers. + * + * The first tuple in the given list is replaced with a dead tuple of type + * "firststate" (REDIRECT/DEAD/PLACEHOLDER); the remaining tuples are replaced + * with dead tuples of type "reststate". If either firststate or reststate + * is REDIRECT, blkno/offnum specify where to link to. + * + * NB: this is used during WAL replay, so beware of trying to make it too + * smart. In particular, it shouldn't use "state" except for calling + * spgFormDeadTuple(). This is also used in a critical section, so no + * pallocs either! + */ +void +spgPageIndexMultiDelete(SpGistState *state, Page page, + OffsetNumber *itemnos, int nitems, + int firststate, int reststate, + BlockNumber blkno, OffsetNumber offnum) +{ + OffsetNumber firstItem; + OffsetNumber sortednos[MaxIndexTuplesPerPage]; + SpGistDeadTuple tuple = NULL; + int i; + + if (nitems == 0) + return; /* nothing to do */ + + /* + * For efficiency we want to use PageIndexMultiDelete, which requires the + * targets to be listed in sorted order, so we have to sort the itemnos + * array. (This also greatly simplifies the math for reinserting the + * replacement tuples.) However, we must not scribble on the caller's + * array, so we have to make a copy. + */ + memcpy(sortednos, itemnos, sizeof(OffsetNumber) * nitems); + if (nitems > 1) + qsort(sortednos, nitems, sizeof(OffsetNumber), cmpOffsetNumbers); + + PageIndexMultiDelete(page, sortednos, nitems); + + firstItem = itemnos[0]; + + for (i = 0; i < nitems; i++) + { + OffsetNumber itemno = sortednos[i]; + int tupstate; + + tupstate = (itemno == firstItem) ? firststate : reststate; + if (tuple == NULL || tuple->tupstate != tupstate) + tuple = spgFormDeadTuple(state, tupstate, blkno, offnum); + + if (PageAddItem(page, (Item) tuple, tuple->size, + itemno, false, false) != itemno) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + tuple->size); + + if (tupstate == SPGIST_REDIRECT) + SpGistPageGetOpaque(page)->nRedirection++; + else if (tupstate == SPGIST_PLACEHOLDER) + SpGistPageGetOpaque(page)->nPlaceholder++; + } +} + +/* + * Update the parent inner tuple's downlink, and mark the parent buffer + * dirty (this must be the last change to the parent page in the current + * WAL action). + */ +static void +saveNodeLink(Relation index, SPPageDesc *parent, + BlockNumber blkno, OffsetNumber offnum) +{ + SpGistInnerTuple innerTuple; + + innerTuple = (SpGistInnerTuple) PageGetItem(parent->page, + PageGetItemId(parent->page, parent->offnum)); + + spgUpdateNodeLink(innerTuple, parent->node, blkno, offnum); + + MarkBufferDirty(parent->buffer); +} + +/* + * Add a leaf tuple to a leaf page where there is known to be room for it + */ +static void +addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, + SPPageDesc *current, SPPageDesc *parent, bool isNulls, bool isNew) +{ + spgxlogAddLeaf xlrec; + + xlrec.newPage = isNew; + xlrec.storesNulls = isNulls; + + /* these will be filled below as needed */ + xlrec.offnumLeaf = InvalidOffsetNumber; + xlrec.offnumHeadLeaf = InvalidOffsetNumber; + xlrec.offnumParent = InvalidOffsetNumber; + xlrec.nodeI = 0; + + START_CRIT_SECTION(); + + if (current->offnum == InvalidOffsetNumber || + SpGistBlockIsRoot(current->blkno)) + { + /* Tuple is not part of a chain */ + SGLT_SET_NEXTOFFSET(leafTuple, InvalidOffsetNumber); + current->offnum = SpGistPageAddNewItem(state, current->page, + (Item) leafTuple, leafTuple->size, + NULL, false); + + xlrec.offnumLeaf = current->offnum; + + /* Must update parent's downlink if any */ + if (parent->buffer != InvalidBuffer) + { + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + + saveNodeLink(index, parent, current->blkno, current->offnum); + } + } + else + { + /* + * Tuple must be inserted into existing chain. We mustn't change the + * chain's head address, but we don't need to chase the entire chain + * to put the tuple at the end; we can insert it second. + * + * Also, it's possible that the "chain" consists only of a DEAD tuple, + * in which case we should replace the DEAD tuple in-place. + */ + SpGistLeafTuple head; + OffsetNumber offnum; + + head = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, current->offnum)); + if (head->tupstate == SPGIST_LIVE) + { + SGLT_SET_NEXTOFFSET(leafTuple, SGLT_GET_NEXTOFFSET(head)); + offnum = SpGistPageAddNewItem(state, current->page, + (Item) leafTuple, leafTuple->size, + NULL, false); + + /* + * re-get head of list because it could have been moved on page, + * and set new second element + */ + head = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, current->offnum)); + SGLT_SET_NEXTOFFSET(head, offnum); + + xlrec.offnumLeaf = offnum; + xlrec.offnumHeadLeaf = current->offnum; + } + else if (head->tupstate == SPGIST_DEAD) + { + SGLT_SET_NEXTOFFSET(leafTuple, InvalidOffsetNumber); + PageIndexTupleDelete(current->page, current->offnum); + if (PageAddItem(current->page, + (Item) leafTuple, leafTuple->size, + current->offnum, false, false) != current->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + leafTuple->size); + + /* WAL replay distinguishes this case by equal offnums */ + xlrec.offnumLeaf = current->offnum; + xlrec.offnumHeadLeaf = current->offnum; + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", head->tupstate); + } + + MarkBufferDirty(current->buffer); + + if (RelationNeedsWAL(index) && !state->isBuild) + { + XLogRecPtr recptr; + int flags; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) leafTuple, leafTuple->size); + + flags = REGBUF_STANDARD; + if (xlrec.newPage) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(0, current->buffer, flags); + if (xlrec.offnumParent != InvalidOffsetNumber) + XLogRegisterBuffer(1, parent->buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_LEAF); + + PageSetLSN(current->page, recptr); + + /* update parent only if we actually changed it */ + if (xlrec.offnumParent != InvalidOffsetNumber) + { + PageSetLSN(parent->page, recptr); + } + } + + END_CRIT_SECTION(); +} + +/* + * Count the number and total size of leaf tuples in the chain starting at + * current->offnum. Return number into *nToSplit and total size as function + * result. + * + * Klugy special case when considering the root page (i.e., root is a leaf + * page, but we're about to split for the first time): return fake large + * values to force spgdoinsert() to take the doPickSplit rather than + * moveLeafs code path. moveLeafs is not prepared to deal with root page. + */ +static int +checkSplitConditions(Relation index, SpGistState *state, + SPPageDesc *current, int *nToSplit) +{ + int i, + n = 0, + totalSize = 0; + + if (SpGistBlockIsRoot(current->blkno)) + { + /* return impossible values to force split */ + *nToSplit = BLCKSZ; + return BLCKSZ; + } + + i = current->offnum; + while (i != InvalidOffsetNumber) + { + SpGistLeafTuple it; + + Assert(i >= FirstOffsetNumber && + i <= PageGetMaxOffsetNumber(current->page)); + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + if (it->tupstate == SPGIST_LIVE) + { + n++; + totalSize += it->size + sizeof(ItemIdData); + } + else if (it->tupstate == SPGIST_DEAD) + { + /* We could see a DEAD tuple as first/only chain item */ + Assert(i == current->offnum); + Assert(SGLT_GET_NEXTOFFSET(it) == InvalidOffsetNumber); + /* Don't count it in result, because it won't go to other page */ + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + + i = SGLT_GET_NEXTOFFSET(it); + } + + *nToSplit = n; + + return totalSize; +} + +/* + * current points to a leaf-tuple chain that we wanted to add newLeafTuple to, + * but the chain has to be moved because there's not enough room to add + * newLeafTuple to its page. We use this method when the chain contains + * very little data so a split would be inefficient. We are sure we can + * fit the chain plus newLeafTuple on one other page. + */ +static void +moveLeafs(Relation index, SpGistState *state, + SPPageDesc *current, SPPageDesc *parent, + SpGistLeafTuple newLeafTuple, bool isNulls) +{ + int i, + nDelete, + nInsert, + size; + Buffer nbuf; + Page npage; + SpGistLeafTuple it; + OffsetNumber r = InvalidOffsetNumber, + startOffset = InvalidOffsetNumber; + bool replaceDead = false; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + BlockNumber nblkno; + spgxlogMoveLeafs xlrec; + char *leafdata, + *leafptr; + + /* This doesn't work on root page */ + Assert(parent->buffer != InvalidBuffer); + Assert(parent->buffer != current->buffer); + + /* Locate the tuples to be moved, and count up the space needed */ + i = PageGetMaxOffsetNumber(current->page); + toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * i); + toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (i + 1)); + + size = newLeafTuple->size + sizeof(ItemIdData); + + nDelete = 0; + i = current->offnum; + while (i != InvalidOffsetNumber) + { + SpGistLeafTuple it; + + Assert(i >= FirstOffsetNumber && + i <= PageGetMaxOffsetNumber(current->page)); + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + + if (it->tupstate == SPGIST_LIVE) + { + toDelete[nDelete] = i; + size += it->size + sizeof(ItemIdData); + nDelete++; + } + else if (it->tupstate == SPGIST_DEAD) + { + /* We could see a DEAD tuple as first/only chain item */ + Assert(i == current->offnum); + Assert(SGLT_GET_NEXTOFFSET(it) == InvalidOffsetNumber); + /* We don't want to move it, so don't count it in size */ + toDelete[nDelete] = i; + nDelete++; + replaceDead = true; + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + + i = SGLT_GET_NEXTOFFSET(it); + } + + /* Find a leaf page that will hold them */ + nbuf = SpGistGetBuffer(index, GBUF_LEAF | (isNulls ? GBUF_NULLS : 0), + size, &xlrec.newPage); + npage = BufferGetPage(nbuf); + nblkno = BufferGetBlockNumber(nbuf); + Assert(nblkno != current->blkno); + + leafdata = leafptr = palloc(size); + + START_CRIT_SECTION(); + + /* copy all the old tuples to new page, unless they're dead */ + nInsert = 0; + if (!replaceDead) + { + for (i = 0; i < nDelete; i++) + { + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, toDelete[i])); + Assert(it->tupstate == SPGIST_LIVE); + + /* + * Update chain link (notice the chain order gets reversed, but we + * don't care). We're modifying the tuple on the source page + * here, but it's okay since we're about to delete it. + */ + SGLT_SET_NEXTOFFSET(it, r); + + r = SpGistPageAddNewItem(state, npage, (Item) it, it->size, + &startOffset, false); + + toInsert[nInsert] = r; + nInsert++; + + /* save modified tuple into leafdata as well */ + memcpy(leafptr, it, it->size); + leafptr += it->size; + } + } + + /* add the new tuple as well */ + SGLT_SET_NEXTOFFSET(newLeafTuple, r); + r = SpGistPageAddNewItem(state, npage, + (Item) newLeafTuple, newLeafTuple->size, + &startOffset, false); + toInsert[nInsert] = r; + nInsert++; + memcpy(leafptr, newLeafTuple, newLeafTuple->size); + leafptr += newLeafTuple->size; + + /* + * Now delete the old tuples, leaving a redirection pointer behind for the + * first one, unless we're doing an index build; in which case there can't + * be any concurrent scan so we need not provide a redirect. + */ + spgPageIndexMultiDelete(state, current->page, toDelete, nDelete, + state->isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + nblkno, r); + + /* Update parent's downlink and mark parent page dirty */ + saveNodeLink(index, parent, nblkno, r); + + /* Mark the leaf pages too */ + MarkBufferDirty(current->buffer); + MarkBufferDirty(nbuf); + + if (RelationNeedsWAL(index) && !state->isBuild) + { + XLogRecPtr recptr; + + /* prepare WAL info */ + STORE_STATE(state, xlrec.stateSrc); + + xlrec.nMoves = nDelete; + xlrec.replaceDead = replaceDead; + xlrec.storesNulls = isNulls; + + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogMoveLeafs); + XLogRegisterData((char *) toDelete, + sizeof(OffsetNumber) * nDelete); + XLogRegisterData((char *) toInsert, + sizeof(OffsetNumber) * nInsert); + XLogRegisterData((char *) leafdata, leafptr - leafdata); + + XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, nbuf, REGBUF_STANDARD | (xlrec.newPage ? REGBUF_WILL_INIT : 0)); + XLogRegisterBuffer(2, parent->buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_MOVE_LEAFS); + + PageSetLSN(current->page, recptr); + PageSetLSN(npage, recptr); + PageSetLSN(parent->page, recptr); + } + + END_CRIT_SECTION(); + + /* Update local free-space cache and release new buffer */ + SpGistSetLastUsedPage(index, nbuf); + UnlockReleaseBuffer(nbuf); +} + +/* + * Update previously-created redirection tuple with appropriate destination + * + * We use this when it's not convenient to know the destination first. + * The tuple should have been made with the "impossible" destination of + * the metapage. + */ +static void +setRedirectionTuple(SPPageDesc *current, OffsetNumber position, + BlockNumber blkno, OffsetNumber offnum) +{ + SpGistDeadTuple dt; + + dt = (SpGistDeadTuple) PageGetItem(current->page, + PageGetItemId(current->page, position)); + Assert(dt->tupstate == SPGIST_REDIRECT); + Assert(ItemPointerGetBlockNumber(&dt->pointer) == SPGIST_METAPAGE_BLKNO); + ItemPointerSet(&dt->pointer, blkno, offnum); +} + +/* + * Test to see if the user-defined picksplit function failed to do its job, + * ie, it put all the leaf tuples into the same node. + * If so, randomly divide the tuples into several nodes (all with the same + * label) and return true to select allTheSame mode for this inner tuple. + * + * (This code is also used to forcibly select allTheSame mode for nulls.) + * + * If we know that the leaf tuples wouldn't all fit on one page, then we + * exclude the last tuple (which is the incoming new tuple that forced a split) + * from the check to see if more than one node is used. The reason for this + * is that if the existing tuples are put into only one chain, then even if + * we move them all to an empty page, there would still not be room for the + * new tuple, so we'd get into an infinite loop of picksplit attempts. + * Forcing allTheSame mode dodges this problem by ensuring the old tuples will + * be split across pages. (Exercise for the reader: figure out why this + * fixes the problem even when there is only one old tuple.) + */ +static bool +checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig, + bool *includeNew) +{ + int theNode; + int limit; + int i; + + /* For the moment, assume we can include the new leaf tuple */ + *includeNew = true; + + /* If there's only the new leaf tuple, don't select allTheSame mode */ + if (in->nTuples <= 1) + return false; + + /* If tuple set doesn't fit on one page, ignore the new tuple in test */ + limit = tooBig ? in->nTuples - 1 : in->nTuples; + + /* Check to see if more than one node is populated */ + theNode = out->mapTuplesToNodes[0]; + for (i = 1; i < limit; i++) + { + if (out->mapTuplesToNodes[i] != theNode) + return false; + } + + /* Nope, so override the picksplit function's decisions */ + + /* If the new tuple is in its own node, it can't be included in split */ + if (tooBig && out->mapTuplesToNodes[in->nTuples - 1] != theNode) + *includeNew = false; + + out->nNodes = 8; /* arbitrary number of child nodes */ + + /* Random assignment of tuples to nodes (note we include new tuple) */ + for (i = 0; i < in->nTuples; i++) + out->mapTuplesToNodes[i] = i % out->nNodes; + + /* The opclass may not use node labels, but if it does, duplicate 'em */ + if (out->nodeLabels) + { + Datum theLabel = out->nodeLabels[theNode]; + + out->nodeLabels = (Datum *) palloc(sizeof(Datum) * out->nNodes); + for (i = 0; i < out->nNodes; i++) + out->nodeLabels[i] = theLabel; + } + + /* We don't touch the prefix or the leaf tuple datum assignments */ + + return true; +} + +/* + * current points to a leaf-tuple chain that we wanted to add newLeafTuple to, + * but the chain has to be split because there's not enough room to add + * newLeafTuple to its page. + * + * This function splits the leaf tuple set according to picksplit's rules, + * creating one or more new chains that are spread across the current page + * and an additional leaf page (we assume that two leaf pages will be + * sufficient). A new inner tuple is created, and the parent downlink + * pointer is updated to point to that inner tuple instead of the leaf chain. + * + * On exit, current contains the address of the new inner tuple. + * + * Returns true if we successfully inserted newLeafTuple during this function, + * false if caller still has to do it (meaning another picksplit operation is + * probably needed). Failure could occur if the picksplit result is fairly + * unbalanced, or if newLeafTuple is just plain too big to fit on a page. + * Because we force the picksplit result to be at least two chains, each + * cycle will get rid of at least one leaf tuple from the chain, so the loop + * will eventually terminate if lack of balance is the issue. If the tuple + * is too big, we assume that repeated picksplit operations will eventually + * make it small enough by repeated prefix-stripping. A broken opclass could + * make this an infinite loop, though, so spgdoinsert() checks that the + * leaf datums get smaller each time. + */ +static bool +doPickSplit(Relation index, SpGistState *state, + SPPageDesc *current, SPPageDesc *parent, + SpGistLeafTuple newLeafTuple, + int level, bool isNulls, bool isNew) +{ + bool insertedNew = false; + spgPickSplitIn in; + spgPickSplitOut out; + FmgrInfo *procinfo; + bool includeNew; + int i, + max, + n; + SpGistInnerTuple innerTuple; + SpGistNodeTuple node, + *nodes; + Buffer newInnerBuffer, + newLeafBuffer; + uint8 *leafPageSelect; + int *leafSizes; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + OffsetNumber redirectTuplePos = InvalidOffsetNumber; + OffsetNumber startOffsets[2]; + SpGistLeafTuple *oldLeafs; + SpGistLeafTuple *newLeafs; + Datum leafDatums[INDEX_MAX_KEYS]; + bool leafIsnulls[INDEX_MAX_KEYS]; + int spaceToDelete; + int currentFreeSpace; + int totalLeafSizes; + bool allTheSame; + spgxlogPickSplit xlrec; + char *leafdata, + *leafptr; + SPPageDesc saveCurrent; + int nToDelete, + nToInsert, + maxToInclude; + + in.level = level; + + /* + * Allocate per-leaf-tuple work arrays with max possible size + */ + max = PageGetMaxOffsetNumber(current->page); + n = max + 1; + in.datums = (Datum *) palloc(sizeof(Datum) * n); + toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n); + toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n); + oldLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n); + newLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n); + leafPageSelect = (uint8 *) palloc(sizeof(uint8) * n); + + STORE_STATE(state, xlrec.stateSrc); + + /* + * Form list of leaf tuples which will be distributed as split result; + * also, count up the amount of space that will be freed from current. + * (Note that in the non-root case, we won't actually delete the old + * tuples, only replace them with redirects or placeholders.) + */ + nToInsert = 0; + nToDelete = 0; + spaceToDelete = 0; + if (SpGistBlockIsRoot(current->blkno)) + { + /* + * We are splitting the root (which up to now is also a leaf page). + * Its tuples are not linked, so scan sequentially to get them all. We + * ignore the original value of current->offnum. + */ + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple it; + + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + if (it->tupstate == SPGIST_LIVE) + { + in.datums[nToInsert] = + isNulls ? (Datum) 0 : SGLTDATUM(it, state); + oldLeafs[nToInsert] = it; + nToInsert++; + toDelete[nToDelete] = i; + nToDelete++; + /* we will delete the tuple altogether, so count full space */ + spaceToDelete += it->size + sizeof(ItemIdData); + } + else /* tuples on root should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + } + } + else + { + /* Normal case, just collect the leaf tuples in the chain */ + i = current->offnum; + while (i != InvalidOffsetNumber) + { + SpGistLeafTuple it; + + Assert(i >= FirstOffsetNumber && i <= max); + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + if (it->tupstate == SPGIST_LIVE) + { + in.datums[nToInsert] = + isNulls ? (Datum) 0 : SGLTDATUM(it, state); + oldLeafs[nToInsert] = it; + nToInsert++; + toDelete[nToDelete] = i; + nToDelete++; + /* we will not delete the tuple, only replace with dead */ + Assert(it->size >= SGDTSIZE); + spaceToDelete += it->size - SGDTSIZE; + } + else if (it->tupstate == SPGIST_DEAD) + { + /* We could see a DEAD tuple as first/only chain item */ + Assert(i == current->offnum); + Assert(SGLT_GET_NEXTOFFSET(it) == InvalidOffsetNumber); + toDelete[nToDelete] = i; + nToDelete++; + /* replacing it with redirect will save no space */ + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + + i = SGLT_GET_NEXTOFFSET(it); + } + } + in.nTuples = nToInsert; + + /* + * We may not actually insert new tuple because another picksplit may be + * necessary due to too large value, but we will try to allocate enough + * space to include it; and in any case it has to be included in the input + * for the picksplit function. So don't increment nToInsert yet. + */ + in.datums[in.nTuples] = + isNulls ? (Datum) 0 : SGLTDATUM(newLeafTuple, state); + oldLeafs[in.nTuples] = newLeafTuple; + in.nTuples++; + + memset(&out, 0, sizeof(out)); + + if (!isNulls) + { + /* + * Perform split using user-defined method. + */ + procinfo = index_getprocinfo(index, 1, SPGIST_PICKSPLIT_PROC); + FunctionCall2Coll(procinfo, + index->rd_indcollation[0], + PointerGetDatum(&in), + PointerGetDatum(&out)); + + /* + * Form new leaf tuples and count up the total space needed. + */ + totalLeafSizes = 0; + for (i = 0; i < in.nTuples; i++) + { + if (state->leafTupDesc->natts > 1) + spgDeformLeafTuple(oldLeafs[i], + state->leafTupDesc, + leafDatums, + leafIsnulls, + isNulls); + + leafDatums[spgKeyColumn] = out.leafTupleDatums[i]; + leafIsnulls[spgKeyColumn] = false; + + newLeafs[i] = spgFormLeafTuple(state, &oldLeafs[i]->heapPtr, + leafDatums, + leafIsnulls); + totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData); + } + } + else + { + /* + * Perform dummy split that puts all tuples into one node. + * checkAllTheSame will override this and force allTheSame mode. + */ + out.hasPrefix = false; + out.nNodes = 1; + out.nodeLabels = NULL; + out.mapTuplesToNodes = palloc0(sizeof(int) * in.nTuples); + + /* + * Form new leaf tuples and count up the total space needed. + */ + totalLeafSizes = 0; + for (i = 0; i < in.nTuples; i++) + { + if (state->leafTupDesc->natts > 1) + spgDeformLeafTuple(oldLeafs[i], + state->leafTupDesc, + leafDatums, + leafIsnulls, + isNulls); + + /* + * Nulls tree can contain only null key values. + */ + leafDatums[spgKeyColumn] = (Datum) 0; + leafIsnulls[spgKeyColumn] = true; + + newLeafs[i] = spgFormLeafTuple(state, &oldLeafs[i]->heapPtr, + leafDatums, + leafIsnulls); + totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData); + } + } + + /* + * Check to see if the picksplit function failed to separate the values, + * ie, it put them all into the same child node. If so, select allTheSame + * mode and create a random split instead. See comments for + * checkAllTheSame as to why we need to know if the new leaf tuples could + * fit on one page. + */ + allTheSame = checkAllTheSame(&in, &out, + totalLeafSizes > SPGIST_PAGE_CAPACITY, + &includeNew); + + /* + * If checkAllTheSame decided we must exclude the new tuple, don't + * consider it any further. + */ + if (includeNew) + maxToInclude = in.nTuples; + else + { + maxToInclude = in.nTuples - 1; + totalLeafSizes -= newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData); + } + + /* + * Allocate per-node work arrays. Since checkAllTheSame could replace + * out.nNodes with a value larger than the number of tuples on the input + * page, we can't allocate these arrays before here. + */ + nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * out.nNodes); + leafSizes = (int *) palloc0(sizeof(int) * out.nNodes); + + /* + * Form nodes of inner tuple and inner tuple itself + */ + for (i = 0; i < out.nNodes; i++) + { + Datum label = (Datum) 0; + bool labelisnull = (out.nodeLabels == NULL); + + if (!labelisnull) + label = out.nodeLabels[i]; + nodes[i] = spgFormNodeTuple(state, label, labelisnull); + } + innerTuple = spgFormInnerTuple(state, + out.hasPrefix, out.prefixDatum, + out.nNodes, nodes); + innerTuple->allTheSame = allTheSame; + + /* + * Update nodes[] array to point into the newly formed innerTuple, so that + * we can adjust their downlinks below. + */ + SGITITERATE(innerTuple, i, node) + { + nodes[i] = node; + } + + /* + * Re-scan new leaf tuples and count up the space needed under each node. + */ + for (i = 0; i < maxToInclude; i++) + { + n = out.mapTuplesToNodes[i]; + if (n < 0 || n >= out.nNodes) + elog(ERROR, "inconsistent result of SPGiST picksplit function"); + leafSizes[n] += newLeafs[i]->size + sizeof(ItemIdData); + } + + /* + * To perform the split, we must insert a new inner tuple, which can't go + * on a leaf page; and unless we are splitting the root page, we must then + * update the parent tuple's downlink to point to the inner tuple. If + * there is room, we'll put the new inner tuple on the same page as the + * parent tuple, otherwise we need another non-leaf buffer. But if the + * parent page is the root, we can't add the new inner tuple there, + * because the root page must have only one inner tuple. + */ + xlrec.initInner = false; + if (parent->buffer != InvalidBuffer && + !SpGistBlockIsRoot(parent->blkno) && + (SpGistPageGetFreeSpace(parent->page, 1) >= + innerTuple->size + sizeof(ItemIdData))) + { + /* New inner tuple will fit on parent page */ + newInnerBuffer = parent->buffer; + } + else if (parent->buffer != InvalidBuffer) + { + /* Send tuple to page with next triple parity (see README) */ + newInnerBuffer = SpGistGetBuffer(index, + GBUF_INNER_PARITY(parent->blkno + 1) | + (isNulls ? GBUF_NULLS : 0), + innerTuple->size + sizeof(ItemIdData), + &xlrec.initInner); + } + else + { + /* Root page split ... inner tuple will go to root page */ + newInnerBuffer = InvalidBuffer; + } + + /* + * The new leaf tuples converted from the existing ones should require the + * same or less space, and therefore should all fit onto one page + * (although that's not necessarily the current page, since we can't + * delete the old tuples but only replace them with placeholders). + * However, the incoming new tuple might not also fit, in which case we + * might need another picksplit cycle to reduce it some more. + * + * If there's not room to put everything back onto the current page, then + * we decide on a per-node basis which tuples go to the new page. (We do + * it like that because leaf tuple chains can't cross pages, so we must + * place all leaf tuples belonging to the same parent node on the same + * page.) + * + * If we are splitting the root page (turning it from a leaf page into an + * inner page), then no leaf tuples can go back to the current page; they + * must all go somewhere else. + */ + if (!SpGistBlockIsRoot(current->blkno)) + currentFreeSpace = PageGetExactFreeSpace(current->page) + spaceToDelete; + else + currentFreeSpace = 0; /* prevent assigning any tuples to current */ + + xlrec.initDest = false; + + if (totalLeafSizes <= currentFreeSpace) + { + /* All the leaf tuples will fit on current page */ + newLeafBuffer = InvalidBuffer; + /* mark new leaf tuple as included in insertions, if allowed */ + if (includeNew) + { + nToInsert++; + insertedNew = true; + } + for (i = 0; i < nToInsert; i++) + leafPageSelect[i] = 0; /* signifies current page */ + } + else if (in.nTuples == 1 && totalLeafSizes > SPGIST_PAGE_CAPACITY) + { + /* + * We're trying to split up a long value by repeated suffixing, but + * it's not going to fit yet. Don't bother allocating a second leaf + * buffer that we won't be able to use. + */ + newLeafBuffer = InvalidBuffer; + Assert(includeNew); + Assert(nToInsert == 0); + } + else + { + /* We will need another leaf page */ + uint8 *nodePageSelect; + int curspace; + int newspace; + + newLeafBuffer = SpGistGetBuffer(index, + GBUF_LEAF | (isNulls ? GBUF_NULLS : 0), + Min(totalLeafSizes, + SPGIST_PAGE_CAPACITY), + &xlrec.initDest); + + /* + * Attempt to assign node groups to the two pages. We might fail to + * do so, even if totalLeafSizes is less than the available space, + * because we can't split a group across pages. + */ + nodePageSelect = (uint8 *) palloc(sizeof(uint8) * out.nNodes); + + curspace = currentFreeSpace; + newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer)); + for (i = 0; i < out.nNodes; i++) + { + if (leafSizes[i] <= curspace) + { + nodePageSelect[i] = 0; /* signifies current page */ + curspace -= leafSizes[i]; + } + else + { + nodePageSelect[i] = 1; /* signifies new leaf page */ + newspace -= leafSizes[i]; + } + } + if (curspace >= 0 && newspace >= 0) + { + /* Successful assignment, so we can include the new leaf tuple */ + if (includeNew) + { + nToInsert++; + insertedNew = true; + } + } + else if (includeNew) + { + /* We must exclude the new leaf tuple from the split */ + int nodeOfNewTuple = out.mapTuplesToNodes[in.nTuples - 1]; + + leafSizes[nodeOfNewTuple] -= + newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData); + + /* Repeat the node assignment process --- should succeed now */ + curspace = currentFreeSpace; + newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer)); + for (i = 0; i < out.nNodes; i++) + { + if (leafSizes[i] <= curspace) + { + nodePageSelect[i] = 0; /* signifies current page */ + curspace -= leafSizes[i]; + } + else + { + nodePageSelect[i] = 1; /* signifies new leaf page */ + newspace -= leafSizes[i]; + } + } + if (curspace < 0 || newspace < 0) + elog(ERROR, "failed to divide leaf tuple groups across pages"); + } + else + { + /* oops, we already excluded new tuple ... should not get here */ + elog(ERROR, "failed to divide leaf tuple groups across pages"); + } + /* Expand the per-node assignments to be shown per leaf tuple */ + for (i = 0; i < nToInsert; i++) + { + n = out.mapTuplesToNodes[i]; + leafPageSelect[i] = nodePageSelect[n]; + } + } + + /* Start preparing WAL record */ + xlrec.nDelete = 0; + xlrec.initSrc = isNew; + xlrec.storesNulls = isNulls; + xlrec.isRootSplit = SpGistBlockIsRoot(current->blkno); + + leafdata = leafptr = (char *) palloc(totalLeafSizes); + + /* Here we begin making the changes to the target pages */ + START_CRIT_SECTION(); + + /* + * Delete old leaf tuples from current buffer, except when we're splitting + * the root; in that case there's no need because we'll re-init the page + * below. We do this first to make room for reinserting new leaf tuples. + */ + if (!SpGistBlockIsRoot(current->blkno)) + { + /* + * Init buffer instead of deleting individual tuples, but only if + * there aren't any other live tuples and only during build; otherwise + * we need to set a redirection tuple for concurrent scans. + */ + if (state->isBuild && + nToDelete + SpGistPageGetOpaque(current->page)->nPlaceholder == + PageGetMaxOffsetNumber(current->page)) + { + SpGistInitBuffer(current->buffer, + SPGIST_LEAF | (isNulls ? SPGIST_NULLS : 0)); + xlrec.initSrc = true; + } + else if (isNew) + { + /* don't expose the freshly init'd buffer as a backup block */ + Assert(nToDelete == 0); + } + else + { + xlrec.nDelete = nToDelete; + + if (!state->isBuild) + { + /* + * Need to create redirect tuple (it will point to new inner + * tuple) but right now the new tuple's location is not known + * yet. So, set the redirection pointer to "impossible" value + * and remember its position to update tuple later. + */ + if (nToDelete > 0) + redirectTuplePos = toDelete[0]; + spgPageIndexMultiDelete(state, current->page, + toDelete, nToDelete, + SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + SPGIST_METAPAGE_BLKNO, + FirstOffsetNumber); + } + else + { + /* + * During index build there is not concurrent searches, so we + * don't need to create redirection tuple. + */ + spgPageIndexMultiDelete(state, current->page, + toDelete, nToDelete, + SPGIST_PLACEHOLDER, + SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + } + } + } + + /* + * Put leaf tuples on proper pages, and update downlinks in innerTuple's + * nodes. + */ + startOffsets[0] = startOffsets[1] = InvalidOffsetNumber; + for (i = 0; i < nToInsert; i++) + { + SpGistLeafTuple it = newLeafs[i]; + Buffer leafBuffer; + BlockNumber leafBlock; + OffsetNumber newoffset; + + /* Which page is it going to? */ + leafBuffer = leafPageSelect[i] ? newLeafBuffer : current->buffer; + leafBlock = BufferGetBlockNumber(leafBuffer); + + /* Link tuple into correct chain for its node */ + n = out.mapTuplesToNodes[i]; + + if (ItemPointerIsValid(&nodes[n]->t_tid)) + { + Assert(ItemPointerGetBlockNumber(&nodes[n]->t_tid) == leafBlock); + SGLT_SET_NEXTOFFSET(it, ItemPointerGetOffsetNumber(&nodes[n]->t_tid)); + } + else + SGLT_SET_NEXTOFFSET(it, InvalidOffsetNumber); + + /* Insert it on page */ + newoffset = SpGistPageAddNewItem(state, BufferGetPage(leafBuffer), + (Item) it, it->size, + &startOffsets[leafPageSelect[i]], + false); + toInsert[i] = newoffset; + + /* ... and complete the chain linking */ + ItemPointerSet(&nodes[n]->t_tid, leafBlock, newoffset); + + /* Also copy leaf tuple into WAL data */ + memcpy(leafptr, newLeafs[i], newLeafs[i]->size); + leafptr += newLeafs[i]->size; + } + + /* + * We're done modifying the other leaf buffer (if any), so mark it dirty. + * current->buffer will be marked below, after we're entirely done + * modifying it. + */ + if (newLeafBuffer != InvalidBuffer) + { + MarkBufferDirty(newLeafBuffer); + } + + /* Remember current buffer, since we're about to change "current" */ + saveCurrent = *current; + + /* + * Store the new innerTuple + */ + if (newInnerBuffer == parent->buffer && newInnerBuffer != InvalidBuffer) + { + /* + * new inner tuple goes to parent page + */ + Assert(current->buffer != parent->buffer); + + /* Repoint "current" at the new inner tuple */ + current->blkno = parent->blkno; + current->buffer = parent->buffer; + current->page = parent->page; + xlrec.offnumInner = current->offnum = + SpGistPageAddNewItem(state, current->page, + (Item) innerTuple, innerTuple->size, + NULL, false); + + /* + * Update parent node link and mark parent page dirty + */ + xlrec.innerIsParent = true; + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + saveNodeLink(index, parent, current->blkno, current->offnum); + + /* + * Update redirection link (in old current buffer) + */ + if (redirectTuplePos != InvalidOffsetNumber) + setRedirectionTuple(&saveCurrent, redirectTuplePos, + current->blkno, current->offnum); + + /* Done modifying old current buffer, mark it dirty */ + MarkBufferDirty(saveCurrent.buffer); + } + else if (parent->buffer != InvalidBuffer) + { + /* + * new inner tuple will be stored on a new page + */ + Assert(newInnerBuffer != InvalidBuffer); + + /* Repoint "current" at the new inner tuple */ + current->buffer = newInnerBuffer; + current->blkno = BufferGetBlockNumber(current->buffer); + current->page = BufferGetPage(current->buffer); + xlrec.offnumInner = current->offnum = + SpGistPageAddNewItem(state, current->page, + (Item) innerTuple, innerTuple->size, + NULL, false); + + /* Done modifying new current buffer, mark it dirty */ + MarkBufferDirty(current->buffer); + + /* + * Update parent node link and mark parent page dirty + */ + xlrec.innerIsParent = (parent->buffer == current->buffer); + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + saveNodeLink(index, parent, current->blkno, current->offnum); + + /* + * Update redirection link (in old current buffer) + */ + if (redirectTuplePos != InvalidOffsetNumber) + setRedirectionTuple(&saveCurrent, redirectTuplePos, + current->blkno, current->offnum); + + /* Done modifying old current buffer, mark it dirty */ + MarkBufferDirty(saveCurrent.buffer); + } + else + { + /* + * Splitting root page, which was a leaf but now becomes inner page + * (and so "current" continues to point at it) + */ + Assert(SpGistBlockIsRoot(current->blkno)); + Assert(redirectTuplePos == InvalidOffsetNumber); + + SpGistInitBuffer(current->buffer, (isNulls ? SPGIST_NULLS : 0)); + xlrec.initInner = true; + xlrec.innerIsParent = false; + + xlrec.offnumInner = current->offnum = + PageAddItem(current->page, (Item) innerTuple, innerTuple->size, + InvalidOffsetNumber, false, false); + if (current->offnum != FirstOffsetNumber) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + innerTuple->size); + + /* No parent link to update, nor redirection to do */ + xlrec.offnumParent = InvalidOffsetNumber; + xlrec.nodeI = 0; + + /* Done modifying new current buffer, mark it dirty */ + MarkBufferDirty(current->buffer); + + /* saveCurrent doesn't represent a different buffer */ + saveCurrent.buffer = InvalidBuffer; + } + + if (RelationNeedsWAL(index) && !state->isBuild) + { + XLogRecPtr recptr; + int flags; + + XLogBeginInsert(); + + xlrec.nInsert = nToInsert; + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogPickSplit); + + XLogRegisterData((char *) toDelete, + sizeof(OffsetNumber) * xlrec.nDelete); + XLogRegisterData((char *) toInsert, + sizeof(OffsetNumber) * xlrec.nInsert); + XLogRegisterData((char *) leafPageSelect, + sizeof(uint8) * xlrec.nInsert); + XLogRegisterData((char *) innerTuple, innerTuple->size); + XLogRegisterData(leafdata, leafptr - leafdata); + + /* Old leaf page */ + if (BufferIsValid(saveCurrent.buffer)) + { + flags = REGBUF_STANDARD; + if (xlrec.initSrc) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(0, saveCurrent.buffer, flags); + } + + /* New leaf page */ + if (BufferIsValid(newLeafBuffer)) + { + flags = REGBUF_STANDARD; + if (xlrec.initDest) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(1, newLeafBuffer, flags); + } + + /* Inner page */ + flags = REGBUF_STANDARD; + if (xlrec.initInner) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(2, current->buffer, flags); + + /* Parent page, if different from inner page */ + if (parent->buffer != InvalidBuffer) + { + if (parent->buffer != current->buffer) + XLogRegisterBuffer(3, parent->buffer, REGBUF_STANDARD); + else + Assert(xlrec.innerIsParent); + } + + /* Issue the WAL record */ + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_PICKSPLIT); + + /* Update page LSNs on all affected pages */ + if (newLeafBuffer != InvalidBuffer) + { + Page page = BufferGetPage(newLeafBuffer); + + PageSetLSN(page, recptr); + } + + if (saveCurrent.buffer != InvalidBuffer) + { + Page page = BufferGetPage(saveCurrent.buffer); + + PageSetLSN(page, recptr); + } + + PageSetLSN(current->page, recptr); + + if (parent->buffer != InvalidBuffer) + { + PageSetLSN(parent->page, recptr); + } + } + + END_CRIT_SECTION(); + + /* Update local free-space cache and unlock buffers */ + if (newLeafBuffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, newLeafBuffer); + UnlockReleaseBuffer(newLeafBuffer); + } + if (saveCurrent.buffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, saveCurrent.buffer); + UnlockReleaseBuffer(saveCurrent.buffer); + } + + return insertedNew; +} + +/* + * spgMatchNode action: descend to N'th child node of current inner tuple + */ +static void +spgMatchNodeAction(Relation index, SpGistState *state, + SpGistInnerTuple innerTuple, + SPPageDesc *current, SPPageDesc *parent, int nodeN) +{ + int i; + SpGistNodeTuple node; + + /* Release previous parent buffer if any */ + if (parent->buffer != InvalidBuffer && + parent->buffer != current->buffer) + { + SpGistSetLastUsedPage(index, parent->buffer); + UnlockReleaseBuffer(parent->buffer); + } + + /* Repoint parent to specified node of current inner tuple */ + parent->blkno = current->blkno; + parent->buffer = current->buffer; + parent->page = current->page; + parent->offnum = current->offnum; + parent->node = nodeN; + + /* Locate that node */ + SGITITERATE(innerTuple, i, node) + { + if (i == nodeN) + break; + } + + if (i != nodeN) + elog(ERROR, "failed to find requested node %d in SPGiST inner tuple", + nodeN); + + /* Point current to the downlink location, if any */ + if (ItemPointerIsValid(&node->t_tid)) + { + current->blkno = ItemPointerGetBlockNumber(&node->t_tid); + current->offnum = ItemPointerGetOffsetNumber(&node->t_tid); + } + else + { + /* Downlink is empty, so we'll need to find a new page */ + current->blkno = InvalidBlockNumber; + current->offnum = InvalidOffsetNumber; + } + + current->buffer = InvalidBuffer; + current->page = NULL; +} + +/* + * spgAddNode action: add a node to the inner tuple at current + */ +static void +spgAddNodeAction(Relation index, SpGistState *state, + SpGistInnerTuple innerTuple, + SPPageDesc *current, SPPageDesc *parent, + int nodeN, Datum nodeLabel) +{ + SpGistInnerTuple newInnerTuple; + spgxlogAddNode xlrec; + + /* Should not be applied to nulls */ + Assert(!SpGistPageStoresNulls(current->page)); + + /* Construct new inner tuple with additional node */ + newInnerTuple = addNode(state, innerTuple, nodeLabel, nodeN); + + /* Prepare WAL record */ + STORE_STATE(state, xlrec.stateSrc); + xlrec.offnum = current->offnum; + + /* we don't fill these unless we need to change the parent downlink */ + xlrec.parentBlk = -1; + xlrec.offnumParent = InvalidOffsetNumber; + xlrec.nodeI = 0; + + /* we don't fill these unless tuple has to be moved */ + xlrec.offnumNew = InvalidOffsetNumber; + xlrec.newPage = false; + + if (PageGetExactFreeSpace(current->page) >= + newInnerTuple->size - innerTuple->size) + { + /* + * We can replace the inner tuple by new version in-place + */ + START_CRIT_SECTION(); + + PageIndexTupleDelete(current->page, current->offnum); + if (PageAddItem(current->page, + (Item) newInnerTuple, newInnerTuple->size, + current->offnum, false, false) != current->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + newInnerTuple->size); + + MarkBufferDirty(current->buffer); + + if (RelationNeedsWAL(index) && !state->isBuild) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) newInnerTuple, newInnerTuple->size); + + XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE); + + PageSetLSN(current->page, recptr); + } + + END_CRIT_SECTION(); + } + else + { + /* + * move inner tuple to another page, and update parent + */ + SpGistDeadTuple dt; + SPPageDesc saveCurrent; + + /* + * It should not be possible to get here for the root page, since we + * allow only one inner tuple on the root page, and spgFormInnerTuple + * always checks that inner tuples don't exceed the size of a page. + */ + if (SpGistBlockIsRoot(current->blkno)) + elog(ERROR, "cannot enlarge root tuple any more"); + Assert(parent->buffer != InvalidBuffer); + + saveCurrent = *current; + + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + + /* + * obtain new buffer with the same parity as current, since it will be + * a child of same parent tuple + */ + current->buffer = SpGistGetBuffer(index, + GBUF_INNER_PARITY(current->blkno), + newInnerTuple->size + sizeof(ItemIdData), + &xlrec.newPage); + current->blkno = BufferGetBlockNumber(current->buffer); + current->page = BufferGetPage(current->buffer); + + /* + * Let's just make real sure new current isn't same as old. Right now + * that's impossible, but if SpGistGetBuffer ever got smart enough to + * delete placeholder tuples before checking space, maybe it wouldn't + * be impossible. The case would appear to work except that WAL + * replay would be subtly wrong, so I think a mere assert isn't enough + * here. + */ + if (current->blkno == saveCurrent.blkno) + elog(ERROR, "SPGiST new buffer shouldn't be same as old buffer"); + + /* + * New current and parent buffer will both be modified; but note that + * parent buffer could be same as either new or old current. + */ + if (parent->buffer == saveCurrent.buffer) + xlrec.parentBlk = 0; + else if (parent->buffer == current->buffer) + xlrec.parentBlk = 1; + else + xlrec.parentBlk = 2; + + START_CRIT_SECTION(); + + /* insert new ... */ + xlrec.offnumNew = current->offnum = + SpGistPageAddNewItem(state, current->page, + (Item) newInnerTuple, newInnerTuple->size, + NULL, false); + + MarkBufferDirty(current->buffer); + + /* update parent's downlink and mark parent page dirty */ + saveNodeLink(index, parent, current->blkno, current->offnum); + + /* + * Replace old tuple with a placeholder or redirection tuple. Unless + * doing an index build, we have to insert a redirection tuple for + * possible concurrent scans. We can't just delete it in any case, + * because that could change the offsets of other tuples on the page, + * breaking downlinks from their parents. + */ + if (state->isBuild) + dt = spgFormDeadTuple(state, SPGIST_PLACEHOLDER, + InvalidBlockNumber, InvalidOffsetNumber); + else + dt = spgFormDeadTuple(state, SPGIST_REDIRECT, + current->blkno, current->offnum); + + PageIndexTupleDelete(saveCurrent.page, saveCurrent.offnum); + if (PageAddItem(saveCurrent.page, (Item) dt, dt->size, + saveCurrent.offnum, + false, false) != saveCurrent.offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + dt->size); + + if (state->isBuild) + SpGistPageGetOpaque(saveCurrent.page)->nPlaceholder++; + else + SpGistPageGetOpaque(saveCurrent.page)->nRedirection++; + + MarkBufferDirty(saveCurrent.buffer); + + if (RelationNeedsWAL(index) && !state->isBuild) + { + XLogRecPtr recptr; + int flags; + + XLogBeginInsert(); + + /* orig page */ + XLogRegisterBuffer(0, saveCurrent.buffer, REGBUF_STANDARD); + /* new page */ + flags = REGBUF_STANDARD; + if (xlrec.newPage) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(1, current->buffer, flags); + /* parent page (if different from orig and new) */ + if (xlrec.parentBlk == 2) + XLogRegisterBuffer(2, parent->buffer, REGBUF_STANDARD); + + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) newInnerTuple, newInnerTuple->size); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE); + + /* we don't bother to check if any of these are redundant */ + PageSetLSN(current->page, recptr); + PageSetLSN(parent->page, recptr); + PageSetLSN(saveCurrent.page, recptr); + } + + END_CRIT_SECTION(); + + /* Release saveCurrent if it's not same as current or parent */ + if (saveCurrent.buffer != current->buffer && + saveCurrent.buffer != parent->buffer) + { + SpGistSetLastUsedPage(index, saveCurrent.buffer); + UnlockReleaseBuffer(saveCurrent.buffer); + } + } +} + +/* + * spgSplitNode action: split inner tuple at current into prefix and postfix + */ +static void +spgSplitNodeAction(Relation index, SpGistState *state, + SpGistInnerTuple innerTuple, + SPPageDesc *current, spgChooseOut *out) +{ + SpGistInnerTuple prefixTuple, + postfixTuple; + SpGistNodeTuple node, + *nodes; + BlockNumber postfixBlkno; + OffsetNumber postfixOffset; + int i; + spgxlogSplitTuple xlrec; + Buffer newBuffer = InvalidBuffer; + + /* Should not be applied to nulls */ + Assert(!SpGistPageStoresNulls(current->page)); + + /* Check opclass gave us sane values */ + if (out->result.splitTuple.prefixNNodes <= 0 || + out->result.splitTuple.prefixNNodes > SGITMAXNNODES) + elog(ERROR, "invalid number of prefix nodes: %d", + out->result.splitTuple.prefixNNodes); + if (out->result.splitTuple.childNodeN < 0 || + out->result.splitTuple.childNodeN >= + out->result.splitTuple.prefixNNodes) + elog(ERROR, "invalid child node number: %d", + out->result.splitTuple.childNodeN); + + /* + * Construct new prefix tuple with requested number of nodes. We'll fill + * in the childNodeN'th node's downlink below. + */ + nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * + out->result.splitTuple.prefixNNodes); + + for (i = 0; i < out->result.splitTuple.prefixNNodes; i++) + { + Datum label = (Datum) 0; + bool labelisnull; + + labelisnull = (out->result.splitTuple.prefixNodeLabels == NULL); + if (!labelisnull) + label = out->result.splitTuple.prefixNodeLabels[i]; + nodes[i] = spgFormNodeTuple(state, label, labelisnull); + } + + prefixTuple = spgFormInnerTuple(state, + out->result.splitTuple.prefixHasPrefix, + out->result.splitTuple.prefixPrefixDatum, + out->result.splitTuple.prefixNNodes, + nodes); + + /* it must fit in the space that innerTuple now occupies */ + if (prefixTuple->size > innerTuple->size) + elog(ERROR, "SPGiST inner-tuple split must not produce longer prefix"); + + /* + * Construct new postfix tuple, containing all nodes of innerTuple with + * same node datums, but with the prefix specified by the picksplit + * function. + */ + nodes = palloc(sizeof(SpGistNodeTuple) * innerTuple->nNodes); + SGITITERATE(innerTuple, i, node) + { + nodes[i] = node; + } + + postfixTuple = spgFormInnerTuple(state, + out->result.splitTuple.postfixHasPrefix, + out->result.splitTuple.postfixPrefixDatum, + innerTuple->nNodes, nodes); + + /* Postfix tuple is allTheSame if original tuple was */ + postfixTuple->allTheSame = innerTuple->allTheSame; + + /* prep data for WAL record */ + xlrec.newPage = false; + + /* + * If we can't fit both tuples on the current page, get a new page for the + * postfix tuple. In particular, can't split to the root page. + * + * For the space calculation, note that prefixTuple replaces innerTuple + * but postfixTuple will be a new entry. + */ + if (SpGistBlockIsRoot(current->blkno) || + SpGistPageGetFreeSpace(current->page, 1) + innerTuple->size < + prefixTuple->size + postfixTuple->size + sizeof(ItemIdData)) + { + /* + * Choose page with next triple parity, because postfix tuple is a + * child of prefix one + */ + newBuffer = SpGistGetBuffer(index, + GBUF_INNER_PARITY(current->blkno + 1), + postfixTuple->size + sizeof(ItemIdData), + &xlrec.newPage); + } + + START_CRIT_SECTION(); + + /* + * Replace old tuple by prefix tuple + */ + PageIndexTupleDelete(current->page, current->offnum); + xlrec.offnumPrefix = PageAddItem(current->page, + (Item) prefixTuple, prefixTuple->size, + current->offnum, false, false); + if (xlrec.offnumPrefix != current->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + prefixTuple->size); + + /* + * put postfix tuple into appropriate page + */ + if (newBuffer == InvalidBuffer) + { + postfixBlkno = current->blkno; + xlrec.offnumPostfix = postfixOffset = + SpGistPageAddNewItem(state, current->page, + (Item) postfixTuple, postfixTuple->size, + NULL, false); + xlrec.postfixBlkSame = true; + } + else + { + postfixBlkno = BufferGetBlockNumber(newBuffer); + xlrec.offnumPostfix = postfixOffset = + SpGistPageAddNewItem(state, BufferGetPage(newBuffer), + (Item) postfixTuple, postfixTuple->size, + NULL, false); + MarkBufferDirty(newBuffer); + xlrec.postfixBlkSame = false; + } + + /* + * And set downlink pointer in the prefix tuple to point to postfix tuple. + * (We can't avoid this step by doing the above two steps in opposite + * order, because there might not be enough space on the page to insert + * the postfix tuple first.) We have to update the local copy of the + * prefixTuple too, because that's what will be written to WAL. + */ + spgUpdateNodeLink(prefixTuple, out->result.splitTuple.childNodeN, + postfixBlkno, postfixOffset); + prefixTuple = (SpGistInnerTuple) PageGetItem(current->page, + PageGetItemId(current->page, current->offnum)); + spgUpdateNodeLink(prefixTuple, out->result.splitTuple.childNodeN, + postfixBlkno, postfixOffset); + + MarkBufferDirty(current->buffer); + + if (RelationNeedsWAL(index) && !state->isBuild) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) prefixTuple, prefixTuple->size); + XLogRegisterData((char *) postfixTuple, postfixTuple->size); + + XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD); + if (newBuffer != InvalidBuffer) + { + int flags; + + flags = REGBUF_STANDARD; + if (xlrec.newPage) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(1, newBuffer, flags); + } + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_SPLIT_TUPLE); + + PageSetLSN(current->page, recptr); + + if (newBuffer != InvalidBuffer) + { + PageSetLSN(BufferGetPage(newBuffer), recptr); + } + } + + END_CRIT_SECTION(); + + /* Update local free-space cache and release buffer */ + if (newBuffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, newBuffer); + UnlockReleaseBuffer(newBuffer); + } +} + +/* + * Insert one item into the index. + * + * Returns true on success, false if we failed to complete the insertion + * (typically because of conflict with a concurrent insert). In the latter + * case, caller should re-call spgdoinsert() with the same args. + */ +bool +spgdoinsert(Relation index, SpGistState *state, + ItemPointer heapPtr, Datum *datums, bool *isnulls) +{ + bool result = true; + TupleDesc leafDescriptor = state->leafTupDesc; + bool isnull = isnulls[spgKeyColumn]; + int level = 0; + Datum leafDatums[INDEX_MAX_KEYS]; + int leafSize; + int bestLeafSize; + int numNoProgressCycles = 0; + SPPageDesc current, + parent; + FmgrInfo *procinfo = NULL; + + /* + * Look up FmgrInfo of the user-defined choose function once, to save + * cycles in the loop below. + */ + if (!isnull) + procinfo = index_getprocinfo(index, 1, SPGIST_CHOOSE_PROC); + + /* + * Prepare the leaf datum to insert. + * + * If an optional "compress" method is provided, then call it to form the + * leaf key datum from the input datum. Otherwise, store the input datum + * as is. Since we don't use index_form_tuple in this AM, we have to make + * sure value to be inserted is not toasted; FormIndexDatum doesn't + * guarantee that. But we assume the "compress" method to return an + * untoasted value. + */ + if (!isnull) + { + if (OidIsValid(index_getprocid(index, 1, SPGIST_COMPRESS_PROC))) + { + FmgrInfo *compressProcinfo = NULL; + + compressProcinfo = index_getprocinfo(index, 1, SPGIST_COMPRESS_PROC); + leafDatums[spgKeyColumn] = + FunctionCall1Coll(compressProcinfo, + index->rd_indcollation[spgKeyColumn], + datums[spgKeyColumn]); + } + else + { + Assert(state->attLeafType.type == state->attType.type); + + if (state->attType.attlen == -1) + leafDatums[spgKeyColumn] = + PointerGetDatum(PG_DETOAST_DATUM(datums[spgKeyColumn])); + else + leafDatums[spgKeyColumn] = datums[spgKeyColumn]; + } + } + else + leafDatums[spgKeyColumn] = (Datum) 0; + + /* Likewise, ensure that any INCLUDE values are not toasted */ + for (int i = spgFirstIncludeColumn; i < leafDescriptor->natts; i++) + { + if (!isnulls[i]) + { + if (TupleDescAttr(leafDescriptor, i)->attlen == -1) + leafDatums[i] = PointerGetDatum(PG_DETOAST_DATUM(datums[i])); + else + leafDatums[i] = datums[i]; + } + else + leafDatums[i] = (Datum) 0; + } + + /* + * Compute space needed for a leaf tuple containing the given data. + */ + leafSize = SpGistGetLeafTupleSize(leafDescriptor, leafDatums, isnulls); + /* Account for an item pointer, too */ + leafSize += sizeof(ItemIdData); + + /* + * If it isn't gonna fit, and the opclass can't reduce the datum size by + * suffixing, bail out now rather than doing a lot of useless work. + */ + if (leafSize > SPGIST_PAGE_CAPACITY && + (isnull || !state->config.longValuesOK)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + leafSize - sizeof(ItemIdData), + SPGIST_PAGE_CAPACITY - sizeof(ItemIdData), + RelationGetRelationName(index)), + errhint("Values larger than a buffer page cannot be indexed."))); + bestLeafSize = leafSize; + + /* Initialize "current" to the appropriate root page */ + current.blkno = isnull ? SPGIST_NULL_BLKNO : SPGIST_ROOT_BLKNO; + current.buffer = InvalidBuffer; + current.page = NULL; + current.offnum = FirstOffsetNumber; + current.node = -1; + + /* "parent" is invalid for the moment */ + parent.blkno = InvalidBlockNumber; + parent.buffer = InvalidBuffer; + parent.page = NULL; + parent.offnum = InvalidOffsetNumber; + parent.node = -1; + + /* + * Before entering the loop, try to clear any pending interrupt condition. + * If a query cancel is pending, we might as well accept it now not later; + * while if a non-canceling condition is pending, servicing it here avoids + * having to restart the insertion and redo all the work so far. + */ + CHECK_FOR_INTERRUPTS(); + + for (;;) + { + bool isNew = false; + + /* + * Bail out if query cancel is pending. We must have this somewhere + * in the loop since a broken opclass could produce an infinite + * picksplit loop. However, because we'll be holding buffer lock(s) + * after the first iteration, ProcessInterrupts() wouldn't be able to + * throw a cancel error here. Hence, if we see that an interrupt is + * pending, break out of the loop and deal with the situation below. + * Set result = false because we must restart the insertion if the + * interrupt isn't a query-cancel-or-die case. + */ + if (INTERRUPTS_PENDING_CONDITION()) + { + result = false; + break; + } + + if (current.blkno == InvalidBlockNumber) + { + /* + * Create a leaf page. If leafSize is too large to fit on a page, + * we won't actually use the page yet, but it simplifies the API + * for doPickSplit to always have a leaf page at hand; so just + * quietly limit our request to a page size. + */ + current.buffer = + SpGistGetBuffer(index, + GBUF_LEAF | (isnull ? GBUF_NULLS : 0), + Min(leafSize, SPGIST_PAGE_CAPACITY), + &isNew); + current.blkno = BufferGetBlockNumber(current.buffer); + } + else if (parent.buffer == InvalidBuffer) + { + /* we hold no parent-page lock, so no deadlock is possible */ + current.buffer = ReadBuffer(index, current.blkno); + LockBuffer(current.buffer, BUFFER_LOCK_EXCLUSIVE); + } + else if (current.blkno != parent.blkno) + { + /* descend to a new child page */ + current.buffer = ReadBuffer(index, current.blkno); + + /* + * Attempt to acquire lock on child page. We must beware of + * deadlock against another insertion process descending from that + * page to our parent page (see README). If we fail to get lock, + * abandon the insertion and tell our caller to start over. + * + * XXX this could be improved, because failing to get lock on a + * buffer is not proof of a deadlock situation; the lock might be + * held by a reader, or even just background writer/checkpointer + * process. Perhaps it'd be worth retrying after sleeping a bit? + */ + if (!ConditionalLockBuffer(current.buffer)) + { + ReleaseBuffer(current.buffer); + UnlockReleaseBuffer(parent.buffer); + return false; + } + } + else + { + /* inner tuple can be stored on the same page as parent one */ + current.buffer = parent.buffer; + } + current.page = BufferGetPage(current.buffer); + + /* should not arrive at a page of the wrong type */ + if (isnull ? !SpGistPageStoresNulls(current.page) : + SpGistPageStoresNulls(current.page)) + elog(ERROR, "SPGiST index page %u has wrong nulls flag", + current.blkno); + + if (SpGistPageIsLeaf(current.page)) + { + SpGistLeafTuple leafTuple; + int nToSplit, + sizeToSplit; + + leafTuple = spgFormLeafTuple(state, heapPtr, leafDatums, isnulls); + if (leafTuple->size + sizeof(ItemIdData) <= + SpGistPageGetFreeSpace(current.page, 1)) + { + /* it fits on page, so insert it and we're done */ + addLeafTuple(index, state, leafTuple, + ¤t, &parent, isnull, isNew); + break; + } + else if ((sizeToSplit = + checkSplitConditions(index, state, ¤t, + &nToSplit)) < SPGIST_PAGE_CAPACITY / 2 && + nToSplit < 64 && + leafTuple->size + sizeof(ItemIdData) + sizeToSplit <= SPGIST_PAGE_CAPACITY) + { + /* + * the amount of data is pretty small, so just move the whole + * chain to another leaf page rather than splitting it. + */ + Assert(!isNew); + moveLeafs(index, state, ¤t, &parent, leafTuple, isnull); + break; /* we're done */ + } + else + { + /* picksplit */ + if (doPickSplit(index, state, ¤t, &parent, + leafTuple, level, isnull, isNew)) + break; /* doPickSplit installed new tuples */ + + /* leaf tuple will not be inserted yet */ + pfree(leafTuple); + + /* + * current now describes new inner tuple, go insert into it + */ + Assert(!SpGistPageIsLeaf(current.page)); + goto process_inner_tuple; + } + } + else /* non-leaf page */ + { + /* + * Apply the opclass choose function to figure out how to insert + * the given datum into the current inner tuple. + */ + SpGistInnerTuple innerTuple; + spgChooseIn in; + spgChooseOut out; + + /* + * spgAddNode and spgSplitTuple cases will loop back to here to + * complete the insertion operation. Just in case the choose + * function is broken and produces add or split requests + * repeatedly, check for query cancel (see comments above). + */ + process_inner_tuple: + if (INTERRUPTS_PENDING_CONDITION()) + { + result = false; + break; + } + + innerTuple = (SpGistInnerTuple) PageGetItem(current.page, + PageGetItemId(current.page, current.offnum)); + + in.datum = datums[spgKeyColumn]; + in.leafDatum = leafDatums[spgKeyColumn]; + in.level = level; + in.allTheSame = innerTuple->allTheSame; + in.hasPrefix = (innerTuple->prefixSize > 0); + in.prefixDatum = SGITDATUM(innerTuple, state); + in.nNodes = innerTuple->nNodes; + in.nodeLabels = spgExtractNodeLabels(state, innerTuple); + + memset(&out, 0, sizeof(out)); + + if (!isnull) + { + /* use user-defined choose method */ + FunctionCall2Coll(procinfo, + index->rd_indcollation[0], + PointerGetDatum(&in), + PointerGetDatum(&out)); + } + else + { + /* force "match" action (to insert to random subnode) */ + out.resultType = spgMatchNode; + } + + if (innerTuple->allTheSame) + { + /* + * It's not allowed to do an AddNode at an allTheSame tuple. + * Opclass must say "match", in which case we choose a random + * one of the nodes to descend into, or "split". + */ + if (out.resultType == spgAddNode) + elog(ERROR, "cannot add a node to an allTheSame inner tuple"); + else if (out.resultType == spgMatchNode) + out.result.matchNode.nodeN = random() % innerTuple->nNodes; + } + + switch (out.resultType) + { + case spgMatchNode: + /* Descend to N'th child node */ + spgMatchNodeAction(index, state, innerTuple, + ¤t, &parent, + out.result.matchNode.nodeN); + /* Adjust level as per opclass request */ + level += out.result.matchNode.levelAdd; + /* Replace leafDatum and recompute leafSize */ + if (!isnull) + { + leafDatums[spgKeyColumn] = out.result.matchNode.restDatum; + leafSize = SpGistGetLeafTupleSize(leafDescriptor, + leafDatums, isnulls); + leafSize += sizeof(ItemIdData); + } + + /* + * Check new tuple size; fail if it can't fit, unless the + * opclass says it can handle the situation by suffixing. + * + * However, the opclass can only shorten the leaf datum, + * which may not be enough to ever make the tuple fit, + * since INCLUDE columns might alone use more than a page. + * Depending on the opclass' behavior, that could lead to + * an infinite loop --- spgtextproc.c, for example, will + * just repeatedly generate an empty-string leaf datum + * once it runs out of data. Actual bugs in opclasses + * might cause infinite looping, too. To detect such a + * loop, check to see if we are making progress by + * reducing the leafSize in each pass. This is a bit + * tricky though. Because of alignment considerations, + * the total tuple size might not decrease on every pass. + * Also, there are edge cases where the choose method + * might seem to not make progress for a cycle or two. + * Somewhat arbitrarily, we allow up to 10 no-progress + * iterations before failing. (This limit should be more + * than MAXALIGN, to accommodate opclasses that trim one + * byte from the leaf datum per pass.) + */ + if (leafSize > SPGIST_PAGE_CAPACITY) + { + bool ok = false; + + if (state->config.longValuesOK && !isnull) + { + if (leafSize < bestLeafSize) + { + ok = true; + bestLeafSize = leafSize; + numNoProgressCycles = 0; + } + else if (++numNoProgressCycles < 10) + ok = true; + } + if (!ok) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + leafSize - sizeof(ItemIdData), + SPGIST_PAGE_CAPACITY - sizeof(ItemIdData), + RelationGetRelationName(index)), + errhint("Values larger than a buffer page cannot be indexed."))); + } + + /* + * Loop around and attempt to insert the new leafDatum at + * "current" (which might reference an existing child + * tuple, or might be invalid to force us to find a new + * page for the tuple). + */ + break; + case spgAddNode: + /* AddNode is not sensible if nodes don't have labels */ + if (in.nodeLabels == NULL) + elog(ERROR, "cannot add a node to an inner tuple without node labels"); + /* Add node to inner tuple, per request */ + spgAddNodeAction(index, state, innerTuple, + ¤t, &parent, + out.result.addNode.nodeN, + out.result.addNode.nodeLabel); + + /* + * Retry insertion into the enlarged node. We assume that + * we'll get a MatchNode result this time. + */ + goto process_inner_tuple; + break; + case spgSplitTuple: + /* Split inner tuple, per request */ + spgSplitNodeAction(index, state, innerTuple, + ¤t, &out); + + /* Retry insertion into the split node */ + goto process_inner_tuple; + break; + default: + elog(ERROR, "unrecognized SPGiST choose result: %d", + (int) out.resultType); + break; + } + } + } /* end loop */ + + /* + * Release any buffers we're still holding. Beware of possibility that + * current and parent reference same buffer. + */ + if (current.buffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, current.buffer); + UnlockReleaseBuffer(current.buffer); + } + if (parent.buffer != InvalidBuffer && + parent.buffer != current.buffer) + { + SpGistSetLastUsedPage(index, parent.buffer); + UnlockReleaseBuffer(parent.buffer); + } + + /* + * We do not support being called while some outer function is holding a + * buffer lock (or any other reason to postpone query cancels). If that + * were the case, telling the caller to retry would create an infinite + * loop. + */ + Assert(INTERRUPTS_CAN_BE_PROCESSED()); + + /* + * Finally, check for interrupts again. If there was a query cancel, + * ProcessInterrupts() will be able to throw the error here. If it was + * some other kind of interrupt that can just be cleared, return false to + * tell our caller to retry. + */ + CHECK_FOR_INTERRUPTS(); + + return result; +} diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c new file mode 100644 index 0000000..1af0af7 --- /dev/null +++ b/src/backend/access/spgist/spginsert.c @@ -0,0 +1,243 @@ +/*------------------------------------------------------------------------- + * + * spginsert.c + * Externally visible index creation/insertion routines + * + * All the actual insertion logic is in spgdoinsert.c. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spginsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/spgist_private.h" +#include "access/spgxlog.h" +#include "access/tableam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +typedef struct +{ + SpGistState spgstate; /* SPGiST's working state */ + int64 indtuples; /* total number of tuples indexed */ + MemoryContext tmpCtx; /* per-tuple temporary context */ +} SpGistBuildState; + + +/* Callback to process one heap tuple during table_index_build_scan */ +static void +spgistBuildCallback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + SpGistBuildState *buildstate = (SpGistBuildState *) state; + MemoryContext oldCtx; + + /* Work in temp context, and reset it after each tuple */ + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + /* + * Even though no concurrent insertions can be happening, we still might + * get a buffer-locking failure due to bgwriter or checkpointer taking a + * lock on some buffer. So we need to be willing to retry. We can flush + * any temp data when retrying. + */ + while (!spgdoinsert(index, &buildstate->spgstate, tid, + values, isnull)) + { + MemoryContextReset(buildstate->tmpCtx); + } + + /* Update total tuple count */ + buildstate->indtuples += 1; + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->tmpCtx); +} + +/* + * Build an SP-GiST index. + */ +IndexBuildResult * +spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + SpGistBuildState buildstate; + Buffer metabuffer, + rootbuffer, + nullbuffer; + + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* + * Initialize the meta page and root pages + */ + metabuffer = SpGistNewBuffer(index); + rootbuffer = SpGistNewBuffer(index); + nullbuffer = SpGistNewBuffer(index); + + Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO); + Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO); + Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO); + + START_CRIT_SECTION(); + + SpGistInitMetapage(BufferGetPage(metabuffer)); + MarkBufferDirty(metabuffer); + SpGistInitBuffer(rootbuffer, SPGIST_LEAF); + MarkBufferDirty(rootbuffer); + SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS); + MarkBufferDirty(nullbuffer); + + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(metabuffer); + UnlockReleaseBuffer(rootbuffer); + UnlockReleaseBuffer(nullbuffer); + + /* + * Now insert all the heap data into the index + */ + initSpGistState(&buildstate.spgstate, index); + buildstate.spgstate.isBuild = true; + buildstate.indtuples = 0; + + buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST build temporary context", + ALLOCSET_DEFAULT_SIZES); + + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + spgistBuildCallback, (void *) &buildstate, + NULL); + + MemoryContextDelete(buildstate.tmpCtx); + + SpGistUpdateMetaPage(index); + + /* + * We didn't write WAL records as we built the index, so if WAL-logging is + * required, write all pages to the WAL now. + */ + if (RelationNeedsWAL(index)) + { + log_newpage_range(index, MAIN_FORKNUM, + 0, RelationGetNumberOfBlocks(index), + true); + } + + result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); + result->heap_tuples = reltuples; + result->index_tuples = buildstate.indtuples; + + return result; +} + +/* + * Build an empty SPGiST index in the initialization fork + */ +void +spgbuildempty(Relation index) +{ + Page page; + + /* Construct metapage. */ + page = (Page) palloc(BLCKSZ); + SpGistInitMetapage(page); + + /* + * Write the page and log it unconditionally. This is important + * particularly for indexes created on tablespaces and databases whose + * creation happened after the last redo pointer as recovery removes any + * of their existing content when the corresponding create records are + * replayed. + */ + PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); + smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, + (char *) page, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_METAPAGE_BLKNO, page, true); + + /* Likewise for the root page. */ + SpGistInitPage(page, SPGIST_LEAF); + + PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); + smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO, + (char *) page, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_ROOT_BLKNO, page, true); + + /* Likewise for the null-tuples root page. */ + SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); + + PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); + smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO, + (char *) page, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_NULL_BLKNO, page, true); + + /* + * An immediate sync is required even if we xlog'd the pages, because the + * writes did not go through shared buffers and therefore a concurrent + * checkpoint may have moved the redo pointer past our xlog record. + */ + smgrimmedsync(index->rd_smgr, INIT_FORKNUM); +} + +/* + * Insert one new tuple into an SPGiST index. + */ +bool +spginsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + SpGistState spgstate; + MemoryContext oldCtx; + MemoryContext insertCtx; + + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST insert temporary context", + ALLOCSET_DEFAULT_SIZES); + oldCtx = MemoryContextSwitchTo(insertCtx); + + initSpGistState(&spgstate, index); + + /* + * We might have to repeat spgdoinsert() multiple times, if conflicts + * occur with concurrent insertions. If so, reset the insertCtx each time + * to avoid cumulative memory consumption. That means we also have to + * redo initSpGistState(), but it's cheap enough not to matter. + */ + while (!spgdoinsert(index, &spgstate, ht_ctid, values, isnull)) + { + MemoryContextReset(insertCtx); + initSpGistState(&spgstate, index); + } + + SpGistUpdateMetaPage(index); + + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + + /* return false since we've not done any unique check */ + return false; +} diff --git a/src/backend/access/spgist/spgkdtreeproc.c b/src/backend/access/spgist/spgkdtreeproc.c new file mode 100644 index 0000000..d9b3f6a --- /dev/null +++ b/src/backend/access/spgist/spgkdtreeproc.c @@ -0,0 +1,349 @@ +/*------------------------------------------------------------------------- + * + * spgkdtreeproc.c + * implementation of k-d tree over points for SP-GiST + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgkdtreeproc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/spgist.h" +#include "access/spgist_private.h" +#include "access/stratnum.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/geo_decls.h" + + +Datum +spg_kd_config(PG_FUNCTION_ARGS) +{ + /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */ + spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1); + + cfg->prefixType = FLOAT8OID; + cfg->labelType = VOIDOID; /* we don't need node labels */ + cfg->canReturnData = true; + cfg->longValuesOK = false; + PG_RETURN_VOID(); +} + +static int +getSide(double coord, bool isX, Point *tst) +{ + double tstcoord = (isX) ? tst->x : tst->y; + + if (coord == tstcoord) + return 0; + else if (coord > tstcoord) + return 1; + else + return -1; +} + +Datum +spg_kd_choose(PG_FUNCTION_ARGS) +{ + spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0); + spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1); + Point *inPoint = DatumGetPointP(in->datum); + double coord; + + if (in->allTheSame) + elog(ERROR, "allTheSame should not occur for k-d trees"); + + Assert(in->hasPrefix); + coord = DatumGetFloat8(in->prefixDatum); + + Assert(in->nNodes == 2); + + out->resultType = spgMatchNode; + out->result.matchNode.nodeN = + (getSide(coord, in->level % 2, inPoint) > 0) ? 0 : 1; + out->result.matchNode.levelAdd = 1; + out->result.matchNode.restDatum = PointPGetDatum(inPoint); + + PG_RETURN_VOID(); +} + +typedef struct SortedPoint +{ + Point *p; + int i; +} SortedPoint; + +static int +x_cmp(const void *a, const void *b) +{ + SortedPoint *pa = (SortedPoint *) a; + SortedPoint *pb = (SortedPoint *) b; + + if (pa->p->x == pb->p->x) + return 0; + return (pa->p->x > pb->p->x) ? 1 : -1; +} + +static int +y_cmp(const void *a, const void *b) +{ + SortedPoint *pa = (SortedPoint *) a; + SortedPoint *pb = (SortedPoint *) b; + + if (pa->p->y == pb->p->y) + return 0; + return (pa->p->y > pb->p->y) ? 1 : -1; +} + + +Datum +spg_kd_picksplit(PG_FUNCTION_ARGS) +{ + spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0); + spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1); + int i; + int middle; + SortedPoint *sorted; + double coord; + + sorted = palloc(sizeof(*sorted) * in->nTuples); + for (i = 0; i < in->nTuples; i++) + { + sorted[i].p = DatumGetPointP(in->datums[i]); + sorted[i].i = i; + } + + qsort(sorted, in->nTuples, sizeof(*sorted), + (in->level % 2) ? x_cmp : y_cmp); + middle = in->nTuples >> 1; + coord = (in->level % 2) ? sorted[middle].p->x : sorted[middle].p->y; + + out->hasPrefix = true; + out->prefixDatum = Float8GetDatum(coord); + + out->nNodes = 2; + out->nodeLabels = NULL; /* we don't need node labels */ + + out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); + out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + + /* + * Note: points that have coordinates exactly equal to coord may get + * classified into either node, depending on where they happen to fall in + * the sorted list. This is okay as long as the inner_consistent function + * descends into both sides for such cases. This is better than the + * alternative of trying to have an exact boundary, because it keeps the + * tree balanced even when we have many instances of the same point value. + * So we should never trigger the allTheSame logic. + */ + for (i = 0; i < in->nTuples; i++) + { + Point *p = sorted[i].p; + int n = sorted[i].i; + + out->mapTuplesToNodes[n] = (i < middle) ? 0 : 1; + out->leafTupleDatums[n] = PointPGetDatum(p); + } + + PG_RETURN_VOID(); +} + +Datum +spg_kd_inner_consistent(PG_FUNCTION_ARGS) +{ + spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0); + spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1); + double coord; + int which; + int i; + BOX bboxes[2]; + + Assert(in->hasPrefix); + coord = DatumGetFloat8(in->prefixDatum); + + if (in->allTheSame) + elog(ERROR, "allTheSame should not occur for k-d trees"); + + Assert(in->nNodes == 2); + + /* "which" is a bitmask of children that satisfy all constraints */ + which = (1 << 1) | (1 << 2); + + for (i = 0; i < in->nkeys; i++) + { + Point *query = DatumGetPointP(in->scankeys[i].sk_argument); + BOX *boxQuery; + + switch (in->scankeys[i].sk_strategy) + { + case RTLeftStrategyNumber: + if ((in->level % 2) != 0 && FPlt(query->x, coord)) + which &= (1 << 1); + break; + case RTRightStrategyNumber: + if ((in->level % 2) != 0 && FPgt(query->x, coord)) + which &= (1 << 2); + break; + case RTSameStrategyNumber: + if ((in->level % 2) != 0) + { + if (FPlt(query->x, coord)) + which &= (1 << 1); + else if (FPgt(query->x, coord)) + which &= (1 << 2); + } + else + { + if (FPlt(query->y, coord)) + which &= (1 << 1); + else if (FPgt(query->y, coord)) + which &= (1 << 2); + } + break; + case RTBelowStrategyNumber: + case RTOldBelowStrategyNumber: + if ((in->level % 2) == 0 && FPlt(query->y, coord)) + which &= (1 << 1); + break; + case RTAboveStrategyNumber: + case RTOldAboveStrategyNumber: + if ((in->level % 2) == 0 && FPgt(query->y, coord)) + which &= (1 << 2); + break; + case RTContainedByStrategyNumber: + + /* + * For this operator, the query is a box not a point. We + * cheat to the extent of assuming that DatumGetPointP won't + * do anything that would be bad for a pointer-to-box. + */ + boxQuery = DatumGetBoxP(in->scankeys[i].sk_argument); + + if ((in->level % 2) != 0) + { + if (FPlt(boxQuery->high.x, coord)) + which &= (1 << 1); + else if (FPgt(boxQuery->low.x, coord)) + which &= (1 << 2); + } + else + { + if (FPlt(boxQuery->high.y, coord)) + which &= (1 << 1); + else if (FPgt(boxQuery->low.y, coord)) + which &= (1 << 2); + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + in->scankeys[i].sk_strategy); + break; + } + + if (which == 0) + break; /* no need to consider remaining conditions */ + } + + /* We must descend into the children identified by which */ + out->nNodes = 0; + + /* Fast-path for no matching children */ + if (!which) + PG_RETURN_VOID(); + + out->nodeNumbers = (int *) palloc(sizeof(int) * 2); + + /* + * When ordering scan keys are specified, we've to calculate distance for + * them. In order to do that, we need calculate bounding boxes for both + * children nodes. Calculation of those bounding boxes on non-zero level + * require knowledge of bounding box of upper node. So, we save bounding + * boxes to traversalValues. + */ + if (in->norderbys > 0) + { + BOX infArea; + BOX *area; + + out->distances = (double **) palloc(sizeof(double *) * in->nNodes); + out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes); + + if (in->level == 0) + { + float8 inf = get_float8_infinity(); + + infArea.high.x = inf; + infArea.high.y = inf; + infArea.low.x = -inf; + infArea.low.y = -inf; + area = &infArea; + } + else + { + area = (BOX *) in->traversalValue; + Assert(area); + } + + bboxes[0].low = area->low; + bboxes[1].high = area->high; + + if (in->level % 2) + { + /* split box by x */ + bboxes[0].high.x = bboxes[1].low.x = coord; + bboxes[0].high.y = area->high.y; + bboxes[1].low.y = area->low.y; + } + else + { + /* split box by y */ + bboxes[0].high.y = bboxes[1].low.y = coord; + bboxes[0].high.x = area->high.x; + bboxes[1].low.x = area->low.x; + } + } + + for (i = 1; i <= 2; i++) + { + if (which & (1 << i)) + { + out->nodeNumbers[out->nNodes] = i - 1; + + if (in->norderbys > 0) + { + MemoryContext oldCtx = MemoryContextSwitchTo(in->traversalMemoryContext); + BOX *box = box_copy(&bboxes[i - 1]); + + MemoryContextSwitchTo(oldCtx); + + out->traversalValues[out->nNodes] = box; + + out->distances[out->nNodes] = spg_key_orderbys_distances(BoxPGetDatum(box), false, + in->orderbys, in->norderbys); + } + + out->nNodes++; + } + } + + /* Set up level increments, too */ + out->levelAdds = (int *) palloc(sizeof(int) * 2); + out->levelAdds[0] = 1; + out->levelAdds[1] = 1; + + PG_RETURN_VOID(); +} + +/* + * spg_kd_leaf_consistent() is the same as spg_quad_leaf_consistent(), + * since we support the same operators and the same leaf data type. + * So we just borrow that function. + */ diff --git a/src/backend/access/spgist/spgproc.c b/src/backend/access/spgist/spgproc.c new file mode 100644 index 0000000..1bad5d6 --- /dev/null +++ b/src/backend/access/spgist/spgproc.c @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * spgproc.c + * Common supporting procedures for SP-GiST opclasses. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgproc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/spgist_private.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/geo_decls.h" + +#define point_point_distance(p1,p2) \ + DatumGetFloat8(DirectFunctionCall2(point_distance, \ + PointPGetDatum(p1), PointPGetDatum(p2))) + +/* Point-box distance in the assumption that box is aligned by axis */ +static double +point_box_distance(Point *point, BOX *box) +{ + double dx, + dy; + + if (isnan(point->x) || isnan(box->low.x) || + isnan(point->y) || isnan(box->low.y)) + return get_float8_nan(); + + if (point->x < box->low.x) + dx = box->low.x - point->x; + else if (point->x > box->high.x) + dx = point->x - box->high.x; + else + dx = 0.0; + + if (point->y < box->low.y) + dy = box->low.y - point->y; + else if (point->y > box->high.y) + dy = point->y - box->high.y; + else + dy = 0.0; + + return HYPOT(dx, dy); +} + +/* + * Returns distances from given key to array of ordering scan keys. Leaf key + * is expected to be point, non-leaf key is expected to be box. Scan key + * arguments are expected to be points. + */ +double * +spg_key_orderbys_distances(Datum key, bool isLeaf, + ScanKey orderbys, int norderbys) +{ + int sk_num; + double *distances = (double *) palloc(norderbys * sizeof(double)), + *distance = distances; + + for (sk_num = 0; sk_num < norderbys; ++sk_num, ++orderbys, ++distance) + { + Point *point = DatumGetPointP(orderbys->sk_argument); + + *distance = isLeaf ? point_point_distance(point, DatumGetPointP(key)) + : point_box_distance(point, DatumGetBoxP(key)); + } + + return distances; +} + +BOX * +box_copy(BOX *orig) +{ + BOX *result = palloc(sizeof(BOX)); + + *result = *orig; + return result; +} diff --git a/src/backend/access/spgist/spgquadtreeproc.c b/src/backend/access/spgist/spgquadtreeproc.c new file mode 100644 index 0000000..a52d924 --- /dev/null +++ b/src/backend/access/spgist/spgquadtreeproc.c @@ -0,0 +1,471 @@ +/*------------------------------------------------------------------------- + * + * spgquadtreeproc.c + * implementation of quad tree over points for SP-GiST + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgquadtreeproc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/spgist.h" +#include "access/spgist_private.h" +#include "access/stratnum.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/geo_decls.h" + +Datum +spg_quad_config(PG_FUNCTION_ARGS) +{ + /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */ + spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1); + + cfg->prefixType = POINTOID; + cfg->labelType = VOIDOID; /* we don't need node labels */ + cfg->canReturnData = true; + cfg->longValuesOK = false; + PG_RETURN_VOID(); +} + +#define SPTEST(f, x, y) \ + DatumGetBool(DirectFunctionCall2(f, PointPGetDatum(x), PointPGetDatum(y))) + +/* + * Determine which quadrant a point falls into, relative to the centroid. + * + * Quadrants are identified like this: + * + * 4 | 1 + * ----+----- + * 3 | 2 + * + * Points on one of the axes are taken to lie in the lowest-numbered + * adjacent quadrant. + */ +static int16 +getQuadrant(Point *centroid, Point *tst) +{ + if ((SPTEST(point_above, tst, centroid) || + SPTEST(point_horiz, tst, centroid)) && + (SPTEST(point_right, tst, centroid) || + SPTEST(point_vert, tst, centroid))) + return 1; + + if (SPTEST(point_below, tst, centroid) && + (SPTEST(point_right, tst, centroid) || + SPTEST(point_vert, tst, centroid))) + return 2; + + if ((SPTEST(point_below, tst, centroid) || + SPTEST(point_horiz, tst, centroid)) && + SPTEST(point_left, tst, centroid)) + return 3; + + if (SPTEST(point_above, tst, centroid) && + SPTEST(point_left, tst, centroid)) + return 4; + + elog(ERROR, "getQuadrant: impossible case"); + return 0; +} + +/* Returns bounding box of a given quadrant inside given bounding box */ +static BOX * +getQuadrantArea(BOX *bbox, Point *centroid, int quadrant) +{ + BOX *result = (BOX *) palloc(sizeof(BOX)); + + switch (quadrant) + { + case 1: + result->high = bbox->high; + result->low = *centroid; + break; + case 2: + result->high.x = bbox->high.x; + result->high.y = centroid->y; + result->low.x = centroid->x; + result->low.y = bbox->low.y; + break; + case 3: + result->high = *centroid; + result->low = bbox->low; + break; + case 4: + result->high.x = centroid->x; + result->high.y = bbox->high.y; + result->low.x = bbox->low.x; + result->low.y = centroid->y; + break; + } + + return result; +} + +Datum +spg_quad_choose(PG_FUNCTION_ARGS) +{ + spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0); + spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1); + Point *inPoint = DatumGetPointP(in->datum), + *centroid; + + if (in->allTheSame) + { + out->resultType = spgMatchNode; + /* nodeN will be set by core */ + out->result.matchNode.levelAdd = 0; + out->result.matchNode.restDatum = PointPGetDatum(inPoint); + PG_RETURN_VOID(); + } + + Assert(in->hasPrefix); + centroid = DatumGetPointP(in->prefixDatum); + + Assert(in->nNodes == 4); + + out->resultType = spgMatchNode; + out->result.matchNode.nodeN = getQuadrant(centroid, inPoint) - 1; + out->result.matchNode.levelAdd = 0; + out->result.matchNode.restDatum = PointPGetDatum(inPoint); + + PG_RETURN_VOID(); +} + +#ifdef USE_MEDIAN +static int +x_cmp(const void *a, const void *b, void *arg) +{ + Point *pa = *(Point **) a; + Point *pb = *(Point **) b; + + if (pa->x == pb->x) + return 0; + return (pa->x > pb->x) ? 1 : -1; +} + +static int +y_cmp(const void *a, const void *b, void *arg) +{ + Point *pa = *(Point **) a; + Point *pb = *(Point **) b; + + if (pa->y == pb->y) + return 0; + return (pa->y > pb->y) ? 1 : -1; +} +#endif + +Datum +spg_quad_picksplit(PG_FUNCTION_ARGS) +{ + spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0); + spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1); + int i; + Point *centroid; + +#ifdef USE_MEDIAN + /* Use the median values of x and y as the centroid point */ + Point **sorted; + + sorted = palloc(sizeof(*sorted) * in->nTuples); + for (i = 0; i < in->nTuples; i++) + sorted[i] = DatumGetPointP(in->datums[i]); + + centroid = palloc(sizeof(*centroid)); + + qsort(sorted, in->nTuples, sizeof(*sorted), x_cmp); + centroid->x = sorted[in->nTuples >> 1]->x; + qsort(sorted, in->nTuples, sizeof(*sorted), y_cmp); + centroid->y = sorted[in->nTuples >> 1]->y; +#else + /* Use the average values of x and y as the centroid point */ + centroid = palloc0(sizeof(*centroid)); + + for (i = 0; i < in->nTuples; i++) + { + centroid->x += DatumGetPointP(in->datums[i])->x; + centroid->y += DatumGetPointP(in->datums[i])->y; + } + + centroid->x /= in->nTuples; + centroid->y /= in->nTuples; +#endif + + out->hasPrefix = true; + out->prefixDatum = PointPGetDatum(centroid); + + out->nNodes = 4; + out->nodeLabels = NULL; /* we don't need node labels */ + + out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); + out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + Point *p = DatumGetPointP(in->datums[i]); + int quadrant = getQuadrant(centroid, p) - 1; + + out->leafTupleDatums[i] = PointPGetDatum(p); + out->mapTuplesToNodes[i] = quadrant; + } + + PG_RETURN_VOID(); +} + + +Datum +spg_quad_inner_consistent(PG_FUNCTION_ARGS) +{ + spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0); + spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1); + Point *centroid; + BOX infbbox; + BOX *bbox = NULL; + int which; + int i; + + Assert(in->hasPrefix); + centroid = DatumGetPointP(in->prefixDatum); + + /* + * When ordering scan keys are specified, we've to calculate distance for + * them. In order to do that, we need calculate bounding boxes for all + * children nodes. Calculation of those bounding boxes on non-zero level + * require knowledge of bounding box of upper node. So, we save bounding + * boxes to traversalValues. + */ + if (in->norderbys > 0) + { + out->distances = (double **) palloc(sizeof(double *) * in->nNodes); + out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes); + + if (in->level == 0) + { + double inf = get_float8_infinity(); + + infbbox.high.x = inf; + infbbox.high.y = inf; + infbbox.low.x = -inf; + infbbox.low.y = -inf; + bbox = &infbbox; + } + else + { + bbox = in->traversalValue; + Assert(bbox); + } + } + + if (in->allTheSame) + { + /* Report that all nodes should be visited */ + out->nNodes = in->nNodes; + out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + for (i = 0; i < in->nNodes; i++) + { + out->nodeNumbers[i] = i; + + if (in->norderbys > 0) + { + MemoryContext oldCtx = MemoryContextSwitchTo(in->traversalMemoryContext); + + /* Use parent quadrant box as traversalValue */ + BOX *quadrant = box_copy(bbox); + + MemoryContextSwitchTo(oldCtx); + + out->traversalValues[i] = quadrant; + out->distances[i] = spg_key_orderbys_distances(BoxPGetDatum(quadrant), false, + in->orderbys, in->norderbys); + } + } + PG_RETURN_VOID(); + } + + Assert(in->nNodes == 4); + + /* "which" is a bitmask of quadrants that satisfy all constraints */ + which = (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4); + + for (i = 0; i < in->nkeys; i++) + { + Point *query = DatumGetPointP(in->scankeys[i].sk_argument); + BOX *boxQuery; + + switch (in->scankeys[i].sk_strategy) + { + case RTLeftStrategyNumber: + if (SPTEST(point_right, centroid, query)) + which &= (1 << 3) | (1 << 4); + break; + case RTRightStrategyNumber: + if (SPTEST(point_left, centroid, query)) + which &= (1 << 1) | (1 << 2); + break; + case RTSameStrategyNumber: + which &= (1 << getQuadrant(centroid, query)); + break; + case RTBelowStrategyNumber: + case RTOldBelowStrategyNumber: + if (SPTEST(point_above, centroid, query)) + which &= (1 << 2) | (1 << 3); + break; + case RTAboveStrategyNumber: + case RTOldAboveStrategyNumber: + if (SPTEST(point_below, centroid, query)) + which &= (1 << 1) | (1 << 4); + break; + case RTContainedByStrategyNumber: + + /* + * For this operator, the query is a box not a point. We + * cheat to the extent of assuming that DatumGetPointP won't + * do anything that would be bad for a pointer-to-box. + */ + boxQuery = DatumGetBoxP(in->scankeys[i].sk_argument); + + if (DatumGetBool(DirectFunctionCall2(box_contain_pt, + PointerGetDatum(boxQuery), + PointerGetDatum(centroid)))) + { + /* centroid is in box, so all quadrants are OK */ + } + else + { + /* identify quadrant(s) containing all corners of box */ + Point p; + int r = 0; + + p = boxQuery->low; + r |= 1 << getQuadrant(centroid, &p); + p.y = boxQuery->high.y; + r |= 1 << getQuadrant(centroid, &p); + p = boxQuery->high; + r |= 1 << getQuadrant(centroid, &p); + p.x = boxQuery->low.x; + r |= 1 << getQuadrant(centroid, &p); + + which &= r; + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + in->scankeys[i].sk_strategy); + break; + } + + if (which == 0) + break; /* no need to consider remaining conditions */ + } + + out->levelAdds = palloc(sizeof(int) * 4); + for (i = 0; i < 4; ++i) + out->levelAdds[i] = 1; + + /* We must descend into the quadrant(s) identified by which */ + out->nodeNumbers = (int *) palloc(sizeof(int) * 4); + out->nNodes = 0; + + for (i = 1; i <= 4; i++) + { + if (which & (1 << i)) + { + out->nodeNumbers[out->nNodes] = i - 1; + + if (in->norderbys > 0) + { + MemoryContext oldCtx = MemoryContextSwitchTo(in->traversalMemoryContext); + BOX *quadrant = getQuadrantArea(bbox, centroid, i); + + MemoryContextSwitchTo(oldCtx); + + out->traversalValues[out->nNodes] = quadrant; + + out->distances[out->nNodes] = spg_key_orderbys_distances(BoxPGetDatum(quadrant), false, + in->orderbys, in->norderbys); + } + + out->nNodes++; + } + } + + PG_RETURN_VOID(); +} + + +Datum +spg_quad_leaf_consistent(PG_FUNCTION_ARGS) +{ + spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0); + spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1); + Point *datum = DatumGetPointP(in->leafDatum); + bool res; + int i; + + /* all tests are exact */ + out->recheck = false; + + /* leafDatum is what it is... */ + out->leafValue = in->leafDatum; + + /* Perform the required comparison(s) */ + res = true; + for (i = 0; i < in->nkeys; i++) + { + Point *query = DatumGetPointP(in->scankeys[i].sk_argument); + + switch (in->scankeys[i].sk_strategy) + { + case RTLeftStrategyNumber: + res = SPTEST(point_left, datum, query); + break; + case RTRightStrategyNumber: + res = SPTEST(point_right, datum, query); + break; + case RTSameStrategyNumber: + res = SPTEST(point_eq, datum, query); + break; + case RTBelowStrategyNumber: + case RTOldBelowStrategyNumber: + res = SPTEST(point_below, datum, query); + break; + case RTAboveStrategyNumber: + case RTOldAboveStrategyNumber: + res = SPTEST(point_above, datum, query); + break; + case RTContainedByStrategyNumber: + + /* + * For this operator, the query is a box not a point. We + * cheat to the extent of assuming that DatumGetPointP won't + * do anything that would be bad for a pointer-to-box. + */ + res = SPTEST(box_contain_pt, query, datum); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + in->scankeys[i].sk_strategy); + break; + } + + if (!res) + break; + } + + if (res && in->norderbys > 0) + /* ok, it passes -> let's compute the distances */ + out->distances = spg_key_orderbys_distances(in->leafDatum, true, + in->orderbys, in->norderbys); + + PG_RETURN_BOOL(res); +} diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c new file mode 100644 index 0000000..401a1a8 --- /dev/null +++ b/src/backend/access/spgist/spgscan.c @@ -0,0 +1,1097 @@ +/*------------------------------------------------------------------------- + * + * spgscan.c + * routines for scanning SP-GiST indexes + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgscan.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/relscan.h" +#include "access/spgist_private.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "utils/datum.h" +#include "utils/float.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr, + Datum leafValue, bool isNull, + SpGistLeafTuple leafTuple, bool recheck, + bool recheckDistances, double *distances); + +/* + * Pairing heap comparison function for the SpGistSearchItem queue. + * KNN-searches currently only support NULLS LAST. So, preserve this logic + * here. + */ +static int +pairingheap_SpGistSearchItem_cmp(const pairingheap_node *a, + const pairingheap_node *b, void *arg) +{ + const SpGistSearchItem *sa = (const SpGistSearchItem *) a; + const SpGistSearchItem *sb = (const SpGistSearchItem *) b; + SpGistScanOpaque so = (SpGistScanOpaque) arg; + int i; + + if (sa->isNull) + { + if (!sb->isNull) + return -1; + } + else if (sb->isNull) + { + return 1; + } + else + { + /* Order according to distance comparison */ + for (i = 0; i < so->numberOfNonNullOrderBys; i++) + { + if (isnan(sa->distances[i]) && isnan(sb->distances[i])) + continue; /* NaN == NaN */ + if (isnan(sa->distances[i])) + return -1; /* NaN > number */ + if (isnan(sb->distances[i])) + return 1; /* number < NaN */ + if (sa->distances[i] != sb->distances[i]) + return (sa->distances[i] < sb->distances[i]) ? 1 : -1; + } + } + + /* Leaf items go before inner pages, to ensure a depth-first search */ + if (sa->isLeaf && !sb->isLeaf) + return 1; + if (!sa->isLeaf && sb->isLeaf) + return -1; + + return 0; +} + +static void +spgFreeSearchItem(SpGistScanOpaque so, SpGistSearchItem *item) +{ + /* value is of type attType if isLeaf, else of type attLeafType */ + /* (no, that is not backwards; yes, it's confusing) */ + if (!(item->isLeaf ? so->state.attType.attbyval : + so->state.attLeafType.attbyval) && + DatumGetPointer(item->value) != NULL) + pfree(DatumGetPointer(item->value)); + + if (item->leafTuple) + pfree(item->leafTuple); + + if (item->traversalValue) + pfree(item->traversalValue); + + pfree(item); +} + +/* + * Add SpGistSearchItem to queue + * + * Called in queue context + */ +static void +spgAddSearchItemToQueue(SpGistScanOpaque so, SpGistSearchItem *item) +{ + pairingheap_add(so->scanQueue, &item->phNode); +} + +static SpGistSearchItem * +spgAllocSearchItem(SpGistScanOpaque so, bool isnull, double *distances) +{ + /* allocate distance array only for non-NULL items */ + SpGistSearchItem *item = + palloc(SizeOfSpGistSearchItem(isnull ? 0 : so->numberOfNonNullOrderBys)); + + item->isNull = isnull; + + if (!isnull && so->numberOfNonNullOrderBys > 0) + memcpy(item->distances, distances, + sizeof(item->distances[0]) * so->numberOfNonNullOrderBys); + + return item; +} + +static void +spgAddStartItem(SpGistScanOpaque so, bool isnull) +{ + SpGistSearchItem *startEntry = + spgAllocSearchItem(so, isnull, so->zeroDistances); + + ItemPointerSet(&startEntry->heapPtr, + isnull ? SPGIST_NULL_BLKNO : SPGIST_ROOT_BLKNO, + FirstOffsetNumber); + startEntry->isLeaf = false; + startEntry->level = 0; + startEntry->value = (Datum) 0; + startEntry->leafTuple = NULL; + startEntry->traversalValue = NULL; + startEntry->recheck = false; + startEntry->recheckDistances = false; + + spgAddSearchItemToQueue(so, startEntry); +} + +/* + * Initialize queue to search the root page, resetting + * any previously active scan + */ +static void +resetSpGistScanOpaque(SpGistScanOpaque so) +{ + MemoryContext oldCtx; + + MemoryContextReset(so->traversalCxt); + + oldCtx = MemoryContextSwitchTo(so->traversalCxt); + + /* initialize queue only for distance-ordered scans */ + so->scanQueue = pairingheap_allocate(pairingheap_SpGistSearchItem_cmp, so); + + if (so->searchNulls) + /* Add a work item to scan the null index entries */ + spgAddStartItem(so, true); + + if (so->searchNonNulls) + /* Add a work item to scan the non-null index entries */ + spgAddStartItem(so, false); + + MemoryContextSwitchTo(oldCtx); + + if (so->numberOfOrderBys > 0) + { + /* Must pfree distances to avoid memory leak */ + int i; + + for (i = 0; i < so->nPtrs; i++) + if (so->distances[i]) + pfree(so->distances[i]); + } + + if (so->want_itup) + { + /* Must pfree reconstructed tuples to avoid memory leak */ + int i; + + for (i = 0; i < so->nPtrs; i++) + pfree(so->reconTups[i]); + } + so->iPtr = so->nPtrs = 0; +} + +/* + * Prepare scan keys in SpGistScanOpaque from caller-given scan keys + * + * Sets searchNulls, searchNonNulls, numberOfKeys, keyData fields of *so. + * + * The point here is to eliminate null-related considerations from what the + * opclass consistent functions need to deal with. We assume all SPGiST- + * indexable operators are strict, so any null RHS value makes the scan + * condition unsatisfiable. We also pull out any IS NULL/IS NOT NULL + * conditions; their effect is reflected into searchNulls/searchNonNulls. + */ +static void +spgPrepareScanKeys(IndexScanDesc scan) +{ + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + bool qual_ok; + bool haveIsNull; + bool haveNotNull; + int nkeys; + int i; + + so->numberOfOrderBys = scan->numberOfOrderBys; + so->orderByData = scan->orderByData; + + if (so->numberOfOrderBys <= 0) + so->numberOfNonNullOrderBys = 0; + else + { + int j = 0; + + /* + * Remove all NULL keys, but remember their offsets in the original + * array. + */ + for (i = 0; i < scan->numberOfOrderBys; i++) + { + ScanKey skey = &so->orderByData[i]; + + if (skey->sk_flags & SK_ISNULL) + so->nonNullOrderByOffsets[i] = -1; + else + { + if (i != j) + so->orderByData[j] = *skey; + + so->nonNullOrderByOffsets[i] = j++; + } + } + + so->numberOfNonNullOrderBys = j; + } + + if (scan->numberOfKeys <= 0) + { + /* If no quals, whole-index scan is required */ + so->searchNulls = true; + so->searchNonNulls = true; + so->numberOfKeys = 0; + return; + } + + /* Examine the given quals */ + qual_ok = true; + haveIsNull = haveNotNull = false; + nkeys = 0; + for (i = 0; i < scan->numberOfKeys; i++) + { + ScanKey skey = &scan->keyData[i]; + + if (skey->sk_flags & SK_SEARCHNULL) + haveIsNull = true; + else if (skey->sk_flags & SK_SEARCHNOTNULL) + haveNotNull = true; + else if (skey->sk_flags & SK_ISNULL) + { + /* ordinary qual with null argument - unsatisfiable */ + qual_ok = false; + break; + } + else + { + /* ordinary qual, propagate into so->keyData */ + so->keyData[nkeys++] = *skey; + /* this effectively creates a not-null requirement */ + haveNotNull = true; + } + } + + /* IS NULL in combination with something else is unsatisfiable */ + if (haveIsNull && haveNotNull) + qual_ok = false; + + /* Emit results */ + if (qual_ok) + { + so->searchNulls = haveIsNull; + so->searchNonNulls = haveNotNull; + so->numberOfKeys = nkeys; + } + else + { + so->searchNulls = false; + so->searchNonNulls = false; + so->numberOfKeys = 0; + } +} + +IndexScanDesc +spgbeginscan(Relation rel, int keysz, int orderbysz) +{ + IndexScanDesc scan; + SpGistScanOpaque so; + int i; + + scan = RelationGetIndexScan(rel, keysz, orderbysz); + + so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData)); + if (keysz > 0) + so->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * keysz); + else + so->keyData = NULL; + initSpGistState(&so->state, scan->indexRelation); + + so->tempCxt = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST search temporary context", + ALLOCSET_DEFAULT_SIZES); + so->traversalCxt = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST traversal-value context", + ALLOCSET_DEFAULT_SIZES); + + /* + * Set up reconTupDesc and xs_hitupdesc in case it's an index-only scan, + * making sure that the key column is shown as being of type attType. + * (It's rather annoying to do this work when it might be wasted, but for + * most opclasses we can re-use the index reldesc instead of making one.) + */ + so->reconTupDesc = scan->xs_hitupdesc = + getSpGistTupleDesc(rel, &so->state.attType); + + /* Allocate various arrays needed for order-by scans */ + if (scan->numberOfOrderBys > 0) + { + /* This will be filled in spgrescan, but allocate the space here */ + so->orderByTypes = (Oid *) + palloc(sizeof(Oid) * scan->numberOfOrderBys); + so->nonNullOrderByOffsets = (int *) + palloc(sizeof(int) * scan->numberOfOrderBys); + + /* These arrays have constant contents, so we can fill them now */ + so->zeroDistances = (double *) + palloc(sizeof(double) * scan->numberOfOrderBys); + so->infDistances = (double *) + palloc(sizeof(double) * scan->numberOfOrderBys); + + for (i = 0; i < scan->numberOfOrderBys; i++) + { + so->zeroDistances[i] = 0.0; + so->infDistances[i] = get_float8_infinity(); + } + + scan->xs_orderbyvals = (Datum *) + palloc0(sizeof(Datum) * scan->numberOfOrderBys); + scan->xs_orderbynulls = (bool *) + palloc(sizeof(bool) * scan->numberOfOrderBys); + memset(scan->xs_orderbynulls, true, + sizeof(bool) * scan->numberOfOrderBys); + } + + fmgr_info_copy(&so->innerConsistentFn, + index_getprocinfo(rel, 1, SPGIST_INNER_CONSISTENT_PROC), + CurrentMemoryContext); + + fmgr_info_copy(&so->leafConsistentFn, + index_getprocinfo(rel, 1, SPGIST_LEAF_CONSISTENT_PROC), + CurrentMemoryContext); + + so->indexCollation = rel->rd_indcollation[0]; + + scan->opaque = so; + + return scan; +} + +void +spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + /* copy scankeys into local storage */ + if (scankey && scan->numberOfKeys > 0) + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + + /* initialize order-by data if needed */ + if (orderbys && scan->numberOfOrderBys > 0) + { + int i; + + memmove(scan->orderByData, orderbys, + scan->numberOfOrderBys * sizeof(ScanKeyData)); + + for (i = 0; i < scan->numberOfOrderBys; i++) + { + ScanKey skey = &scan->orderByData[i]; + + /* + * Look up the datatype returned by the original ordering + * operator. SP-GiST always uses a float8 for the distance + * function, but the ordering operator could be anything else. + * + * XXX: The distance function is only allowed to be lossy if the + * ordering operator's result type is float4 or float8. Otherwise + * we don't know how to return the distance to the executor. But + * we cannot check that here, as we won't know if the distance + * function is lossy until it returns *recheck = true for the + * first time. + */ + so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid); + } + } + + /* preprocess scankeys, set up the representation in *so */ + spgPrepareScanKeys(scan); + + /* set up starting queue entries */ + resetSpGistScanOpaque(so); + + /* count an indexscan for stats */ + pgstat_count_index_scan(scan->indexRelation); +} + +void +spgendscan(IndexScanDesc scan) +{ + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + MemoryContextDelete(so->tempCxt); + MemoryContextDelete(so->traversalCxt); + + if (so->keyData) + pfree(so->keyData); + + if (so->state.leafTupDesc && + so->state.leafTupDesc != RelationGetDescr(so->state.index)) + FreeTupleDesc(so->state.leafTupDesc); + + if (so->state.deadTupleStorage) + pfree(so->state.deadTupleStorage); + + if (scan->numberOfOrderBys > 0) + { + pfree(so->orderByTypes); + pfree(so->nonNullOrderByOffsets); + pfree(so->zeroDistances); + pfree(so->infDistances); + pfree(scan->xs_orderbyvals); + pfree(scan->xs_orderbynulls); + } + + pfree(so); +} + +/* + * Leaf SpGistSearchItem constructor, called in queue context + */ +static SpGistSearchItem * +spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, + Datum leafValue, bool recheck, bool recheckDistances, + bool isnull, double *distances) +{ + SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances); + + item->level = level; + item->heapPtr = leafTuple->heapPtr; + + /* + * If we need the reconstructed value, copy it to queue cxt out of tmp + * cxt. Caution: the leaf_consistent method may not have supplied a value + * if we didn't ask it to, and mildly-broken methods might supply one of + * the wrong type. The correct leafValue type is attType not leafType. + */ + if (so->want_itup) + { + item->value = isnull ? (Datum) 0 : + datumCopy(leafValue, so->state.attType.attbyval, + so->state.attType.attlen); + + /* + * If we're going to need to reconstruct INCLUDE attributes, store the + * whole leaf tuple so we can get the INCLUDE attributes out of it. + */ + if (so->state.leafTupDesc->natts > 1) + { + item->leafTuple = palloc(leafTuple->size); + memcpy(item->leafTuple, leafTuple, leafTuple->size); + } + else + item->leafTuple = NULL; + } + else + { + item->value = (Datum) 0; + item->leafTuple = NULL; + } + item->traversalValue = NULL; + item->isLeaf = true; + item->recheck = recheck; + item->recheckDistances = recheckDistances; + + return item; +} + +/* + * Test whether a leaf tuple satisfies all the scan keys + * + * *reportedSome is set to true if: + * the scan is not ordered AND the item satisfies the scankeys + */ +static bool +spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, + SpGistLeafTuple leafTuple, bool isnull, + bool *reportedSome, storeRes_func storeRes) +{ + Datum leafValue; + double *distances; + bool result; + bool recheck; + bool recheckDistances; + + if (isnull) + { + /* Should not have arrived on a nulls page unless nulls are wanted */ + Assert(so->searchNulls); + leafValue = (Datum) 0; + distances = NULL; + recheck = false; + recheckDistances = false; + result = true; + } + else + { + spgLeafConsistentIn in; + spgLeafConsistentOut out; + + /* use temp context for calling leaf_consistent */ + MemoryContext oldCxt = MemoryContextSwitchTo(so->tempCxt); + + in.scankeys = so->keyData; + in.nkeys = so->numberOfKeys; + in.orderbys = so->orderByData; + in.norderbys = so->numberOfNonNullOrderBys; + Assert(!item->isLeaf); /* else reconstructedValue would be wrong type */ + in.reconstructedValue = item->value; + in.traversalValue = item->traversalValue; + in.level = item->level; + in.returnData = so->want_itup; + in.leafDatum = SGLTDATUM(leafTuple, &so->state); + + out.leafValue = (Datum) 0; + out.recheck = false; + out.distances = NULL; + out.recheckDistances = false; + + result = DatumGetBool(FunctionCall2Coll(&so->leafConsistentFn, + so->indexCollation, + PointerGetDatum(&in), + PointerGetDatum(&out))); + recheck = out.recheck; + recheckDistances = out.recheckDistances; + leafValue = out.leafValue; + distances = out.distances; + + MemoryContextSwitchTo(oldCxt); + } + + if (result) + { + /* item passes the scankeys */ + if (so->numberOfNonNullOrderBys > 0) + { + /* the scan is ordered -> add the item to the queue */ + MemoryContext oldCxt = MemoryContextSwitchTo(so->traversalCxt); + SpGistSearchItem *heapItem = spgNewHeapItem(so, item->level, + leafTuple, + leafValue, + recheck, + recheckDistances, + isnull, + distances); + + spgAddSearchItemToQueue(so, heapItem); + + MemoryContextSwitchTo(oldCxt); + } + else + { + /* non-ordered scan, so report the item right away */ + Assert(!recheckDistances); + storeRes(so, &leafTuple->heapPtr, leafValue, isnull, + leafTuple, recheck, false, NULL); + *reportedSome = true; + } + } + + return result; +} + +/* A bundle initializer for inner_consistent methods */ +static void +spgInitInnerConsistentIn(spgInnerConsistentIn *in, + SpGistScanOpaque so, + SpGistSearchItem *item, + SpGistInnerTuple innerTuple) +{ + in->scankeys = so->keyData; + in->orderbys = so->orderByData; + in->nkeys = so->numberOfKeys; + in->norderbys = so->numberOfNonNullOrderBys; + Assert(!item->isLeaf); /* else reconstructedValue would be wrong type */ + in->reconstructedValue = item->value; + in->traversalMemoryContext = so->traversalCxt; + in->traversalValue = item->traversalValue; + in->level = item->level; + in->returnData = so->want_itup; + in->allTheSame = innerTuple->allTheSame; + in->hasPrefix = (innerTuple->prefixSize > 0); + in->prefixDatum = SGITDATUM(innerTuple, &so->state); + in->nNodes = innerTuple->nNodes; + in->nodeLabels = spgExtractNodeLabels(&so->state, innerTuple); +} + +static SpGistSearchItem * +spgMakeInnerItem(SpGistScanOpaque so, + SpGistSearchItem *parentItem, + SpGistNodeTuple tuple, + spgInnerConsistentOut *out, int i, bool isnull, + double *distances) +{ + SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances); + + item->heapPtr = tuple->t_tid; + item->level = out->levelAdds ? parentItem->level + out->levelAdds[i] + : parentItem->level; + + /* Must copy value out of temp context */ + /* (recall that reconstructed values are of type leafType) */ + item->value = out->reconstructedValues + ? datumCopy(out->reconstructedValues[i], + so->state.attLeafType.attbyval, + so->state.attLeafType.attlen) + : (Datum) 0; + + item->leafTuple = NULL; + + /* + * Elements of out.traversalValues should be allocated in + * in.traversalMemoryContext, which is actually a long lived context of + * index scan. + */ + item->traversalValue = + out->traversalValues ? out->traversalValues[i] : NULL; + + item->isLeaf = false; + item->recheck = false; + item->recheckDistances = false; + + return item; +} + +static void +spgInnerTest(SpGistScanOpaque so, SpGistSearchItem *item, + SpGistInnerTuple innerTuple, bool isnull) +{ + MemoryContext oldCxt = MemoryContextSwitchTo(so->tempCxt); + spgInnerConsistentOut out; + int nNodes = innerTuple->nNodes; + int i; + + memset(&out, 0, sizeof(out)); + + if (!isnull) + { + spgInnerConsistentIn in; + + spgInitInnerConsistentIn(&in, so, item, innerTuple); + + /* use user-defined inner consistent method */ + FunctionCall2Coll(&so->innerConsistentFn, + so->indexCollation, + PointerGetDatum(&in), + PointerGetDatum(&out)); + } + else + { + /* force all children to be visited */ + out.nNodes = nNodes; + out.nodeNumbers = (int *) palloc(sizeof(int) * nNodes); + for (i = 0; i < nNodes; i++) + out.nodeNumbers[i] = i; + } + + /* If allTheSame, they should all or none of them match */ + if (innerTuple->allTheSame && out.nNodes != 0 && out.nNodes != nNodes) + elog(ERROR, "inconsistent inner_consistent results for allTheSame inner tuple"); + + if (out.nNodes) + { + /* collect node pointers */ + SpGistNodeTuple node; + SpGistNodeTuple *nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * nNodes); + + SGITITERATE(innerTuple, i, node) + { + nodes[i] = node; + } + + MemoryContextSwitchTo(so->traversalCxt); + + for (i = 0; i < out.nNodes; i++) + { + int nodeN = out.nodeNumbers[i]; + SpGistSearchItem *innerItem; + double *distances; + + Assert(nodeN >= 0 && nodeN < nNodes); + + node = nodes[nodeN]; + + if (!ItemPointerIsValid(&node->t_tid)) + continue; + + /* + * Use infinity distances if innerConsistentFn() failed to return + * them or if is a NULL item (their distances are really unused). + */ + distances = out.distances ? out.distances[i] : so->infDistances; + + innerItem = spgMakeInnerItem(so, item, node, &out, i, isnull, + distances); + + spgAddSearchItemToQueue(so, innerItem); + } + } + + MemoryContextSwitchTo(oldCxt); +} + +/* Returns a next item in an (ordered) scan or null if the index is exhausted */ +static SpGistSearchItem * +spgGetNextQueueItem(SpGistScanOpaque so) +{ + if (pairingheap_is_empty(so->scanQueue)) + return NULL; /* Done when both heaps are empty */ + + /* Return item; caller is responsible to pfree it */ + return (SpGistSearchItem *) pairingheap_remove_first(so->scanQueue); +} + +enum SpGistSpecialOffsetNumbers +{ + SpGistBreakOffsetNumber = InvalidOffsetNumber, + SpGistRedirectOffsetNumber = MaxOffsetNumber + 1, + SpGistErrorOffsetNumber = MaxOffsetNumber + 2 +}; + +static OffsetNumber +spgTestLeafTuple(SpGistScanOpaque so, + SpGistSearchItem *item, + Page page, OffsetNumber offset, + bool isnull, bool isroot, + bool *reportedSome, + storeRes_func storeRes) +{ + SpGistLeafTuple leafTuple = (SpGistLeafTuple) + PageGetItem(page, PageGetItemId(page, offset)); + + if (leafTuple->tupstate != SPGIST_LIVE) + { + if (!isroot) /* all tuples on root should be live */ + { + if (leafTuple->tupstate == SPGIST_REDIRECT) + { + /* redirection tuple should be first in chain */ + Assert(offset == ItemPointerGetOffsetNumber(&item->heapPtr)); + /* transfer attention to redirect point */ + item->heapPtr = ((SpGistDeadTuple) leafTuple)->pointer; + Assert(ItemPointerGetBlockNumber(&item->heapPtr) != SPGIST_METAPAGE_BLKNO); + return SpGistRedirectOffsetNumber; + } + + if (leafTuple->tupstate == SPGIST_DEAD) + { + /* dead tuple should be first in chain */ + Assert(offset == ItemPointerGetOffsetNumber(&item->heapPtr)); + /* No live entries on this page */ + Assert(SGLT_GET_NEXTOFFSET(leafTuple) == InvalidOffsetNumber); + return SpGistBreakOffsetNumber; + } + } + + /* We should not arrive at a placeholder */ + elog(ERROR, "unexpected SPGiST tuple state: %d", leafTuple->tupstate); + return SpGistErrorOffsetNumber; + } + + Assert(ItemPointerIsValid(&leafTuple->heapPtr)); + + spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes); + + return SGLT_GET_NEXTOFFSET(leafTuple); +} + +/* + * Walk the tree and report all tuples passing the scan quals to the storeRes + * subroutine. + * + * If scanWholeIndex is true, we'll do just that. If not, we'll stop at the + * next page boundary once we have reported at least one tuple. + */ +static void +spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, + storeRes_func storeRes, Snapshot snapshot) +{ + Buffer buffer = InvalidBuffer; + bool reportedSome = false; + + while (scanWholeIndex || !reportedSome) + { + SpGistSearchItem *item = spgGetNextQueueItem(so); + + if (item == NULL) + break; /* No more items in queue -> done */ + +redirect: + /* Check for interrupts, just in case of infinite loop */ + CHECK_FOR_INTERRUPTS(); + + if (item->isLeaf) + { + /* We store heap items in the queue only in case of ordered search */ + Assert(so->numberOfNonNullOrderBys > 0); + storeRes(so, &item->heapPtr, item->value, item->isNull, + item->leafTuple, item->recheck, + item->recheckDistances, item->distances); + reportedSome = true; + } + else + { + BlockNumber blkno = ItemPointerGetBlockNumber(&item->heapPtr); + OffsetNumber offset = ItemPointerGetOffsetNumber(&item->heapPtr); + Page page; + bool isnull; + + if (buffer == InvalidBuffer) + { + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } + else if (blkno != BufferGetBlockNumber(buffer)) + { + UnlockReleaseBuffer(buffer); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } + + /* else new pointer points to the same page, no work needed */ + + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, index, page); + + isnull = SpGistPageStoresNulls(page) ? true : false; + + if (SpGistPageIsLeaf(page)) + { + /* Page is a leaf - that is, all it's tuples are heap items */ + OffsetNumber max = PageGetMaxOffsetNumber(page); + + if (SpGistBlockIsRoot(blkno)) + { + /* When root is a leaf, examine all its tuples */ + for (offset = FirstOffsetNumber; offset <= max; offset++) + (void) spgTestLeafTuple(so, item, page, offset, + isnull, true, + &reportedSome, storeRes); + } + else + { + /* Normal case: just examine the chain we arrived at */ + while (offset != InvalidOffsetNumber) + { + Assert(offset >= FirstOffsetNumber && offset <= max); + offset = spgTestLeafTuple(so, item, page, offset, + isnull, false, + &reportedSome, storeRes); + if (offset == SpGistRedirectOffsetNumber) + goto redirect; + } + } + } + else /* page is inner */ + { + SpGistInnerTuple innerTuple = (SpGistInnerTuple) + PageGetItem(page, PageGetItemId(page, offset)); + + if (innerTuple->tupstate != SPGIST_LIVE) + { + if (innerTuple->tupstate == SPGIST_REDIRECT) + { + /* transfer attention to redirect point */ + item->heapPtr = ((SpGistDeadTuple) innerTuple)->pointer; + Assert(ItemPointerGetBlockNumber(&item->heapPtr) != + SPGIST_METAPAGE_BLKNO); + goto redirect; + } + elog(ERROR, "unexpected SPGiST tuple state: %d", + innerTuple->tupstate); + } + + spgInnerTest(so, item, innerTuple, isnull); + } + } + + /* done with this scan item */ + spgFreeSearchItem(so, item); + /* clear temp context before proceeding to the next one */ + MemoryContextReset(so->tempCxt); + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); +} + + +/* storeRes subroutine for getbitmap case */ +static void +storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, + Datum leafValue, bool isnull, + SpGistLeafTuple leafTuple, bool recheck, + bool recheckDistances, double *distances) +{ + Assert(!recheckDistances && !distances); + tbm_add_tuples(so->tbm, heapPtr, 1, recheck); + so->ntids++; +} + +int64 +spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + /* Copy want_itup to *so so we don't need to pass it around separately */ + so->want_itup = false; + + so->tbm = tbm; + so->ntids = 0; + + spgWalk(scan->indexRelation, so, true, storeBitmap, scan->xs_snapshot); + + return so->ntids; +} + +/* storeRes subroutine for gettuple case */ +static void +storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, + Datum leafValue, bool isnull, + SpGistLeafTuple leafTuple, bool recheck, + bool recheckDistances, double *nonNullDistances) +{ + Assert(so->nPtrs < MaxIndexTuplesPerPage); + so->heapPtrs[so->nPtrs] = *heapPtr; + so->recheck[so->nPtrs] = recheck; + so->recheckDistances[so->nPtrs] = recheckDistances; + + if (so->numberOfOrderBys > 0) + { + if (isnull || so->numberOfNonNullOrderBys <= 0) + so->distances[so->nPtrs] = NULL; + else + { + IndexOrderByDistance *distances = + palloc(sizeof(distances[0]) * so->numberOfOrderBys); + int i; + + for (i = 0; i < so->numberOfOrderBys; i++) + { + int offset = so->nonNullOrderByOffsets[i]; + + if (offset >= 0) + { + /* Copy non-NULL distance value */ + distances[i].value = nonNullDistances[offset]; + distances[i].isnull = false; + } + else + { + /* Set distance's NULL flag. */ + distances[i].value = 0.0; + distances[i].isnull = true; + } + } + + so->distances[so->nPtrs] = distances; + } + } + + if (so->want_itup) + { + /* + * Reconstruct index data. We have to copy the datum out of the temp + * context anyway, so we may as well create the tuple here. + */ + Datum leafDatums[INDEX_MAX_KEYS]; + bool leafIsnulls[INDEX_MAX_KEYS]; + + /* We only need to deform the old tuple if it has INCLUDE attributes */ + if (so->state.leafTupDesc->natts > 1) + spgDeformLeafTuple(leafTuple, so->state.leafTupDesc, + leafDatums, leafIsnulls, isnull); + + leafDatums[spgKeyColumn] = leafValue; + leafIsnulls[spgKeyColumn] = isnull; + + so->reconTups[so->nPtrs] = heap_form_tuple(so->reconTupDesc, + leafDatums, + leafIsnulls); + } + so->nPtrs++; +} + +bool +spggettuple(IndexScanDesc scan, ScanDirection dir) +{ + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + if (dir != ForwardScanDirection) + elog(ERROR, "SP-GiST only supports forward scan direction"); + + /* Copy want_itup to *so so we don't need to pass it around separately */ + so->want_itup = scan->xs_want_itup; + + for (;;) + { + if (so->iPtr < so->nPtrs) + { + /* continuing to return reported tuples */ + scan->xs_heaptid = so->heapPtrs[so->iPtr]; + scan->xs_recheck = so->recheck[so->iPtr]; + scan->xs_hitup = so->reconTups[so->iPtr]; + + if (so->numberOfOrderBys > 0) + index_store_float8_orderby_distances(scan, so->orderByTypes, + so->distances[so->iPtr], + so->recheckDistances[so->iPtr]); + so->iPtr++; + return true; + } + + if (so->numberOfOrderBys > 0) + { + /* Must pfree distances to avoid memory leak */ + int i; + + for (i = 0; i < so->nPtrs; i++) + if (so->distances[i]) + pfree(so->distances[i]); + } + + if (so->want_itup) + { + /* Must pfree reconstructed tuples to avoid memory leak */ + int i; + + for (i = 0; i < so->nPtrs; i++) + pfree(so->reconTups[i]); + } + so->iPtr = so->nPtrs = 0; + + spgWalk(scan->indexRelation, so, false, storeGettuple, + scan->xs_snapshot); + + if (so->nPtrs == 0) + break; /* must have completed scan */ + } + + return false; +} + +bool +spgcanreturn(Relation index, int attno) +{ + SpGistCache *cache; + + /* INCLUDE attributes can always be fetched for index-only scans */ + if (attno > 1) + return true; + + /* We can do it if the opclass config function says so */ + cache = spgGetCache(index); + + return cache->config.canReturnData; +} diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c new file mode 100644 index 0000000..f340555 --- /dev/null +++ b/src/backend/access/spgist/spgtextproc.c @@ -0,0 +1,699 @@ +/*------------------------------------------------------------------------- + * + * spgtextproc.c + * implementation of radix tree (compressed trie) over text + * + * In a text_ops SPGiST index, inner tuples can have a prefix which is the + * common prefix of all strings indexed under that tuple. The node labels + * represent the next byte of the string(s) after the prefix. Assuming we + * always use the longest possible prefix, we will get more than one node + * label unless the prefix length is restricted by SPGIST_MAX_PREFIX_LENGTH. + * + * To reconstruct the indexed string for any index entry, concatenate the + * inner-tuple prefixes and node labels starting at the root and working + * down to the leaf entry, then append the datum in the leaf entry. + * (While descending the tree, "level" is the number of bytes reconstructed + * so far.) + * + * However, there are two special cases for node labels: -1 indicates that + * there are no more bytes after the prefix-so-far, and -2 indicates that we + * had to split an existing allTheSame tuple (in such a case we have to create + * a node label that doesn't correspond to any string byte). In either case, + * the node label does not contribute anything to the reconstructed string. + * + * Previously, we used a node label of zero for both special cases, but + * this was problematic because one can't tell whether a string ending at + * the current level can be pushed down into such a child node. For + * backwards compatibility, we still support such node labels for reading; + * but no new entries will ever be pushed down into a zero-labeled child. + * No new entries ever get pushed into a -2-labeled child, either. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgtextproc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/spgist.h" +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/pg_locale.h" +#include "utils/varlena.h" + + +/* + * In the worst case, an inner tuple in a text radix tree could have as many + * as 258 nodes (one for each possible byte value, plus the two special + * cases). Each node can take 16 bytes on MAXALIGN=8 machines. The inner + * tuple must fit on an index page of size BLCKSZ. Rather than assuming we + * know the exact amount of overhead imposed by page headers, tuple headers, + * etc, we leave 100 bytes for that (the actual overhead should be no more + * than 56 bytes at this writing, so there is slop in this number). + * So we can safely create prefixes up to BLCKSZ - 258 * 16 - 100 bytes long. + * Unfortunately, because 258 * 16 is over 4K, there is no safe prefix length + * when BLCKSZ is less than 8K; it is always possible to get "SPGiST inner + * tuple size exceeds maximum" if there are too many distinct next-byte values + * at a given place in the tree. Since use of nonstandard block sizes appears + * to be negligible in the field, we just live with that fact for now, + * choosing a max prefix size of 32 bytes when BLCKSZ is configured smaller + * than default. + */ +#define SPGIST_MAX_PREFIX_LENGTH Max((int) (BLCKSZ - 258 * 16 - 100), 32) + +/* + * Strategy for collation aware operator on text is equal to btree strategy + * plus value of 10. + * + * Current collation aware strategies and their corresponding btree strategies: + * 11 BTLessStrategyNumber + * 12 BTLessEqualStrategyNumber + * 14 BTGreaterEqualStrategyNumber + * 15 BTGreaterStrategyNumber + */ +#define SPG_STRATEGY_ADDITION (10) +#define SPG_IS_COLLATION_AWARE_STRATEGY(s) ((s) > SPG_STRATEGY_ADDITION \ + && (s) != RTPrefixStrategyNumber) + +/* Struct for sorting values in picksplit */ +typedef struct spgNodePtr +{ + Datum d; + int i; + int16 c; +} spgNodePtr; + + +Datum +spg_text_config(PG_FUNCTION_ARGS) +{ + /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */ + spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1); + + cfg->prefixType = TEXTOID; + cfg->labelType = INT2OID; + cfg->canReturnData = true; + cfg->longValuesOK = true; /* suffixing will shorten long values */ + PG_RETURN_VOID(); +} + +/* + * Form a text datum from the given not-necessarily-null-terminated string, + * using short varlena header format if possible + */ +static Datum +formTextDatum(const char *data, int datalen) +{ + char *p; + + p = (char *) palloc(datalen + VARHDRSZ); + + if (datalen + VARHDRSZ_SHORT <= VARATT_SHORT_MAX) + { + SET_VARSIZE_SHORT(p, datalen + VARHDRSZ_SHORT); + if (datalen) + memcpy(p + VARHDRSZ_SHORT, data, datalen); + } + else + { + SET_VARSIZE(p, datalen + VARHDRSZ); + memcpy(p + VARHDRSZ, data, datalen); + } + + return PointerGetDatum(p); +} + +/* + * Find the length of the common prefix of a and b + */ +static int +commonPrefix(const char *a, const char *b, int lena, int lenb) +{ + int i = 0; + + while (i < lena && i < lenb && *a == *b) + { + a++; + b++; + i++; + } + + return i; +} + +/* + * Binary search an array of int16 datums for a match to c + * + * On success, *i gets the match location; on failure, it gets where to insert + */ +static bool +searchChar(Datum *nodeLabels, int nNodes, int16 c, int *i) +{ + int StopLow = 0, + StopHigh = nNodes; + + while (StopLow < StopHigh) + { + int StopMiddle = (StopLow + StopHigh) >> 1; + int16 middle = DatumGetInt16(nodeLabels[StopMiddle]); + + if (c < middle) + StopHigh = StopMiddle; + else if (c > middle) + StopLow = StopMiddle + 1; + else + { + *i = StopMiddle; + return true; + } + } + + *i = StopHigh; + return false; +} + +Datum +spg_text_choose(PG_FUNCTION_ARGS) +{ + spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0); + spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1); + text *inText = DatumGetTextPP(in->datum); + char *inStr = VARDATA_ANY(inText); + int inSize = VARSIZE_ANY_EXHDR(inText); + char *prefixStr = NULL; + int prefixSize = 0; + int commonLen = 0; + int16 nodeChar = 0; + int i = 0; + + /* Check for prefix match, set nodeChar to first byte after prefix */ + if (in->hasPrefix) + { + text *prefixText = DatumGetTextPP(in->prefixDatum); + + prefixStr = VARDATA_ANY(prefixText); + prefixSize = VARSIZE_ANY_EXHDR(prefixText); + + commonLen = commonPrefix(inStr + in->level, + prefixStr, + inSize - in->level, + prefixSize); + + if (commonLen == prefixSize) + { + if (inSize - in->level > commonLen) + nodeChar = *(unsigned char *) (inStr + in->level + commonLen); + else + nodeChar = -1; + } + else + { + /* Must split tuple because incoming value doesn't match prefix */ + out->resultType = spgSplitTuple; + + if (commonLen == 0) + { + out->result.splitTuple.prefixHasPrefix = false; + } + else + { + out->result.splitTuple.prefixHasPrefix = true; + out->result.splitTuple.prefixPrefixDatum = + formTextDatum(prefixStr, commonLen); + } + out->result.splitTuple.prefixNNodes = 1; + out->result.splitTuple.prefixNodeLabels = + (Datum *) palloc(sizeof(Datum)); + out->result.splitTuple.prefixNodeLabels[0] = + Int16GetDatum(*(unsigned char *) (prefixStr + commonLen)); + + out->result.splitTuple.childNodeN = 0; + + if (prefixSize - commonLen == 1) + { + out->result.splitTuple.postfixHasPrefix = false; + } + else + { + out->result.splitTuple.postfixHasPrefix = true; + out->result.splitTuple.postfixPrefixDatum = + formTextDatum(prefixStr + commonLen + 1, + prefixSize - commonLen - 1); + } + + PG_RETURN_VOID(); + } + } + else if (inSize > in->level) + { + nodeChar = *(unsigned char *) (inStr + in->level); + } + else + { + nodeChar = -1; + } + + /* Look up nodeChar in the node label array */ + if (searchChar(in->nodeLabels, in->nNodes, nodeChar, &i)) + { + /* + * Descend to existing node. (If in->allTheSame, the core code will + * ignore our nodeN specification here, but that's OK. We still have + * to provide the correct levelAdd and restDatum values, and those are + * the same regardless of which node gets chosen by core.) + */ + int levelAdd; + + out->resultType = spgMatchNode; + out->result.matchNode.nodeN = i; + levelAdd = commonLen; + if (nodeChar >= 0) + levelAdd++; + out->result.matchNode.levelAdd = levelAdd; + if (inSize - in->level - levelAdd > 0) + out->result.matchNode.restDatum = + formTextDatum(inStr + in->level + levelAdd, + inSize - in->level - levelAdd); + else + out->result.matchNode.restDatum = + formTextDatum(NULL, 0); + } + else if (in->allTheSame) + { + /* + * Can't use AddNode action, so split the tuple. The upper tuple has + * the same prefix as before and uses a dummy node label -2 for the + * lower tuple. The lower tuple has no prefix and the same node + * labels as the original tuple. + * + * Note: it might seem tempting to shorten the upper tuple's prefix, + * if it has one, then use its last byte as label for the lower tuple. + * But that doesn't win since we know the incoming value matches the + * whole prefix: we'd just end up splitting the lower tuple again. + */ + out->resultType = spgSplitTuple; + out->result.splitTuple.prefixHasPrefix = in->hasPrefix; + out->result.splitTuple.prefixPrefixDatum = in->prefixDatum; + out->result.splitTuple.prefixNNodes = 1; + out->result.splitTuple.prefixNodeLabels = (Datum *) palloc(sizeof(Datum)); + out->result.splitTuple.prefixNodeLabels[0] = Int16GetDatum(-2); + out->result.splitTuple.childNodeN = 0; + out->result.splitTuple.postfixHasPrefix = false; + } + else + { + /* Add a node for the not-previously-seen nodeChar value */ + out->resultType = spgAddNode; + out->result.addNode.nodeLabel = Int16GetDatum(nodeChar); + out->result.addNode.nodeN = i; + } + + PG_RETURN_VOID(); +} + +/* qsort comparator to sort spgNodePtr structs by "c" */ +static int +cmpNodePtr(const void *a, const void *b) +{ + const spgNodePtr *aa = (const spgNodePtr *) a; + const spgNodePtr *bb = (const spgNodePtr *) b; + + return aa->c - bb->c; +} + +Datum +spg_text_picksplit(PG_FUNCTION_ARGS) +{ + spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0); + spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1); + text *text0 = DatumGetTextPP(in->datums[0]); + int i, + commonLen; + spgNodePtr *nodes; + + /* Identify longest common prefix, if any */ + commonLen = VARSIZE_ANY_EXHDR(text0); + for (i = 1; i < in->nTuples && commonLen > 0; i++) + { + text *texti = DatumGetTextPP(in->datums[i]); + int tmp = commonPrefix(VARDATA_ANY(text0), + VARDATA_ANY(texti), + VARSIZE_ANY_EXHDR(text0), + VARSIZE_ANY_EXHDR(texti)); + + if (tmp < commonLen) + commonLen = tmp; + } + + /* + * Limit the prefix length, if necessary, to ensure that the resulting + * inner tuple will fit on a page. + */ + commonLen = Min(commonLen, SPGIST_MAX_PREFIX_LENGTH); + + /* Set node prefix to be that string, if it's not empty */ + if (commonLen == 0) + { + out->hasPrefix = false; + } + else + { + out->hasPrefix = true; + out->prefixDatum = formTextDatum(VARDATA_ANY(text0), commonLen); + } + + /* Extract the node label (first non-common byte) from each value */ + nodes = (spgNodePtr *) palloc(sizeof(spgNodePtr) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + text *texti = DatumGetTextPP(in->datums[i]); + + if (commonLen < VARSIZE_ANY_EXHDR(texti)) + nodes[i].c = *(unsigned char *) (VARDATA_ANY(texti) + commonLen); + else + nodes[i].c = -1; /* use -1 if string is all common */ + nodes[i].i = i; + nodes[i].d = in->datums[i]; + } + + /* + * Sort by label values so that we can group the values into nodes. This + * also ensures that the nodes are ordered by label value, allowing the + * use of binary search in searchChar. + */ + qsort(nodes, in->nTuples, sizeof(*nodes), cmpNodePtr); + + /* And emit results */ + out->nNodes = 0; + out->nodeLabels = (Datum *) palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = (int *) palloc(sizeof(int) * in->nTuples); + out->leafTupleDatums = (Datum *) palloc(sizeof(Datum) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + text *texti = DatumGetTextPP(nodes[i].d); + Datum leafD; + + if (i == 0 || nodes[i].c != nodes[i - 1].c) + { + out->nodeLabels[out->nNodes] = Int16GetDatum(nodes[i].c); + out->nNodes++; + } + + if (commonLen < VARSIZE_ANY_EXHDR(texti)) + leafD = formTextDatum(VARDATA_ANY(texti) + commonLen + 1, + VARSIZE_ANY_EXHDR(texti) - commonLen - 1); + else + leafD = formTextDatum(NULL, 0); + + out->leafTupleDatums[nodes[i].i] = leafD; + out->mapTuplesToNodes[nodes[i].i] = out->nNodes - 1; + } + + PG_RETURN_VOID(); +} + +Datum +spg_text_inner_consistent(PG_FUNCTION_ARGS) +{ + spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0); + spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1); + bool collate_is_c = lc_collate_is_c(PG_GET_COLLATION()); + text *reconstructedValue; + text *reconstrText; + int maxReconstrLen; + text *prefixText = NULL; + int prefixSize = 0; + int i; + + /* + * Reconstruct values represented at this tuple, including parent data, + * prefix of this tuple if any, and the node label if it's non-dummy. + * in->level should be the length of the previously reconstructed value, + * and the number of bytes added here is prefixSize or prefixSize + 1. + * + * Note: we assume that in->reconstructedValue isn't toasted and doesn't + * have a short varlena header. This is okay because it must have been + * created by a previous invocation of this routine, and we always emit + * long-format reconstructed values. + */ + reconstructedValue = (text *) DatumGetPointer(in->reconstructedValue); + Assert(reconstructedValue == NULL ? in->level == 0 : + VARSIZE_ANY_EXHDR(reconstructedValue) == in->level); + + maxReconstrLen = in->level + 1; + if (in->hasPrefix) + { + prefixText = DatumGetTextPP(in->prefixDatum); + prefixSize = VARSIZE_ANY_EXHDR(prefixText); + maxReconstrLen += prefixSize; + } + + reconstrText = palloc(VARHDRSZ + maxReconstrLen); + SET_VARSIZE(reconstrText, VARHDRSZ + maxReconstrLen); + + if (in->level) + memcpy(VARDATA(reconstrText), + VARDATA(reconstructedValue), + in->level); + if (prefixSize) + memcpy(((char *) VARDATA(reconstrText)) + in->level, + VARDATA_ANY(prefixText), + prefixSize); + /* last byte of reconstrText will be filled in below */ + + /* + * Scan the child nodes. For each one, complete the reconstructed value + * and see if it's consistent with the query. If so, emit an entry into + * the output arrays. + */ + out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->levelAdds = (int *) palloc(sizeof(int) * in->nNodes); + out->reconstructedValues = (Datum *) palloc(sizeof(Datum) * in->nNodes); + out->nNodes = 0; + + for (i = 0; i < in->nNodes; i++) + { + int16 nodeChar = DatumGetInt16(in->nodeLabels[i]); + int thisLen; + bool res = true; + int j; + + /* If nodeChar is a dummy value, don't include it in data */ + if (nodeChar <= 0) + thisLen = maxReconstrLen - 1; + else + { + ((unsigned char *) VARDATA(reconstrText))[maxReconstrLen - 1] = nodeChar; + thisLen = maxReconstrLen; + } + + for (j = 0; j < in->nkeys; j++) + { + StrategyNumber strategy = in->scankeys[j].sk_strategy; + text *inText; + int inSize; + int r; + + /* + * If it's a collation-aware operator, but the collation is C, we + * can treat it as non-collation-aware. With non-C collation we + * need to traverse whole tree :-( so there's no point in making + * any check here. (Note also that our reconstructed value may + * well end with a partial multibyte character, so that applying + * any encoding-sensitive test to it would be risky anyhow.) + */ + if (SPG_IS_COLLATION_AWARE_STRATEGY(strategy)) + { + if (collate_is_c) + strategy -= SPG_STRATEGY_ADDITION; + else + continue; + } + + inText = DatumGetTextPP(in->scankeys[j].sk_argument); + inSize = VARSIZE_ANY_EXHDR(inText); + + r = memcmp(VARDATA(reconstrText), VARDATA_ANY(inText), + Min(inSize, thisLen)); + + switch (strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (r > 0) + res = false; + break; + case BTEqualStrategyNumber: + if (r != 0 || inSize < thisLen) + res = false; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (r < 0) + res = false; + break; + case RTPrefixStrategyNumber: + if (r != 0) + res = false; + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + in->scankeys[j].sk_strategy); + break; + } + + if (!res) + break; /* no need to consider remaining conditions */ + } + + if (res) + { + out->nodeNumbers[out->nNodes] = i; + out->levelAdds[out->nNodes] = thisLen - in->level; + SET_VARSIZE(reconstrText, VARHDRSZ + thisLen); + out->reconstructedValues[out->nNodes] = + datumCopy(PointerGetDatum(reconstrText), false, -1); + out->nNodes++; + } + } + + PG_RETURN_VOID(); +} + +Datum +spg_text_leaf_consistent(PG_FUNCTION_ARGS) +{ + spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0); + spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1); + int level = in->level; + text *leafValue, + *reconstrValue = NULL; + char *fullValue; + int fullLen; + bool res; + int j; + + /* all tests are exact */ + out->recheck = false; + + leafValue = DatumGetTextPP(in->leafDatum); + + /* As above, in->reconstructedValue isn't toasted or short. */ + if (DatumGetPointer(in->reconstructedValue)) + reconstrValue = (text *) DatumGetPointer(in->reconstructedValue); + + Assert(reconstrValue == NULL ? level == 0 : + VARSIZE_ANY_EXHDR(reconstrValue) == level); + + /* Reconstruct the full string represented by this leaf tuple */ + fullLen = level + VARSIZE_ANY_EXHDR(leafValue); + if (VARSIZE_ANY_EXHDR(leafValue) == 0 && level > 0) + { + fullValue = VARDATA(reconstrValue); + out->leafValue = PointerGetDatum(reconstrValue); + } + else + { + text *fullText = palloc(VARHDRSZ + fullLen); + + SET_VARSIZE(fullText, VARHDRSZ + fullLen); + fullValue = VARDATA(fullText); + if (level) + memcpy(fullValue, VARDATA(reconstrValue), level); + if (VARSIZE_ANY_EXHDR(leafValue) > 0) + memcpy(fullValue + level, VARDATA_ANY(leafValue), + VARSIZE_ANY_EXHDR(leafValue)); + out->leafValue = PointerGetDatum(fullText); + } + + /* Perform the required comparison(s) */ + res = true; + for (j = 0; j < in->nkeys; j++) + { + StrategyNumber strategy = in->scankeys[j].sk_strategy; + text *query = DatumGetTextPP(in->scankeys[j].sk_argument); + int queryLen = VARSIZE_ANY_EXHDR(query); + int r; + + if (strategy == RTPrefixStrategyNumber) + { + /* + * if level >= length of query then reconstrValue must begin with + * query (prefix) string, so we don't need to check it again. + */ + res = (level >= queryLen) || + DatumGetBool(DirectFunctionCall2Coll(text_starts_with, + PG_GET_COLLATION(), + out->leafValue, + PointerGetDatum(query))); + + if (!res) /* no need to consider remaining conditions */ + break; + + continue; + } + + if (SPG_IS_COLLATION_AWARE_STRATEGY(strategy)) + { + /* Collation-aware comparison */ + strategy -= SPG_STRATEGY_ADDITION; + + /* If asserts enabled, verify encoding of reconstructed string */ + Assert(pg_verifymbstr(fullValue, fullLen, false)); + + r = varstr_cmp(fullValue, fullLen, + VARDATA_ANY(query), queryLen, + PG_GET_COLLATION()); + } + else + { + /* Non-collation-aware comparison */ + r = memcmp(fullValue, VARDATA_ANY(query), Min(queryLen, fullLen)); + + if (r == 0) + { + if (queryLen > fullLen) + r = -1; + else if (queryLen < fullLen) + r = 1; + } + } + + switch (strategy) + { + case BTLessStrategyNumber: + res = (r < 0); + break; + case BTLessEqualStrategyNumber: + res = (r <= 0); + break; + case BTEqualStrategyNumber: + res = (r == 0); + break; + case BTGreaterEqualStrategyNumber: + res = (r >= 0); + break; + case BTGreaterStrategyNumber: + res = (r > 0); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + in->scankeys[j].sk_strategy); + res = false; + break; + } + + if (!res) + break; /* no need to consider remaining conditions */ + } + + PG_RETURN_BOOL(res); +} diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c new file mode 100644 index 0000000..4484805 --- /dev/null +++ b/src/backend/access/spgist/spgutils.c @@ -0,0 +1,1350 @@ +/*------------------------------------------------------------------------- + * + * spgutils.c + * various support functions for SP-GiST + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgutils.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/htup_details.h" +#include "access/reloptions.h" +#include "access/spgist_private.h" +#include "access/toast_compression.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/pg_amop.h" +#include "commands/vacuum.h" +#include "nodes/nodeFuncs.h" +#include "parser/parse_coerce.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/index_selfuncs.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + + +/* + * SP-GiST handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +spghandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = SPGISTNProc; + amroutine->amoptsprocnum = SPGIST_OPTIONS_PROC; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = true; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = false; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = true; + amroutine->amstorage = true; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcaninclude = true; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = spgbuild; + amroutine->ambuildempty = spgbuildempty; + amroutine->aminsert = spginsert; + amroutine->ambulkdelete = spgbulkdelete; + amroutine->amvacuumcleanup = spgvacuumcleanup; + amroutine->amcanreturn = spgcanreturn; + amroutine->amcostestimate = spgcostestimate; + amroutine->amoptions = spgoptions; + amroutine->amproperty = spgproperty; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = spgvalidate; + amroutine->amadjustmembers = spgadjustmembers; + amroutine->ambeginscan = spgbeginscan; + amroutine->amrescan = spgrescan; + amroutine->amgettuple = spggettuple; + amroutine->amgetbitmap = spggetbitmap; + amroutine->amendscan = spgendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * GetIndexInputType + * Determine the nominal input data type for an index column + * + * We define the "nominal" input type as the associated opclass's opcintype, + * or if that is a polymorphic type, the base type of the heap column or + * expression that is the index's input. The reason for preferring the + * opcintype is that non-polymorphic opclasses probably don't want to hear + * about binary-compatible input types. For instance, if a text opclass + * is being used with a varchar heap column, we want to report "text" not + * "varchar". Likewise, opclasses don't want to hear about domain types, + * so if we do consult the actual input type, we make sure to flatten domains. + * + * At some point maybe this should go somewhere else, but it's not clear + * if any other index AMs have a use for it. + */ +static Oid +GetIndexInputType(Relation index, AttrNumber indexcol) +{ + Oid opcintype; + AttrNumber heapcol; + List *indexprs; + ListCell *indexpr_item; + + Assert(index->rd_index != NULL); + Assert(indexcol > 0 && indexcol <= index->rd_index->indnkeyatts); + opcintype = index->rd_opcintype[indexcol - 1]; + if (!IsPolymorphicType(opcintype)) + return opcintype; + heapcol = index->rd_index->indkey.values[indexcol - 1]; + if (heapcol != 0) /* Simple index column? */ + return getBaseType(get_atttype(index->rd_index->indrelid, heapcol)); + + /* + * If the index expressions are already cached, skip calling + * RelationGetIndexExpressions, as it will make a copy which is overkill. + * We're not going to modify the trees, and we're not going to do anything + * that would invalidate the relcache entry before we're done. + */ + if (index->rd_indexprs) + indexprs = index->rd_indexprs; + else + indexprs = RelationGetIndexExpressions(index); + indexpr_item = list_head(indexprs); + for (int i = 1; i <= index->rd_index->indnkeyatts; i++) + { + if (index->rd_index->indkey.values[i - 1] == 0) + { + /* expression column */ + if (indexpr_item == NULL) + elog(ERROR, "wrong number of index expressions"); + if (i == indexcol) + return getBaseType(exprType((Node *) lfirst(indexpr_item))); + indexpr_item = lnext(indexprs, indexpr_item); + } + } + elog(ERROR, "wrong number of index expressions"); + return InvalidOid; /* keep compiler quiet */ +} + +/* Fill in a SpGistTypeDesc struct with info about the specified data type */ +static void +fillTypeDesc(SpGistTypeDesc *desc, Oid type) +{ + HeapTuple tp; + Form_pg_type typtup; + + desc->type = type; + tp = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for type %u", type); + typtup = (Form_pg_type) GETSTRUCT(tp); + desc->attlen = typtup->typlen; + desc->attbyval = typtup->typbyval; + desc->attalign = typtup->typalign; + desc->attstorage = typtup->typstorage; + ReleaseSysCache(tp); +} + +/* + * Fetch local cache of AM-specific info about the index, initializing it + * if necessary + */ +SpGistCache * +spgGetCache(Relation index) +{ + SpGistCache *cache; + + if (index->rd_amcache == NULL) + { + Oid atttype; + spgConfigIn in; + FmgrInfo *procinfo; + Buffer metabuffer; + SpGistMetaPageData *metadata; + + cache = MemoryContextAllocZero(index->rd_indexcxt, + sizeof(SpGistCache)); + + /* SPGiST must have one key column and can also have INCLUDE columns */ + Assert(IndexRelationGetNumberOfKeyAttributes(index) == 1); + Assert(IndexRelationGetNumberOfAttributes(index) <= INDEX_MAX_KEYS); + + /* + * Get the actual (well, nominal) data type of the key column. We + * pass this to the opclass config function so that polymorphic + * opclasses are possible. + */ + atttype = GetIndexInputType(index, spgKeyColumn + 1); + + /* Call the config function to get config info for the opclass */ + in.attType = atttype; + + procinfo = index_getprocinfo(index, 1, SPGIST_CONFIG_PROC); + FunctionCall2Coll(procinfo, + index->rd_indcollation[spgKeyColumn], + PointerGetDatum(&in), + PointerGetDatum(&cache->config)); + + /* + * If leafType isn't specified, use the declared index column type, + * which index.c will have derived from the opclass's opcintype. + * (Although we now make spgvalidate.c warn if these aren't the same, + * old user-defined opclasses may not set the STORAGE parameter + * correctly, so believe leafType if it's given.) + */ + if (!OidIsValid(cache->config.leafType)) + { + cache->config.leafType = + TupleDescAttr(RelationGetDescr(index), spgKeyColumn)->atttypid; + + /* + * If index column type is binary-coercible to atttype (for + * example, it's a domain over atttype), treat it as plain atttype + * to avoid thinking we need to compress. + */ + if (cache->config.leafType != atttype && + IsBinaryCoercible(cache->config.leafType, atttype)) + cache->config.leafType = atttype; + } + + /* Get the information we need about each relevant datatype */ + fillTypeDesc(&cache->attType, atttype); + + if (cache->config.leafType != atttype) + { + if (!OidIsValid(index_getprocid(index, 1, SPGIST_COMPRESS_PROC))) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("compress method must be defined when leaf type is different from input type"))); + + fillTypeDesc(&cache->attLeafType, cache->config.leafType); + } + else + { + /* Save lookups in this common case */ + cache->attLeafType = cache->attType; + } + + fillTypeDesc(&cache->attPrefixType, cache->config.prefixType); + fillTypeDesc(&cache->attLabelType, cache->config.labelType); + + /* Last, get the lastUsedPages data from the metapage */ + metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + + metadata = SpGistPageGetMeta(BufferGetPage(metabuffer)); + + if (metadata->magicNumber != SPGIST_MAGIC_NUMBER) + elog(ERROR, "index \"%s\" is not an SP-GiST index", + RelationGetRelationName(index)); + + cache->lastUsedPages = metadata->lastUsedPages; + + UnlockReleaseBuffer(metabuffer); + + index->rd_amcache = (void *) cache; + } + else + { + /* assume it's up to date */ + cache = (SpGistCache *) index->rd_amcache; + } + + return cache; +} + +/* + * Compute a tuple descriptor for leaf tuples or index-only-scan result tuples. + * + * We can use the relcache's tupdesc as-is in many cases, and it's always + * OK so far as any INCLUDE columns are concerned. However, the entry for + * the key column has to match leafType in the first case or attType in the + * second case. While the relcache's tupdesc *should* show leafType, this + * might not hold for legacy user-defined opclasses, since before v14 they + * were not allowed to declare their true storage type in CREATE OPCLASS. + * Also, attType can be different from what is in the relcache. + * + * This function gives back either a pointer to the relcache's tupdesc + * if that is suitable, or a palloc'd copy that's been adjusted to match + * the specified key column type. We can avoid doing any catalog lookups + * here by insisting that the caller pass an SpGistTypeDesc not just an OID. + */ +TupleDesc +getSpGistTupleDesc(Relation index, SpGistTypeDesc *keyType) +{ + TupleDesc outTupDesc; + Form_pg_attribute att; + + if (keyType->type == + TupleDescAttr(RelationGetDescr(index), spgKeyColumn)->atttypid) + outTupDesc = RelationGetDescr(index); + else + { + outTupDesc = CreateTupleDescCopy(RelationGetDescr(index)); + att = TupleDescAttr(outTupDesc, spgKeyColumn); + /* It's sufficient to update the type-dependent fields of the column */ + att->atttypid = keyType->type; + att->atttypmod = -1; + att->attlen = keyType->attlen; + att->attbyval = keyType->attbyval; + att->attalign = keyType->attalign; + att->attstorage = keyType->attstorage; + /* We shouldn't need to bother with making these valid: */ + att->attcompression = InvalidCompressionMethod; + att->attcollation = InvalidOid; + /* In case we changed typlen, we'd better reset following offsets */ + for (int i = spgFirstIncludeColumn; i < outTupDesc->natts; i++) + TupleDescAttr(outTupDesc, i)->attcacheoff = -1; + } + return outTupDesc; +} + +/* Initialize SpGistState for working with the given index */ +void +initSpGistState(SpGistState *state, Relation index) +{ + SpGistCache *cache; + + state->index = index; + + /* Get cached static information about index */ + cache = spgGetCache(index); + + state->config = cache->config; + state->attType = cache->attType; + state->attLeafType = cache->attLeafType; + state->attPrefixType = cache->attPrefixType; + state->attLabelType = cache->attLabelType; + + /* Ensure we have a valid descriptor for leaf tuples */ + state->leafTupDesc = getSpGistTupleDesc(state->index, &state->attLeafType); + + /* Make workspace for constructing dead tuples */ + state->deadTupleStorage = palloc0(SGDTSIZE); + + /* Set XID to use in redirection tuples */ + state->myXid = GetTopTransactionIdIfAny(); + + /* Assume we're not in an index build (spgbuild will override) */ + state->isBuild = false; +} + +/* + * Allocate a new page (either by recycling, or by extending the index file). + * + * The returned buffer is already pinned and exclusive-locked. + * Caller is responsible for initializing the page by calling SpGistInitBuffer. + */ +Buffer +SpGistNewBuffer(Relation index) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(index); + + if (blkno == InvalidBlockNumber) + break; /* nothing known to FSM */ + + /* + * The fixed pages shouldn't ever be listed in FSM, but just in case + * one is, ignore it. + */ + if (SpGistBlockIsFixed(blkno)) + continue; + + buffer = ReadBuffer(index, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + Page page = BufferGetPage(buffer); + + if (PageIsNew(page)) + return buffer; /* OK to use, if never initialized */ + + if (SpGistPageIsDeleted(page) || PageIsEmpty(page)) + return buffer; /* OK to use */ + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(index); + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + + buffer = ReadBuffer(index, P_NEW); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return buffer; +} + +/* + * Update index metapage's lastUsedPages info from local cache, if possible + * + * Updating meta page isn't critical for index working, so + * 1 use ConditionalLockBuffer to improve concurrency + * 2 don't WAL-log metabuffer changes to decrease WAL traffic + */ +void +SpGistUpdateMetaPage(Relation index) +{ + SpGistCache *cache = (SpGistCache *) index->rd_amcache; + + if (cache != NULL) + { + Buffer metabuffer; + + metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO); + + if (ConditionalLockBuffer(metabuffer)) + { + Page metapage = BufferGetPage(metabuffer); + SpGistMetaPageData *metadata = SpGistPageGetMeta(metapage); + + metadata->lastUsedPages = cache->lastUsedPages; + + /* + * Set pd_lower just past the end of the metadata. This is + * essential, because without doing so, metadata will be lost if + * xlog.c compresses the page. (We must do this here because + * pre-v11 versions of PG did not set the metapage's pd_lower + * correctly, so a pg_upgraded index might contain the wrong + * value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(SpGistMetaPageData)) - (char *) metapage; + + MarkBufferDirty(metabuffer); + UnlockReleaseBuffer(metabuffer); + } + else + { + ReleaseBuffer(metabuffer); + } + } +} + +/* Macro to select proper element of lastUsedPages cache depending on flags */ +/* Masking flags with SPGIST_CACHED_PAGES is just for paranoia's sake */ +#define GET_LUP(c, f) (&(c)->lastUsedPages.cachedPage[((unsigned int) (f)) % SPGIST_CACHED_PAGES]) + +/* + * Allocate and initialize a new buffer of the type and parity specified by + * flags. The returned buffer is already pinned and exclusive-locked. + * + * When requesting an inner page, if we get one with the wrong parity, + * we just release the buffer and try again. We will get a different page + * because GetFreeIndexPage will have marked the page used in FSM. The page + * is entered in our local lastUsedPages cache, so there's some hope of + * making use of it later in this session, but otherwise we rely on VACUUM + * to eventually re-enter the page in FSM, making it available for recycling. + * Note that such a page does not get marked dirty here, so unless it's used + * fairly soon, the buffer will just get discarded and the page will remain + * as it was on disk. + * + * When we return a buffer to the caller, the page is *not* entered into + * the lastUsedPages cache; we expect the caller will do so after it's taken + * whatever space it will use. This is because after the caller has used up + * some space, the page might have less space than whatever was cached already + * so we'd rather not trash the old cache entry. + */ +static Buffer +allocNewBuffer(Relation index, int flags) +{ + SpGistCache *cache = spgGetCache(index); + uint16 pageflags = 0; + + if (GBUF_REQ_LEAF(flags)) + pageflags |= SPGIST_LEAF; + if (GBUF_REQ_NULLS(flags)) + pageflags |= SPGIST_NULLS; + + for (;;) + { + Buffer buffer; + + buffer = SpGistNewBuffer(index); + SpGistInitBuffer(buffer, pageflags); + + if (pageflags & SPGIST_LEAF) + { + /* Leaf pages have no parity concerns, so just use it */ + return buffer; + } + else + { + BlockNumber blkno = BufferGetBlockNumber(buffer); + int blkFlags = GBUF_INNER_PARITY(blkno); + + if ((flags & GBUF_PARITY_MASK) == blkFlags) + { + /* Page has right parity, use it */ + return buffer; + } + else + { + /* Page has wrong parity, record it in cache and try again */ + if (pageflags & SPGIST_NULLS) + blkFlags |= GBUF_NULLS; + cache->lastUsedPages.cachedPage[blkFlags].blkno = blkno; + cache->lastUsedPages.cachedPage[blkFlags].freeSpace = + PageGetExactFreeSpace(BufferGetPage(buffer)); + UnlockReleaseBuffer(buffer); + } + } + } +} + +/* + * Get a buffer of the type and parity specified by flags, having at least + * as much free space as indicated by needSpace. We use the lastUsedPages + * cache to assign the same buffer previously requested when possible. + * The returned buffer is already pinned and exclusive-locked. + * + * *isNew is set true if the page was initialized here, false if it was + * already valid. + */ +Buffer +SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew) +{ + SpGistCache *cache = spgGetCache(index); + SpGistLastUsedPage *lup; + + /* Bail out if even an empty page wouldn't meet the demand */ + if (needSpace > SPGIST_PAGE_CAPACITY) + elog(ERROR, "desired SPGiST tuple size is too big"); + + /* + * If possible, increase the space request to include relation's + * fillfactor. This ensures that when we add unrelated tuples to a page, + * we try to keep 100-fillfactor% available for adding tuples that are + * related to the ones already on it. But fillfactor mustn't cause an + * error for requests that would otherwise be legal. + */ + needSpace += SpGistGetTargetPageFreeSpace(index); + needSpace = Min(needSpace, SPGIST_PAGE_CAPACITY); + + /* Get the cache entry for this flags setting */ + lup = GET_LUP(cache, flags); + + /* If we have nothing cached, just turn it over to allocNewBuffer */ + if (lup->blkno == InvalidBlockNumber) + { + *isNew = true; + return allocNewBuffer(index, flags); + } + + /* fixed pages should never be in cache */ + Assert(!SpGistBlockIsFixed(lup->blkno)); + + /* If cached freeSpace isn't enough, don't bother looking at the page */ + if (lup->freeSpace >= needSpace) + { + Buffer buffer; + Page page; + + buffer = ReadBuffer(index, lup->blkno); + + if (!ConditionalLockBuffer(buffer)) + { + /* + * buffer is locked by another process, so return a new buffer + */ + ReleaseBuffer(buffer); + *isNew = true; + return allocNewBuffer(index, flags); + } + + page = BufferGetPage(buffer); + + if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page)) + { + /* OK to initialize the page */ + uint16 pageflags = 0; + + if (GBUF_REQ_LEAF(flags)) + pageflags |= SPGIST_LEAF; + if (GBUF_REQ_NULLS(flags)) + pageflags |= SPGIST_NULLS; + SpGistInitBuffer(buffer, pageflags); + lup->freeSpace = PageGetExactFreeSpace(page) - needSpace; + *isNew = true; + return buffer; + } + + /* + * Check that page is of right type and has enough space. We must + * recheck this since our cache isn't necessarily up to date. + */ + if ((GBUF_REQ_LEAF(flags) ? SpGistPageIsLeaf(page) : !SpGistPageIsLeaf(page)) && + (GBUF_REQ_NULLS(flags) ? SpGistPageStoresNulls(page) : !SpGistPageStoresNulls(page))) + { + int freeSpace = PageGetExactFreeSpace(page); + + if (freeSpace >= needSpace) + { + /* Success, update freespace info and return the buffer */ + lup->freeSpace = freeSpace - needSpace; + *isNew = false; + return buffer; + } + } + + /* + * fallback to allocation of new buffer + */ + UnlockReleaseBuffer(buffer); + } + + /* No success with cache, so return a new buffer */ + *isNew = true; + return allocNewBuffer(index, flags); +} + +/* + * Update lastUsedPages cache when done modifying a page. + * + * We update the appropriate cache entry if it already contained this page + * (its freeSpace is likely obsolete), or if this page has more space than + * whatever we had cached. + */ +void +SpGistSetLastUsedPage(Relation index, Buffer buffer) +{ + SpGistCache *cache = spgGetCache(index); + SpGistLastUsedPage *lup; + int freeSpace; + Page page = BufferGetPage(buffer); + BlockNumber blkno = BufferGetBlockNumber(buffer); + int flags; + + /* Never enter fixed pages (root pages) in cache, though */ + if (SpGistBlockIsFixed(blkno)) + return; + + if (SpGistPageIsLeaf(page)) + flags = GBUF_LEAF; + else + flags = GBUF_INNER_PARITY(blkno); + if (SpGistPageStoresNulls(page)) + flags |= GBUF_NULLS; + + lup = GET_LUP(cache, flags); + + freeSpace = PageGetExactFreeSpace(page); + if (lup->blkno == InvalidBlockNumber || lup->blkno == blkno || + lup->freeSpace < freeSpace) + { + lup->blkno = blkno; + lup->freeSpace = freeSpace; + } +} + +/* + * Initialize an SPGiST page to empty, with specified flags + */ +void +SpGistInitPage(Page page, uint16 f) +{ + SpGistPageOpaque opaque; + + PageInit(page, BLCKSZ, sizeof(SpGistPageOpaqueData)); + opaque = SpGistPageGetOpaque(page); + opaque->flags = f; + opaque->spgist_page_id = SPGIST_PAGE_ID; +} + +/* + * Initialize a buffer's page to empty, with specified flags + */ +void +SpGistInitBuffer(Buffer b, uint16 f) +{ + Assert(BufferGetPageSize(b) == BLCKSZ); + SpGistInitPage(BufferGetPage(b), f); +} + +/* + * Initialize metadata page + */ +void +SpGistInitMetapage(Page page) +{ + SpGistMetaPageData *metadata; + int i; + + SpGistInitPage(page, SPGIST_META); + metadata = SpGistPageGetMeta(page); + memset(metadata, 0, sizeof(SpGistMetaPageData)); + metadata->magicNumber = SPGIST_MAGIC_NUMBER; + + /* initialize last-used-page cache to empty */ + for (i = 0; i < SPGIST_CACHED_PAGES; i++) + metadata->lastUsedPages.cachedPage[i].blkno = InvalidBlockNumber; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metadata + sizeof(SpGistMetaPageData)) - (char *) page; +} + +/* + * reloptions processing for SPGiST + */ +bytea * +spgoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"fillfactor", RELOPT_TYPE_INT, offsetof(SpGistOptions, fillfactor)}, + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_SPGIST, + sizeof(SpGistOptions), + tab, lengthof(tab)); + +} + +/* + * Get the space needed to store a non-null datum of the indicated type + * in an inner tuple (that is, as a prefix or node label). + * Note the result is already rounded up to a MAXALIGN boundary. + * Here we follow the convention that pass-by-val types are just stored + * in their Datum representation (compare memcpyInnerDatum). + */ +unsigned int +SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum) +{ + unsigned int size; + + if (att->attbyval) + size = sizeof(Datum); + else if (att->attlen > 0) + size = att->attlen; + else + size = VARSIZE_ANY(datum); + + return MAXALIGN(size); +} + +/* + * Copy the given non-null datum to *target, in the inner-tuple case + */ +static void +memcpyInnerDatum(void *target, SpGistTypeDesc *att, Datum datum) +{ + unsigned int size; + + if (att->attbyval) + { + memcpy(target, &datum, sizeof(Datum)); + } + else + { + size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum); + memcpy(target, DatumGetPointer(datum), size); + } +} + +/* + * Compute space required for a leaf tuple holding the given data. + * + * This must match the size-calculation portion of spgFormLeafTuple. + */ +Size +SpGistGetLeafTupleSize(TupleDesc tupleDescriptor, + Datum *datums, bool *isnulls) +{ + Size size; + Size data_size; + bool needs_null_mask = false; + int natts = tupleDescriptor->natts; + + /* + * Decide whether we need a nulls bitmask. + * + * If there is only a key attribute (natts == 1), never use a bitmask, for + * compatibility with the pre-v14 layout of leaf tuples. Otherwise, we + * need one if any attribute is null. + */ + if (natts > 1) + { + for (int i = 0; i < natts; i++) + { + if (isnulls[i]) + { + needs_null_mask = true; + break; + } + } + } + + /* + * Calculate size of the data part; same as for heap tuples. + */ + data_size = heap_compute_data_size(tupleDescriptor, datums, isnulls); + + /* + * Compute total size. + */ + size = SGLTHDRSZ(needs_null_mask); + size += data_size; + size = MAXALIGN(size); + + /* + * Ensure that we can replace the tuple with a dead tuple later. This test + * is unnecessary when there are any non-null attributes, but be safe. + */ + if (size < SGDTSIZE) + size = SGDTSIZE; + + return size; +} + +/* + * Construct a leaf tuple containing the given heap TID and datum values + */ +SpGistLeafTuple +spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr, + Datum *datums, bool *isnulls) +{ + SpGistLeafTuple tup; + TupleDesc tupleDescriptor = state->leafTupDesc; + Size size; + Size hoff; + Size data_size; + bool needs_null_mask = false; + int natts = tupleDescriptor->natts; + char *tp; /* ptr to tuple data */ + uint16 tupmask = 0; /* unused heap_fill_tuple output */ + + /* + * Decide whether we need a nulls bitmask. + * + * If there is only a key attribute (natts == 1), never use a bitmask, for + * compatibility with the pre-v14 layout of leaf tuples. Otherwise, we + * need one if any attribute is null. + */ + if (natts > 1) + { + for (int i = 0; i < natts; i++) + { + if (isnulls[i]) + { + needs_null_mask = true; + break; + } + } + } + + /* + * Calculate size of the data part; same as for heap tuples. + */ + data_size = heap_compute_data_size(tupleDescriptor, datums, isnulls); + + /* + * Compute total size. + */ + hoff = SGLTHDRSZ(needs_null_mask); + size = hoff + data_size; + size = MAXALIGN(size); + + /* + * Ensure that we can replace the tuple with a dead tuple later. This test + * is unnecessary when there are any non-null attributes, but be safe. + */ + if (size < SGDTSIZE) + size = SGDTSIZE; + + /* OK, form the tuple */ + tup = (SpGistLeafTuple) palloc0(size); + + tup->size = size; + SGLT_SET_NEXTOFFSET(tup, InvalidOffsetNumber); + tup->heapPtr = *heapPtr; + + tp = (char *) tup + hoff; + + if (needs_null_mask) + { + bits8 *bp; /* ptr to null bitmap in tuple */ + + /* Set nullmask presence bit in SpGistLeafTuple header */ + SGLT_SET_HASNULLMASK(tup, true); + /* Fill the data area and null mask */ + bp = (bits8 *) ((char *) tup + sizeof(SpGistLeafTupleData)); + heap_fill_tuple(tupleDescriptor, datums, isnulls, tp, data_size, + &tupmask, bp); + } + else if (natts > 1 || !isnulls[spgKeyColumn]) + { + /* Fill data area only */ + heap_fill_tuple(tupleDescriptor, datums, isnulls, tp, data_size, + &tupmask, (bits8 *) NULL); + } + /* otherwise we have no data, nor a bitmap, to fill */ + + return tup; +} + +/* + * Construct a node (to go into an inner tuple) containing the given label + * + * Note that the node's downlink is just set invalid here. Caller will fill + * it in later. + */ +SpGistNodeTuple +spgFormNodeTuple(SpGistState *state, Datum label, bool isnull) +{ + SpGistNodeTuple tup; + unsigned int size; + unsigned short infomask = 0; + + /* compute space needed (note result is already maxaligned) */ + size = SGNTHDRSZ; + if (!isnull) + size += SpGistGetInnerTypeSize(&state->attLabelType, label); + + /* + * Here we make sure that the size will fit in the field reserved for it + * in t_info. + */ + if ((size & INDEX_SIZE_MASK) != size) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row requires %zu bytes, maximum size is %zu", + (Size) size, (Size) INDEX_SIZE_MASK))); + + tup = (SpGistNodeTuple) palloc0(size); + + if (isnull) + infomask |= INDEX_NULL_MASK; + /* we don't bother setting the INDEX_VAR_MASK bit */ + infomask |= size; + tup->t_info = infomask; + + /* The TID field will be filled in later */ + ItemPointerSetInvalid(&tup->t_tid); + + if (!isnull) + memcpyInnerDatum(SGNTDATAPTR(tup), &state->attLabelType, label); + + return tup; +} + +/* + * Construct an inner tuple containing the given prefix and node array + */ +SpGistInnerTuple +spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix, + int nNodes, SpGistNodeTuple *nodes) +{ + SpGistInnerTuple tup; + unsigned int size; + unsigned int prefixSize; + int i; + char *ptr; + + /* Compute size needed */ + if (hasPrefix) + prefixSize = SpGistGetInnerTypeSize(&state->attPrefixType, prefix); + else + prefixSize = 0; + + size = SGITHDRSZ + prefixSize; + + /* Note: we rely on node tuple sizes to be maxaligned already */ + for (i = 0; i < nNodes; i++) + size += IndexTupleSize(nodes[i]); + + /* + * Ensure that we can replace the tuple with a dead tuple later. This + * test is unnecessary given current tuple layouts, but let's be safe. + */ + if (size < SGDTSIZE) + size = SGDTSIZE; + + /* + * Inner tuple should be small enough to fit on a page + */ + if (size > SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("SP-GiST inner tuple size %zu exceeds maximum %zu", + (Size) size, + SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)), + errhint("Values larger than a buffer page cannot be indexed."))); + + /* + * Check for overflow of header fields --- probably can't fail if the + * above succeeded, but let's be paranoid + */ + if (size > SGITMAXSIZE || + prefixSize > SGITMAXPREFIXSIZE || + nNodes > SGITMAXNNODES) + elog(ERROR, "SPGiST inner tuple header field is too small"); + + /* OK, form the tuple */ + tup = (SpGistInnerTuple) palloc0(size); + + tup->nNodes = nNodes; + tup->prefixSize = prefixSize; + tup->size = size; + + if (hasPrefix) + memcpyInnerDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix); + + ptr = (char *) SGITNODEPTR(tup); + + for (i = 0; i < nNodes; i++) + { + SpGistNodeTuple node = nodes[i]; + + memcpy(ptr, node, IndexTupleSize(node)); + ptr += IndexTupleSize(node); + } + + return tup; +} + +/* + * Construct a "dead" tuple to replace a tuple being deleted. + * + * The state can be SPGIST_REDIRECT, SPGIST_DEAD, or SPGIST_PLACEHOLDER. + * For a REDIRECT tuple, a pointer (blkno+offset) must be supplied, and + * the xid field is filled in automatically. + * + * This is called in critical sections, so we don't use palloc; the tuple + * is built in preallocated storage. It should be copied before another + * call with different parameters can occur. + */ +SpGistDeadTuple +spgFormDeadTuple(SpGistState *state, int tupstate, + BlockNumber blkno, OffsetNumber offnum) +{ + SpGistDeadTuple tuple = (SpGistDeadTuple) state->deadTupleStorage; + + tuple->tupstate = tupstate; + tuple->size = SGDTSIZE; + SGLT_SET_NEXTOFFSET(tuple, InvalidOffsetNumber); + + if (tupstate == SPGIST_REDIRECT) + { + ItemPointerSet(&tuple->pointer, blkno, offnum); + Assert(TransactionIdIsValid(state->myXid)); + tuple->xid = state->myXid; + } + else + { + ItemPointerSetInvalid(&tuple->pointer); + tuple->xid = InvalidTransactionId; + } + + return tuple; +} + +/* + * Convert an SPGiST leaf tuple into Datum/isnull arrays. + * + * The caller must allocate sufficient storage for the output arrays. + * (INDEX_MAX_KEYS entries should be enough.) + */ +void +spgDeformLeafTuple(SpGistLeafTuple tup, TupleDesc tupleDescriptor, + Datum *datums, bool *isnulls, bool keyColumnIsNull) +{ + bool hasNullsMask = SGLT_GET_HASNULLMASK(tup); + char *tp; /* ptr to tuple data */ + bits8 *bp; /* ptr to null bitmap in tuple */ + + if (keyColumnIsNull && tupleDescriptor->natts == 1) + { + /* + * Trivial case: there is only the key attribute and we're in a nulls + * tree. The hasNullsMask bit in the tuple header should not be set + * (and thus we can't use index_deform_tuple_internal), but + * nonetheless the result is NULL. + * + * Note: currently this is dead code, because noplace calls this when + * there is only the key attribute. But we should cover the case. + */ + Assert(!hasNullsMask); + + datums[spgKeyColumn] = (Datum) 0; + isnulls[spgKeyColumn] = true; + return; + } + + tp = (char *) tup + SGLTHDRSZ(hasNullsMask); + bp = (bits8 *) ((char *) tup + sizeof(SpGistLeafTupleData)); + + index_deform_tuple_internal(tupleDescriptor, + datums, isnulls, + tp, bp, hasNullsMask); + + /* + * Key column isnull value from the tuple should be consistent with + * keyColumnIsNull flag from the caller. + */ + Assert(keyColumnIsNull == isnulls[spgKeyColumn]); +} + +/* + * Extract the label datums of the nodes within innerTuple + * + * Returns NULL if label datums are NULLs + */ +Datum * +spgExtractNodeLabels(SpGistState *state, SpGistInnerTuple innerTuple) +{ + Datum *nodeLabels; + int i; + SpGistNodeTuple node; + + /* Either all the labels must be NULL, or none. */ + node = SGITNODEPTR(innerTuple); + if (IndexTupleHasNulls(node)) + { + SGITITERATE(innerTuple, i, node) + { + if (!IndexTupleHasNulls(node)) + elog(ERROR, "some but not all node labels are null in SPGiST inner tuple"); + } + /* They're all null, so just return NULL */ + return NULL; + } + else + { + nodeLabels = (Datum *) palloc(sizeof(Datum) * innerTuple->nNodes); + SGITITERATE(innerTuple, i, node) + { + if (IndexTupleHasNulls(node)) + elog(ERROR, "some but not all node labels are null in SPGiST inner tuple"); + nodeLabels[i] = SGNTDATUM(node, state); + } + return nodeLabels; + } +} + +/* + * Add a new item to the page, replacing a PLACEHOLDER item if possible. + * Return the location it's inserted at, or InvalidOffsetNumber on failure. + * + * If startOffset isn't NULL, we start searching for placeholders at + * *startOffset, and update that to the next place to search. This is just + * an optimization for repeated insertions. + * + * If errorOK is false, we throw error when there's not enough room, + * rather than returning InvalidOffsetNumber. + */ +OffsetNumber +SpGistPageAddNewItem(SpGistState *state, Page page, Item item, Size size, + OffsetNumber *startOffset, bool errorOK) +{ + SpGistPageOpaque opaque = SpGistPageGetOpaque(page); + OffsetNumber i, + maxoff, + offnum; + + if (opaque->nPlaceholder > 0 && + PageGetExactFreeSpace(page) + SGDTSIZE >= MAXALIGN(size)) + { + /* Try to replace a placeholder */ + maxoff = PageGetMaxOffsetNumber(page); + offnum = InvalidOffsetNumber; + + for (;;) + { + if (startOffset && *startOffset != InvalidOffsetNumber) + i = *startOffset; + else + i = FirstOffsetNumber; + for (; i <= maxoff; i++) + { + SpGistDeadTuple it = (SpGistDeadTuple) PageGetItem(page, + PageGetItemId(page, i)); + + if (it->tupstate == SPGIST_PLACEHOLDER) + { + offnum = i; + break; + } + } + + /* Done if we found a placeholder */ + if (offnum != InvalidOffsetNumber) + break; + + if (startOffset && *startOffset != InvalidOffsetNumber) + { + /* Hint was no good, re-search from beginning */ + *startOffset = InvalidOffsetNumber; + continue; + } + + /* Hmm, no placeholder found? */ + opaque->nPlaceholder = 0; + break; + } + + if (offnum != InvalidOffsetNumber) + { + /* Replace the placeholder tuple */ + PageIndexTupleDelete(page, offnum); + + offnum = PageAddItem(page, item, size, offnum, false, false); + + /* + * We should not have failed given the size check at the top of + * the function, but test anyway. If we did fail, we must PANIC + * because we've already deleted the placeholder tuple, and + * there's no other way to keep the damage from getting to disk. + */ + if (offnum != InvalidOffsetNumber) + { + Assert(opaque->nPlaceholder > 0); + opaque->nPlaceholder--; + if (startOffset) + *startOffset = offnum + 1; + } + else + elog(PANIC, "failed to add item of size %u to SPGiST index page", + (int) size); + + return offnum; + } + } + + /* No luck in replacing a placeholder, so just add it to the page */ + offnum = PageAddItem(page, item, size, + InvalidOffsetNumber, false, false); + + if (offnum == InvalidOffsetNumber && !errorOK) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + (int) size); + + return offnum; +} + +/* + * spgproperty() -- Check boolean properties of indexes. + * + * This is optional for most AMs, but is required for SP-GiST because the core + * property code doesn't support AMPROP_DISTANCE_ORDERABLE. + */ +bool +spgproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull) +{ + Oid opclass, + opfamily, + opcintype; + CatCList *catlist; + int i; + + /* Only answer column-level inquiries */ + if (attno == 0) + return false; + + switch (prop) + { + case AMPROP_DISTANCE_ORDERABLE: + break; + default: + return false; + } + + /* + * Currently, SP-GiST distance-ordered scans require that there be a + * distance operator in the opclass with the default types. So we assume + * that if such a operator exists, then there's a reason for it. + */ + + /* First we need to know the column's opclass. */ + opclass = get_index_column_opclass(index_oid, attno); + if (!OidIsValid(opclass)) + { + *isnull = true; + return true; + } + + /* Now look up the opclass family and input datatype. */ + if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype)) + { + *isnull = true; + return true; + } + + /* And now we can check whether the operator is provided. */ + catlist = SearchSysCacheList1(AMOPSTRATEGY, + ObjectIdGetDatum(opfamily)); + + *res = false; + + for (i = 0; i < catlist->n_members; i++) + { + HeapTuple amoptup = &catlist->members[i]->tuple; + Form_pg_amop amopform = (Form_pg_amop) GETSTRUCT(amoptup); + + if (amopform->amoppurpose == AMOP_ORDER && + (amopform->amoplefttype == opcintype || + amopform->amoprighttype == opcintype) && + opfamily_can_sort_type(amopform->amopsortfamily, + get_op_rettype(amopform->amopopr))) + { + *res = true; + break; + } + } + + ReleaseSysCacheList(catlist); + + *isnull = false; + + return true; +} diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c new file mode 100644 index 0000000..76fb037 --- /dev/null +++ b/src/backend/access/spgist/spgvacuum.c @@ -0,0 +1,975 @@ +/*------------------------------------------------------------------------- + * + * spgvacuum.c + * vacuum for SP-GiST + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgvacuum.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/spgist_private.h" +#include "access/spgxlog.h" +#include "access/transam.h" +#include "access/xloginsert.h" +#include "catalog/storage_xlog.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "utils/snapmgr.h" + + +/* Entry in pending-list of TIDs we need to revisit */ +typedef struct spgVacPendingItem +{ + ItemPointerData tid; /* redirection target to visit */ + bool done; /* have we dealt with this? */ + struct spgVacPendingItem *next; /* list link */ +} spgVacPendingItem; + +/* Local state for vacuum operations */ +typedef struct spgBulkDeleteState +{ + /* Parameters passed in to spgvacuumscan */ + IndexVacuumInfo *info; + IndexBulkDeleteResult *stats; + IndexBulkDeleteCallback callback; + void *callback_state; + + /* Additional working state */ + SpGistState spgstate; /* for SPGiST operations that need one */ + spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */ + TransactionId myXmin; /* for detecting newly-added redirects */ + BlockNumber lastFilledBlock; /* last non-deletable block */ +} spgBulkDeleteState; + + +/* + * Add TID to pendingList, but only if not already present. + * + * Note that new items are always appended at the end of the list; this + * ensures that scans of the list don't miss items added during the scan. + */ +static void +spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid) +{ + spgVacPendingItem *pitem; + spgVacPendingItem **listLink; + + /* search the list for pre-existing entry */ + listLink = &bds->pendingList; + while (*listLink != NULL) + { + pitem = *listLink; + if (ItemPointerEquals(tid, &pitem->tid)) + return; /* already in list, do nothing */ + listLink = &pitem->next; + } + /* not there, so append new entry */ + pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem)); + pitem->tid = *tid; + pitem->done = false; + pitem->next = NULL; + *listLink = pitem; +} + +/* + * Clear pendingList + */ +static void +spgClearPendingList(spgBulkDeleteState *bds) +{ + spgVacPendingItem *pitem; + spgVacPendingItem *nitem; + + for (pitem = bds->pendingList; pitem != NULL; pitem = nitem) + { + nitem = pitem->next; + /* All items in list should have been dealt with */ + Assert(pitem->done); + pfree(pitem); + } + bds->pendingList = NULL; +} + +/* + * Vacuum a regular (non-root) leaf page + * + * We must delete tuples that are targeted for deletion by the VACUUM, + * but not move any tuples that are referenced by outside links; we assume + * those are the ones that are heads of chains. + * + * If we find a REDIRECT that was made by a concurrently-running transaction, + * we must add its target TID to pendingList. (We don't try to visit the + * target immediately, first because we don't want VACUUM locking more than + * one buffer at a time, and second because the duplicate-filtering logic + * in spgAddPendingTID is useful to ensure we can't get caught in an infinite + * loop in the face of continuous concurrent insertions.) + * + * If forPending is true, we are examining the page as a consequence of + * chasing a redirect link, not as part of the normal sequential scan. + * We still vacuum the page normally, but we don't increment the stats + * about live tuples; else we'd double-count those tuples, since the page + * has been or will be visited in the sequential scan as well. + */ +static void +vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, + bool forPending) +{ + Page page = BufferGetPage(buffer); + spgxlogVacuumLeaf xlrec; + OffsetNumber toDead[MaxIndexTuplesPerPage]; + OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; + OffsetNumber moveSrc[MaxIndexTuplesPerPage]; + OffsetNumber moveDest[MaxIndexTuplesPerPage]; + OffsetNumber chainSrc[MaxIndexTuplesPerPage]; + OffsetNumber chainDest[MaxIndexTuplesPerPage]; + OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; + bool deletable[MaxIndexTuplesPerPage + 1]; + int nDeletable; + OffsetNumber i, + max = PageGetMaxOffsetNumber(page); + + memset(predecessor, 0, sizeof(predecessor)); + memset(deletable, 0, sizeof(deletable)); + nDeletable = 0; + + /* Scan page, identify tuples to delete, accumulate stats */ + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, i)); + if (lt->tupstate == SPGIST_LIVE) + { + Assert(ItemPointerIsValid(<->heapPtr)); + + if (bds->callback(<->heapPtr, bds->callback_state)) + { + bds->stats->tuples_removed += 1; + deletable[i] = true; + nDeletable++; + } + else + { + if (!forPending) + bds->stats->num_index_tuples += 1; + } + + /* Form predecessor map, too */ + if (SGLT_GET_NEXTOFFSET(lt) != InvalidOffsetNumber) + { + /* paranoia about corrupted chain links */ + if (SGLT_GET_NEXTOFFSET(lt) < FirstOffsetNumber || + SGLT_GET_NEXTOFFSET(lt) > max || + predecessor[SGLT_GET_NEXTOFFSET(lt)] != InvalidOffsetNumber) + elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"", + BufferGetBlockNumber(buffer), + RelationGetRelationName(index)); + predecessor[SGLT_GET_NEXTOFFSET(lt)] = i; + } + } + else if (lt->tupstate == SPGIST_REDIRECT) + { + SpGistDeadTuple dt = (SpGistDeadTuple) lt; + + Assert(SGLT_GET_NEXTOFFSET(dt) == InvalidOffsetNumber); + Assert(ItemPointerIsValid(&dt->pointer)); + + /* + * Add target TID to pending list if the redirection could have + * happened since VACUUM started. + * + * Note: we could make a tighter test by seeing if the xid is + * "running" according to the active snapshot; but snapmgr.c + * doesn't currently export a suitable API, and it's not entirely + * clear that a tighter test is worth the cycles anyway. + */ + if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) + spgAddPendingTID(bds, &dt->pointer); + } + else + { + Assert(SGLT_GET_NEXTOFFSET(lt) == InvalidOffsetNumber); + } + } + + if (nDeletable == 0) + return; /* nothing more to do */ + + /*---------- + * Figure out exactly what we have to do. We do this separately from + * actually modifying the page, mainly so that we have a representation + * that can be dumped into WAL and then the replay code can do exactly + * the same thing. The output of this step consists of six arrays + * describing four kinds of operations, to be performed in this order: + * + * toDead[]: tuple numbers to be replaced with DEAD tuples + * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples + * moveSrc[]: tuple numbers that need to be relocated to another offset + * (replacing the tuple there) and then replaced with PLACEHOLDER tuples + * moveDest[]: new locations for moveSrc tuples + * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates + * chainDest[]: new values of nextOffset for chainSrc members + * + * It's easiest to figure out what we have to do by processing tuple + * chains, so we iterate over all the tuples (not just the deletable + * ones!) to identify chain heads, then chase down each chain and make + * work item entries for deletable tuples within the chain. + *---------- + */ + xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0; + + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple head; + bool interveningDeletable; + OffsetNumber prevLive; + OffsetNumber j; + + head = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, i)); + if (head->tupstate != SPGIST_LIVE) + continue; /* can't be a chain member */ + if (predecessor[i] != 0) + continue; /* not a chain head */ + + /* initialize ... */ + interveningDeletable = false; + prevLive = deletable[i] ? InvalidOffsetNumber : i; + + /* scan down the chain ... */ + j = SGLT_GET_NEXTOFFSET(head); + while (j != InvalidOffsetNumber) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, j)); + if (lt->tupstate != SPGIST_LIVE) + { + /* all tuples in chain should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + lt->tupstate); + } + + if (deletable[j]) + { + /* This tuple should be replaced by a placeholder */ + toPlaceholder[xlrec.nPlaceholder] = j; + xlrec.nPlaceholder++; + /* previous live tuple's chain link will need an update */ + interveningDeletable = true; + } + else if (prevLive == InvalidOffsetNumber) + { + /* + * This is the first live tuple in the chain. It has to move + * to the head position. + */ + moveSrc[xlrec.nMove] = j; + moveDest[xlrec.nMove] = i; + xlrec.nMove++; + /* Chain updates will be applied after the move */ + prevLive = i; + interveningDeletable = false; + } + else + { + /* + * Second or later live tuple. Arrange to re-chain it to the + * previous live one, if there was a gap. + */ + if (interveningDeletable) + { + chainSrc[xlrec.nChain] = prevLive; + chainDest[xlrec.nChain] = j; + xlrec.nChain++; + } + prevLive = j; + interveningDeletable = false; + } + + j = SGLT_GET_NEXTOFFSET(lt); + } + + if (prevLive == InvalidOffsetNumber) + { + /* The chain is entirely removable, so we need a DEAD tuple */ + toDead[xlrec.nDead] = i; + xlrec.nDead++; + } + else if (interveningDeletable) + { + /* One or more deletions at end of chain, so close it off */ + chainSrc[xlrec.nChain] = prevLive; + chainDest[xlrec.nChain] = InvalidOffsetNumber; + xlrec.nChain++; + } + } + + /* sanity check ... */ + if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) + elog(ERROR, "inconsistent counts of deletable tuples"); + + /* Do the updates */ + START_CRIT_SECTION(); + + spgPageIndexMultiDelete(&bds->spgstate, page, + toDead, xlrec.nDead, + SPGIST_DEAD, SPGIST_DEAD, + InvalidBlockNumber, InvalidOffsetNumber); + + spgPageIndexMultiDelete(&bds->spgstate, page, + toPlaceholder, xlrec.nPlaceholder, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, InvalidOffsetNumber); + + /* + * We implement the move step by swapping the line pointers of the source + * and target tuples, then replacing the newly-source tuples with + * placeholders. This is perhaps unduly friendly with the page data + * representation, but it's fast and doesn't risk page overflow when a + * tuple to be relocated is large. + */ + for (i = 0; i < xlrec.nMove; i++) + { + ItemId idSrc = PageGetItemId(page, moveSrc[i]); + ItemId idDest = PageGetItemId(page, moveDest[i]); + ItemIdData tmp; + + tmp = *idSrc; + *idSrc = *idDest; + *idDest = tmp; + } + + spgPageIndexMultiDelete(&bds->spgstate, page, + moveSrc, xlrec.nMove, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, InvalidOffsetNumber); + + for (i = 0; i < xlrec.nChain; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, chainSrc[i])); + Assert(lt->tupstate == SPGIST_LIVE); + SGLT_SET_NEXTOFFSET(lt, chainDest[i]); + } + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + STORE_STATE(&bds->spgstate, xlrec.stateSrc); + + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf); + /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ + XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead); + XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder); + XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove); + XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove); + XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain); + XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); +} + +/* + * Vacuum a root page when it is also a leaf + * + * On the root, we just delete any dead leaf tuples; no fancy business + */ +static void +vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + spgxlogVacuumRoot xlrec; + OffsetNumber toDelete[MaxIndexTuplesPerPage]; + OffsetNumber i, + max = PageGetMaxOffsetNumber(page); + + xlrec.nDelete = 0; + + /* Scan page, identify tuples to delete, accumulate stats */ + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, i)); + if (lt->tupstate == SPGIST_LIVE) + { + Assert(ItemPointerIsValid(<->heapPtr)); + + if (bds->callback(<->heapPtr, bds->callback_state)) + { + bds->stats->tuples_removed += 1; + toDelete[xlrec.nDelete] = i; + xlrec.nDelete++; + } + else + { + bds->stats->num_index_tuples += 1; + } + } + else + { + /* all tuples on root should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + lt->tupstate); + } + } + + if (xlrec.nDelete == 0) + return; /* nothing more to do */ + + /* Do the update */ + START_CRIT_SECTION(); + + /* The tuple numbers are in order, so we can use PageIndexMultiDelete */ + PageIndexMultiDelete(page, toDelete, xlrec.nDelete); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + /* Prepare WAL record */ + STORE_STATE(&bds->spgstate, xlrec.stateSrc); + + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot); + /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ + XLogRegisterData((char *) toDelete, + sizeof(OffsetNumber) * xlrec.nDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); +} + +/* + * Clean up redirect and placeholder tuples on the given page + * + * Redirect tuples can be marked placeholder once they're old enough. + * Placeholder tuples can be removed if it won't change the offsets of + * non-placeholder ones. + * + * Unlike the routines above, this works on both leaf and inner pages. + */ +static void +vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + SpGistPageOpaque opaque = SpGistPageGetOpaque(page); + OffsetNumber i, + max = PageGetMaxOffsetNumber(page), + firstPlaceholder = InvalidOffsetNumber; + bool hasNonPlaceholder = false; + bool hasUpdate = false; + OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; + OffsetNumber itemnos[MaxIndexTuplesPerPage]; + spgxlogVacuumRedirect xlrec; + GlobalVisState *vistest; + + xlrec.nToPlaceholder = 0; + xlrec.newestRedirectXid = InvalidTransactionId; + + /* XXX: providing heap relation would allow more pruning */ + vistest = GlobalVisTestFor(NULL); + + START_CRIT_SECTION(); + + /* + * Scan backwards to convert old redirection tuples to placeholder tuples, + * and identify location of last non-placeholder tuple while at it. + */ + for (i = max; + i >= FirstOffsetNumber && + (opaque->nRedirection > 0 || !hasNonPlaceholder); + i--) + { + SpGistDeadTuple dt; + + dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); + + if (dt->tupstate == SPGIST_REDIRECT && + GlobalVisTestIsRemovableXid(vistest, dt->xid)) + { + dt->tupstate = SPGIST_PLACEHOLDER; + Assert(opaque->nRedirection > 0); + opaque->nRedirection--; + opaque->nPlaceholder++; + + /* remember newest XID among the removed redirects */ + if (!TransactionIdIsValid(xlrec.newestRedirectXid) || + TransactionIdPrecedes(xlrec.newestRedirectXid, dt->xid)) + xlrec.newestRedirectXid = dt->xid; + + ItemPointerSetInvalid(&dt->pointer); + + itemToPlaceholder[xlrec.nToPlaceholder] = i; + xlrec.nToPlaceholder++; + + hasUpdate = true; + } + + if (dt->tupstate == SPGIST_PLACEHOLDER) + { + if (!hasNonPlaceholder) + firstPlaceholder = i; + } + else + { + hasNonPlaceholder = true; + } + } + + /* + * Any placeholder tuples at the end of page can safely be removed. We + * can't remove ones before the last non-placeholder, though, because we + * can't alter the offset numbers of non-placeholder tuples. + */ + if (firstPlaceholder != InvalidOffsetNumber) + { + /* + * We do not store this array to rdata because it's easy to recreate. + */ + for (i = firstPlaceholder; i <= max; i++) + itemnos[i - firstPlaceholder] = i; + + i = max - firstPlaceholder + 1; + Assert(opaque->nPlaceholder >= i); + opaque->nPlaceholder -= i; + + /* The array is surely sorted, so can use PageIndexMultiDelete */ + PageIndexMultiDelete(page, itemnos, i); + + hasUpdate = true; + } + + xlrec.firstPlaceholder = firstPlaceholder; + + if (hasUpdate) + MarkBufferDirty(buffer); + + if (hasUpdate && RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect); + XLogRegisterData((char *) itemToPlaceholder, + sizeof(OffsetNumber) * xlrec.nToPlaceholder); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); +} + +/* + * Process one page during a bulkdelete scan + */ +static void +spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) +{ + Relation index = bds->info->index; + Buffer buffer; + Page page; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, bds->info->strategy); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page)) + { + /* + * We found an all-zero page, which could happen if the database + * crashed just after extending the file. Recycle it. + */ + } + else if (PageIsEmpty(page)) + { + /* nothing to do */ + } + else if (SpGistPageIsLeaf(page)) + { + if (SpGistBlockIsRoot(blkno)) + { + vacuumLeafRoot(bds, index, buffer); + /* no need for vacuumRedirectAndPlaceholder */ + } + else + { + vacuumLeafPage(bds, index, buffer, false); + vacuumRedirectAndPlaceholder(index, buffer); + } + } + else + { + /* inner page */ + vacuumRedirectAndPlaceholder(index, buffer); + } + + /* + * The root pages must never be deleted, nor marked as available in FSM, + * because we don't want them ever returned by a search for a place to put + * a new tuple. Otherwise, check for empty page, and make sure the FSM + * knows about it. + */ + if (!SpGistBlockIsRoot(blkno)) + { + if (PageIsNew(page) || PageIsEmpty(page)) + { + RecordFreeIndexPage(index, blkno); + bds->stats->pages_deleted++; + } + else + { + SpGistSetLastUsedPage(index, buffer); + bds->lastFilledBlock = blkno; + } + } + + UnlockReleaseBuffer(buffer); +} + +/* + * Process the pending-TID list between pages of the main scan + */ +static void +spgprocesspending(spgBulkDeleteState *bds) +{ + Relation index = bds->info->index; + spgVacPendingItem *pitem; + spgVacPendingItem *nitem; + BlockNumber blkno; + Buffer buffer; + Page page; + + for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next) + { + if (pitem->done) + continue; /* ignore already-done items */ + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + /* examine the referenced page */ + blkno = ItemPointerGetBlockNumber(&pitem->tid); + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, bds->info->strategy); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page) || SpGistPageIsDeleted(page)) + { + /* Probably shouldn't happen, but ignore it */ + } + else if (SpGistPageIsLeaf(page)) + { + if (SpGistBlockIsRoot(blkno)) + { + /* this should definitely not happen */ + elog(ERROR, "redirection leads to root page of index \"%s\"", + RelationGetRelationName(index)); + } + + /* deal with any deletable tuples */ + vacuumLeafPage(bds, index, buffer, true); + /* might as well do this while we are here */ + vacuumRedirectAndPlaceholder(index, buffer); + + SpGistSetLastUsedPage(index, buffer); + + /* + * We can mark as done not only this item, but any later ones + * pointing at the same page, since we vacuumed the whole page. + */ + pitem->done = true; + for (nitem = pitem->next; nitem != NULL; nitem = nitem->next) + { + if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) + nitem->done = true; + } + } + else + { + /* + * On an inner page, visit the referenced inner tuple and add all + * its downlinks to the pending list. We might have pending items + * for more than one inner tuple on the same page (in fact this is + * pretty likely given the way space allocation works), so get + * them all while we are here. + */ + for (nitem = pitem; nitem != NULL; nitem = nitem->next) + { + if (nitem->done) + continue; + if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) + { + OffsetNumber offset; + SpGistInnerTuple innerTuple; + + offset = ItemPointerGetOffsetNumber(&nitem->tid); + innerTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, offset)); + if (innerTuple->tupstate == SPGIST_LIVE) + { + SpGistNodeTuple node; + int i; + + SGITITERATE(innerTuple, i, node) + { + if (ItemPointerIsValid(&node->t_tid)) + spgAddPendingTID(bds, &node->t_tid); + } + } + else if (innerTuple->tupstate == SPGIST_REDIRECT) + { + /* transfer attention to redirect point */ + spgAddPendingTID(bds, + &((SpGistDeadTuple) innerTuple)->pointer); + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", + innerTuple->tupstate); + + nitem->done = true; + } + } + } + + UnlockReleaseBuffer(buffer); + } + + spgClearPendingList(bds); +} + +/* + * Perform a bulkdelete scan + */ +static void +spgvacuumscan(spgBulkDeleteState *bds) +{ + Relation index = bds->info->index; + bool needLock; + BlockNumber num_pages, + blkno; + + /* Finish setting up spgBulkDeleteState */ + initSpGistState(&bds->spgstate, index); + bds->pendingList = NULL; + bds->myXmin = GetActiveSnapshot()->xmin; + bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO; + + /* + * Reset counts that will be incremented during the scan; needed in case + * of multiple scans during a single VACUUM command + */ + bds->stats->estimated_count = false; + bds->stats->num_index_tuples = 0; + bds->stats->pages_deleted = 0; + + /* We can skip locking for new or temp relations */ + needLock = !RELATION_IS_LOCAL(index); + + /* + * The outer loop iterates over all index pages except the metapage, in + * physical order (we hope the kernel will cooperate in providing + * read-ahead for speed). It is critical that we visit all leaf pages, + * including ones added after we start the scan, else we might fail to + * delete some deletable tuples. See more extensive comments about this + * in btvacuumscan(). + */ + blkno = SPGIST_METAPAGE_BLKNO + 1; + for (;;) + { + /* Get the current relation length */ + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + num_pages = RelationGetNumberOfBlocks(index); + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + /* Quit if we've scanned the whole relation */ + if (blkno >= num_pages) + break; + /* Iterate over pages, then loop back to recheck length */ + for (; blkno < num_pages; blkno++) + { + spgvacuumpage(bds, blkno); + /* empty the pending-list after each page */ + if (bds->pendingList != NULL) + spgprocesspending(bds); + } + } + + /* Propagate local lastUsedPages cache to metablock */ + SpGistUpdateMetaPage(index); + + /* + * If we found any empty pages (and recorded them in the FSM), then + * forcibly update the upper-level FSM pages to ensure that searchers can + * find them. It's possible that the pages were also found during + * previous scans and so this is a waste of time, but it's cheap enough + * relative to scanning the index that it shouldn't matter much, and + * making sure that free pages are available sooner not later seems + * worthwhile. + * + * Note that if no empty pages exist, we don't bother vacuuming the FSM at + * all. + */ + if (bds->stats->pages_deleted > 0) + IndexFreeSpaceMapVacuum(index); + + /* + * Truncate index if possible + * + * XXX disabled because it's unsafe due to possible concurrent inserts. + * We'd have to rescan the pages to make sure they're still empty, and it + * doesn't seem worth it. Note that btree doesn't do this either. + * + * Another reason not to truncate is that it could invalidate the cached + * pages-with-freespace pointers in the metapage and other backends' + * relation caches, that is leave them pointing to nonexistent pages. + * Adding RelationGetNumberOfBlocks calls to protect the places that use + * those pointers would be unduly expensive. + */ +#ifdef NOT_USED + if (num_pages > bds->lastFilledBlock + 1) + { + BlockNumber lastBlock = num_pages - 1; + + num_pages = bds->lastFilledBlock + 1; + RelationTruncate(index, num_pages); + bds->stats->pages_removed += lastBlock - bds->lastFilledBlock; + bds->stats->pages_deleted -= lastBlock - bds->lastFilledBlock; + } +#endif + + /* Report final stats */ + bds->stats->num_pages = num_pages; + bds->stats->pages_newly_deleted = bds->stats->pages_deleted; + bds->stats->pages_free = bds->stats->pages_deleted; +} + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + spgBulkDeleteState bds; + + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + bds.info = info; + bds.stats = stats; + bds.callback = callback; + bds.callback_state = callback_state; + + spgvacuumscan(&bds); + + return stats; +} + +/* Dummy callback to delete no tuples during spgvacuumcleanup */ +static bool +dummy_callback(ItemPointer itemptr, void *state) +{ + return false; +} + +/* + * Post-VACUUM cleanup. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + spgBulkDeleteState bds; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + /* + * We don't need to scan the index if there was a preceding bulkdelete + * pass. Otherwise, make a pass that won't delete any live tuples, but + * might still accomplish useful stuff with redirect/placeholder cleanup + * and/or FSM housekeeping, and in any case will provide stats. + */ + if (stats == NULL) + { + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + bds.info = info; + bds.stats = stats; + bds.callback = dummy_callback; + bds.callback_state = NULL; + + spgvacuumscan(&bds); + } + + /* + * It's quite possible for us to be fooled by concurrent tuple moves into + * double-counting some index tuples, so disbelieve any total that exceeds + * the underlying heap's count ... if we know that accurately. Otherwise + * this might just make matters worse. + */ + if (!info->estimated_count) + { + if (stats->num_index_tuples > info->num_heap_tuples) + stats->num_index_tuples = info->num_heap_tuples; + } + + return stats; +} diff --git a/src/backend/access/spgist/spgvalidate.c b/src/backend/access/spgist/spgvalidate.c new file mode 100644 index 0000000..472a28b --- /dev/null +++ b/src/backend/access/spgist/spgvalidate.c @@ -0,0 +1,392 @@ +/*------------------------------------------------------------------------- + * + * spgvalidate.c + * Opclass validator for SP-GiST. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/htup_details.h" +#include "access/spgist_private.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + + +/* + * Validator for an SP-GiST opclass. + * + * Some of the checks done here cover the whole opfamily, and therefore are + * redundant when checking each opclass in a family. But they don't run long + * enough to be much of a problem, so we accept the duplication rather than + * complicate the amvalidate API. + */ +bool +spgvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + Oid opckeytype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + spgConfigIn configIn; + spgConfigOut configOut; + Oid configOutLefttype = InvalidOid; + Oid configOutRighttype = InvalidOid; + Oid configOutLeafType = InvalidOid; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opckeytype = classform->opckeytype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + grouplist = identify_opfamily_groups(oprlist, proclist); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* + * All SP-GiST support functions should be registered with matching + * left/right types + */ + if (procform->amproclefttype != procform->amprocrighttype) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains support function %s with different left and right input types", + opfamilyname, "spgist", + format_procedure(procform->amproc)))); + result = false; + } + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case SPGIST_CONFIG_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 2, 2, INTERNALOID, INTERNALOID); + configIn.attType = procform->amproclefttype; + memset(&configOut, 0, sizeof(configOut)); + + OidFunctionCall2(procform->amproc, + PointerGetDatum(&configIn), + PointerGetDatum(&configOut)); + + configOutLefttype = procform->amproclefttype; + configOutRighttype = procform->amprocrighttype; + + /* Default leaf type is opckeytype or input type */ + if (OidIsValid(opckeytype)) + configOutLeafType = opckeytype; + else + configOutLeafType = procform->amproclefttype; + + /* If some other leaf datum type is specified, warn */ + if (OidIsValid(configOut.leafType) && + configOutLeafType != configOut.leafType) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("SP-GiST leaf data type %s does not match declared type %s", + format_type_be(configOut.leafType), + format_type_be(configOutLeafType)))); + result = false; + configOutLeafType = configOut.leafType; + } + + /* + * When leaf and attribute types are the same, compress + * function is not required and we set corresponding bit in + * functionset for later group consistency check. + */ + if (configOutLeafType == configIn.attType) + { + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *group = lfirst(lc); + + if (group->lefttype == procform->amproclefttype && + group->righttype == procform->amprocrighttype) + { + group->functionset |= + ((uint64) 1) << SPGIST_COMPRESS_PROC; + break; + } + } + } + break; + case SPGIST_CHOOSE_PROC: + case SPGIST_PICKSPLIT_PROC: + case SPGIST_INNER_CONSISTENT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 2, 2, INTERNALOID, INTERNALOID); + break; + case SPGIST_LEAF_CONSISTENT_PROC: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 2, 2, INTERNALOID, INTERNALOID); + break; + case SPGIST_COMPRESS_PROC: + if (configOutLefttype != procform->amproclefttype || + configOutRighttype != procform->amprocrighttype) + ok = false; + else + ok = check_amproc_signature(procform->amproc, + configOutLeafType, true, + 1, 1, procform->amproclefttype); + break; + case SPGIST_OPTIONS_PROC: + ok = check_amoptsproc_signature(procform->amproc); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "spgist", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "spgist", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + Oid op_rettype; + + /* TODO: Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "spgist", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* spgist supports ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH) + { + /* ... and operator result must match the claimed btree opfamily */ + op_rettype = get_op_rettype(oprform->amopopr); + if (!opfamily_can_sort_type(oprform->amopsortfamily, op_rettype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "spgist", + format_operator(oprform->amopopr)))); + result = false; + } + } + else + op_rettype = BOOLOID; + + /* Check operator signature --- same for all spgist strategies */ + if (!check_amop_signature(oprform->amopopr, op_rettype, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "spgist", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * Complain if there are any datatype pairs with functions but no + * operators. This is about the best we can do for now to detect + * missing operators. + */ + if (thisgroup->operatorset == 0) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s", + opfamilyname, "spgist", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + + /* + * Complain if we're missing functions for any datatype, remembering + * that SP-GiST doesn't use cross-type support functions. + */ + if (thisgroup->lefttype != thisgroup->righttype) + continue; + + for (i = 1; i <= SPGISTNProc; i++) + { + if ((thisgroup->functionset & (((uint64) 1) << i)) != 0) + continue; /* got it */ + if (i == SPGIST_OPTIONS_PROC) + continue; /* optional method */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing support function %d for type %s", + opfamilyname, "spgist", i, + format_type_be(thisgroup->lefttype)))); + result = false; + } + } + + /* Check that the originally-named opclass is supported */ + /* (if group is there, we already checked it adequately above) */ + if (!opclassgroup) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing operator(s)", + opclassname, "spgist"))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + +/* + * Prechecking function for adding operators/functions to an SP-GiST opfamily. + */ +void +spgadjustmembers(Oid opfamilyoid, + Oid opclassoid, + List *operators, + List *functions) +{ + ListCell *lc; + + /* + * Operator members of an SP-GiST opfamily should never have hard + * dependencies, since their connection to the opfamily depends only on + * what the support functions think, and that can be altered. For + * consistency, we make all soft dependencies point to the opfamily, + * though a soft dependency on the opclass would work as well in the + * CREATE OPERATOR CLASS case. + */ + foreach(lc, operators) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + } + + /* + * Required support functions should have hard dependencies. Preferably + * those are just dependencies on the opclass, but if we're in ALTER + * OPERATOR FAMILY, we leave the dependency pointing at the whole + * opfamily. (Given that SP-GiST opclasses generally don't share + * opfamilies, it seems unlikely to be worth working harder.) + */ + foreach(lc, functions) + { + OpFamilyMember *op = (OpFamilyMember *) lfirst(lc); + + switch (op->number) + { + case SPGIST_CONFIG_PROC: + case SPGIST_CHOOSE_PROC: + case SPGIST_PICKSPLIT_PROC: + case SPGIST_INNER_CONSISTENT_PROC: + case SPGIST_LEAF_CONSISTENT_PROC: + /* Required support function */ + op->ref_is_hard = true; + break; + case SPGIST_COMPRESS_PROC: + case SPGIST_OPTIONS_PROC: + /* Optional, so force it to be a soft family dependency */ + op->ref_is_hard = false; + op->ref_is_family = true; + op->refobjid = opfamilyoid; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("support function number %d is invalid for access method %s", + op->number, "spgist"))); + break; + } + } +} diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c new file mode 100644 index 0000000..3dfd2aa --- /dev/null +++ b/src/backend/access/spgist/spgxlog.c @@ -0,0 +1,1013 @@ +/*------------------------------------------------------------------------- + * + * spgxlog.c + * WAL replay logic for SP-GiST + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgxlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/spgist_private.h" +#include "access/spgxlog.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "storage/standby.h" +#include "utils/memutils.h" + + +static MemoryContext opCtx; /* working memory for operations */ + + +/* + * Prepare a dummy SpGistState, with just the minimum info needed for replay. + * + * At present, all we need is enough info to support spgFormDeadTuple(), + * plus the isBuild flag. + */ +static void +fillFakeState(SpGistState *state, spgxlogState stateSrc) +{ + memset(state, 0, sizeof(*state)); + + state->myXid = stateSrc.myXid; + state->isBuild = stateSrc.isBuild; + state->deadTupleStorage = palloc0(SGDTSIZE); +} + +/* + * Add a leaf tuple, or replace an existing placeholder tuple. This is used + * to replay SpGistPageAddNewItem() operations. If the offset points at an + * existing tuple, it had better be a placeholder tuple. + */ +static void +addOrReplaceTuple(Page page, Item tuple, int size, OffsetNumber offset) +{ + if (offset <= PageGetMaxOffsetNumber(page)) + { + SpGistDeadTuple dt = (SpGistDeadTuple) PageGetItem(page, + PageGetItemId(page, offset)); + + if (dt->tupstate != SPGIST_PLACEHOLDER) + elog(ERROR, "SPGiST tuple to be replaced is not a placeholder"); + + Assert(SpGistPageGetOpaque(page)->nPlaceholder > 0); + SpGistPageGetOpaque(page)->nPlaceholder--; + + PageIndexTupleDelete(page, offset); + } + + Assert(offset <= PageGetMaxOffsetNumber(page) + 1); + + if (PageAddItem(page, tuple, size, offset, false, false) != offset) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + size); +} + +static void +spgRedoAddLeaf(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr; + char *leafTuple; + SpGistLeafTupleData leafTupleHdr; + Buffer buffer; + Page page; + XLogRedoAction action; + + ptr += sizeof(spgxlogAddLeaf); + leafTuple = ptr; + /* the leaf tuple is unaligned, so make a copy to access its header */ + memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData)); + + /* + * In normal operation we would have both current and parent pages locked + * simultaneously; but in WAL replay it should be safe to update the leaf + * page before updating the parent. + */ + if (xldata->newPage) + { + buffer = XLogInitBufferForRedo(record, 0); + SpGistInitBuffer(buffer, + SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + + if (action == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + /* insert new tuple */ + if (xldata->offnumLeaf != xldata->offnumHeadLeaf) + { + /* normal cases, tuple was added by SpGistPageAddNewItem */ + addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, + xldata->offnumLeaf); + + /* update head tuple's chain link if needed */ + if (xldata->offnumHeadLeaf != InvalidOffsetNumber) + { + SpGistLeafTuple head; + + head = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumHeadLeaf)); + Assert(SGLT_GET_NEXTOFFSET(head) == SGLT_GET_NEXTOFFSET(&leafTupleHdr)); + SGLT_SET_NEXTOFFSET(head, xldata->offnumLeaf); + } + } + else + { + /* replacing a DEAD tuple */ + PageIndexTupleDelete(page, xldata->offnumLeaf); + if (PageAddItem(page, + (Item) leafTuple, leafTupleHdr.size, + xldata->offnumLeaf, false, false) != xldata->offnumLeaf) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + leafTupleHdr.size); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* update parent downlink if necessary */ + if (xldata->offnumParent != InvalidOffsetNumber) + { + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + SpGistInnerTuple tuple; + BlockNumber blknoLeaf; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &blknoLeaf); + + page = BufferGetPage(buffer); + + tuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + spgUpdateNodeLink(tuple, xldata->nodeI, + blknoLeaf, xldata->offnumLeaf); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } +} + +static void +spgRedoMoveLeafs(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr; + SpGistState state; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + int nInsert; + Buffer buffer; + Page page; + XLogRedoAction action; + BlockNumber blknoDst; + + XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoDst); + + fillFakeState(&state, xldata->stateSrc); + + nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1; + + ptr += SizeOfSpgxlogMoveLeafs; + toDelete = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nMoves; + toInsert = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * nInsert; + + /* now ptr points to the list of leaf tuples */ + + /* + * In normal operation we would have all three pages (source, dest, and + * parent) locked simultaneously; but in WAL replay it should be safe to + * update them one at a time, as long as we do it in the right order. + */ + + /* Insert tuples on the dest page (do first, so redirect is valid) */ + if (xldata->newPage) + { + buffer = XLogInitBufferForRedo(record, 1); + SpGistInitBuffer(buffer, + SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 1, &buffer); + + if (action == BLK_NEEDS_REDO) + { + int i; + + page = BufferGetPage(buffer); + + for (i = 0; i < nInsert; i++) + { + char *leafTuple; + SpGistLeafTupleData leafTupleHdr; + + /* + * the tuples are not aligned, so must copy to access the size + * field. + */ + leafTuple = ptr; + memcpy(&leafTupleHdr, leafTuple, + sizeof(SpGistLeafTupleData)); + + addOrReplaceTuple(page, (Item) leafTuple, + leafTupleHdr.size, toInsert[i]); + ptr += leafTupleHdr.size; + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* Delete tuples from the source page, inserting a redirection pointer */ + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves, + state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + blknoDst, + toInsert[nInsert - 1]); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* And update the parent downlink */ + if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) + { + SpGistInnerTuple tuple; + + page = BufferGetPage(buffer); + + tuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + spgUpdateNodeLink(tuple, xldata->nodeI, + blknoDst, toInsert[nInsert - 1]); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +spgRedoAddNode(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogAddNode *xldata = (spgxlogAddNode *) ptr; + char *innerTuple; + SpGistInnerTupleData innerTupleHdr; + SpGistState state; + Buffer buffer; + Page page; + XLogRedoAction action; + + ptr += sizeof(spgxlogAddNode); + innerTuple = ptr; + /* the tuple is unaligned, so make a copy to access its header */ + memcpy(&innerTupleHdr, innerTuple, sizeof(SpGistInnerTupleData)); + + fillFakeState(&state, xldata->stateSrc); + + if (!XLogRecHasBlockRef(record, 1)) + { + /* update in place */ + Assert(xldata->parentBlk == -1); + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + PageIndexTupleDelete(page, xldata->offnum); + if (PageAddItem(page, (Item) innerTuple, innerTupleHdr.size, + xldata->offnum, + false, false) != xldata->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + innerTupleHdr.size); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + else + { + BlockNumber blkno; + BlockNumber blknoNew; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &blkno); + XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoNew); + + /* + * In normal operation we would have all three pages (source, dest, + * and parent) locked simultaneously; but in WAL replay it should be + * safe to update them one at a time, as long as we do it in the right + * order. We must insert the new tuple before replacing the old tuple + * with the redirect tuple. + */ + + /* Install new tuple first so redirect is valid */ + if (xldata->newPage) + { + /* AddNode is not used for nulls pages */ + buffer = XLogInitBufferForRedo(record, 1); + SpGistInitBuffer(buffer, 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + addOrReplaceTuple(page, (Item) innerTuple, + innerTupleHdr.size, xldata->offnumNew); + + /* + * If parent is in this same page, update it now. + */ + if (xldata->parentBlk == 1) + { + SpGistInnerTuple parentTuple; + + parentTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + spgUpdateNodeLink(parentTuple, xldata->nodeI, + blknoNew, xldata->offnumNew); + } + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* Delete old tuple, replacing it with redirect or placeholder tuple */ + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + SpGistDeadTuple dt; + + page = BufferGetPage(buffer); + + if (state.isBuild) + dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + else + dt = spgFormDeadTuple(&state, SPGIST_REDIRECT, + blknoNew, + xldata->offnumNew); + + PageIndexTupleDelete(page, xldata->offnum); + if (PageAddItem(page, (Item) dt, dt->size, + xldata->offnum, + false, false) != xldata->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + dt->size); + + if (state.isBuild) + SpGistPageGetOpaque(page)->nPlaceholder++; + else + SpGistPageGetOpaque(page)->nRedirection++; + + /* + * If parent is in this same page, update it now. + */ + if (xldata->parentBlk == 0) + { + SpGistInnerTuple parentTuple; + + parentTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + spgUpdateNodeLink(parentTuple, xldata->nodeI, + blknoNew, xldata->offnumNew); + } + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * Update parent downlink (if we didn't do it as part of the source or + * destination page update already). + */ + if (xldata->parentBlk == 2) + { + if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) + { + SpGistInnerTuple parentTuple; + + page = BufferGetPage(buffer); + + parentTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + spgUpdateNodeLink(parentTuple, xldata->nodeI, + blknoNew, xldata->offnumNew); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + } +} + +static void +spgRedoSplitTuple(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr; + char *prefixTuple; + SpGistInnerTupleData prefixTupleHdr; + char *postfixTuple; + SpGistInnerTupleData postfixTupleHdr; + Buffer buffer; + Page page; + XLogRedoAction action; + + ptr += sizeof(spgxlogSplitTuple); + prefixTuple = ptr; + /* the prefix tuple is unaligned, so make a copy to access its header */ + memcpy(&prefixTupleHdr, prefixTuple, sizeof(SpGistInnerTupleData)); + ptr += prefixTupleHdr.size; + postfixTuple = ptr; + /* postfix tuple is also unaligned */ + memcpy(&postfixTupleHdr, postfixTuple, sizeof(SpGistInnerTupleData)); + + /* + * In normal operation we would have both pages locked simultaneously; but + * in WAL replay it should be safe to update them one at a time, as long + * as we do it in the right order. + */ + + /* insert postfix tuple first to avoid dangling link */ + if (!xldata->postfixBlkSame) + { + if (xldata->newPage) + { + buffer = XLogInitBufferForRedo(record, 1); + /* SplitTuple is not used for nulls pages */ + SpGistInitBuffer(buffer, 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + addOrReplaceTuple(page, (Item) postfixTuple, + postfixTupleHdr.size, xldata->offnumPostfix); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + /* now handle the original page */ + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + PageIndexTupleDelete(page, xldata->offnumPrefix); + if (PageAddItem(page, (Item) prefixTuple, prefixTupleHdr.size, + xldata->offnumPrefix, false, false) != xldata->offnumPrefix) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + prefixTupleHdr.size); + + if (xldata->postfixBlkSame) + addOrReplaceTuple(page, (Item) postfixTuple, + postfixTupleHdr.size, + xldata->offnumPostfix); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +spgRedoPickSplit(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr; + char *innerTuple; + SpGistInnerTupleData innerTupleHdr; + SpGistState state; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + uint8 *leafPageSelect; + Buffer srcBuffer; + Buffer destBuffer; + Buffer innerBuffer; + Page srcPage; + Page destPage; + Page page; + int i; + BlockNumber blknoInner; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 2, NULL, NULL, &blknoInner); + + fillFakeState(&state, xldata->stateSrc); + + ptr += SizeOfSpgxlogPickSplit; + toDelete = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nDelete; + toInsert = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nInsert; + leafPageSelect = (uint8 *) ptr; + ptr += sizeof(uint8) * xldata->nInsert; + + innerTuple = ptr; + /* the inner tuple is unaligned, so make a copy to access its header */ + memcpy(&innerTupleHdr, innerTuple, sizeof(SpGistInnerTupleData)); + ptr += innerTupleHdr.size; + + /* now ptr points to the list of leaf tuples */ + + if (xldata->isRootSplit) + { + /* when splitting root, we touch it only in the guise of new inner */ + srcBuffer = InvalidBuffer; + srcPage = NULL; + } + else if (xldata->initSrc) + { + /* just re-init the source page */ + srcBuffer = XLogInitBufferForRedo(record, 0); + srcPage = (Page) BufferGetPage(srcBuffer); + + SpGistInitBuffer(srcBuffer, + SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); + /* don't update LSN etc till we're done with it */ + } + else + { + /* + * Delete the specified tuples from source page. (In case we're in + * Hot Standby, we need to hold lock on the page till we're done + * inserting leaf tuples and the new inner tuple, else the added + * redirect tuple will be a dangling link.) + */ + srcPage = NULL; + if (XLogReadBufferForRedo(record, 0, &srcBuffer) == BLK_NEEDS_REDO) + { + srcPage = BufferGetPage(srcBuffer); + + /* + * We have it a bit easier here than in doPickSplit(), because we + * know the inner tuple's location already, so we can inject the + * correct redirection tuple now. + */ + if (!state.isBuild) + spgPageIndexMultiDelete(&state, srcPage, + toDelete, xldata->nDelete, + SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + blknoInner, + xldata->offnumInner); + else + spgPageIndexMultiDelete(&state, srcPage, + toDelete, xldata->nDelete, + SPGIST_PLACEHOLDER, + SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + + /* don't update LSN etc till we're done with it */ + } + } + + /* try to access dest page if any */ + if (!XLogRecHasBlockRef(record, 1)) + { + destBuffer = InvalidBuffer; + destPage = NULL; + } + else if (xldata->initDest) + { + /* just re-init the dest page */ + destBuffer = XLogInitBufferForRedo(record, 1); + destPage = (Page) BufferGetPage(destBuffer); + + SpGistInitBuffer(destBuffer, + SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); + /* don't update LSN etc till we're done with it */ + } + else + { + /* + * We could probably release the page lock immediately in the + * full-page-image case, but for safety let's hold it till later. + */ + if (XLogReadBufferForRedo(record, 1, &destBuffer) == BLK_NEEDS_REDO) + destPage = (Page) BufferGetPage(destBuffer); + else + destPage = NULL; /* don't do any page updates */ + } + + /* restore leaf tuples to src and/or dest page */ + for (i = 0; i < xldata->nInsert; i++) + { + char *leafTuple; + SpGistLeafTupleData leafTupleHdr; + + /* the tuples are not aligned, so must copy to access the size field. */ + leafTuple = ptr; + memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData)); + ptr += leafTupleHdr.size; + + page = leafPageSelect[i] ? destPage : srcPage; + if (page == NULL) + continue; /* no need to touch this page */ + + addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, + toInsert[i]); + } + + /* Now update src and dest page LSNs if needed */ + if (srcPage != NULL) + { + PageSetLSN(srcPage, lsn); + MarkBufferDirty(srcBuffer); + } + if (destPage != NULL) + { + PageSetLSN(destPage, lsn); + MarkBufferDirty(destBuffer); + } + + /* restore new inner tuple */ + if (xldata->initInner) + { + innerBuffer = XLogInitBufferForRedo(record, 2); + SpGistInitBuffer(innerBuffer, (xldata->storesNulls ? SPGIST_NULLS : 0)); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 2, &innerBuffer); + + if (action == BLK_NEEDS_REDO) + { + page = BufferGetPage(innerBuffer); + + addOrReplaceTuple(page, (Item) innerTuple, innerTupleHdr.size, + xldata->offnumInner); + + /* if inner is also parent, update link while we're here */ + if (xldata->innerIsParent) + { + SpGistInnerTuple parent; + + parent = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + spgUpdateNodeLink(parent, xldata->nodeI, + blknoInner, xldata->offnumInner); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(innerBuffer); + } + if (BufferIsValid(innerBuffer)) + UnlockReleaseBuffer(innerBuffer); + + /* + * Now we can release the leaf-page locks. It's okay to do this before + * updating the parent downlink. + */ + if (BufferIsValid(srcBuffer)) + UnlockReleaseBuffer(srcBuffer); + if (BufferIsValid(destBuffer)) + UnlockReleaseBuffer(destBuffer); + + /* update parent downlink, unless we did it above */ + if (XLogRecHasBlockRef(record, 3)) + { + Buffer parentBuffer; + + if (XLogReadBufferForRedo(record, 3, &parentBuffer) == BLK_NEEDS_REDO) + { + SpGistInnerTuple parent; + + page = BufferGetPage(parentBuffer); + + parent = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + spgUpdateNodeLink(parent, xldata->nodeI, + blknoInner, xldata->offnumInner); + + PageSetLSN(page, lsn); + MarkBufferDirty(parentBuffer); + } + if (BufferIsValid(parentBuffer)) + UnlockReleaseBuffer(parentBuffer); + } + else + Assert(xldata->innerIsParent || xldata->isRootSplit); +} + +static void +spgRedoVacuumLeaf(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogVacuumLeaf *xldata = (spgxlogVacuumLeaf *) ptr; + OffsetNumber *toDead; + OffsetNumber *toPlaceholder; + OffsetNumber *moveSrc; + OffsetNumber *moveDest; + OffsetNumber *chainSrc; + OffsetNumber *chainDest; + SpGistState state; + Buffer buffer; + Page page; + int i; + + fillFakeState(&state, xldata->stateSrc); + + ptr += SizeOfSpgxlogVacuumLeaf; + toDead = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nDead; + toPlaceholder = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nPlaceholder; + moveSrc = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nMove; + moveDest = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nMove; + chainSrc = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nChain; + chainDest = (OffsetNumber *) ptr; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + spgPageIndexMultiDelete(&state, page, + toDead, xldata->nDead, + SPGIST_DEAD, SPGIST_DEAD, + InvalidBlockNumber, + InvalidOffsetNumber); + + spgPageIndexMultiDelete(&state, page, + toPlaceholder, xldata->nPlaceholder, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + + /* see comments in vacuumLeafPage() */ + for (i = 0; i < xldata->nMove; i++) + { + ItemId idSrc = PageGetItemId(page, moveSrc[i]); + ItemId idDest = PageGetItemId(page, moveDest[i]); + ItemIdData tmp; + + tmp = *idSrc; + *idSrc = *idDest; + *idDest = tmp; + } + + spgPageIndexMultiDelete(&state, page, + moveSrc, xldata->nMove, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + + for (i = 0; i < xldata->nChain; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, chainSrc[i])); + Assert(lt->tupstate == SPGIST_LIVE); + SGLT_SET_NEXTOFFSET(lt, chainDest[i]); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +spgRedoVacuumRoot(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogVacuumRoot *xldata = (spgxlogVacuumRoot *) ptr; + OffsetNumber *toDelete; + Buffer buffer; + Page page; + + toDelete = xldata->offsets; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + /* The tuple numbers are in order */ + PageIndexMultiDelete(page, toDelete, xldata->nDelete); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +spgRedoVacuumRedirect(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + char *ptr = XLogRecGetData(record); + spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr; + OffsetNumber *itemToPlaceholder; + Buffer buffer; + + itemToPlaceholder = xldata->offsets; + + /* + * If any redirection tuples are being removed, make sure there are no + * live Hot Standby transactions that might need to see them. + */ + if (InHotStandby) + { + if (TransactionIdIsValid(xldata->newestRedirectXid)) + { + RelFileNode node; + + XLogRecGetBlockTag(record, 0, &node, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(xldata->newestRedirectXid, + node); + } + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + SpGistPageOpaque opaque = SpGistPageGetOpaque(page); + int i; + + /* Convert redirect pointers to plain placeholders */ + for (i = 0; i < xldata->nToPlaceholder; i++) + { + SpGistDeadTuple dt; + + dt = (SpGistDeadTuple) PageGetItem(page, + PageGetItemId(page, itemToPlaceholder[i])); + Assert(dt->tupstate == SPGIST_REDIRECT); + dt->tupstate = SPGIST_PLACEHOLDER; + ItemPointerSetInvalid(&dt->pointer); + } + + Assert(opaque->nRedirection >= xldata->nToPlaceholder); + opaque->nRedirection -= xldata->nToPlaceholder; + opaque->nPlaceholder += xldata->nToPlaceholder; + + /* Remove placeholder tuples at end of page */ + if (xldata->firstPlaceholder != InvalidOffsetNumber) + { + int max = PageGetMaxOffsetNumber(page); + OffsetNumber *toDelete; + + toDelete = palloc(sizeof(OffsetNumber) * max); + + for (i = xldata->firstPlaceholder; i <= max; i++) + toDelete[i - xldata->firstPlaceholder] = i; + + i = max - xldata->firstPlaceholder + 1; + Assert(opaque->nPlaceholder >= i); + opaque->nPlaceholder -= i; + + /* The array is sorted, so can use PageIndexMultiDelete */ + PageIndexMultiDelete(page, toDelete, i); + + pfree(toDelete); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +void +spg_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCxt; + + oldCxt = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_SPGIST_ADD_LEAF: + spgRedoAddLeaf(record); + break; + case XLOG_SPGIST_MOVE_LEAFS: + spgRedoMoveLeafs(record); + break; + case XLOG_SPGIST_ADD_NODE: + spgRedoAddNode(record); + break; + case XLOG_SPGIST_SPLIT_TUPLE: + spgRedoSplitTuple(record); + break; + case XLOG_SPGIST_PICKSPLIT: + spgRedoPickSplit(record); + break; + case XLOG_SPGIST_VACUUM_LEAF: + spgRedoVacuumLeaf(record); + break; + case XLOG_SPGIST_VACUUM_ROOT: + spgRedoVacuumRoot(record); + break; + case XLOG_SPGIST_VACUUM_REDIRECT: + spgRedoVacuumRedirect(record); + break; + default: + elog(PANIC, "spg_redo: unknown op code %u", info); + } + + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(opCtx); +} + +void +spg_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +spg_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; +} + +/* + * Mask a SpGist page before performing consistency checks on it. + */ +void +spg_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + PageHeader pagehdr = (PageHeader) page; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + + /* + * Mask the unused space, but only if the page's pd_lower appears to have + * been set correctly. + */ + if (pagehdr->pd_lower >= SizeOfPageHeaderData) + mask_unused_space(page); +} diff --git a/src/backend/access/table/Makefile b/src/backend/access/table/Makefile new file mode 100644 index 0000000..9aba3ff --- /dev/null +++ b/src/backend/access/table/Makefile @@ -0,0 +1,21 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/table +# +# IDENTIFICATION +# src/backend/access/table/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/table +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + table.o \ + tableam.o \ + tableamapi.o \ + toast_helper.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/table/table.c b/src/backend/access/table/table.c new file mode 100644 index 0000000..545007e --- /dev/null +++ b/src/backend/access/table/table.c @@ -0,0 +1,170 @@ +/*------------------------------------------------------------------------- + * + * table.c + * Generic routines for table related code. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/table/table.c + * + * + * NOTES + * This file contains table_ routines that implement access to tables (in + * contrast to other relation types like indexes) that are independent of + * individual table access methods. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relation.h" +#include "access/table.h" +#include "storage/lmgr.h" + + +/* ---------------- + * table_open - open a table relation by relation OID + * + * This is essentially relation_open plus check that the relation + * is not an index nor a composite type. (The caller should also + * check that it's not a view or foreign table before assuming it has + * storage.) + * ---------------- + */ +Relation +table_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + r = relation_open(relationId, lockmode); + + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + + return r; +} + + +/* ---------------- + * try_table_open - open a table relation by relation OID + * + * Same as table_open, except return NULL instead of failing + * if the relation does not exist. + * ---------------- + */ +Relation +try_table_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + r = try_relation_open(relationId, lockmode); + + /* leave if table does not exist */ + if (!r) + return NULL; + + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + + return r; +} + +/* ---------------- + * table_openrv - open a table relation specified + * by a RangeVar node + * + * As above, but relation is specified by a RangeVar. + * ---------------- + */ +Relation +table_openrv(const RangeVar *relation, LOCKMODE lockmode) +{ + Relation r; + + r = relation_openrv(relation, lockmode); + + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + + return r; +} + +/* ---------------- + * table_openrv_extended - open a table relation specified + * by a RangeVar node + * + * As above, but optionally return NULL instead of failing for + * relation-not-found. + * ---------------- + */ +Relation +table_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, + bool missing_ok) +{ + Relation r; + + r = relation_openrv_extended(relation, lockmode, missing_ok); + + if (r) + { + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + } + + return r; +} + +/* ---------------- + * table_close - close a table + * + * If lockmode is not "NoLock", we then release the specified lock. + * + * Note that it is often sensible to hold a lock beyond relation_close; + * in that case, the lock is released automatically at xact end. + * ---------------- + */ +void +table_close(Relation relation, LOCKMODE lockmode) +{ + relation_close(relation, lockmode); +} diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c new file mode 100644 index 0000000..5ea5bdd --- /dev/null +++ b/src/backend/access/table/tableam.c @@ -0,0 +1,765 @@ +/*---------------------------------------------------------------------- + * + * tableam.c + * Table access method routines too big to be inline functions. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/table/tableam.c + * + * NOTES + * Note that most function in here are documented in tableam.h, rather than + * here. That's because there's a lot of inline functions in tableam.h and + * it'd be harder to understand if one constantly had to switch between files. + * + *---------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/syncscan.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "optimizer/plancat.h" +#include "port/pg_bitutils.h" +#include "storage/bufmgr.h" +#include "storage/shmem.h" +#include "storage/smgr.h" + +/* + * Constants to control the behavior of block allocation to parallel workers + * during a parallel seqscan. Technically these values do not need to be + * powers of 2, but having them as powers of 2 makes the math more optimal + * and makes the ramp-down stepping more even. + */ + +/* The number of I/O chunks we try to break a parallel seqscan down into */ +#define PARALLEL_SEQSCAN_NCHUNKS 2048 +/* Ramp down size of allocations when we've only this number of chunks left */ +#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64 +/* Cap the size of parallel I/O chunks to this number of blocks */ +#define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 8192 + +/* GUC variables */ +char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD; +bool synchronize_seqscans = true; + + +/* ---------------------------------------------------------------------------- + * Slot functions. + * ---------------------------------------------------------------------------- + */ + +const TupleTableSlotOps * +table_slot_callbacks(Relation relation) +{ + const TupleTableSlotOps *tts_cb; + + if (relation->rd_tableam) + tts_cb = relation->rd_tableam->slot_callbacks(relation); + else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + { + /* + * Historically FDWs expect to store heap tuples in slots. Continue + * handing them one, to make it less painful to adapt FDWs to new + * versions. The cost of a heap slot over a virtual slot is pretty + * small. + */ + tts_cb = &TTSOpsHeapTuple; + } + else + { + /* + * These need to be supported, as some parts of the code (like COPY) + * need to create slots for such relations too. It seems better to + * centralize the knowledge that a heap slot is the right thing in + * that case here. + */ + Assert(relation->rd_rel->relkind == RELKIND_VIEW || + relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + tts_cb = &TTSOpsVirtual; + } + + return tts_cb; +} + +TupleTableSlot * +table_slot_create(Relation relation, List **reglist) +{ + const TupleTableSlotOps *tts_cb; + TupleTableSlot *slot; + + tts_cb = table_slot_callbacks(relation); + slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb); + + if (reglist) + *reglist = lappend(*reglist, slot); + + return slot; +} + + +/* ---------------------------------------------------------------------------- + * Table scan functions. + * ---------------------------------------------------------------------------- + */ + +TableScanDesc +table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT; + Oid relid = RelationGetRelid(relation); + Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); + + return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key, + NULL, flags); +} + +void +table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot) +{ + Assert(IsMVCCSnapshot(snapshot)); + + RegisterSnapshot(snapshot); + scan->rs_snapshot = snapshot; + scan->rs_flags |= SO_TEMP_SNAPSHOT; +} + + +/* ---------------------------------------------------------------------------- + * Parallel table scan related functions. + * ---------------------------------------------------------------------------- + */ + +Size +table_parallelscan_estimate(Relation rel, Snapshot snapshot) +{ + Size sz = 0; + + if (IsMVCCSnapshot(snapshot)) + sz = add_size(sz, EstimateSnapshotSpace(snapshot)); + else + Assert(snapshot == SnapshotAny); + + sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); + + return sz; +} + +void +table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, + Snapshot snapshot) +{ + Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); + + pscan->phs_snapshot_off = snapshot_off; + + if (IsMVCCSnapshot(snapshot)) + { + SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); + pscan->phs_snapshot_any = false; + } + else + { + Assert(snapshot == SnapshotAny); + pscan->phs_snapshot_any = true; + } +} + +TableScanDesc +table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan) +{ + Snapshot snapshot; + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + Assert(RelationGetRelid(relation) == parallel_scan->phs_relid); + + if (!parallel_scan->phs_snapshot_any) + { + /* Snapshot was serialized -- restore it */ + snapshot = RestoreSnapshot((char *) parallel_scan + + parallel_scan->phs_snapshot_off); + RegisterSnapshot(snapshot); + flags |= SO_TEMP_SNAPSHOT; + } + else + { + /* SnapshotAny passed by caller (not serialized) */ + snapshot = SnapshotAny; + } + + return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, + parallel_scan, flags); +} + + +/* ---------------------------------------------------------------------------- + * Index scan related functions. + * ---------------------------------------------------------------------------- + */ + +/* + * To perform that check simply start an index scan, create the necessary + * slot, do the heap lookup, and shut everything down again. This could be + * optimized, but is unlikely to matter from a performance POV. If there + * frequently are live index pointers also matching a unique index key, the + * CPU overhead of this routine is unlikely to matter. + * + * Note that *tid may be modified when we return true if the AM supports + * storing multiple row versions reachable via a single index entry (like + * heap's HOT). + */ +bool +table_index_fetch_tuple_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead) +{ + IndexFetchTableData *scan; + TupleTableSlot *slot; + bool call_again = false; + bool found; + + slot = table_slot_create(rel, NULL); + scan = table_index_fetch_begin(rel); + found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, + all_dead); + table_index_fetch_end(scan); + ExecDropSingleTupleTableSlot(slot); + + return found; +} + + +/* ------------------------------------------------------------------------ + * Functions for non-modifying operations on individual tuples + * ------------------------------------------------------------------------ + */ + +void +table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid) +{ + Relation rel = scan->rs_rd; + const TableAmRoutine *tableam = rel->rd_tableam; + + /* + * We don't expect direct calls to table_tuple_get_latest_tid with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding"); + + /* + * Since this can be called with user-supplied TID, don't trust the input + * too much. + */ + if (!tableam->tuple_tid_valid(scan, tid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("tid (%u, %u) is not valid for relation \"%s\"", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid), + RelationGetRelationName(rel)))); + + tableam->tuple_get_latest_tid(scan, tid); +} + + +/* ---------------------------------------------------------------------------- + * Functions to make modifications a bit simpler. + * ---------------------------------------------------------------------------- + */ + +/* + * simple_table_tuple_insert - insert a tuple + * + * Currently, this routine differs from table_tuple_insert only in supplying a + * default command ID and not allowing access to the speedup options. + */ +void +simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) +{ + table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL); +} + +/* + * simple_table_tuple_delete - delete a tuple + * + * This routine may be used to delete a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot) +{ + TM_Result result; + TM_FailureData tmfd; + + result = table_tuple_delete(rel, tid, + GetCurrentCommandId(true), + snapshot, InvalidSnapshot, + true /* wait for commit */ , + &tmfd, false /* changingPart */ ); + + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized table_tuple_delete status: %u", result); + break; + } +} + +/* + * simple_table_tuple_update - replace a tuple + * + * This routine may be used to update a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_table_tuple_update(Relation rel, ItemPointer otid, + TupleTableSlot *slot, + Snapshot snapshot, + bool *update_indexes) +{ + TM_Result result; + TM_FailureData tmfd; + LockTupleMode lockmode; + + result = table_tuple_update(rel, otid, slot, + GetCurrentCommandId(true), + snapshot, InvalidSnapshot, + true /* wait for commit */ , + &tmfd, &lockmode, update_indexes); + + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized table_tuple_update status: %u", result); + break; + } + +} + + +/* ---------------------------------------------------------------------------- + * Helper functions to implement parallel scans for block oriented AMs. + * ---------------------------------------------------------------------------- + */ + +Size +table_block_parallelscan_estimate(Relation rel) +{ + return sizeof(ParallelBlockTableScanDescData); +} + +Size +table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) +{ + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + bpscan->base.phs_relid = RelationGetRelid(rel); + bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel); + /* compare phs_syncscan initialization to similar logic in initscan */ + bpscan->base.phs_syncscan = synchronize_seqscans && + !RelationUsesLocalBuffers(rel) && + bpscan->phs_nblocks > NBuffers / 4; + SpinLockInit(&bpscan->phs_mutex); + bpscan->phs_startblock = InvalidBlockNumber; + pg_atomic_init_u64(&bpscan->phs_nallocated, 0); + + return sizeof(ParallelBlockTableScanDescData); +} + +void +table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) +{ + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + pg_atomic_write_u64(&bpscan->phs_nallocated, 0); +} + +/* + * find and set the scan's startblock + * + * Determine where the parallel seq scan should start. This function may be + * called many times, once by each parallel worker. We must be careful only + * to set the startblock once. + */ +void +table_block_parallelscan_startblock_init(Relation rel, + ParallelBlockTableScanWorker pbscanwork, + ParallelBlockTableScanDesc pbscan) +{ + BlockNumber sync_startpage = InvalidBlockNumber; + + /* Reset the state we use for controlling allocation size. */ + memset(pbscanwork, 0, sizeof(*pbscanwork)); + + StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE, + "pg_nextpower2_32 may be too small for non-standard BlockNumber width"); + + /* + * We determine the chunk size based on the size of the relation. First we + * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then + * take the next highest power of 2 number of the chunk size. This means + * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS + * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks. + */ + pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks / + PARALLEL_SEQSCAN_NCHUNKS, 1)); + + /* + * Ensure we don't go over the maximum chunk size with larger tables. This + * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger + * tables. Too large a chunk size has been shown to be detrimental to + * synchronous scan performance. + */ + pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size, + PARALLEL_SEQSCAN_MAX_CHUNK_SIZE); + +retry: + /* Grab the spinlock. */ + SpinLockAcquire(&pbscan->phs_mutex); + + /* + * If the scan's startblock has not yet been initialized, we must do so + * now. If this is not a synchronized scan, we just start at block 0, but + * if it is a synchronized scan, we must get the starting position from + * the synchronized scan machinery. We can't hold the spinlock while + * doing that, though, so release the spinlock, get the information we + * need, and retry. If nobody else has initialized the scan in the + * meantime, we'll fill in the value we fetched on the second time + * through. + */ + if (pbscan->phs_startblock == InvalidBlockNumber) + { + if (!pbscan->base.phs_syncscan) + pbscan->phs_startblock = 0; + else if (sync_startpage != InvalidBlockNumber) + pbscan->phs_startblock = sync_startpage; + else + { + SpinLockRelease(&pbscan->phs_mutex); + sync_startpage = ss_get_location(rel, pbscan->phs_nblocks); + goto retry; + } + } + SpinLockRelease(&pbscan->phs_mutex); +} + +/* + * get the next page to scan + * + * Get the next page to scan. Even if there are no pages left to scan, + * another backend could have grabbed a page to scan and not yet finished + * looking at it, so it doesn't follow that the scan is done when the first + * backend gets an InvalidBlockNumber return. + */ +BlockNumber +table_block_parallelscan_nextpage(Relation rel, + ParallelBlockTableScanWorker pbscanwork, + ParallelBlockTableScanDesc pbscan) +{ + BlockNumber page; + uint64 nallocated; + + /* + * The logic below allocates block numbers out to parallel workers in a + * way that each worker will receive a set of consecutive block numbers to + * scan. Earlier versions of this would allocate the next highest block + * number to the next worker to call this function. This would generally + * result in workers never receiving consecutive block numbers. Some + * operating systems would not detect the sequential I/O pattern due to + * each backend being a different process which could result in poor + * performance due to inefficient or no readahead. To work around this + * issue, we now allocate a range of block numbers for each worker and + * when they come back for another block, we give them the next one in + * that range until the range is complete. When the worker completes the + * range of blocks we then allocate another range for it and return the + * first block number from that range. + * + * Here we name these ranges of blocks "chunks". The initial size of + * these chunks is determined in table_block_parallelscan_startblock_init + * based on the size of the relation. Towards the end of the scan, we + * start making reductions in the size of the chunks in order to attempt + * to divide the remaining work over all the workers as evenly as + * possible. + * + * Here pbscanwork is local worker memory. phsw_chunk_remaining tracks + * the number of blocks remaining in the chunk. When that reaches 0 then + * we must allocate a new chunk for the worker. + * + * phs_nallocated tracks how many blocks have been allocated to workers + * already. When phs_nallocated >= rs_nblocks, all blocks have been + * allocated. + * + * Because we use an atomic fetch-and-add to fetch the current value, the + * phs_nallocated counter will exceed rs_nblocks, because workers will + * still increment the value, when they try to allocate the next block but + * all blocks have been allocated already. The counter must be 64 bits + * wide because of that, to avoid wrapping around when rs_nblocks is close + * to 2^32. + * + * The actual block to return is calculated by adding the counter to the + * starting block number, modulo nblocks. + */ + + /* + * First check if we have any remaining blocks in a previous chunk for + * this worker. We must consume all of the blocks from that before we + * allocate a new chunk to the worker. + */ + if (pbscanwork->phsw_chunk_remaining > 0) + { + /* + * Give them the next block in the range and update the remaining + * number of blocks. + */ + nallocated = ++pbscanwork->phsw_nallocated; + pbscanwork->phsw_chunk_remaining--; + } + else + { + /* + * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks + * remaining in the scan, we half the chunk size. Since we reduce the + * chunk size here, we'll hit this again after doing + * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few + * iterations of this, we'll end up doing the last few blocks with the + * chunk size set to 1. + */ + if (pbscanwork->phsw_chunk_size > 1 && + pbscanwork->phsw_nallocated > pbscan->phs_nblocks - + (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS)) + pbscanwork->phsw_chunk_size >>= 1; + + nallocated = pbscanwork->phsw_nallocated = + pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, + pbscanwork->phsw_chunk_size); + + /* + * Set the remaining number of blocks in this chunk so that subsequent + * calls from this worker continue on with this chunk until it's done. + */ + pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1; + } + + if (nallocated >= pbscan->phs_nblocks) + page = InvalidBlockNumber; /* all blocks have been allocated */ + else + page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks; + + /* + * Report scan location. Normally, we report the current page number. + * When we reach the end of the scan, though, we report the starting page, + * not the ending page, just so the starting positions for later scans + * doesn't slew backwards. We only report the position at the end of the + * scan once, though: subsequent callers will report nothing. + */ + if (pbscan->base.phs_syncscan) + { + if (page != InvalidBlockNumber) + ss_report_location(rel, page); + else if (nallocated == pbscan->phs_nblocks) + ss_report_location(rel, pbscan->phs_startblock); + } + + return page; +} + +/* ---------------------------------------------------------------------------- + * Helper functions to implement relation sizing for block oriented AMs. + * ---------------------------------------------------------------------------- + */ + +/* + * table_block_relation_size + * + * If a table AM uses the various relation forks as the sole place where data + * is stored, and if it uses them in the expected manner (e.g. the actual data + * is in the main fork rather than some other), it can use this implementation + * of the relation_size callback rather than implementing its own. + */ +uint64 +table_block_relation_size(Relation rel, ForkNumber forkNumber) +{ + uint64 nblocks = 0; + + /* Open it at the smgr level if not already done */ + RelationOpenSmgr(rel); + + /* InvalidForkNumber indicates returning the size for all forks */ + if (forkNumber == InvalidForkNumber) + { + for (int i = 0; i < MAX_FORKNUM; i++) + nblocks += smgrnblocks(rel->rd_smgr, i); + } + else + nblocks = smgrnblocks(rel->rd_smgr, forkNumber); + + return nblocks * BLCKSZ; +} + +/* + * table_block_relation_estimate_size + * + * This function can't be directly used as the implementation of the + * relation_estimate_size callback, because it has a few additional parameters. + * Instead, it is intended to be used as a helper function; the caller can + * pass through the arguments to its relation_estimate_size function plus the + * additional values required here. + * + * overhead_bytes_per_tuple should contain the approximate number of bytes + * of storage required to store a tuple above and beyond what is required for + * the tuple data proper. Typically, this would include things like the + * size of the tuple header and item pointer. This is only used for query + * planning, so a table AM where the value is not constant could choose to + * pass a "best guess". + * + * usable_bytes_per_page should contain the approximate number of bytes per + * page usable for tuple data, excluding the page header and any anticipated + * special space. + */ +void +table_block_relation_estimate_size(Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples, + double *allvisfrac, + Size overhead_bytes_per_tuple, + Size usable_bytes_per_page) +{ + BlockNumber curpages; + BlockNumber relpages; + double reltuples; + BlockNumber relallvisible; + double density; + + /* it should have storage, so we can call the smgr */ + curpages = RelationGetNumberOfBlocks(rel); + + /* coerce values in pg_class to more desirable types */ + relpages = (BlockNumber) rel->rd_rel->relpages; + reltuples = (double) rel->rd_rel->reltuples; + relallvisible = (BlockNumber) rel->rd_rel->relallvisible; + + /* + * HACK: if the relation has never yet been vacuumed, use a minimum size + * estimate of 10 pages. The idea here is to avoid assuming a + * newly-created table is really small, even if it currently is, because + * that may not be true once some data gets loaded into it. Once a vacuum + * or analyze cycle has been done on it, it's more reasonable to believe + * the size is somewhat stable. + * + * (Note that this is only an issue if the plan gets cached and used again + * after the table has been filled. What we're trying to avoid is using a + * nestloop-type plan on a table that has grown substantially since the + * plan was made. Normally, autovacuum/autoanalyze will occur once enough + * inserts have happened and cause cached-plan invalidation; but that + * doesn't happen instantaneously, and it won't happen at all for cases + * such as temporary tables.) + * + * We test "never vacuumed" by seeing whether reltuples < 0. + * + * If the table has inheritance children, we don't apply this heuristic. + * Totally empty parent tables are quite common, so we should be willing + * to believe that they are empty. + */ + if (curpages < 10 && + reltuples < 0 && + !rel->rd_rel->relhassubclass) + curpages = 10; + + /* report estimated # pages */ + *pages = curpages; + /* quick exit if rel is clearly empty */ + if (curpages == 0) + { + *tuples = 0; + *allvisfrac = 0; + return; + } + + /* estimate number of tuples from previous tuple density */ + if (reltuples >= 0 && relpages > 0) + density = reltuples / (double) relpages; + else + { + /* + * When we have no data because the relation was never yet vacuumed, + * estimate tuple width from attribute datatypes. We assume here that + * the pages are completely full, which is OK for tables but is + * probably an overestimate for indexes. Fortunately + * get_relation_info() can clamp the overestimate to the parent + * table's size. + * + * Note: this code intentionally disregards alignment considerations, + * because (a) that would be gilding the lily considering how crude + * the estimate is, (b) it creates platform dependencies in the + * default plans which are kind of a headache for regression testing, + * and (c) different table AMs might use different padding schemes. + */ + int32 tuple_width; + + tuple_width = get_rel_data_width(rel, attr_widths); + tuple_width += overhead_bytes_per_tuple; + /* note: integer division is intentional here */ + density = usable_bytes_per_page / tuple_width; + } + *tuples = rint(density * (double) curpages); + + /* + * We use relallvisible as-is, rather than scaling it up like we do for + * the pages and tuples counts, on the theory that any pages added since + * the last VACUUM are most likely not marked all-visible. But costsize.c + * wants it converted to a fraction. + */ + if (relallvisible == 0 || curpages <= 0) + *allvisfrac = 0; + else if ((double) relallvisible >= curpages) + *allvisfrac = 1; + else + *allvisfrac = (double) relallvisible / curpages; +} diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c new file mode 100644 index 0000000..325ecdc --- /dev/null +++ b/src/backend/access/table/tableamapi.c @@ -0,0 +1,158 @@ +/*---------------------------------------------------------------------- + * + * tableamapi.c + * Support routines for API for Postgres table access methods + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/table/tableamapi.c + *---------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "catalog/pg_am.h" +#include "catalog/pg_proc.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "utils/fmgroids.h" +#include "utils/memutils.h" +#include "utils/syscache.h" + + +/* + * GetTableAmRoutine + * Call the specified access method handler routine to get its + * TableAmRoutine struct, which will be palloc'd in the caller's + * memory context. + */ +const TableAmRoutine * +GetTableAmRoutine(Oid amhandler) +{ + Datum datum; + const TableAmRoutine *routine; + + datum = OidFunctionCall0(amhandler); + routine = (TableAmRoutine *) DatumGetPointer(datum); + + if (routine == NULL || !IsA(routine, TableAmRoutine)) + elog(ERROR, "table access method handler %u did not return a TableAmRoutine struct", + amhandler); + + /* + * Assert that all required callbacks are present. That makes it a bit + * easier to keep AMs up to date, e.g. when forward porting them to a new + * major version. + */ + Assert(routine->scan_begin != NULL); + Assert(routine->scan_end != NULL); + Assert(routine->scan_rescan != NULL); + Assert(routine->scan_getnextslot != NULL); + + Assert(routine->parallelscan_estimate != NULL); + Assert(routine->parallelscan_initialize != NULL); + Assert(routine->parallelscan_reinitialize != NULL); + + Assert(routine->index_fetch_begin != NULL); + Assert(routine->index_fetch_reset != NULL); + Assert(routine->index_fetch_end != NULL); + Assert(routine->index_fetch_tuple != NULL); + + Assert(routine->tuple_fetch_row_version != NULL); + Assert(routine->tuple_tid_valid != NULL); + Assert(routine->tuple_get_latest_tid != NULL); + Assert(routine->tuple_satisfies_snapshot != NULL); + Assert(routine->index_delete_tuples != NULL); + + Assert(routine->tuple_insert != NULL); + + /* + * Could be made optional, but would require throwing error during + * parse-analysis. + */ + Assert(routine->tuple_insert_speculative != NULL); + Assert(routine->tuple_complete_speculative != NULL); + + Assert(routine->multi_insert != NULL); + Assert(routine->tuple_delete != NULL); + Assert(routine->tuple_update != NULL); + Assert(routine->tuple_lock != NULL); + + Assert(routine->relation_set_new_filenode != NULL); + Assert(routine->relation_nontransactional_truncate != NULL); + Assert(routine->relation_copy_data != NULL); + Assert(routine->relation_copy_for_cluster != NULL); + Assert(routine->relation_vacuum != NULL); + Assert(routine->scan_analyze_next_block != NULL); + Assert(routine->scan_analyze_next_tuple != NULL); + Assert(routine->index_build_range_scan != NULL); + Assert(routine->index_validate_scan != NULL); + + Assert(routine->relation_size != NULL); + Assert(routine->relation_needs_toast_table != NULL); + + Assert(routine->relation_estimate_size != NULL); + + /* optional, but one callback implies presence of the other */ + Assert((routine->scan_bitmap_next_block == NULL) == + (routine->scan_bitmap_next_tuple == NULL)); + Assert(routine->scan_sample_next_block != NULL); + Assert(routine->scan_sample_next_tuple != NULL); + + return routine; +} + +/* check_hook: validate new default_table_access_method */ +bool +check_default_table_access_method(char **newval, void **extra, GucSource source) +{ + if (**newval == '\0') + { + GUC_check_errdetail("%s cannot be empty.", + "default_table_access_method"); + return false; + } + + if (strlen(*newval) >= NAMEDATALEN) + { + GUC_check_errdetail("%s is too long (maximum %d characters).", + "default_table_access_method", NAMEDATALEN - 1); + return false; + } + + /* + * If we aren't inside a transaction, or not connected to a database, we + * cannot do the catalog access necessary to verify the method. Must + * accept the value on faith. + */ + if (IsTransactionState() && MyDatabaseId != InvalidOid) + { + if (!OidIsValid(get_table_am_oid(*newval, true))) + { + /* + * When source == PGC_S_TEST, don't throw a hard error for a + * nonexistent table access method, only a NOTICE. See comments in + * guc.h. + */ + if (source == PGC_S_TEST) + { + ereport(NOTICE, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("table access method \"%s\" does not exist", + *newval))); + } + else + { + GUC_check_errdetail("Table access method \"%s\" does not exist.", + *newval); + return false; + } + } + } + + return true; +} diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c new file mode 100644 index 0000000..013236b --- /dev/null +++ b/src/backend/access/table/toast_helper.c @@ -0,0 +1,337 @@ +/*------------------------------------------------------------------------- + * + * toast_helper.c + * Helper functions for table AMs implementing compressed or + * out-of-line storage of varlena attributes. + * + * Copyright (c) 2000-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/table/toast_helper.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/table.h" +#include "access/toast_helper.h" +#include "access/toast_internals.h" +#include "catalog/pg_type_d.h" + + +/* + * Prepare to TOAST a tuple. + * + * tupleDesc, toast_values, and toast_isnull are required parameters; they + * provide the necessary details about the tuple to be toasted. + * + * toast_oldvalues and toast_oldisnull should be NULL for a newly-inserted + * tuple; for an update, they should describe the existing tuple. + * + * All of these arrays should have a length equal to tupleDesc->natts. + * + * On return, toast_flags and toast_attr will have been initialized. + * toast_flags is just a single uint8, but toast_attr is a caller-provided + * array with a length equal to tupleDesc->natts. The caller need not + * perform any initialization of the array before calling this function. + */ +void +toast_tuple_init(ToastTupleContext *ttc) +{ + TupleDesc tupleDesc = ttc->ttc_rel->rd_att; + int numAttrs = tupleDesc->natts; + int i; + + ttc->ttc_flags = 0; + + for (i = 0; i < numAttrs; i++) + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, i); + struct varlena *old_value; + struct varlena *new_value; + + ttc->ttc_attr[i].tai_colflags = 0; + ttc->ttc_attr[i].tai_oldexternal = NULL; + ttc->ttc_attr[i].tai_compression = att->attcompression; + + if (ttc->ttc_oldvalues != NULL) + { + /* + * For UPDATE get the old and new values of this attribute + */ + old_value = + (struct varlena *) DatumGetPointer(ttc->ttc_oldvalues[i]); + new_value = + (struct varlena *) DatumGetPointer(ttc->ttc_values[i]); + + /* + * If the old value is stored on disk, check if it has changed so + * we have to delete it later. + */ + if (att->attlen == -1 && !ttc->ttc_oldisnull[i] && + VARATT_IS_EXTERNAL_ONDISK(old_value)) + { + if (ttc->ttc_isnull[i] || + !VARATT_IS_EXTERNAL_ONDISK(new_value) || + memcmp((char *) old_value, (char *) new_value, + VARSIZE_EXTERNAL(old_value)) != 0) + { + /* + * The old external stored value isn't needed any more + * after the update + */ + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_NEEDS_DELETE_OLD; + ttc->ttc_flags |= TOAST_NEEDS_DELETE_OLD; + } + else + { + /* + * This attribute isn't changed by this update so we reuse + * the original reference to the old value in the new + * tuple. + */ + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + continue; + } + } + } + else + { + /* + * For INSERT simply get the new value + */ + new_value = (struct varlena *) DatumGetPointer(ttc->ttc_values[i]); + } + + /* + * Handle NULL attributes + */ + if (ttc->ttc_isnull[i]) + { + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + ttc->ttc_flags |= TOAST_HAS_NULLS; + continue; + } + + /* + * Now look at varlena attributes + */ + if (att->attlen == -1) + { + /* + * If the table's attribute says PLAIN always, force it so. + */ + if (att->attstorage == TYPSTORAGE_PLAIN) + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + + /* + * We took care of UPDATE above, so any external value we find + * still in the tuple must be someone else's that we cannot reuse + * (this includes the case of an out-of-line in-memory datum). + * Fetch it back (without decompression, unless we are forcing + * PLAIN storage). If necessary, we'll push it out as a new + * external value below. + */ + if (VARATT_IS_EXTERNAL(new_value)) + { + ttc->ttc_attr[i].tai_oldexternal = new_value; + if (att->attstorage == TYPSTORAGE_PLAIN) + new_value = detoast_attr(new_value); + else + new_value = detoast_external_attr(new_value); + ttc->ttc_values[i] = PointerGetDatum(new_value); + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_NEEDS_FREE; + ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); + } + + /* + * Remember the size of this attribute + */ + ttc->ttc_attr[i].tai_size = VARSIZE_ANY(new_value); + } + else + { + /* + * Not a varlena attribute, plain storage always + */ + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + } + } +} + +/* + * Find the largest varlena attribute that satisfies certain criteria. + * + * The relevant column must not be marked TOASTCOL_IGNORE, and if the + * for_compression flag is passed as true, it must also not be marked + * TOASTCOL_INCOMPRESSIBLE. + * + * The column must have attstorage EXTERNAL or EXTENDED if check_main is + * false, and must have attstorage MAIN if check_main is true. + * + * The column must have a minimum size of MAXALIGN(TOAST_POINTER_SIZE); + * if not, no benefit is to be expected by compressing it. + * + * The return value is the index of the biggest suitable column, or + * -1 if there is none. + */ +int +toast_tuple_find_biggest_attribute(ToastTupleContext *ttc, + bool for_compression, bool check_main) +{ + TupleDesc tupleDesc = ttc->ttc_rel->rd_att; + int numAttrs = tupleDesc->natts; + int biggest_attno = -1; + int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE); + int32 skip_colflags = TOASTCOL_IGNORE; + int i; + + if (for_compression) + skip_colflags |= TOASTCOL_INCOMPRESSIBLE; + + for (i = 0; i < numAttrs; i++) + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, i); + + if ((ttc->ttc_attr[i].tai_colflags & skip_colflags) != 0) + continue; + if (VARATT_IS_EXTERNAL(DatumGetPointer(ttc->ttc_values[i]))) + continue; /* can't happen, toast_action would be PLAIN */ + if (for_compression && + VARATT_IS_COMPRESSED(DatumGetPointer(ttc->ttc_values[i]))) + continue; + if (check_main && att->attstorage != TYPSTORAGE_MAIN) + continue; + if (!check_main && att->attstorage != TYPSTORAGE_EXTENDED && + att->attstorage != TYPSTORAGE_EXTERNAL) + continue; + + if (ttc->ttc_attr[i].tai_size > biggest_size) + { + biggest_attno = i; + biggest_size = ttc->ttc_attr[i].tai_size; + } + } + + return biggest_attno; +} + +/* + * Try compression for an attribute. + * + * If we find that the attribute is not compressible, mark it so. + */ +void +toast_tuple_try_compression(ToastTupleContext *ttc, int attribute) +{ + Datum *value = &ttc->ttc_values[attribute]; + Datum new_value; + ToastAttrInfo *attr = &ttc->ttc_attr[attribute]; + + new_value = toast_compress_datum(*value, attr->tai_compression); + + if (DatumGetPointer(new_value) != NULL) + { + /* successful compression */ + if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) + pfree(DatumGetPointer(*value)); + *value = new_value; + attr->tai_colflags |= TOASTCOL_NEEDS_FREE; + attr->tai_size = VARSIZE(DatumGetPointer(*value)); + ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); + } + else + { + /* incompressible, ignore on subsequent compression passes */ + attr->tai_colflags |= TOASTCOL_INCOMPRESSIBLE; + } +} + +/* + * Move an attribute to external storage. + */ +void +toast_tuple_externalize(ToastTupleContext *ttc, int attribute, int options) +{ + Datum *value = &ttc->ttc_values[attribute]; + Datum old_value = *value; + ToastAttrInfo *attr = &ttc->ttc_attr[attribute]; + + attr->tai_colflags |= TOASTCOL_IGNORE; + *value = toast_save_datum(ttc->ttc_rel, old_value, attr->tai_oldexternal, + options); + if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) + pfree(DatumGetPointer(old_value)); + attr->tai_colflags |= TOASTCOL_NEEDS_FREE; + ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); +} + +/* + * Perform appropriate cleanup after one tuple has been subjected to TOAST. + */ +void +toast_tuple_cleanup(ToastTupleContext *ttc) +{ + TupleDesc tupleDesc = ttc->ttc_rel->rd_att; + int numAttrs = tupleDesc->natts; + + /* + * Free allocated temp values + */ + if ((ttc->ttc_flags & TOAST_NEEDS_FREE) != 0) + { + int i; + + for (i = 0; i < numAttrs; i++) + { + ToastAttrInfo *attr = &ttc->ttc_attr[i]; + + if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) + pfree(DatumGetPointer(ttc->ttc_values[i])); + } + } + + /* + * Delete external values from the old tuple + */ + if ((ttc->ttc_flags & TOAST_NEEDS_DELETE_OLD) != 0) + { + int i; + + for (i = 0; i < numAttrs; i++) + { + ToastAttrInfo *attr = &ttc->ttc_attr[i]; + + if ((attr->tai_colflags & TOASTCOL_NEEDS_DELETE_OLD) != 0) + toast_delete_datum(ttc->ttc_rel, ttc->ttc_oldvalues[i], false); + } + } +} + +/* + * Check for external stored attributes and delete them from the secondary + * relation. + */ +void +toast_delete_external(Relation rel, Datum *values, bool *isnull, + bool is_speculative) +{ + TupleDesc tupleDesc = rel->rd_att; + int numAttrs = tupleDesc->natts; + int i; + + for (i = 0; i < numAttrs; i++) + { + if (TupleDescAttr(tupleDesc, i)->attlen == -1) + { + Datum value = values[i]; + + if (isnull[i]) + continue; + else if (VARATT_IS_EXTERNAL_ONDISK(PointerGetDatum(value))) + toast_delete_datum(rel, value, is_speculative); + } + } +} diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile new file mode 100644 index 0000000..01641e5 --- /dev/null +++ b/src/backend/access/tablesample/Makefile @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/tablesample +# +# IDENTIFICATION +# src/backend/access/tablesample/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/tablesample +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + bernoulli.o \ + system.o \ + tablesample.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c new file mode 100644 index 0000000..ae6e4f5 --- /dev/null +++ b/src/backend/access/tablesample/bernoulli.c @@ -0,0 +1,229 @@ +/*------------------------------------------------------------------------- + * + * bernoulli.c + * support routines for BERNOULLI tablesample method + * + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. + * + * To achieve that, we proceed by hashing each candidate TID together with + * the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/tablesample/bernoulli.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/tsmapi.h" +#include "catalog/pg_type.h" +#include "common/hashfn.h" +#include "optimizer/optimizer.h" +#include "utils/builtins.h" + + +/* Private state */ +typedef struct +{ + uint64 cutoff; /* select tuples with hash less than this */ + uint32 seed; /* random seed */ + OffsetNumber lt; /* last tuple returned from current block */ +} BernoulliSamplerData; + + +static void bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void bernoulli_initsamplescan(SampleScanState *node, + int eflags); +static void bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + + +/* + * Create a TsmRoutine descriptor for the BERNOULLI method. + */ +Datum +tsm_bernoulli_handler(PG_FUNCTION_ARGS) +{ + TsmRoutine *tsm = makeNode(TsmRoutine); + + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize; + tsm->InitSampleScan = bernoulli_initsamplescan; + tsm->BeginSampleScan = bernoulli_beginsamplescan; + tsm->NextSampleBlock = NULL; + tsm->NextSampleTuple = bernoulli_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); +} + +/* + * Sample size estimation. + */ +static void +bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) +{ + Node *pctnode; + float4 samplefract; + + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } + else + { + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; + } + + /* We'll visit all pages of the baserel */ + *pages = baserel->pages; + + *tuples = clamp_row_est(baserel->tuples * samplefract); +} + +/* + * Initialize during executor setup. + */ +static void +bernoulli_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(BernoulliSamplerData)); +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + double dcutoff; + + if (percent < 0 || percent > 100 || isnan(percent)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); + + /* + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. + */ + dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); + sampler->cutoff = (uint64) dcutoff; + sampler->seed = seed; + sampler->lt = InvalidOffsetNumber; + + /* + * Use bulkread, since we're scanning all pages. But pagemode visibility + * checking is a win only at larger sampling fractions. The 25% cutoff + * here is based on very limited experimentation. + */ + node->use_bulkread = true; + node->use_pagemode = (percent >= 25); +} + +/* + * Select next sampled tuple in current block. + * + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists). The reason is that we do the coinflip for every tuple + * offset in the table. Since all tuples have the same probability of being + * returned, it doesn't matter if we do extra coinflips for invisible tuples. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +static OffsetNumber +bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) +{ + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; + OffsetNumber tupoffset = sampler->lt; + uint32 hashinput[3]; + + /* Advance to first/next tuple in block */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* + * We compute the hash by applying hash_any to an array of 3 uint32's + * containing the block, offset, and seed. This is efficient to set up, + * and with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. + * + * These words in the hash input are the same throughout the block: + */ + hashinput[0] = blockno; + hashinput[2] = sampler->seed; + + /* + * Loop over tuple offsets until finding suitable TID or reaching end of + * block. + */ + for (; tupoffset <= maxoffset; tupoffset++) + { + uint32 hash; + + hashinput[1] = tupoffset; + + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) + break; + } + + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + return tupoffset; +} diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c new file mode 100644 index 0000000..b0869e5 --- /dev/null +++ b/src/backend/access/tablesample/system.c @@ -0,0 +1,257 @@ +/*------------------------------------------------------------------------- + * + * system.c + * support routines for SYSTEM tablesample method + * + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. + * + * To achieve that, we proceed by hashing each candidate block number together + * with the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/tablesample/system.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/relscan.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" +#include "common/hashfn.h" +#include "optimizer/optimizer.h" +#include "utils/builtins.h" + + +/* Private state */ +typedef struct +{ + uint64 cutoff; /* select blocks with hash less than this */ + uint32 seed; /* random seed */ + BlockNumber nextblock; /* next block to consider sampling */ + OffsetNumber lt; /* last tuple returned from current block */ +} SystemSamplerData; + + +static void system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void system_initsamplescan(SampleScanState *node, + int eflags); +static void system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks); +static OffsetNumber system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + + +/* + * Create a TsmRoutine descriptor for the SYSTEM method. + */ +Datum +tsm_system_handler(PG_FUNCTION_ARGS) +{ + TsmRoutine *tsm = makeNode(TsmRoutine); + + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = system_samplescangetsamplesize; + tsm->InitSampleScan = system_initsamplescan; + tsm->BeginSampleScan = system_beginsamplescan; + tsm->NextSampleBlock = system_nextsampleblock; + tsm->NextSampleTuple = system_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); +} + +/* + * Sample size estimation. + */ +static void +system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) +{ + Node *pctnode; + float4 samplefract; + + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } + else + { + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; + } + + /* We'll visit a sample of the pages ... */ + *pages = clamp_row_est(baserel->pages * samplefract); + + /* ... and hopefully get a representative number of tuples from them */ + *tuples = clamp_row_est(baserel->tuples * samplefract); +} + +/* + * Initialize during executor setup. + */ +static void +system_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(SystemSamplerData)); +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + double dcutoff; + + if (percent < 0 || percent > 100 || isnan(percent)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); + + /* + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. + */ + dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); + sampler->cutoff = (uint64) dcutoff; + sampler->seed = seed; + sampler->nextblock = 0; + sampler->lt = InvalidOffsetNumber; + + /* + * Bulkread buffer access strategy probably makes sense unless we're + * scanning a very small fraction of the table. The 1% cutoff here is a + * guess. We should use pagemode visibility checking, since we scan all + * tuples on each selected page. + */ + node->use_bulkread = (percent >= 1); + node->use_pagemode = true; +} + +/* + * Select next block to sample. + */ +static BlockNumber +system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + BlockNumber nextblock = sampler->nextblock; + uint32 hashinput[2]; + + /* + * We compute the hash by applying hash_any to an array of 2 uint32's + * containing the block number and seed. This is efficient to set up, and + * with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. + * + * These words in the hash input are the same throughout the block: + */ + hashinput[1] = sampler->seed; + + /* + * Loop over block numbers until finding suitable block or reaching end of + * relation. + */ + for (; nextblock < nblocks; nextblock++) + { + uint32 hash; + + hashinput[0] = nextblock; + + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) + break; + } + + if (nextblock < nblocks) + { + /* Found a suitable block; remember where we should start next time */ + sampler->nextblock = nextblock + 1; + return nextblock; + } + + /* Done, but let's reset nextblock to 0 for safety. */ + sampler->nextblock = 0; + return InvalidBlockNumber; +} + +/* + * Select next sampled tuple in current block. + * + * In block sampling, we just want to sample all the tuples in each selected + * block. + * + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists); nodeSamplescan.c will deal with that. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +static OffsetNumber +system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + OffsetNumber tupoffset = sampler->lt; + + /* Advance to next possible offset on page */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* Done? */ + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + return tupoffset; +} diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c new file mode 100644 index 0000000..02f2a95 --- /dev/null +++ b/src/backend/access/tablesample/tablesample.c @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * tablesample.c + * Support functions for TABLESAMPLE feature + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/tablesample/tablesample.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/tsmapi.h" + + +/* + * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler. + * + * This is a convenience routine that's just meant to check for errors. + */ +TsmRoutine * +GetTsmRoutine(Oid tsmhandler) +{ + Datum datum; + TsmRoutine *routine; + + datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL)); + routine = (TsmRoutine *) DatumGetPointer(datum); + + if (routine == NULL || !IsA(routine, TsmRoutine)) + elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct", + tsmhandler); + + return routine; +} diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile new file mode 100644 index 0000000..595e02d --- /dev/null +++ b/src/backend/access/transam/Makefile @@ -0,0 +1,40 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/transam +# +# IDENTIFICATION +# src/backend/access/transam/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/transam +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + clog.o \ + commit_ts.o \ + generic_xlog.o \ + multixact.o \ + parallel.o \ + rmgr.o \ + slru.o \ + subtrans.o \ + timeline.o \ + transam.o \ + twophase.o \ + twophase_rmgr.o \ + varsup.o \ + xact.o \ + xlog.o \ + xlogarchive.o \ + xlogfuncs.o \ + xloginsert.o \ + xlogreader.o \ + xlogutils.o + +include $(top_srcdir)/src/backend/common.mk + +# ensure that version checks in xlog.c get recompiled when catversion.h changes +xlog.o: xlog.c $(top_srcdir)/src/include/catalog/catversion.h diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README new file mode 100644 index 0000000..1edc818 --- /dev/null +++ b/src/backend/access/transam/README @@ -0,0 +1,896 @@ +src/backend/access/transam/README + +The Transaction System +====================== + +PostgreSQL's transaction system is a three-layer system. The bottom layer +implements low-level transactions and subtransactions, on top of which rests +the mainloop's control code, which in turn implements user-visible +transactions and savepoints. + +The middle layer of code is called by postgres.c before and after the +processing of each query, or after detecting an error: + + StartTransactionCommand + CommitTransactionCommand + AbortCurrentTransaction + +Meanwhile, the user can alter the system's state by issuing the SQL commands +BEGIN, COMMIT, ROLLBACK, SAVEPOINT, ROLLBACK TO or RELEASE. The traffic cop +redirects these calls to the toplevel routines + + BeginTransactionBlock + EndTransactionBlock + UserAbortTransactionBlock + DefineSavepoint + RollbackToSavepoint + ReleaseSavepoint + +respectively. Depending on the current state of the system, these functions +call low level functions to activate the real transaction system: + + StartTransaction + CommitTransaction + AbortTransaction + CleanupTransaction + StartSubTransaction + CommitSubTransaction + AbortSubTransaction + CleanupSubTransaction + +Additionally, within a transaction, CommandCounterIncrement is called to +increment the command counter, which allows future commands to "see" the +effects of previous commands within the same transaction. Note that this is +done automatically by CommitTransactionCommand after each query inside a +transaction block, but some utility functions also do it internally to allow +some operations (usually in the system catalogs) to be seen by future +operations in the same utility command. (For example, in DefineRelation it is +done after creating the heap so the pg_class row is visible, to be able to +lock it.) + + +For example, consider the following sequence of user commands: + +1) BEGIN +2) SELECT * FROM foo +3) INSERT INTO foo VALUES (...) +4) COMMIT + +In the main processing loop, this results in the following function call +sequence: + + / StartTransactionCommand; + / StartTransaction; +1) < ProcessUtility; << BEGIN + \ BeginTransactionBlock; + \ CommitTransactionCommand; + + / StartTransactionCommand; +2) / PortalRunSelect; << SELECT ... + \ CommitTransactionCommand; + \ CommandCounterIncrement; + + / StartTransactionCommand; +3) / ProcessQuery; << INSERT ... + \ CommitTransactionCommand; + \ CommandCounterIncrement; + + / StartTransactionCommand; + / ProcessUtility; << COMMIT +4) < EndTransactionBlock; + \ CommitTransactionCommand; + \ CommitTransaction; + +The point of this example is to demonstrate the need for +StartTransactionCommand and CommitTransactionCommand to be state smart -- they +should call CommandCounterIncrement between the calls to BeginTransactionBlock +and EndTransactionBlock and outside these calls they need to do normal start, +commit or abort processing. + +Furthermore, suppose the "SELECT * FROM foo" caused an abort condition. In +this case AbortCurrentTransaction is called, and the transaction is put in +aborted state. In this state, any user input is ignored except for +transaction-termination statements, or ROLLBACK TO commands. + +Transaction aborts can occur in two ways: + +1) system dies from some internal cause (syntax error, etc) +2) user types ROLLBACK + +The reason we have to distinguish them is illustrated by the following two +situations: + + case 1 case 2 + ------ ------ +1) user types BEGIN 1) user types BEGIN +2) user does something 2) user does something +3) user does not like what 3) system aborts for some reason + she sees and types ABORT (syntax error, etc) + +In case 1, we want to abort the transaction and return to the default state. +In case 2, there may be more commands coming our way which are part of the +same transaction block; we have to ignore these commands until we see a COMMIT +or ROLLBACK. + +Internal aborts are handled by AbortCurrentTransaction, while user aborts are +handled by UserAbortTransactionBlock. Both of them rely on AbortTransaction +to do all the real work. The only difference is what state we enter after +AbortTransaction does its work: + +* AbortCurrentTransaction leaves us in TBLOCK_ABORT, +* UserAbortTransactionBlock leaves us in TBLOCK_ABORT_END + +Low-level transaction abort handling is divided in two phases: +* AbortTransaction executes as soon as we realize the transaction has + failed. It should release all shared resources (locks etc) so that we do + not delay other backends unnecessarily. +* CleanupTransaction executes when we finally see a user COMMIT + or ROLLBACK command; it cleans things up and gets us out of the transaction + completely. In particular, we mustn't destroy TopTransactionContext until + this point. + +Also, note that when a transaction is committed, we don't close it right away. +Rather it's put in TBLOCK_END state, which means that when +CommitTransactionCommand is called after the query has finished processing, +the transaction has to be closed. The distinction is subtle but important, +because it means that control will leave the xact.c code with the transaction +open, and the main loop will be able to keep processing inside the same +transaction. So, in a sense, transaction commit is also handled in two +phases, the first at EndTransactionBlock and the second at +CommitTransactionCommand (which is where CommitTransaction is actually +called). + +The rest of the code in xact.c are routines to support the creation and +finishing of transactions and subtransactions. For example, AtStart_Memory +takes care of initializing the memory subsystem at main transaction start. + + +Subtransaction Handling +----------------------- + +Subtransactions are implemented using a stack of TransactionState structures, +each of which has a pointer to its parent transaction's struct. When a new +subtransaction is to be opened, PushTransaction is called, which creates a new +TransactionState, with its parent link pointing to the current transaction. +StartSubTransaction is in charge of initializing the new TransactionState to +sane values, and properly initializing other subsystems (AtSubStart routines). + +When closing a subtransaction, either CommitSubTransaction has to be called +(if the subtransaction is committing), or AbortSubTransaction and +CleanupSubTransaction (if it's aborting). In either case, PopTransaction is +called so the system returns to the parent transaction. + +One important point regarding subtransaction handling is that several may need +to be closed in response to a single user command. That's because savepoints +have names, and we allow to commit or rollback a savepoint by name, which is +not necessarily the one that was last opened. Also a COMMIT or ROLLBACK +command must be able to close out the entire stack. We handle this by having +the utility command subroutine mark all the state stack entries as commit- +pending or abort-pending, and then when the main loop reaches +CommitTransactionCommand, the real work is done. The main point of doing +things this way is that if we get an error while popping state stack entries, +the remaining stack entries still show what we need to do to finish up. + +In the case of ROLLBACK TO , we abort all the subtransactions up +through the one identified by the savepoint name, and then re-create that +subtransaction level with the same name. So it's a completely new +subtransaction as far as the internals are concerned. + +Other subsystems are allowed to start "internal" subtransactions, which are +handled by BeginInternalSubTransaction. This is to allow implementing +exception handling, e.g. in PL/pgSQL. ReleaseCurrentSubTransaction and +RollbackAndReleaseCurrentSubTransaction allows the subsystem to close said +subtransactions. The main difference between this and the savepoint/release +path is that we execute the complete state transition immediately in each +subroutine, rather than deferring some work until CommitTransactionCommand. +Another difference is that BeginInternalSubTransaction is allowed when no +explicit transaction block has been established, while DefineSavepoint is not. + + +Transaction and Subtransaction Numbering +---------------------------------------- + +Transactions and subtransactions are assigned permanent XIDs only when/if +they first do something that requires one --- typically, insert/update/delete +a tuple, though there are a few other places that need an XID assigned. +If a subtransaction requires an XID, we always first assign one to its +parent. This maintains the invariant that child transactions have XIDs later +than their parents, which is assumed in a number of places. + +The subsidiary actions of obtaining a lock on the XID and entering it into +pg_subtrans and PG_PROC are done at the time it is assigned. + +A transaction that has no XID still needs to be identified for various +purposes, notably holding locks. For this purpose we assign a "virtual +transaction ID" or VXID to each top-level transaction. VXIDs are formed from +two fields, the backendID and a backend-local counter; this arrangement allows +assignment of a new VXID at transaction start without any contention for +shared memory. To ensure that a VXID isn't re-used too soon after backend +exit, we store the last local counter value into shared memory at backend +exit, and initialize it from the previous value for the same backendID slot +at backend start. All these counters go back to zero at shared memory +re-initialization, but that's OK because VXIDs never appear anywhere on-disk. + +Internally, a backend needs a way to identify subtransactions whether or not +they have XIDs; but this need only lasts as long as the parent top transaction +endures. Therefore, we have SubTransactionId, which is somewhat like +CommandId in that it's generated from a counter that we reset at the start of +each top transaction. The top-level transaction itself has SubTransactionId 1, +and subtransactions have IDs 2 and up. (Zero is reserved for +InvalidSubTransactionId.) Note that subtransactions do not have their +own VXIDs; they use the parent top transaction's VXID. + + +Interlocking Transaction Begin, Transaction End, and Snapshots +-------------------------------------------------------------- + +We try hard to minimize the amount of overhead and lock contention involved +in the frequent activities of beginning/ending a transaction and taking a +snapshot. Unfortunately, we must have some interlocking for this, because +we must ensure consistency about the commit order of transactions. +For example, suppose an UPDATE in xact A is blocked by xact B's prior +update of the same row, and xact B is doing commit while xact C gets a +snapshot. Xact A can complete and commit as soon as B releases its locks. +If xact C's GetSnapshotData sees xact B as still running, then it had +better see xact A as still running as well, or it will be able to see two +tuple versions - one deleted by xact B and one inserted by xact A. Another +reason why this would be bad is that C would see (in the row inserted by A) +earlier changes by B, and it would be inconsistent for C not to see any +of B's changes elsewhere in the database. + +Formally, the correctness requirement is "if a snapshot A considers +transaction X as committed, and any of transaction X's snapshots considered +transaction Y as committed, then snapshot A must consider transaction Y as +committed". + +What we actually enforce is strict serialization of commits and rollbacks +with snapshot-taking: we do not allow any transaction to exit the set of +running transactions while a snapshot is being taken. (This rule is +stronger than necessary for consistency, but is relatively simple to +enforce, and it assists with some other issues as explained below.) The +implementation of this is that GetSnapshotData takes the ProcArrayLock in +shared mode (so that multiple backends can take snapshots in parallel), +but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode +while clearing the ProcGlobal->xids[] entry at transaction end (either +commit or abort). (To reduce context switching, when multiple transactions +commit nearly simultaneously, we have one backend take ProcArrayLock and +clear the XIDs of multiple processes at once.) + +ProcArrayEndTransaction also holds the lock while advancing the shared +latestCompletedXid variable. This allows GetSnapshotData to use +latestCompletedXid + 1 as xmax for its snapshot: there can be no +transaction >= this xid value that the snapshot needs to consider as +completed. + +In short, then, the rule is that no transaction may exit the set of +currently-running transactions between the time we fetch latestCompletedXid +and the time we finish building our snapshot. However, this restriction +only applies to transactions that have an XID --- read-only transactions +can end without acquiring ProcArrayLock, since they don't affect anyone +else's snapshot nor latestCompletedXid. + +Transaction start, per se, doesn't have any interlocking with these +considerations, since we no longer assign an XID immediately at transaction +start. But when we do decide to allocate an XID, GetNewTransactionId must +store the new XID into the shared ProcArray before releasing XidGenLock. +This ensures that all top-level XIDs <= latestCompletedXid are either +present in the ProcArray, or not running anymore. (This guarantee doesn't +apply to subtransaction XIDs, because of the possibility that there's not +room for them in the subxid array; instead we guarantee that they are +present or the overflow flag is set.) If a backend released XidGenLock +before storing its XID into ProcGlobal->xids[], then it would be possible for +another backend to allocate and commit a later XID, causing latestCompletedXid +to pass the first backend's XID, before that value became visible in the +ProcArray. That would break ComputeXidHorizons, as discussed below. + +We allow GetNewTransactionId to store the XID into ProcGlobal->xids[] (or the +subxid array) without taking ProcArrayLock. This was once necessary to +avoid deadlock; while that is no longer the case, it's still beneficial for +performance. We are thereby relying on fetch/store of an XID to be atomic, +else other backends might see a partially-set XID. This also means that +readers of the ProcArray xid fields must be careful to fetch a value only +once, rather than assume they can read it multiple times and get the same +answer each time. (Use volatile-qualified pointers when doing this, to +ensure that the C compiler does exactly what you tell it to.) + +Another important activity that uses the shared ProcArray is +ComputeXidHorizons, which must determine a lower bound for the oldest xmin +of any active MVCC snapshot, system-wide. Each individual backend +advertises the smallest xmin of its own snapshots in MyProc->xmin, or zero +if it currently has no live snapshots (eg, if it's between transactions or +hasn't yet set a snapshot for a new transaction). ComputeXidHorizons takes +the MIN() of the valid xmin fields. It does this with only shared lock on +ProcArrayLock, which means there is a potential race condition against other +backends doing GetSnapshotData concurrently: we must be certain that a +concurrent backend that is about to set its xmin does not compute an xmin +less than what ComputeXidHorizons determines. We ensure that by including +all the active XIDs into the MIN() calculation, along with the valid xmins. +The rule that transactions can't exit without taking exclusive ProcArrayLock +ensures that concurrent holders of shared ProcArrayLock will compute the +same minimum of currently-active XIDs: no xact, in particular not the +oldest, can exit while we hold shared ProcArrayLock. So +ComputeXidHorizons's view of the minimum active XID will be the same as that +of any concurrent GetSnapshotData, and so it can't produce an overestimate. +If there is no active transaction at all, ComputeXidHorizons uses +latestCompletedXid + 1, which is a lower bound for the xmin that might +be computed by concurrent or later GetSnapshotData calls. (We know that no +XID less than this could be about to appear in the ProcArray, because of the +XidGenLock interlock discussed above.) + +As GetSnapshotData is performance critical, it does not perform an accurate +oldest-xmin calculation (it used to, until v14). The contents of a snapshot +only depend on the xids of other backends, not their xmin. As backend's xmin +changes much more often than its xid, having GetSnapshotData look at xmins +can lead to a lot of unnecessary cacheline ping-pong. Instead +GetSnapshotData updates approximate thresholds (one that guarantees that all +deleted rows older than it can be removed, another determining that deleted +rows newer than it can not be removed). GlobalVisTest* uses those threshold +to make invisibility decision, falling back to ComputeXidHorizons if +necessary. + +Note that while it is certain that two concurrent executions of +GetSnapshotData will compute the same xmin for their own snapshots, there is +no such guarantee for the horizons computed by ComputeXidHorizons. This is +because we allow XID-less transactions to clear their MyProc->xmin +asynchronously (without taking ProcArrayLock), so one execution might see +what had been the oldest xmin, and another not. This is OK since the +thresholds need only be a valid lower bound. As noted above, we are already +assuming that fetch/store of the xid fields is atomic, so assuming it for +xmin as well is no extra risk. + + +pg_xact and pg_subtrans +----------------------- + +pg_xact and pg_subtrans are permanent (on-disk) storage of transaction related +information. There is a limited number of pages of each kept in memory, so +in many cases there is no need to actually read from disk. However, if +there's a long running transaction or a backend sitting idle with an open +transaction, it may be necessary to be able to read and write this information +from disk. They also allow information to be permanent across server restarts. + +pg_xact records the commit status for each transaction that has been assigned +an XID. A transaction can be in progress, committed, aborted, or +"sub-committed". This last state means that it's a subtransaction that's no +longer running, but its parent has not updated its state yet. It is not +necessary to update a subtransaction's transaction status to subcommit, so we +can just defer it until main transaction commit. The main role of marking +transactions as sub-committed is to provide an atomic commit protocol when +transaction status is spread across multiple clog pages. As a result, whenever +transaction status spreads across multiple pages we must use a two-phase commit +protocol: the first phase is to mark the subtransactions as sub-committed, then +we mark the top level transaction and all its subtransactions committed (in +that order). Thus, subtransactions that have not aborted appear as in-progress +even when they have already finished, and the subcommit status appears as a +very short transitory state during main transaction commit. Subtransaction +abort is always marked in clog as soon as it occurs. When the transaction +status all fit in a single CLOG page, we atomically mark them all as committed +without bothering with the intermediate sub-commit state. + +Savepoints are implemented using subtransactions. A subtransaction is a +transaction inside a transaction; its commit or abort status is not only +dependent on whether it committed itself, but also whether its parent +transaction committed. To implement multiple savepoints in a transaction we +allow unlimited transaction nesting depth, so any particular subtransaction's +commit state is dependent on the commit status of each and every ancestor +transaction. + +The "subtransaction parent" (pg_subtrans) mechanism records, for each +transaction with an XID, the TransactionId of its parent transaction. This +information is stored as soon as the subtransaction is assigned an XID. +Top-level transactions do not have a parent, so they leave their pg_subtrans +entries set to the default value of zero (InvalidTransactionId). + +pg_subtrans is used to check whether the transaction in question is still +running --- the main Xid of a transaction is recorded in ProcGlobal->xids[], +with a copy in PGPROC->xid, but since we allow arbitrary nesting of +subtransactions, we can't fit all Xids in shared memory, so we have to store +them on disk. Note, however, that for each transaction we keep a "cache" of +Xids that are known to be part of the transaction tree, so we can skip looking +at pg_subtrans unless we know the cache has been overflowed. See +storage/ipc/procarray.c for the gory details. + +slru.c is the supporting mechanism for both pg_xact and pg_subtrans. It +implements the LRU policy for in-memory buffer pages. The high-level routines +for pg_xact are implemented in transam.c, while the low-level functions are in +clog.c. pg_subtrans is contained completely in subtrans.c. + + +Write-Ahead Log Coding +---------------------- + +The WAL subsystem (also called XLOG in the code) exists to guarantee crash +recovery. It can also be used to provide point-in-time recovery, as well as +hot-standby replication via log shipping. Here are some notes about +non-obvious aspects of its design. + +A basic assumption of a write AHEAD log is that log entries must reach stable +storage before the data-page changes they describe. This ensures that +replaying the log to its end will bring us to a consistent state where there +are no partially-performed transactions. To guarantee this, each data page +(either heap or index) is marked with the LSN (log sequence number --- in +practice, a WAL file location) of the latest XLOG record affecting the page. +Before the bufmgr can write out a dirty page, it must ensure that xlog has +been flushed to disk at least up to the page's LSN. This low-level +interaction improves performance by not waiting for XLOG I/O until necessary. +The LSN check exists only in the shared-buffer manager, not in the local +buffer manager used for temp tables; hence operations on temp tables must not +be WAL-logged. + +During WAL replay, we can check the LSN of a page to detect whether the change +recorded by the current log entry is already applied (it has been, if the page +LSN is >= the log entry's WAL location). + +Usually, log entries contain just enough information to redo a single +incremental update on a page (or small group of pages). This will work only +if the filesystem and hardware implement data page writes as atomic actions, +so that a page is never left in a corrupt partly-written state. Since that's +often an untenable assumption in practice, we log additional information to +allow complete reconstruction of modified pages. The first WAL record +affecting a given page after a checkpoint is made to contain a copy of the +entire page, and we implement replay by restoring that page copy instead of +redoing the update. (This is more reliable than the data storage itself would +be because we can check the validity of the WAL record's CRC.) We can detect +the "first change after checkpoint" by noting whether the page's old LSN +precedes the end of WAL as of the last checkpoint (the RedoRecPtr). + +The general schema for executing a WAL-logged action is + +1. Pin and exclusive-lock the shared buffer(s) containing the data page(s) +to be modified. + +2. START_CRIT_SECTION() (Any error during the next three steps must cause a +PANIC because the shared buffers will contain unlogged changes, which we +have to ensure don't get to disk. Obviously, you should check conditions +such as whether there's enough free space on the page before you start the +critical section.) + +3. Apply the required changes to the shared buffer(s). + +4. Mark the shared buffer(s) as dirty with MarkBufferDirty(). (This must +happen before the WAL record is inserted; see notes in SyncOneBuffer().) +Note that marking a buffer dirty with MarkBufferDirty() should only +happen iff you write a WAL record; see Writing Hints below. + +5. If the relation requires WAL-logging, build a WAL record using +XLogBeginInsert and XLogRegister* functions, and insert it. (See +"Constructing a WAL record" below). Then update the page's LSN using the +returned XLOG location. For instance, + + XLogBeginInsert(); + XLogRegisterBuffer(...) + XLogRegisterData(...) + recptr = XLogInsert(rmgr_id, info); + + PageSetLSN(dp, recptr); + +6. END_CRIT_SECTION() + +7. Unlock and unpin the buffer(s). + +Complex changes (such as a multilevel index insertion) normally need to be +described by a series of atomic-action WAL records. The intermediate states +must be self-consistent, so that if the replay is interrupted between any +two actions, the system is fully functional. In btree indexes, for example, +a page split requires a new page to be allocated, and an insertion of a new +key in the parent btree level, but for locking reasons this has to be +reflected by two separate WAL records. Replaying the first record, to +allocate the new page and move tuples to it, sets a flag on the page to +indicate that the key has not been inserted to the parent yet. Replaying the +second record clears the flag. This intermediate state is never seen by +other backends during normal operation, because the lock on the child page +is held across the two actions, but will be seen if the operation is +interrupted before writing the second WAL record. The search algorithm works +with the intermediate state as normal, but if an insertion encounters a page +with the incomplete-split flag set, it will finish the interrupted split by +inserting the key to the parent, before proceeding. + + +Constructing a WAL record +------------------------- + +A WAL record consists of a header common to all WAL record types, +record-specific data, and information about the data blocks modified. Each +modified data block is identified by an ID number, and can optionally have +more record-specific data associated with the block. If XLogInsert decides +that a full-page image of a block needs to be taken, the data associated +with that block is not included. + +The API for constructing a WAL record consists of five functions: +XLogBeginInsert, XLogRegisterBuffer, XLogRegisterData, XLogRegisterBufData, +and XLogInsert. First, call XLogBeginInsert(). Then register all the buffers +modified, and data needed to replay the changes, using XLogRegister* +functions. Finally, insert the constructed record to the WAL by calling +XLogInsert(). + + XLogBeginInsert(); + + /* register buffers modified as part of this WAL-logged action */ + XLogRegisterBuffer(0, lbuffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_STANDARD); + + /* register data that is always included in the WAL record */ + XLogRegisterData(&xlrec, SizeOfFictionalAction); + + /* + * register data associated with a buffer. This will not be included + * in the record if a full-page image is taken. + */ + XLogRegisterBufData(0, tuple->data, tuple->len); + + /* more data associated with the buffer */ + XLogRegisterBufData(0, data2, len2); + + /* + * Ok, all the data and buffers to include in the WAL record have + * been registered. Insert the record. + */ + recptr = XLogInsert(RM_FOO_ID, XLOG_FOOBAR_DO_STUFF); + +Details of the API functions: + +void XLogBeginInsert(void) + + Must be called before XLogRegisterBuffer and XLogRegisterData. + +void XLogResetInsertion(void) + + Clear any currently registered data and buffers from the WAL record + construction workspace. This is only needed if you have already called + XLogBeginInsert(), but decide to not insert the record after all. + +void XLogEnsureRecordSpace(int max_block_id, int ndatas) + + Normally, the WAL record construction buffers have the following limits: + + * highest block ID that can be used is 4 (allowing five block references) + * Max 20 chunks of registered data + + These default limits are enough for most record types that change some + on-disk structures. For the odd case that requires more data, or needs to + modify more buffers, these limits can be raised by calling + XLogEnsureRecordSpace(). XLogEnsureRecordSpace() must be called before + XLogBeginInsert(), and outside a critical section. + +void XLogRegisterBuffer(uint8 block_id, Buffer buf, uint8 flags); + + XLogRegisterBuffer adds information about a data block to the WAL record. + block_id is an arbitrary number used to identify this page reference in + the redo routine. The information needed to re-find the page at redo - + relfilenode, fork, and block number - are included in the WAL record. + + XLogInsert will automatically include a full copy of the page contents, if + this is the first modification of the buffer since the last checkpoint. + It is important to register every buffer modified by the action with + XLogRegisterBuffer, to avoid torn-page hazards. + + The flags control when and how the buffer contents are included in the + WAL record. Normally, a full-page image is taken only if the page has not + been modified since the last checkpoint, and only if full_page_writes=on + or an online backup is in progress. The REGBUF_FORCE_IMAGE flag can be + used to force a full-page image to always be included; that is useful + e.g. for an operation that rewrites most of the page, so that tracking the + details is not worth it. For the rare case where it is not necessary to + protect from torn pages, REGBUF_NO_IMAGE flag can be used to suppress + full page image from being taken. REGBUF_WILL_INIT also suppresses a full + page image, but the redo routine must re-generate the page from scratch, + without looking at the old page contents. Re-initializing the page + protects from torn page hazards like a full page image does. + + The REGBUF_STANDARD flag can be specified together with the other flags to + indicate that the page follows the standard page layout. It causes the + area between pd_lower and pd_upper to be left out from the image, reducing + WAL volume. + + If the REGBUF_KEEP_DATA flag is given, any per-buffer data registered with + XLogRegisterBufData() is included in the WAL record even if a full-page + image is taken. + +void XLogRegisterData(char *data, int len); + + XLogRegisterData is used to include arbitrary data in the WAL record. If + XLogRegisterData() is called multiple times, the data are appended, and + will be made available to the redo routine as one contiguous chunk. + +void XLogRegisterBufData(uint8 block_id, char *data, int len); + + XLogRegisterBufData is used to include data associated with a particular + buffer that was registered earlier with XLogRegisterBuffer(). If + XLogRegisterBufData() is called multiple times with the same block ID, the + data are appended, and will be made available to the redo routine as one + contiguous chunk. + + If a full-page image of the buffer is taken at insertion, the data is not + included in the WAL record, unless the REGBUF_KEEP_DATA flag is used. + + +Writing a REDO routine +---------------------- + +A REDO routine uses the data and page references included in the WAL record +to reconstruct the new state of the page. The record decoding functions +and macros in xlogreader.c/h can be used to extract the data from the record. + +When replaying a WAL record that describes changes on multiple pages, you +must be careful to lock the pages properly to prevent concurrent Hot Standby +queries from seeing an inconsistent state. If this requires that two +or more buffer locks be held concurrently, you must lock the pages in +appropriate order, and not release the locks until all the changes are done. + +Note that we must only use PageSetLSN/PageGetLSN() when we know the action +is serialised. Only Startup process may modify data blocks during recovery, +so Startup process may execute PageGetLSN() without fear of serialisation +problems. All other processes must only call PageSet/GetLSN when holding +either an exclusive buffer lock or a shared lock plus buffer header lock, +or be writing the data block directly rather than through shared buffers +while holding AccessExclusiveLock on the relation. + + +Writing Hints +------------- + +In some cases, we write additional information to data blocks without +writing a preceding WAL record. This should only happen iff the data can +be reconstructed later following a crash and the action is simply a way +of optimising for performance. When a hint is written we use +MarkBufferDirtyHint() to mark the block dirty. + +If the buffer is clean and checksums are in use then MarkBufferDirtyHint() +inserts an XLOG_FPI_FOR_HINT record to ensure that we take a full page image +that includes the hint. We do this to avoid a partial page write, when we +write the dirtied page. WAL is not written during recovery, so we simply skip +dirtying blocks because of hints when in recovery. + +If you do decide to optimise away a WAL record, then any calls to +MarkBufferDirty() must be replaced by MarkBufferDirtyHint(), +otherwise you will expose the risk of partial page writes. + + +Write-Ahead Logging for Filesystem Actions +------------------------------------------ + +The previous section described how to WAL-log actions that only change page +contents within shared buffers. For that type of action it is generally +possible to check all likely error cases (such as insufficient space on the +page) before beginning to make the actual change. Therefore we can make +the change and the creation of the associated WAL log record "atomic" by +wrapping them into a critical section --- the odds of failure partway +through are low enough that PANIC is acceptable if it does happen. + +Clearly, that approach doesn't work for cases where there's a significant +probability of failure within the action to be logged, such as creation +of a new file or database. We don't want to PANIC, and we especially don't +want to PANIC after having already written a WAL record that says we did +the action --- if we did, replay of the record would probably fail again +and PANIC again, making the failure unrecoverable. This means that the +ordinary WAL rule of "write WAL before the changes it describes" doesn't +work, and we need a different design for such cases. + +There are several basic types of filesystem actions that have this +issue. Here is how we deal with each: + +1. Adding a disk page to an existing table. + +This action isn't WAL-logged at all. We extend a table by writing a page +of zeroes at its end. We must actually do this write so that we are sure +the filesystem has allocated the space. If the write fails we can just +error out normally. Once the space is known allocated, we can initialize +and fill the page via one or more normal WAL-logged actions. Because it's +possible that we crash between extending the file and writing out the WAL +entries, we have to treat discovery of an all-zeroes page in a table or +index as being a non-error condition. In such cases we can just reclaim +the space for re-use. + +2. Creating a new table, which requires a new file in the filesystem. + +We try to create the file, and if successful we make a WAL record saying +we did it. If not successful, we can just throw an error. Notice that +there is a window where we have created the file but not yet written any +WAL about it to disk. If we crash during this window, the file remains +on disk as an "orphan". It would be possible to clean up such orphans +by having database restart search for files that don't have any committed +entry in pg_class, but that currently isn't done because of the possibility +of deleting data that is useful for forensic analysis of the crash. +Orphan files are harmless --- at worst they waste a bit of disk space --- +because we check for on-disk collisions when allocating new relfilenode +OIDs. So cleaning up isn't really necessary. + +3. Deleting a table, which requires an unlink() that could fail. + +Our approach here is to WAL-log the operation first, but to treat failure +of the actual unlink() call as a warning rather than error condition. +Again, this can leave an orphan file behind, but that's cheap compared to +the alternatives. Since we can't actually do the unlink() until after +we've committed the DROP TABLE transaction, throwing an error would be out +of the question anyway. (It may be worth noting that the WAL entry about +the file deletion is actually part of the commit record for the dropping +transaction.) + +4. Creating and deleting databases and tablespaces, which requires creating +and deleting directories and entire directory trees. + +These cases are handled similarly to creating individual files, ie, we +try to do the action first and then write a WAL entry if it succeeded. +The potential amount of wasted disk space is rather larger, of course. +In the creation case we try to delete the directory tree again if creation +fails, so as to reduce the risk of wasted space. Failure partway through +a deletion operation results in a corrupt database: the DROP failed, but +some of the data is gone anyway. There is little we can do about that, +though, and in any case it was presumably data the user no longer wants. + +In all of these cases, if WAL replay fails to redo the original action +we must panic and abort recovery. The DBA will have to manually clean up +(for instance, free up some disk space or fix directory permissions) and +then restart recovery. This is part of the reason for not writing a WAL +entry until we've successfully done the original action. + + +Skipping WAL for New RelFileNode +-------------------------------- + +Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK +would unlink, in-tree access methods write no WAL for that change. Code that +writes WAL without calling RelationNeedsWAL() must check for this case. This +skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change +for the same block, REDO could overwrite the WAL-skipping change. If a +WAL-writing change followed a WAL-skipping change for the same block, a +related problem would arise. When a WAL record contains no full-page image, +REDO expects the page to match its contents from just before record insertion. +A WAL-skipping change may not reach disk at all, violating REDO's expectation +under full_page_writes=off. For any access method, CommitTransaction() writes +and fsyncs affected blocks before recording the commit. + +Prefer to do the same in future access methods. However, two other approaches +can work. First, an access method can irreversibly transition a given fork +from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and +smgrimmedsync(). Second, an access method can opt to write WAL +unconditionally for permanent relations. Under these approaches, the access +method callbacks must not call functions that react to RelationNeedsWAL(). + +This applies only to WAL records whose replay would modify bytes stored in the +new relfilenode. It does not apply to other records about the relfilenode, +such as XLOG_SMGR_CREATE. Because it operates at the level of individual +relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations. +Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which +ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while +the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table +to skip WAL, but that won't affect its indexes. + + +Asynchronous Commit +------------------- + +As of PostgreSQL 8.3 it is possible to perform asynchronous commits - i.e., +we don't wait while the WAL record for the commit is fsync'ed. +We perform an asynchronous commit when synchronous_commit = off. Instead +of performing an XLogFlush() up to the LSN of the commit, we merely note +the LSN in shared memory. The backend then continues with other work. +We record the LSN only for an asynchronous commit, not an abort; there's +never any need to flush an abort record, since the presumption after a +crash would be that the transaction aborted anyway. + +We always force synchronous commit when the transaction is deleting +relations, to ensure the commit record is down to disk before the relations +are removed from the filesystem. Also, certain utility commands that have +non-roll-backable side effects (such as filesystem changes) force sync +commit to minimize the window in which the filesystem change has been made +but the transaction isn't guaranteed committed. + +The walwriter regularly wakes up (via wal_writer_delay) or is woken up +(via its latch, which is set by backends committing asynchronously) and +performs an XLogBackgroundFlush(). This checks the location of the last +completely filled WAL page. If that has moved forwards, then we write all +the changed buffers up to that point, so that under full load we write +only whole buffers. If there has been a break in activity and the current +WAL page is the same as before, then we find out the LSN of the most +recent asynchronous commit, and write up to that point, if required (i.e. +if it's in the current WAL page). If more than wal_writer_delay has +passed, or more than wal_writer_flush_after blocks have been written, since +the last flush, WAL is also flushed up to the current location. This +arrangement in itself would guarantee that an async commit record reaches +disk after at most two times wal_writer_delay after the transaction +completes. However, we also allow XLogFlush to write/flush full buffers +"flexibly" (ie, not wrapping around at the end of the circular WAL buffer +area), so as to minimize the number of writes issued under high load when +multiple WAL pages are filled per walwriter cycle. This makes the worst-case +delay three wal_writer_delay cycles. + +There are some other subtle points to consider with asynchronous commits. +First, for each page of CLOG we must remember the LSN of the latest commit +affecting the page, so that we can enforce the same flush-WAL-before-write +rule that we do for ordinary relation pages. Otherwise the record of the +commit might reach disk before the WAL record does. Again, abort records +need not factor into this consideration. + +In fact, we store more than one LSN for each clog page. This relates to +the way we set transaction status hint bits during visibility tests. +We must not set a transaction-committed hint bit on a relation page and +have that record make it to disk prior to the WAL record of the commit. +Since visibility tests are normally made while holding buffer share locks, +we do not have the option of changing the page's LSN to guarantee WAL +synchronization. Instead, we defer the setting of the hint bit if we have +not yet flushed WAL as far as the LSN associated with the transaction. +This requires tracking the LSN of each unflushed async commit. It is +convenient to associate this data with clog buffers: because we will flush +WAL before writing a clog page, we know that we do not need to remember a +transaction's LSN longer than the clog page holding its commit status +remains in memory. However, the naive approach of storing an LSN for each +clog position is unattractive: the LSNs are 32x bigger than the two-bit +commit status fields, and so we'd need 256K of additional shared memory for +each 8K clog buffer page. We choose instead to store a smaller number of +LSNs per page, where each LSN is the highest LSN associated with any +transaction commit in a contiguous range of transaction IDs on that page. +This saves storage at the price of some possibly-unnecessary delay in +setting transaction hint bits. + +How many transactions should share the same cached LSN (N)? If the +system's workload consists only of small async-commit transactions, then +it's reasonable to have N similar to the number of transactions per +walwriter cycle, since that is the granularity with which transactions will +become truly committed (and thus hintable) anyway. The worst case is where +a sync-commit xact shares a cached LSN with an async-commit xact that +commits a bit later; even though we paid to sync the first xact to disk, +we won't be able to hint its outputs until the second xact is sync'd, up to +three walwriter cycles later. This argues for keeping N (the group size) +as small as possible. For the moment we are setting the group size to 32, +which makes the LSN cache space the same size as the actual clog buffer +space (independently of BLCKSZ). + +It is useful that we can run both synchronous and asynchronous commit +transactions concurrently, but the safety of this is perhaps not +immediately obvious. Assume we have two transactions, T1 and T2. The Log +Sequence Number (LSN) is the point in the WAL sequence where a transaction +commit is recorded, so LSN1 and LSN2 are the commit records of those +transactions. If T2 can see changes made by T1 then when T2 commits it +must be true that LSN2 follows LSN1. Thus when T2 commits it is certain +that all of the changes made by T1 are also now recorded in the WAL. This +is true whether T1 was asynchronous or synchronous. As a result, it is +safe for asynchronous commits and synchronous commits to work concurrently +without endangering data written by synchronous commits. Sub-transactions +are not important here since the final write to disk only occurs at the +commit of the top level transaction. + +Changes to data blocks cannot reach disk unless WAL is flushed up to the +point of the LSN of the data blocks. Any attempt to write unsafe data to +disk will trigger a write which ensures the safety of all data written by +that and prior transactions. Data blocks and clog pages are both protected +by LSNs. + +Changes to a temp table are not WAL-logged, hence could reach disk in +advance of T1's commit, but we don't care since temp table contents don't +survive crashes anyway. + +Database writes that skip WAL for new relfilenodes are also safe. In these +cases it's entirely possible for the data to reach disk before T1's commit, +because T1 will fsync it down to disk without any sort of interlock. However, +all these paths are designed to write data that no other transaction can see +until after T1 commits. The situation is thus not different from ordinary +WAL-logged updates. + +Transaction Emulation during Recovery +------------------------------------- + +During Recovery we replay transaction changes in the order they occurred. +As part of this replay we emulate some transactional behaviour, so that +read only backends can take MVCC snapshots. We do this by maintaining a +list of XIDs belonging to transactions that are being replayed, so that +each transaction that has recorded WAL records for database writes exist +in the array until it commits. Further details are given in comments in +procarray.c. + +Many actions write no WAL records at all, for example read only transactions. +These have no effect on MVCC in recovery and we can pretend they never +occurred at all. Subtransaction commit does not write a WAL record either +and has very little effect, since lock waiters need to wait for the +parent transaction to complete. + +Not all transactional behaviour is emulated, for example we do not insert +a transaction entry into the lock table, nor do we maintain the transaction +stack in memory. Clog, multixact and commit_ts entries are made normally. +Subtrans is maintained during recovery but the details of the transaction +tree are ignored and all subtransactions reference the top-level TransactionId +directly. Since commit is atomic this provides correct lock wait behaviour +yet simplifies emulation of subtransactions considerably. + +Further details on locking mechanics in recovery are given in comments +with the Lock rmgr code. diff --git a/src/backend/access/transam/README.parallel b/src/backend/access/transam/README.parallel new file mode 100644 index 0000000..99c588d --- /dev/null +++ b/src/backend/access/transam/README.parallel @@ -0,0 +1,237 @@ +Overview +======== + +PostgreSQL provides some simple facilities to make writing parallel algorithms +easier. Using a data structure called a ParallelContext, you can arrange to +launch background worker processes, initialize their state to match that of +the backend which initiated parallelism, communicate with them via dynamic +shared memory, and write reasonably complex code that can run either in the +user backend or in one of the parallel workers without needing to be aware of +where it's running. + +The backend which starts a parallel operation (hereafter, the initiating +backend) starts by creating a dynamic shared memory segment which will last +for the lifetime of the parallel operation. This dynamic shared memory segment +will contain (1) a shm_mq that can be used to transport errors (and other +messages reported via elog/ereport) from the worker back to the initiating +backend; (2) serialized representations of the initiating backend's private +state, so that the worker can synchronize its state with of the initiating +backend; and (3) any other data structures which a particular user of the +ParallelContext data structure may wish to add for its own purposes. Once +the initiating backend has initialized the dynamic shared memory segment, it +asks the postmaster to launch the appropriate number of parallel workers. +These workers then connect to the dynamic shared memory segment, initiate +their state, and then invoke the appropriate entrypoint, as further detailed +below. + +Error Reporting +=============== + +When started, each parallel worker begins by attaching the dynamic shared +memory segment and locating the shm_mq to be used for error reporting; it +redirects all of its protocol messages to this shm_mq. Prior to this point, +any failure of the background worker will not be reported to the initiating +backend; from the point of view of the initiating backend, the worker simply +failed to start. The initiating backend must anyway be prepared to cope +with fewer parallel workers than it originally requested, so catering to +this case imposes no additional burden. + +Whenever a new message (or partial message; very large messages may wrap) is +sent to the error-reporting queue, PROCSIG_PARALLEL_MESSAGE is sent to the +initiating backend. This causes the next CHECK_FOR_INTERRUPTS() in the +initiating backend to read and rethrow the message. For the most part, this +makes error reporting in parallel mode "just work". Of course, to work +properly, it is important that the code the initiating backend is executing +CHECK_FOR_INTERRUPTS() regularly and avoid blocking interrupt processing for +long periods of time, but those are good things to do anyway. + +(A currently-unsolved problem is that some messages may get written to the +system log twice, once in the backend where the report was originally +generated, and again when the initiating backend rethrows the message. If +we decide to suppress one of these reports, it should probably be second one; +otherwise, if the worker is for some reason unable to propagate the message +back to the initiating backend, the message will be lost altogether.) + +State Sharing +============= + +It's possible to write C code which works correctly without parallelism, but +which fails when parallelism is used. No parallel infrastructure can +completely eliminate this problem, because any global variable is a risk. +There's no general mechanism for ensuring that every global variable in the +worker will have the same value that it does in the initiating backend; even +if we could ensure that, some function we're calling could update the variable +after each call, and only the backend where that update is performed will see +the new value. Similar problems can arise with any more-complex data +structure we might choose to use. For example, a pseudo-random number +generator should, given a particular seed value, produce the same predictable +series of values every time. But it does this by relying on some private +state which won't automatically be shared between cooperating backends. A +parallel-safe PRNG would need to store its state in dynamic shared memory, and +would require locking. The parallelism infrastructure has no way of knowing +whether the user intends to call code that has this sort of problem, and can't +do anything about it anyway. + +Instead, we take a more pragmatic approach. First, we try to make as many of +the operations that are safe outside of parallel mode work correctly in +parallel mode as well. Second, we try to prohibit common unsafe operations +via suitable error checks. These checks are intended to catch 100% of +unsafe things that a user might do from the SQL interface, but code written +in C can do unsafe things that won't trigger these checks. The error checks +are engaged via EnterParallelMode(), which should be called before creating +a parallel context, and disarmed via ExitParallelMode(), which should be +called after all parallel contexts have been destroyed. The most +significant restriction imposed by parallel mode is that all operations must +be strictly read-only; we allow no writes to the database and no DDL. We +might try to relax these restrictions in the future. + +To make as many operations as possible safe in parallel mode, we try to copy +the most important pieces of state from the initiating backend to each parallel +worker. This includes: + + - The set of libraries dynamically loaded by dfmgr.c. + + - The authenticated user ID and current database. Each parallel worker + will connect to the same database as the initiating backend, using the + same user ID. + + - The values of all GUCs. Accordingly, permanent changes to the value of + any GUC are forbidden while in parallel mode; but temporary changes, + such as entering a function with non-NULL proconfig, are OK. + + - The current subtransaction's XID, the top-level transaction's XID, and + the list of XIDs considered current (that is, they are in-progress or + subcommitted). This information is needed to ensure that tuple visibility + checks return the same results in the worker as they do in the + initiating backend. See also the section Transaction Integration, below. + + - The combo CID mappings. This is needed to ensure consistent answers to + tuple visibility checks. The need to synchronize this data structure is + a major reason why we can't support writes in parallel mode: such writes + might create new combo CIDs, and we have no way to let other workers + (or the initiating backend) know about them. + + - The transaction snapshot. + + - The active snapshot, which might be different from the transaction + snapshot. + + - The currently active user ID and security context. Note that this is + the fourth user ID we restore: the initial step of binding to the correct + database also involves restoring the authenticated user ID. When GUC + values are restored, this incidentally sets SessionUserId and OuterUserId + to the correct values. This final step restores CurrentUserId. + + - State related to pending REINDEX operations, which prevents access to + an index that is currently being rebuilt. + + - Active relmapper.c mapping state. This is needed to allow consistent + answers when fetching the current relfilenode for relation oids of + mapped relations. + +To prevent unprincipled deadlocks when running in parallel mode, this code +also arranges for the leader and all workers to participate in group +locking. See src/backend/storage/lmgr/README for more details. + +Transaction Integration +======================= + +Regardless of what the TransactionState stack looks like in the parallel +leader, each parallel worker ends up with a stack of depth 1. This stack +entry is marked with the special transaction block state +TBLOCK_PARALLEL_INPROGRESS so that it's not confused with an ordinary +toplevel transaction. The XID of this TransactionState is set to the XID of +the innermost currently-active subtransaction in the initiating backend. The +initiating backend's toplevel XID, and the XIDs of all current (in-progress +or subcommitted) XIDs are stored separately from the TransactionState stack, +but in such a way that GetTopTransactionId(), GetTopTransactionIdIfAny(), and +TransactionIdIsCurrentTransactionId() return the same values that they would +in the initiating backend. We could copy the entire transaction state stack, +but most of it would be useless: for example, you can't roll back to a +savepoint from within a parallel worker, and there are no resources to +associated with the memory contexts or resource owners of intermediate +subtransactions. + +No meaningful change to the transaction state can be made while in parallel +mode. No XIDs can be assigned, and no subtransactions can start or end, +because we have no way of communicating these state changes to cooperating +backends, or of synchronizing them. It's clearly unworkable for the initiating +backend to exit any transaction or subtransaction that was in progress when +parallelism was started before all parallel workers have exited; and it's even +more clearly crazy for a parallel worker to try to subcommit or subabort the +current subtransaction and execute in some other transaction context than was +present in the initiating backend. It might be practical to allow internal +sub-transactions (e.g. to implement a PL/pgSQL EXCEPTION block) to be used in +parallel mode, provided that they are XID-less, because other backends +wouldn't really need to know about those transactions or do anything +differently because of them. Right now, we don't even allow that. + +At the end of a parallel operation, which can happen either because it +completed successfully or because it was interrupted by an error, parallel +workers associated with that operation exit. In the error case, transaction +abort processing in the parallel leader kills off any remaining workers, and +the parallel leader then waits for them to die. In the case of a successful +parallel operation, the parallel leader does not send any signals, but must +wait for workers to complete and exit of their own volition. In either +case, it is very important that all workers actually exit before the +parallel leader cleans up the (sub)transaction in which they were created; +otherwise, chaos can ensue. For example, if the leader is rolling back the +transaction that created the relation being scanned by a worker, the +relation could disappear while the worker is still busy scanning it. That's +not safe. + +Generally, the cleanup performed by each worker at this point is similar to +top-level commit or abort. Each backend has its own resource owners: buffer +pins, catcache or relcache reference counts, tuple descriptors, and so on +are managed separately by each backend, and must free them before exiting. +There are, however, some important differences between parallel worker +commit or abort and a real top-level transaction commit or abort. Most +importantly: + + - No commit or abort record is written; the initiating backend is + responsible for this. + + - Cleanup of pg_temp namespaces is not done. Parallel workers cannot + safely access the initiating backend's pg_temp namespace, and should + not create one of their own. + +Coding Conventions +=================== + +Before beginning any parallel operation, call EnterParallelMode(); after all +parallel operations are completed, call ExitParallelMode(). To actually +parallelize a particular operation, use a ParallelContext. The basic coding +pattern looks like this: + + EnterParallelMode(); /* prohibit unsafe state changes */ + + pcxt = CreateParallelContext("library_name", "function_name", nworkers); + + /* Allow space for application-specific data here. */ + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, keys); + + InitializeParallelDSM(pcxt); /* create DSM and copy state to it */ + + /* Store the data for which we reserved space. */ + space = shm_toc_allocate(pcxt->toc, size); + shm_toc_insert(pcxt->toc, key, space); + + LaunchParallelWorkers(pcxt); + + /* do parallel stuff */ + + WaitForParallelWorkersToFinish(pcxt); + + /* read any final results from dynamic shared memory */ + + DestroyParallelContext(pcxt); + + ExitParallelMode(); + +If desired, after WaitForParallelWorkersToFinish() has been called, the +context can be reset so that workers can be launched anew using the same +parallel context. To do this, first call ReinitializeParallelDSM() to +reinitialize state managed by the parallel context machinery itself; then, +perform any other necessary resetting of state; after that, you can again +call LaunchParallelWorkers. diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c new file mode 100644 index 0000000..8575f1c --- /dev/null +++ b/src/backend/access/transam/clog.c @@ -0,0 +1,1030 @@ +/*------------------------------------------------------------------------- + * + * clog.c + * PostgreSQL transaction-commit-log manager + * + * This module replaces the old "pg_log" access code, which treated pg_log + * essentially like a relation, in that it went through the regular buffer + * manager. The problem with that was that there wasn't any good way to + * recycle storage space for transactions so old that they'll never be + * looked up again. Now we use specialized access code so that the commit + * log can be broken into relatively small, independent segments. + * + * XLOG interactions: this module generates an XLOG record whenever a new + * CLOG page is initialized to zeroes. Other writes of CLOG come from + * recording of transaction commit or abort in xact.c, which generates its + * own XLOG records for these events and will re-perform the status update + * on redo; so we need make no additional XLOG entry here. For synchronous + * transaction commits, the XLOG is guaranteed flushed through the XLOG commit + * record before we are called to log a commit, so the WAL rule "write xlog + * before data" is satisfied automatically. However, for async commits we + * must track the latest LSN affecting each CLOG page, so that we can flush + * XLOG that far and satisfy the WAL rule. We don't have to worry about this + * for aborts (whether sync or async), since the post-crash assumption would + * be that such transactions failed anyway. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/clog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" +#include "access/slru.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/proc.h" +#include "storage/sync.h" + +/* + * Defines for CLOG page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE, + * and CLOG segment numbering at + * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateCLOG (see CLOGPagePrecedes). + */ + +/* We need two bits per xact, so four xacts fit in a byte */ +#define CLOG_BITS_PER_XACT 2 +#define CLOG_XACTS_PER_BYTE 4 +#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) +#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) + +#define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) +#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) +#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE) +#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE) + +/* We store the latest async LSN for each group of transactions */ +#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */ +#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP) + +#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \ + ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP) + +/* + * The number of subtransactions below which we consider to apply clog group + * update optimization. Testing reveals that the number higher than this can + * hurt performance. + */ +#define THRESHOLD_SUBTRANS_CLOG_OPT 5 + +/* + * Link to shared-memory data structures for CLOG control + */ +static SlruCtlData XactCtlData; + +#define XactCtl (&XactCtlData) + + +static int ZeroCLOGPage(int pageno, bool writeXlog); +static bool CLOGPagePrecedes(int page1, int page2); +static void WriteZeroPageXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact, + Oid oldestXactDb); +static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno, + bool all_xact_same_page); +static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status, + XLogRecPtr lsn, int slotno); +static void set_status_by_pages(int nsubxids, TransactionId *subxids, + XidStatus status, XLogRecPtr lsn); +static bool TransactionGroupUpdateXidStatus(TransactionId xid, + XidStatus status, XLogRecPtr lsn, int pageno); +static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno); + + +/* + * TransactionIdSetTreeStatus + * + * Record the final state of transaction entries in the commit log for + * a transaction and its subtransaction tree. Take care to ensure this is + * efficient, and as atomic as possible. + * + * xid is a single xid to set status for. This will typically be + * the top level transactionid for a top level commit or abort. It can + * also be a subtransaction when we record transaction aborts. + * + * subxids is an array of xids of length nsubxids, representing subtransactions + * in the tree of xid. In various cases nsubxids may be zero. + * + * lsn must be the WAL location of the commit record when recording an async + * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the + * caller guarantees the commit record is already flushed in that case. It + * should be InvalidXLogRecPtr for abort cases, too. + * + * In the commit case, atomicity is limited by whether all the subxids are in + * the same CLOG page as xid. If they all are, then the lock will be grabbed + * only once, and the status will be set to committed directly. Otherwise + * we must + * 1. set sub-committed all subxids that are not on the same page as the + * main xid + * 2. atomically set committed the main xid and the subxids on the same page + * 3. go over the first bunch again and set them committed + * Note that as far as concurrent checkers are concerned, main transaction + * commit as a whole is still atomic. + * + * Example: + * TransactionId t commits and has subxids t1, t2, t3, t4 + * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3 + * 1. update pages2-3: + * page2: set t2,t3 as sub-committed + * page3: set t4 as sub-committed + * 2. update page1: + * set t1 as sub-committed, + * then set t as committed, + then set t1 as committed + * 3. update pages2-3: + * page2: set t2,t3 as committed + * page3: set t4 as committed + * + * NB: this is a low-level routine and is NOT the preferred entry point + * for most uses; functions in transam.c are the intended callers. + * + * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need, + * but aren't yet in cache, as well as hinting pages not to fall out of + * cache yet. + */ +void +TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, XLogRecPtr lsn) +{ + int pageno = TransactionIdToPage(xid); /* get page of parent */ + int i; + + Assert(status == TRANSACTION_STATUS_COMMITTED || + status == TRANSACTION_STATUS_ABORTED); + + /* + * See how many subxids, if any, are on the same page as the parent, if + * any. + */ + for (i = 0; i < nsubxids; i++) + { + if (TransactionIdToPage(subxids[i]) != pageno) + break; + } + + /* + * Do all items fit on a single page? + */ + if (i == nsubxids) + { + /* + * Set the parent and all subtransactions in a single call + */ + TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn, + pageno, true); + } + else + { + int nsubxids_on_first_page = i; + + /* + * If this is a commit then we care about doing this correctly (i.e. + * using the subcommitted intermediate status). By here, we know + * we're updating more than one page of clog, so we must mark entries + * that are *not* on the first page so that they show as subcommitted + * before we then return to update the status to fully committed. + * + * To avoid touching the first page twice, skip marking subcommitted + * for the subxids on that first page. + */ + if (status == TRANSACTION_STATUS_COMMITTED) + set_status_by_pages(nsubxids - nsubxids_on_first_page, + subxids + nsubxids_on_first_page, + TRANSACTION_STATUS_SUB_COMMITTED, lsn); + + /* + * Now set the parent and subtransactions on same page as the parent, + * if any + */ + pageno = TransactionIdToPage(xid); + TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status, + lsn, pageno, false); + + /* + * Now work through the rest of the subxids one clog page at a time, + * starting from the second page onwards, like we did above. + */ + set_status_by_pages(nsubxids - nsubxids_on_first_page, + subxids + nsubxids_on_first_page, + status, lsn); + } +} + +/* + * Helper for TransactionIdSetTreeStatus: set the status for a bunch of + * transactions, chunking in the separate CLOG pages involved. We never + * pass the whole transaction tree to this function, only subtransactions + * that are on different pages to the top level transaction id. + */ +static void +set_status_by_pages(int nsubxids, TransactionId *subxids, + XidStatus status, XLogRecPtr lsn) +{ + int pageno = TransactionIdToPage(subxids[0]); + int offset = 0; + int i = 0; + + Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */ + + while (i < nsubxids) + { + int num_on_page = 0; + int nextpageno; + + do + { + nextpageno = TransactionIdToPage(subxids[i]); + if (nextpageno != pageno) + break; + num_on_page++; + i++; + } while (i < nsubxids); + + TransactionIdSetPageStatus(InvalidTransactionId, + num_on_page, subxids + offset, + status, lsn, pageno, false); + offset = i; + pageno = nextpageno; + } +} + +/* + * Record the final state of transaction entries in the commit log for all + * entries on a single page. Atomic only on this page. + */ +static void +TransactionIdSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno, + bool all_xact_same_page) +{ + /* Can't use group update when PGPROC overflows. */ + StaticAssertStmt(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS, + "group clog threshold less than PGPROC cached subxids"); + + /* + * When there is contention on XactSLRULock, we try to group multiple + * updates; a single leader process will perform transaction status + * updates for multiple backends so that the number of times XactSLRULock + * needs to be acquired is reduced. + * + * For this optimization to be safe, the XID and subxids in MyProc must be + * the same as the ones for which we're setting the status. Check that + * this is the case. + * + * For this optimization to be efficient, we shouldn't have too many + * sub-XIDs and all of the XIDs for which we're adjusting clog should be + * on the same page. Check those conditions, too. + */ + if (all_xact_same_page && xid == MyProc->xid && + nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT && + nsubxids == MyProc->subxidStatus.count && + (nsubxids == 0 || + memcmp(subxids, MyProc->subxids.xids, + nsubxids * sizeof(TransactionId)) == 0)) + { + /* + * If we can immediately acquire XactSLRULock, we update the status of + * our own XID and release the lock. If not, try use group XID + * update. If that doesn't work out, fall back to waiting for the + * lock to perform an update for this transaction only. + */ + if (LWLockConditionalAcquire(XactSLRULock, LW_EXCLUSIVE)) + { + /* Got the lock without waiting! Do the update. */ + TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, + lsn, pageno); + LWLockRelease(XactSLRULock); + return; + } + else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno)) + { + /* Group update mechanism has done the work. */ + return; + } + + /* Fall through only if update isn't done yet. */ + } + + /* Group update not applicable, or couldn't accept this page number. */ + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, + lsn, pageno); + LWLockRelease(XactSLRULock); +} + +/* + * Record the final state of transaction entry in the commit log + * + * We don't do any locking here; caller must handle that. + */ +static void +TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno) +{ + int slotno; + int i; + + Assert(status == TRANSACTION_STATUS_COMMITTED || + status == TRANSACTION_STATUS_ABORTED || + (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); + Assert(LWLockHeldByMeInMode(XactSLRULock, LW_EXCLUSIVE)); + + /* + * If we're doing an async commit (ie, lsn is valid), then we must wait + * for any active write on the page slot to complete. Otherwise our + * update could reach disk in that write, which will not do since we + * mustn't let it reach disk until we've done the appropriate WAL flush. + * But when lsn is invalid, it's OK to scribble on a page while it is + * write-busy, since we don't care if the update reaches disk sooner than + * we think. + */ + slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); + + /* + * Set the main transaction id, if any. + * + * If we update more than one xid on this page while it is being written + * out, we might find that some of the bits go to disk and others don't. + * If we are updating commits on the page with the top-level xid that + * could break atomicity, so we subcommit the subxids first before we mark + * the top-level commit. + */ + if (TransactionIdIsValid(xid)) + { + /* Subtransactions first, if needed ... */ + if (status == TRANSACTION_STATUS_COMMITTED) + { + for (i = 0; i < nsubxids; i++) + { + Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + TransactionIdSetStatusBit(subxids[i], + TRANSACTION_STATUS_SUB_COMMITTED, + lsn, slotno); + } + } + + /* ... then the main transaction */ + TransactionIdSetStatusBit(xid, status, lsn, slotno); + } + + /* Set the subtransactions */ + for (i = 0; i < nsubxids; i++) + { + Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + TransactionIdSetStatusBit(subxids[i], status, lsn, slotno); + } + + XactCtl->shared->page_dirty[slotno] = true; +} + +/* + * When we cannot immediately acquire XactSLRULock in exclusive mode at + * commit time, add ourselves to a list of processes that need their XIDs + * status update. The first process to add itself to the list will acquire + * XactSLRULock in exclusive mode and set transaction status as required + * on behalf of all group members. This avoids a great deal of contention + * around XactSLRULock when many processes are trying to commit at once, + * since the lock need not be repeatedly handed off from one committing + * process to the next. + * + * Returns true when transaction status has been updated in clog; returns + * false if we decided against applying the optimization because the page + * number we need to update differs from those processes already waiting. + */ +static bool +TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, + XLogRecPtr lsn, int pageno) +{ + volatile PROC_HDR *procglobal = ProcGlobal; + PGPROC *proc = MyProc; + uint32 nextidx; + uint32 wakeidx; + + /* We should definitely have an XID whose status needs to be updated. */ + Assert(TransactionIdIsValid(xid)); + + /* + * Add ourselves to the list of processes needing a group XID status + * update. + */ + proc->clogGroupMember = true; + proc->clogGroupMemberXid = xid; + proc->clogGroupMemberXidStatus = status; + proc->clogGroupMemberPage = pageno; + proc->clogGroupMemberLsn = lsn; + + nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst); + + while (true) + { + /* + * Add the proc to list, if the clog page where we need to update the + * current transaction status is same as group leader's clog page. + * + * There is a race condition here, which is that after doing the below + * check and before adding this proc's clog update to a group, the + * group leader might have already finished the group update for this + * page and becomes group leader of another group. This will lead to a + * situation where a single group can have different clog page + * updates. This isn't likely and will still work, just maybe a bit + * less efficiently. + */ + if (nextidx != INVALID_PGPROCNO && + ProcGlobal->allProcs[nextidx].clogGroupMemberPage != proc->clogGroupMemberPage) + { + /* + * Ensure that this proc is not a member of any clog group that + * needs an XID status update. + */ + proc->clogGroupMember = false; + pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO); + return false; + } + + pg_atomic_write_u32(&proc->clogGroupNext, nextidx); + + if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst, + &nextidx, + (uint32) proc->pgprocno)) + break; + } + + /* + * If the list was not empty, the leader will update the status of our + * XID. It is impossible to have followers without a leader because the + * first process that has added itself to the list will always have + * nextidx as INVALID_PGPROCNO. + */ + if (nextidx != INVALID_PGPROCNO) + { + int extraWaits = 0; + + /* Sleep until the leader updates our XID status. */ + pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE); + for (;;) + { + /* acts as a read barrier */ + PGSemaphoreLock(proc->sem); + if (!proc->clogGroupMember) + break; + extraWaits++; + } + pgstat_report_wait_end(); + + Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PGPROCNO); + + /* Fix semaphore count for any absorbed wakeups */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + return true; + } + + /* We are the leader. Acquire the lock on behalf of everyone. */ + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* + * Now that we've got the lock, clear the list of processes waiting for + * group XID status update, saving a pointer to the head of the list. + * Trying to pop elements one at a time could lead to an ABA problem. + */ + nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst, + INVALID_PGPROCNO); + + /* Remember head of list so we can perform wakeups after dropping lock. */ + wakeidx = nextidx; + + /* Walk the list and update the status of all XIDs. */ + while (nextidx != INVALID_PGPROCNO) + { + PGPROC *proc = &ProcGlobal->allProcs[nextidx]; + + /* + * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs + * should not use group XID status update mechanism. + */ + Assert(proc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT); + + TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid, + proc->subxidStatus.count, + proc->subxids.xids, + proc->clogGroupMemberXidStatus, + proc->clogGroupMemberLsn, + proc->clogGroupMemberPage); + + /* Move to next proc in list. */ + nextidx = pg_atomic_read_u32(&proc->clogGroupNext); + } + + /* We're done with the lock now. */ + LWLockRelease(XactSLRULock); + + /* + * Now that we've released the lock, go back and wake everybody up. We + * don't do this under the lock so as to keep lock hold times to a + * minimum. + */ + while (wakeidx != INVALID_PGPROCNO) + { + PGPROC *proc = &ProcGlobal->allProcs[wakeidx]; + + wakeidx = pg_atomic_read_u32(&proc->clogGroupNext); + pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO); + + /* ensure all previous writes are visible before follower continues. */ + pg_write_barrier(); + + proc->clogGroupMember = false; + + if (proc != MyProc) + PGSemaphoreUnlock(proc->sem); + } + + return true; +} + +/* + * Sets the commit status of a single transaction. + * + * Must be called with XactSLRULock held + */ +static void +TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) +{ + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + char *byteptr; + char byteval; + char curval; + + byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK; + + /* + * When replaying transactions during recovery we still need to perform + * the two phases of subcommit and then commit. However, some transactions + * are already correctly marked, so we just treat those as a no-op which + * allows us to keep the following Assert as restrictive as possible. + */ + if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED && + curval == TRANSACTION_STATUS_COMMITTED) + return; + + /* + * Current state change should be from 0 or subcommitted to target state + * or we should already be there when replaying changes during recovery. + */ + Assert(curval == 0 || + (curval == TRANSACTION_STATUS_SUB_COMMITTED && + status != TRANSACTION_STATUS_IN_PROGRESS) || + curval == status); + + /* note this assumes exclusive access to the clog page */ + byteval = *byteptr; + byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift); + byteval |= (status << bshift); + *byteptr = byteval; + + /* + * Update the group LSN if the transaction completion LSN is higher. + * + * Note: lsn will be invalid when supplied during InRecovery processing, + * so we don't need to do anything special to avoid LSN updates during + * recovery. After recovery completes the next clog change will set the + * LSN correctly. + */ + if (!XLogRecPtrIsInvalid(lsn)) + { + int lsnindex = GetLSNIndex(slotno, xid); + + if (XactCtl->shared->group_lsn[lsnindex] < lsn) + XactCtl->shared->group_lsn[lsnindex] = lsn; + } +} + +/* + * Interrogate the state of a transaction in the commit log. + * + * Aside from the actual commit status, this function returns (into *lsn) + * an LSN that is late enough to be able to guarantee that if we flush up to + * that LSN then we will have flushed the transaction's commit record to disk. + * The result is not necessarily the exact LSN of the transaction's commit + * record! For example, for long-past transactions (those whose clog pages + * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because + * we group transactions on the same clog page to conserve storage, we might + * return the LSN of a later transaction that falls into the same group. + * + * NB: this is a low-level routine and is NOT the preferred entry point + * for most uses; TransactionLogFetch() in transam.c is the intended caller. + */ +XidStatus +TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) +{ + int pageno = TransactionIdToPage(xid); + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + int slotno; + int lsnindex; + char *byteptr; + XidStatus status; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + + slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid); + byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + + status = (*byteptr >> bshift) & CLOG_XACT_BITMASK; + + lsnindex = GetLSNIndex(slotno, xid); + *lsn = XactCtl->shared->group_lsn[lsnindex]; + + LWLockRelease(XactSLRULock); + + return status; +} + +/* + * Number of shared CLOG buffers. + * + * On larger multi-processor systems, it is possible to have many CLOG page + * requests in flight at one time which could lead to disk access for CLOG + * page if the required page is not found in memory. Testing revealed that we + * can get the best performance by having 128 CLOG buffers, more than that it + * doesn't improve performance. + * + * Unconditionally keeping the number of CLOG buffers to 128 did not seem like + * a good idea, because it would increase the minimum amount of shared memory + * required to start, which could be a problem for people running very small + * configurations. The following formula seems to represent a reasonable + * compromise: people with very low values for shared_buffers will get fewer + * CLOG buffers as well, and everyone else will get 128. + */ +Size +CLOGShmemBuffers(void) +{ + return Min(128, Max(4, NBuffers / 512)); +} + +/* + * Initialization of shared memory for CLOG + */ +Size +CLOGShmemSize(void) +{ + return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE); +} + +void +CLOGShmemInit(void) +{ + XactCtl->PagePrecedes = CLOGPagePrecedes; + SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, + XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, + SYNC_HANDLER_CLOG); + SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE); +} + +/* + * This func must be called ONCE on system install. It creates + * the initial CLOG segment. (The CLOG directory is assumed to + * have been created by initdb, and CLOGShmemInit must have been + * called already.) + */ +void +BootStrapCLOG(void) +{ + int slotno; + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the commit log */ + slotno = ZeroCLOGPage(0, false); + + /* Make sure it's written out */ + SimpleLruWritePage(XactCtl, slotno); + Assert(!XactCtl->shared->page_dirty[slotno]); + + LWLockRelease(XactSLRULock); +} + +/* + * Initialize (or reinitialize) a page of CLOG to zeroes. + * If writeXlog is true, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroCLOGPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(XactCtl, pageno); + + if (writeXlog) + WriteZeroPageXlogRec(pageno); + + return slotno; +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + */ +void +StartupCLOG(void) +{ + TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + int pageno = TransactionIdToPage(xid); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* + * Initialize our idea of the latest page number. + */ + XactCtl->shared->latest_page_number = pageno; + + LWLockRelease(XactSLRULock); +} + +/* + * This must be called ONCE at the end of startup/recovery. + */ +void +TrimCLOG(void) +{ + TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + int pageno = TransactionIdToPage(xid); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* + * Zero out the remainder of the current clog page. Under normal + * circumstances it should be zeroes already, but it seems at least + * theoretically possible that XLOG replay will have settled on a nextXID + * value that is less than the last XID actually used and marked by the + * previous database lifecycle (since subtransaction commit writes clog + * but makes no WAL entry). Let's just be safe. (We need not worry about + * pages beyond the current one, since those will be zeroed when first + * used. For the same reason, there is no need to do anything when + * nextXid is exactly at a page boundary; and it's likely that the + * "current" page doesn't exist yet in that case.) + */ + if (TransactionIdToPgIndex(xid) != 0) + { + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + int slotno; + char *byteptr; + + slotno = SimpleLruReadPage(XactCtl, pageno, false, xid); + byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + + /* Zero so-far-unused positions in the current byte */ + *byteptr &= (1 << bshift) - 1; + /* Zero the rest of the page */ + MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1); + + XactCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(XactSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointCLOG(void) +{ + /* + * Write dirty CLOG pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); + SimpleLruWriteAll(XactCtl, true); + TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); +} + + +/* + * Make sure that CLOG has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty clog or xlog page to make room + * in shared memory. + */ +void +ExtendCLOG(TransactionId newestXact) +{ + int pageno; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToPgIndex(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToPage(newestXact); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroCLOGPage(pageno, true); + + LWLockRelease(XactSLRULock); +} + + +/* + * Remove all CLOG segments before the one holding the passed transaction ID + * + * Before removing any CLOG data, we must flush XLOG to disk, to ensure + * that any recently-emitted FREEZE_PAGE records have reached disk; otherwise + * a crash and restart might leave us with some unfrozen tuples referencing + * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too. + * Replaying the deletion from XLOG is not critical, since the files could + * just as well be removed later, but doing so prevents a long-running hot + * standby server from acquiring an unreasonably bloated CLOG directory. + * + * Since CLOG segments hold a large number of transactions, the opportunity to + * actually remove a segment is fairly rare, and so it seems best not to do + * the XLOG flush unless we have confirmed that there is a removable segment. + */ +void +TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. + */ + cutoffPage = TransactionIdToPage(oldestXact); + + /* Check to see if there's any files that could be removed */ + if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage)) + return; /* nothing to remove */ + + /* + * Advance oldestClogXid before truncating clog, so concurrent xact status + * lookups can ensure they don't attempt to access truncated-away clog. + * + * It's only necessary to do this if we will actually truncate away clog + * pages. + */ + AdvanceOldestClogXid(oldestXact); + + /* + * Write XLOG record and flush XLOG to disk. We record the oldest xid + * we're keeping information about here so we can ensure that it's always + * ahead of clog truncation in case we crash, and so a standby finds out + * the new valid xid before the next checkpoint. + */ + WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid); + + /* Now we can remove the old CLOG segment(s) */ + SimpleLruTruncate(XactCtl, cutoffPage); +} + + +/* + * Decide whether a CLOG page number is "older" for truncation purposes. + * + * We need to use comparison of TransactionIds here in order to do the right + * thing with wraparound XID arithmetic. However, TransactionIdPrecedes() + * would get weird about permanent xact IDs. So, offset both such that xid1, + * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset + * is relevant to page 0 and to the page preceding page 0. + * + * The page containing oldestXact-2^31 is the important edge case. The + * portion of that page equaling or following oldestXact-2^31 is expendable, + * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is + * the first XID of a page and segment, the entire page and segment is + * expendable, and we could truncate the segment. Recognizing that case would + * require making oldestXact, not just the page containing oldestXact, + * available to this callback. The benefit would be rare and small, so we + * don't optimize that edge case. + */ +static bool +CLOGPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1)); +} + + +/* + * Write a ZEROPAGE xlog record + */ +static void +WriteZeroPageXlogRec(int pageno) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); +} + +/* + * Write a TRUNCATE xlog record + * + * We must flush the xlog record to disk before returning --- see notes + * in TruncateCLOG(). + */ +static void +WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb) +{ + XLogRecPtr recptr; + xl_clog_truncate xlrec; + + xlrec.pageno = pageno; + xlrec.oldestXact = oldestXact; + xlrec.oldestXactDb = oldestXactDb; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate)); + recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE); + XLogFlush(recptr); +} + +/* + * CLOG resource manager's routines + */ +void +clog_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in clog records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == CLOG_ZEROPAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + slotno = ZeroCLOGPage(pageno, false); + SimpleLruWritePage(XactCtl, slotno); + Assert(!XactCtl->shared->page_dirty[slotno]); + + LWLockRelease(XactSLRULock); + } + else if (info == CLOG_TRUNCATE) + { + xl_clog_truncate xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate)); + + AdvanceOldestClogXid(xlrec.oldestXact); + + SimpleLruTruncate(XactCtl, xlrec.pageno); + } + else + elog(PANIC, "clog_redo: unknown op code %u", info); +} + +/* + * Entrypoint for sync.c to sync clog files. + */ +int +clogsyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(XactCtl, ftag, path); +} diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c new file mode 100644 index 0000000..edbe3cf --- /dev/null +++ b/src/backend/access/transam/commit_ts.c @@ -0,0 +1,1032 @@ +/*------------------------------------------------------------------------- + * + * commit_ts.c + * PostgreSQL commit timestamp manager + * + * This module is a pg_xact-like system that stores the commit timestamp + * for each transaction. + * + * XLOG interactions: this module generates an XLOG record whenever a new + * CommitTs page is initialized to zeroes. Also, one XLOG record is + * generated for setting of values when the caller requests it; this allows + * us to support values coming from places other than transaction commit. + * Other writes of CommitTS come from recording of transaction commit in + * xact.c, which generates its own XLOG records for these events and will + * re-perform the status update on redo; so we need make no additional XLOG + * entry here. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/commit_ts.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/commit_ts.h" +#include "access/htup_details.h" +#include "access/slru.h" +#include "access/transam.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "storage/shmem.h" +#include "utils/builtins.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" + +/* + * Defines for CommitTs page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * CommitTs page numbering also wraps around at + * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE, and CommitTs segment numbering at + * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateCommitTs (see CommitTsPagePrecedes). + */ + +/* + * We need 8+2 bytes per xact. Note that enlarging this struct might mean + * the largest possible file name is more than 5 chars long; see + * SlruScanDirectory. + */ +typedef struct CommitTimestampEntry +{ + TimestampTz time; + RepOriginId nodeid; +} CommitTimestampEntry; + +#define SizeOfCommitTimestampEntry (offsetof(CommitTimestampEntry, nodeid) + \ + sizeof(RepOriginId)) + +#define COMMIT_TS_XACTS_PER_PAGE \ + (BLCKSZ / SizeOfCommitTimestampEntry) + +#define TransactionIdToCTsPage(xid) \ + ((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE) +#define TransactionIdToCTsEntry(xid) \ + ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE) + +/* + * Link to shared-memory data structures for CommitTs control + */ +static SlruCtlData CommitTsCtlData; + +#define CommitTsCtl (&CommitTsCtlData) + +/* + * We keep a cache of the last value set in shared memory. + * + * This is also good place to keep the activation status. We keep this + * separate from the GUC so that the standby can activate the module if the + * primary has it active independently of the value of the GUC. + * + * This is protected by CommitTsLock. In some places, we use commitTsActive + * without acquiring the lock; where this happens, a comment explains the + * rationale for it. + */ +typedef struct CommitTimestampShared +{ + TransactionId xidLastCommit; + CommitTimestampEntry dataLastCommit; + bool commitTsActive; +} CommitTimestampShared; + +CommitTimestampShared *commitTsShared; + + +/* GUC variable */ +bool track_commit_timestamp; + +static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz ts, + RepOriginId nodeid, int pageno); +static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, + RepOriginId nodeid, int slotno); +static void error_commit_ts_disabled(void); +static int ZeroCommitTsPage(int pageno, bool writeXlog); +static bool CommitTsPagePrecedes(int page1, int page2); +static void ActivateCommitTs(void); +static void DeactivateCommitTs(void); +static void WriteZeroPageXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid); + +/* + * TransactionTreeSetCommitTsData + * + * Record the final commit timestamp of transaction entries in the commit log + * for a transaction and its subtransaction tree, as efficiently as possible. + * + * xid is the top level transaction id. + * + * subxids is an array of xids of length nsubxids, representing subtransactions + * in the tree of xid. In various cases nsubxids may be zero. + * The reason why tracking just the parent xid commit timestamp is not enough + * is that the subtrans SLRU does not stay valid across crashes (it's not + * permanent) so we need to keep the information about them here. If the + * subtrans implementation changes in the future, we might want to revisit the + * decision of storing timestamp info for each subxid. + */ +void +TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz timestamp, + RepOriginId nodeid) +{ + int i; + TransactionId headxid; + TransactionId newestXact; + + /* + * No-op if the module is not active. + * + * An unlocked read here is fine, because in a standby (the only place + * where the flag can change in flight) this routine is only called by the + * recovery process, which is also the only process which can change the + * flag. + */ + if (!commitTsShared->commitTsActive) + return; + + /* + * Figure out the latest Xid in this batch: either the last subxid if + * there's any, otherwise the parent xid. + */ + if (nsubxids > 0) + newestXact = subxids[nsubxids - 1]; + else + newestXact = xid; + + /* + * We split the xids to set the timestamp to in groups belonging to the + * same SLRU page; the first element in each such set is its head. The + * first group has the main XID as the head; subsequent sets use the first + * subxid not on the previous page as head. This way, we only have to + * lock/modify each SLRU page once. + */ + headxid = xid; + i = 0; + for (;;) + { + int pageno = TransactionIdToCTsPage(headxid); + int j; + + for (j = i; j < nsubxids; j++) + { + if (TransactionIdToCTsPage(subxids[j]) != pageno) + break; + } + /* subxids[i..j] are on the same page as the head */ + + SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid, + pageno); + + /* if we wrote out all subxids, we're done. */ + if (j >= nsubxids) + break; + + /* + * Set the new head and skip over it, as well as over the subxids we + * just wrote. + */ + headxid = subxids[j]; + i = j + 1; + } + + /* update the cached value in shared memory */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + commitTsShared->xidLastCommit = xid; + commitTsShared->dataLastCommit.time = timestamp; + commitTsShared->dataLastCommit.nodeid = nodeid; + + /* and move forwards our endpoint, if needed */ + if (TransactionIdPrecedes(ShmemVariableCache->newestCommitTsXid, newestXact)) + ShmemVariableCache->newestCommitTsXid = newestXact; + LWLockRelease(CommitTsLock); +} + +/* + * Record the commit timestamp of transaction entries in the commit log for all + * entries on a single page. Atomic only on this page. + */ +static void +SetXidCommitTsInPage(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz ts, + RepOriginId nodeid, int pageno) +{ + int slotno; + int i; + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid); + + TransactionIdSetCommitTs(xid, ts, nodeid, slotno); + for (i = 0; i < nsubxids; i++) + TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno); + + CommitTsCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(CommitTsSLRULock); +} + +/* + * Sets the commit timestamp of a single transaction. + * + * Must be called with CommitTsSLRULock held + */ +static void +TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, + RepOriginId nodeid, int slotno) +{ + int entryno = TransactionIdToCTsEntry(xid); + CommitTimestampEntry entry; + + Assert(TransactionIdIsNormal(xid)); + + entry.time = ts; + entry.nodeid = nodeid; + + memcpy(CommitTsCtl->shared->page_buffer[slotno] + + SizeOfCommitTimestampEntry * entryno, + &entry, SizeOfCommitTimestampEntry); +} + +/* + * Interrogate the commit timestamp of a transaction. + * + * The return value indicates whether a commit timestamp record was found for + * the given xid. The timestamp value is returned in *ts (which may not be + * null), and the origin node for the Xid is returned in *nodeid, if it's not + * null. + */ +bool +TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, + RepOriginId *nodeid) +{ + int pageno = TransactionIdToCTsPage(xid); + int entryno = TransactionIdToCTsEntry(xid); + int slotno; + CommitTimestampEntry entry; + TransactionId oldestCommitTsXid; + TransactionId newestCommitTsXid; + + if (!TransactionIdIsValid(xid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cannot retrieve commit timestamp for transaction %u", xid))); + else if (!TransactionIdIsNormal(xid)) + { + /* frozen and bootstrap xids are always committed far in the past */ + *ts = 0; + if (nodeid) + *nodeid = 0; + return false; + } + + LWLockAcquire(CommitTsLock, LW_SHARED); + + /* Error if module not enabled */ + if (!commitTsShared->commitTsActive) + error_commit_ts_disabled(); + + /* + * If we're asked for the cached value, return that. Otherwise, fall + * through to read from SLRU. + */ + if (commitTsShared->xidLastCommit == xid) + { + *ts = commitTsShared->dataLastCommit.time; + if (nodeid) + *nodeid = commitTsShared->dataLastCommit.nodeid; + + LWLockRelease(CommitTsLock); + return *ts != 0; + } + + oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid; + newestCommitTsXid = ShmemVariableCache->newestCommitTsXid; + /* neither is invalid, or both are */ + Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid)); + LWLockRelease(CommitTsLock); + + /* + * Return empty if the requested value is outside our valid range. + */ + if (!TransactionIdIsValid(oldestCommitTsXid) || + TransactionIdPrecedes(xid, oldestCommitTsXid) || + TransactionIdPrecedes(newestCommitTsXid, xid)) + { + *ts = 0; + if (nodeid) + *nodeid = InvalidRepOriginId; + return false; + } + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); + memcpy(&entry, + CommitTsCtl->shared->page_buffer[slotno] + + SizeOfCommitTimestampEntry * entryno, + SizeOfCommitTimestampEntry); + + *ts = entry.time; + if (nodeid) + *nodeid = entry.nodeid; + + LWLockRelease(CommitTsSLRULock); + return *ts != 0; +} + +/* + * Return the Xid of the latest committed transaction. (As far as this module + * is concerned, anyway; it's up to the caller to ensure the value is useful + * for its purposes.) + * + * ts and nodeid are filled with the corresponding data; they can be passed + * as NULL if not wanted. + */ +TransactionId +GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid) +{ + TransactionId xid; + + LWLockAcquire(CommitTsLock, LW_SHARED); + + /* Error if module not enabled */ + if (!commitTsShared->commitTsActive) + error_commit_ts_disabled(); + + xid = commitTsShared->xidLastCommit; + if (ts) + *ts = commitTsShared->dataLastCommit.time; + if (nodeid) + *nodeid = commitTsShared->dataLastCommit.nodeid; + LWLockRelease(CommitTsLock); + + return xid; +} + +static void +error_commit_ts_disabled(void) +{ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not get commit timestamp data"), + RecoveryInProgress() ? + errhint("Make sure the configuration parameter \"%s\" is set on the primary server.", + "track_commit_timestamp") : + errhint("Make sure the configuration parameter \"%s\" is set.", + "track_commit_timestamp"))); +} + +/* + * SQL-callable wrapper to obtain commit time of a transaction + */ +Datum +pg_xact_commit_timestamp(PG_FUNCTION_ARGS) +{ + TransactionId xid = PG_GETARG_TRANSACTIONID(0); + TimestampTz ts; + bool found; + + found = TransactionIdGetCommitTsData(xid, &ts, NULL); + + if (!found) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(ts); +} + + +/* + * pg_last_committed_xact + * + * SQL-callable wrapper to obtain some information about the latest + * committed transaction: transaction ID, timestamp and replication + * origin. + */ +Datum +pg_last_committed_xact(PG_FUNCTION_ARGS) +{ + TransactionId xid; + RepOriginId nodeid; + TimestampTz ts; + Datum values[3]; + bool nulls[3]; + TupleDesc tupdesc; + HeapTuple htup; + + /* and construct a tuple with our data */ + xid = GetLatestCommitTsData(&ts, &nodeid); + + /* + * Construct a tuple descriptor for the result row. This must match this + * function's pg_proc entry! + */ + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "timestamp", + TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "roident", + OIDOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + if (!TransactionIdIsNormal(xid)) + { + memset(nulls, true, sizeof(nulls)); + } + else + { + values[0] = TransactionIdGetDatum(xid); + nulls[0] = false; + + values[1] = TimestampTzGetDatum(ts); + nulls[1] = false; + + values[2] = ObjectIdGetDatum((Oid) nodeid); + nulls[2] = false; + } + + htup = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(htup)); +} + +/* + * pg_xact_commit_timestamp_origin + * + * SQL-callable wrapper to obtain commit timestamp and replication origin + * of a given transaction. + */ +Datum +pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS) +{ + TransactionId xid = PG_GETARG_TRANSACTIONID(0); + RepOriginId nodeid; + TimestampTz ts; + Datum values[2]; + bool nulls[2]; + TupleDesc tupdesc; + HeapTuple htup; + bool found; + + found = TransactionIdGetCommitTsData(xid, &ts, &nodeid); + + /* + * Construct a tuple descriptor for the result row. This must match this + * function's pg_proc entry! + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "timestamp", + TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "roident", + OIDOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + if (!found) + { + memset(nulls, true, sizeof(nulls)); + } + else + { + values[0] = TimestampTzGetDatum(ts); + nulls[0] = false; + + values[1] = ObjectIdGetDatum((Oid) nodeid); + nulls[1] = false; + } + + htup = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(htup)); +} + +/* + * Number of shared CommitTS buffers. + * + * We use a very similar logic as for the number of CLOG buffers; see comments + * in CLOGShmemBuffers. + */ +Size +CommitTsShmemBuffers(void) +{ + return Min(16, Max(4, NBuffers / 1024)); +} + +/* + * Shared memory sizing for CommitTs + */ +Size +CommitTsShmemSize(void) +{ + return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) + + sizeof(CommitTimestampShared); +} + +/* + * Initialize CommitTs at system startup (postmaster start or standalone + * backend) + */ +void +CommitTsShmemInit(void) +{ + bool found; + + CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; + SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, + CommitTsSLRULock, "pg_commit_ts", + LWTRANCHE_COMMITTS_BUFFER, + SYNC_HANDLER_COMMIT_TS); + SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE); + + commitTsShared = ShmemInitStruct("CommitTs shared", + sizeof(CommitTimestampShared), + &found); + + if (!IsUnderPostmaster) + { + Assert(!found); + + commitTsShared->xidLastCommit = InvalidTransactionId; + TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time); + commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId; + commitTsShared->commitTsActive = false; + } + else + Assert(found); +} + +/* + * This function must be called ONCE on system install. + * + * (The CommitTs directory is assumed to have been created by initdb, and + * CommitTsShmemInit must have been called already.) + */ +void +BootStrapCommitTs(void) +{ + /* + * Nothing to do here at present, unlike most other SLRU modules; segments + * are created when the server is started with this module enabled. See + * ActivateCommitTs. + */ +} + +/* + * Initialize (or reinitialize) a page of CommitTs to zeroes. + * If writeXlog is true, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroCommitTsPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(CommitTsCtl, pageno); + + if (writeXlog) + WriteZeroPageXlogRec(pageno); + + return slotno; +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + */ +void +StartupCommitTs(void) +{ + ActivateCommitTs(); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after recovery has finished. + */ +void +CompleteCommitTsInitialization(void) +{ + /* + * If the feature is not enabled, turn it off for good. This also removes + * any leftover data. + * + * Conversely, we activate the module if the feature is enabled. This is + * necessary for primary and standby as the activation depends on the + * control file contents at the beginning of recovery or when a + * XLOG_PARAMETER_CHANGE is replayed. + */ + if (!track_commit_timestamp) + DeactivateCommitTs(); + else + ActivateCommitTs(); +} + +/* + * Activate or deactivate CommitTs' upon reception of a XLOG_PARAMETER_CHANGE + * XLog record during recovery. + */ +void +CommitTsParameterChange(bool newvalue, bool oldvalue) +{ + /* + * If the commit_ts module is disabled in this server and we get word from + * the primary server that it is enabled there, activate it so that we can + * replay future WAL records involving it; also mark it as active on + * pg_control. If the old value was already set, we already did this, so + * don't do anything. + * + * If the module is disabled in the primary, disable it here too, unless + * the module is enabled locally. + * + * Note this only runs in the recovery process, so an unlocked read is + * fine. + */ + if (newvalue) + { + if (!commitTsShared->commitTsActive) + ActivateCommitTs(); + } + else if (commitTsShared->commitTsActive) + DeactivateCommitTs(); +} + +/* + * Activate this module whenever necessary. + * This must happen during postmaster or standalone-backend startup, + * or during WAL replay anytime the track_commit_timestamp setting is + * changed in the primary. + * + * The reason why this SLRU needs separate activation/deactivation functions is + * that it can be enabled/disabled during start and the activation/deactivation + * on the primary is propagated to the standby via replay. Other SLRUs don't + * have this property and they can be just initialized during normal startup. + * + * This is in charge of creating the currently active segment, if it's not + * already there. The reason for this is that the server might have been + * running with this module disabled for a while and thus might have skipped + * the normal creation point. + */ +static void +ActivateCommitTs(void) +{ + TransactionId xid; + int pageno; + + /* If we've done this already, there's nothing to do */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (commitTsShared->commitTsActive) + { + LWLockRelease(CommitTsLock); + return; + } + LWLockRelease(CommitTsLock); + + xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + pageno = TransactionIdToCTsPage(xid); + + /* + * Re-Initialize our idea of the latest page number. + */ + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + CommitTsCtl->shared->latest_page_number = pageno; + LWLockRelease(CommitTsSLRULock); + + /* + * If CommitTs is enabled, but it wasn't in the previous server run, we + * need to set the oldest and newest values to the next Xid; that way, we + * will not try to read data that might not have been set. + * + * XXX does this have a problem if a server is started with commitTs + * enabled, then started with commitTs disabled, then restarted with it + * enabled again? It doesn't look like it does, because there should be a + * checkpoint that sets the value to InvalidTransactionId at end of + * recovery; and so any chance of injecting new transactions without + * CommitTs values would occur after the oldestCommitTsXid has been set to + * Invalid temporarily. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTsXid == InvalidTransactionId) + { + ShmemVariableCache->oldestCommitTsXid = + ShmemVariableCache->newestCommitTsXid = ReadNextTransactionId(); + } + LWLockRelease(CommitTsLock); + + /* Create the current segment file, if necessary */ + if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) + { + int slotno; + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + slotno = ZeroCommitTsPage(pageno, false); + SimpleLruWritePage(CommitTsCtl, slotno); + Assert(!CommitTsCtl->shared->page_dirty[slotno]); + LWLockRelease(CommitTsSLRULock); + } + + /* Change the activation status in shared memory. */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + commitTsShared->commitTsActive = true; + LWLockRelease(CommitTsLock); +} + +/* + * Deactivate this module. + * + * This must be called when the track_commit_timestamp parameter is turned off. + * This happens during postmaster or standalone-backend startup, or during WAL + * replay. + * + * Resets CommitTs into invalid state to make sure we don't hand back + * possibly-invalid data; also removes segments of old data. + */ +static void +DeactivateCommitTs(void) +{ + /* + * Cleanup the status in the shared memory. + * + * We reset everything in the commitTsShared record to prevent user from + * getting confusing data about last committed transaction on the standby + * when the module was activated repeatedly on the primary. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + + commitTsShared->commitTsActive = false; + commitTsShared->xidLastCommit = InvalidTransactionId; + TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time); + commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId; + + ShmemVariableCache->oldestCommitTsXid = InvalidTransactionId; + ShmemVariableCache->newestCommitTsXid = InvalidTransactionId; + + LWLockRelease(CommitTsLock); + + /* + * Remove *all* files. This is necessary so that there are no leftover + * files; in the case where this feature is later enabled after running + * with it disabled for some time there may be a gap in the file sequence. + * (We can probably tolerate out-of-sequence files, as they are going to + * be overwritten anyway when we wrap around, but it seems better to be + * tidy.) + */ + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + (void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL); + LWLockRelease(CommitTsSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointCommitTs(void) +{ + /* + * Write dirty CommitTs pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(CommitTsCtl, true); +} + +/* + * Make sure that CommitTs has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty CommitTs or xlog page to make room + * in shared memory. + * + * NB: the current implementation relies on track_commit_timestamp being + * PGC_POSTMASTER. + */ +void +ExtendCommitTs(TransactionId newestXact) +{ + int pageno; + + /* + * Nothing to do if module not enabled. Note we do an unlocked read of + * the flag here, which is okay because this routine is only called from + * GetNewTransactionId, which is never called in a standby. + */ + Assert(!InRecovery); + if (!commitTsShared->commitTsActive) + return; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToCTsEntry(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToCTsPage(newestXact); + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroCommitTsPage(pageno, !InRecovery); + + LWLockRelease(CommitTsSLRULock); +} + +/* + * Remove all CommitTs segments before the one holding the passed + * transaction ID. + * + * Note that we don't need to flush XLOG here. + */ +void +TruncateCommitTs(TransactionId oldestXact) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. + */ + cutoffPage = TransactionIdToCTsPage(oldestXact); + + /* Check to see if there's any files that could be removed */ + if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence, + &cutoffPage)) + return; /* nothing to remove */ + + /* Write XLOG record */ + WriteTruncateXlogRec(cutoffPage, oldestXact); + + /* Now we can remove the old CommitTs segment(s) */ + SimpleLruTruncate(CommitTsCtl, cutoffPage); +} + +/* + * Set the limit values between which commit TS can be consulted. + */ +void +SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact) +{ + /* + * Be careful not to overwrite values that are either further into the + * "future" or signal a disabled committs. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId) + { + if (TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact)) + ShmemVariableCache->oldestCommitTsXid = oldestXact; + if (TransactionIdPrecedes(newestXact, ShmemVariableCache->newestCommitTsXid)) + ShmemVariableCache->newestCommitTsXid = newestXact; + } + else + { + Assert(ShmemVariableCache->newestCommitTsXid == InvalidTransactionId); + ShmemVariableCache->oldestCommitTsXid = oldestXact; + ShmemVariableCache->newestCommitTsXid = newestXact; + } + LWLockRelease(CommitTsLock); +} + +/* + * Move forwards the oldest commitTS value that can be consulted + */ +void +AdvanceOldestCommitTsXid(TransactionId oldestXact) +{ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId && + TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact)) + ShmemVariableCache->oldestCommitTsXid = oldestXact; + LWLockRelease(CommitTsLock); +} + + +/* + * Decide whether a commitTS page number is "older" for truncation purposes. + * Analogous to CLOGPagePrecedes(). + * + * At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This + * introduces differences compared to CLOG and the other SLRUs having (1 << + * 31) % per_page == 0. This function never tests exactly + * TransactionIdPrecedes(x-2^31, x). When the system reaches xidStopLimit, + * there are two possible counts of page boundaries between oldestXact and the + * latest XID assigned, depending on whether oldestXact is within the first + * 128 entries of its page. Since this function doesn't know the location of + * oldestXact within page2, it returns false for one page that actually is + * expendable. This is a wider (yet still negligible) version of the + * truncation opportunity that CLOGPagePrecedes() cannot recognize. + * + * For the sake of a worked example, number entries with decimal values such + * that page1==1 entries range from 1.0 to 1.999. Let N+0.15 be the number of + * pages that 2^31 entries will span (N is an integer). If oldestXact=N+2.1, + * then the final safe XID assignment leaves newestXact=1.95. We keep page 2, + * because entry=2.85 is the border that toggles whether entries precede the + * last entry of the oldestXact page. While page 2 is expendable at + * oldestXact=N+2.1, it would be precious at oldestXact=N+2.9. + */ +static bool +CommitTsPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1)); +} + + +/* + * Write a ZEROPAGE xlog record + */ +static void +WriteZeroPageXlogRec(int pageno) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); +} + +/* + * Write a TRUNCATE xlog record + */ +static void +WriteTruncateXlogRec(int pageno, TransactionId oldestXid) +{ + xl_commit_ts_truncate xlrec; + + xlrec.pageno = pageno; + xlrec.oldestXid = oldestXid; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate); + (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE); +} + +/* + * CommitTS resource manager's routines + */ +void +commit_ts_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in commit_ts records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == COMMIT_TS_ZEROPAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + + slotno = ZeroCommitTsPage(pageno, false); + SimpleLruWritePage(CommitTsCtl, slotno); + Assert(!CommitTsCtl->shared->page_dirty[slotno]); + + LWLockRelease(CommitTsSLRULock); + } + else if (info == COMMIT_TS_TRUNCATE) + { + xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) XLogRecGetData(record); + + AdvanceOldestCommitTsXid(trunc->oldestXid); + + /* + * During XLOG replay, latest_page_number isn't set up yet; insert a + * suitable value to bypass the sanity test in SimpleLruTruncate. + */ + CommitTsCtl->shared->latest_page_number = trunc->pageno; + + SimpleLruTruncate(CommitTsCtl, trunc->pageno); + } + else + elog(PANIC, "commit_ts_redo: unknown op code %u", info); +} + +/* + * Entrypoint for sync.c to sync commit_ts files. + */ +int +committssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(CommitTsCtl, ftag, path); +} diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c new file mode 100644 index 0000000..63301a1 --- /dev/null +++ b/src/backend/access/transam/generic_xlog.c @@ -0,0 +1,544 @@ +/*------------------------------------------------------------------------- + * + * generic_xlog.c + * Implementation of generic xlog records. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/generic_xlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/generic_xlog.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "utils/memutils.h" + +/*------------------------------------------------------------------------- + * Internally, a delta between pages consists of a set of fragments. Each + * fragment represents changes made in a given region of a page. A fragment + * is made up as follows: + * + * - offset of page region (OffsetNumber) + * - length of page region (OffsetNumber) + * - data - the data to place into the region ('length' number of bytes) + * + * Unchanged regions of a page are not represented in its delta. As a result, + * a delta can be more compact than the full page image. But having an + * unchanged region between two fragments that is smaller than the fragment + * header (offset+length) does not pay off in terms of the overall size of + * the delta. For this reason, we merge adjacent fragments if the unchanged + * region between them is <= MATCH_THRESHOLD bytes. + * + * We do not bother to merge fragments across the "lower" and "upper" parts + * of a page; it's very seldom the case that pd_lower and pd_upper are within + * MATCH_THRESHOLD bytes of each other, and handling that infrequent case + * would complicate and slow down the delta-computation code unduly. + * Therefore, the worst-case delta size includes two fragment headers plus + * a full page's worth of data. + *------------------------------------------------------------------------- + */ +#define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber)) +#define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE +#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE) + +/* Struct of generic xlog data for single page */ +typedef struct +{ + Buffer buffer; /* registered buffer */ + int flags; /* flags for this buffer */ + int deltaLen; /* space consumed in delta field */ + char *image; /* copy of page image for modification, do not + * do it in-place to have aligned memory chunk */ + char delta[MAX_DELTA_SIZE]; /* delta between page images */ +} PageData; + +/* State of generic xlog record construction */ +struct GenericXLogState +{ + /* Info about each page, see above */ + PageData pages[MAX_GENERIC_XLOG_PAGES]; + bool isLogged; + /* Page images (properly aligned) */ + PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; +}; + +static void writeFragment(PageData *pageData, OffsetNumber offset, + OffsetNumber len, const char *data); +static void computeRegionDelta(PageData *pageData, + const char *curpage, const char *targetpage, + int targetStart, int targetEnd, + int validStart, int validEnd); +static void computeDelta(PageData *pageData, Page curpage, Page targetpage); +static void applyPageRedo(Page page, const char *delta, Size deltaSize); + + +/* + * Write next fragment into pageData's delta. + * + * The fragment has the given offset and length, and data points to the + * actual data (of length length). + */ +static void +writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length, + const char *data) +{ + char *ptr = pageData->delta + pageData->deltaLen; + + /* Verify we have enough space */ + Assert(pageData->deltaLen + sizeof(offset) + + sizeof(length) + length <= sizeof(pageData->delta)); + + /* Write fragment data */ + memcpy(ptr, &offset, sizeof(offset)); + ptr += sizeof(offset); + memcpy(ptr, &length, sizeof(length)); + ptr += sizeof(length); + memcpy(ptr, data, length); + ptr += length; + + pageData->deltaLen = ptr - pageData->delta; +} + +/* + * Compute the XLOG fragments needed to transform a region of curpage into the + * corresponding region of targetpage, and append them to pageData's delta + * field. The region to transform runs from targetStart to targetEnd-1. + * Bytes in curpage outside the range validStart to validEnd-1 should be + * considered invalid, and always overwritten with target data. + * + * This function is a hot spot, so it's worth being as tense as possible + * about the data-matching loops. + */ +static void +computeRegionDelta(PageData *pageData, + const char *curpage, const char *targetpage, + int targetStart, int targetEnd, + int validStart, int validEnd) +{ + int i, + loopEnd, + fragmentBegin = -1, + fragmentEnd = -1; + + /* Deal with any invalid start region by including it in first fragment */ + if (validStart > targetStart) + { + fragmentBegin = targetStart; + targetStart = validStart; + } + + /* We'll deal with any invalid end region after the main loop */ + loopEnd = Min(targetEnd, validEnd); + + /* Examine all the potentially matchable bytes */ + i = targetStart; + while (i < loopEnd) + { + if (curpage[i] != targetpage[i]) + { + /* On unmatched byte, start new fragment if not already in one */ + if (fragmentBegin < 0) + fragmentBegin = i; + /* Mark unmatched-data endpoint as uncertain */ + fragmentEnd = -1; + /* Extend the fragment as far as possible in a tight loop */ + i++; + while (i < loopEnd && curpage[i] != targetpage[i]) + i++; + if (i >= loopEnd) + break; + } + + /* Found a matched byte, so remember end of unmatched fragment */ + fragmentEnd = i; + + /* + * Extend the match as far as possible in a tight loop. (On typical + * workloads, this inner loop is the bulk of this function's runtime.) + */ + i++; + while (i < loopEnd && curpage[i] == targetpage[i]) + i++; + + /* + * There are several possible cases at this point: + * + * 1. We have no unwritten fragment (fragmentBegin < 0). There's + * nothing to write; and it doesn't matter what fragmentEnd is. + * + * 2. We found more than MATCH_THRESHOLD consecutive matching bytes. + * Dump out the unwritten fragment, stopping at fragmentEnd. + * + * 3. The match extends to loopEnd. We'll do nothing here, exit the + * loop, and then dump the unwritten fragment, after merging it with + * the invalid end region if any. If we don't so merge, fragmentEnd + * establishes how much the final writeFragment call needs to write. + * + * 4. We found an unmatched byte before loopEnd. The loop will repeat + * and will enter the unmatched-byte stanza above. So in this case + * also, it doesn't matter what fragmentEnd is. The matched bytes + * will get merged into the continuing unmatched fragment. + * + * Only in case 3 do we reach the bottom of the loop with a meaningful + * fragmentEnd value, which is why it's OK that we unconditionally + * assign "fragmentEnd = i" above. + */ + if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD) + { + writeFragment(pageData, fragmentBegin, + fragmentEnd - fragmentBegin, + targetpage + fragmentBegin); + fragmentBegin = -1; + fragmentEnd = -1; /* not really necessary */ + } + } + + /* Deal with any invalid end region by including it in final fragment */ + if (loopEnd < targetEnd) + { + if (fragmentBegin < 0) + fragmentBegin = loopEnd; + fragmentEnd = targetEnd; + } + + /* Write final fragment if any */ + if (fragmentBegin >= 0) + { + if (fragmentEnd < 0) + fragmentEnd = targetEnd; + writeFragment(pageData, fragmentBegin, + fragmentEnd - fragmentBegin, + targetpage + fragmentBegin); + } +} + +/* + * Compute the XLOG delta record needed to transform curpage into targetpage, + * and store it in pageData's delta field. + */ +static void +computeDelta(PageData *pageData, Page curpage, Page targetpage) +{ + int targetLower = ((PageHeader) targetpage)->pd_lower, + targetUpper = ((PageHeader) targetpage)->pd_upper, + curLower = ((PageHeader) curpage)->pd_lower, + curUpper = ((PageHeader) curpage)->pd_upper; + + pageData->deltaLen = 0; + + /* Compute delta records for lower part of page ... */ + computeRegionDelta(pageData, curpage, targetpage, + 0, targetLower, + 0, curLower); + /* ... and for upper part, ignoring what's between */ + computeRegionDelta(pageData, curpage, targetpage, + targetUpper, BLCKSZ, + curUpper, BLCKSZ); + + /* + * If xlog debug is enabled, then check produced delta. Result of delta + * application to curpage should be equivalent to targetpage. + */ +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + { + PGAlignedBlock tmp; + + memcpy(tmp.data, curpage, BLCKSZ); + applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen); + if (memcmp(tmp.data, targetpage, targetLower) != 0 || + memcmp(tmp.data + targetUpper, targetpage + targetUpper, + BLCKSZ - targetUpper) != 0) + elog(ERROR, "result of generic xlog apply does not match"); + } +#endif +} + +/* + * Start new generic xlog record for modifications to specified relation. + */ +GenericXLogState * +GenericXLogStart(Relation relation) +{ + GenericXLogState *state; + int i; + + state = (GenericXLogState *) palloc(sizeof(GenericXLogState)); + state->isLogged = RelationNeedsWAL(relation); + + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + state->pages[i].image = state->images[i].data; + state->pages[i].buffer = InvalidBuffer; + } + + return state; +} + +/* + * Register new buffer for generic xlog record. + * + * Returns pointer to the page's image in the GenericXLogState, which + * is what the caller should modify. + * + * If the buffer is already registered, just return its existing entry. + * (It's not very clear what to do with the flags in such a case, but + * for now we stay with the original flags.) + */ +Page +GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags) +{ + int block_id; + + /* Search array for existing entry or first unused slot */ + for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++) + { + PageData *page = &state->pages[block_id]; + + if (BufferIsInvalid(page->buffer)) + { + /* Empty slot, so use it (there cannot be a match later) */ + page->buffer = buffer; + page->flags = flags; + memcpy(page->image, BufferGetPage(buffer), BLCKSZ); + return (Page) page->image; + } + else if (page->buffer == buffer) + { + /* + * Buffer is already registered. Just return the image, which is + * already prepared. + */ + return (Page) page->image; + } + } + + elog(ERROR, "maximum number %d of generic xlog buffers is exceeded", + MAX_GENERIC_XLOG_PAGES); + /* keep compiler quiet */ + return NULL; +} + +/* + * Apply changes represented by GenericXLogState to the actual buffers, + * and emit a generic xlog record. + */ +XLogRecPtr +GenericXLogFinish(GenericXLogState *state) +{ + XLogRecPtr lsn; + int i; + + if (state->isLogged) + { + /* Logged relation: make xlog record in critical section. */ + XLogBeginInsert(); + + START_CRIT_SECTION(); + + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + PageData *pageData = &state->pages[i]; + Page page; + PageHeader pageHeader; + + if (BufferIsInvalid(pageData->buffer)) + continue; + + page = BufferGetPage(pageData->buffer); + pageHeader = (PageHeader) pageData->image; + + if (pageData->flags & GENERIC_XLOG_FULL_IMAGE) + { + /* + * A full-page image does not require us to supply any xlog + * data. Just apply the image, being careful to zero the + * "hole" between pd_lower and pd_upper in order to avoid + * divergence between actual page state and what replay would + * produce. + */ + memcpy(page, pageData->image, pageHeader->pd_lower); + memset(page + pageHeader->pd_lower, 0, + pageHeader->pd_upper - pageHeader->pd_lower); + memcpy(page + pageHeader->pd_upper, + pageData->image + pageHeader->pd_upper, + BLCKSZ - pageHeader->pd_upper); + + XLogRegisterBuffer(i, pageData->buffer, + REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + else + { + /* + * In normal mode, calculate delta and write it as xlog data + * associated with this page. + */ + computeDelta(pageData, page, (Page) pageData->image); + + /* Apply the image, with zeroed "hole" as above */ + memcpy(page, pageData->image, pageHeader->pd_lower); + memset(page + pageHeader->pd_lower, 0, + pageHeader->pd_upper - pageHeader->pd_lower); + memcpy(page + pageHeader->pd_upper, + pageData->image + pageHeader->pd_upper, + BLCKSZ - pageHeader->pd_upper); + + XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD); + XLogRegisterBufData(i, pageData->delta, pageData->deltaLen); + } + } + + /* Insert xlog record */ + lsn = XLogInsert(RM_GENERIC_ID, 0); + + /* Set LSN and mark buffers dirty */ + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + PageData *pageData = &state->pages[i]; + + if (BufferIsInvalid(pageData->buffer)) + continue; + PageSetLSN(BufferGetPage(pageData->buffer), lsn); + MarkBufferDirty(pageData->buffer); + } + END_CRIT_SECTION(); + } + else + { + /* Unlogged relation: skip xlog-related stuff */ + START_CRIT_SECTION(); + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + PageData *pageData = &state->pages[i]; + + if (BufferIsInvalid(pageData->buffer)) + continue; + memcpy(BufferGetPage(pageData->buffer), + pageData->image, + BLCKSZ); + /* We don't worry about zeroing the "hole" in this case */ + MarkBufferDirty(pageData->buffer); + } + END_CRIT_SECTION(); + /* We don't have a LSN to return, in this case */ + lsn = InvalidXLogRecPtr; + } + + pfree(state); + + return lsn; +} + +/* + * Abort generic xlog record construction. No changes are applied to buffers. + * + * Note: caller is responsible for releasing locks/pins on buffers, if needed. + */ +void +GenericXLogAbort(GenericXLogState *state) +{ + pfree(state); +} + +/* + * Apply delta to given page image. + */ +static void +applyPageRedo(Page page, const char *delta, Size deltaSize) +{ + const char *ptr = delta; + const char *end = delta + deltaSize; + + while (ptr < end) + { + OffsetNumber offset, + length; + + memcpy(&offset, ptr, sizeof(offset)); + ptr += sizeof(offset); + memcpy(&length, ptr, sizeof(length)); + ptr += sizeof(length); + + memcpy(page + offset, ptr, length); + + ptr += length; + } +} + +/* + * Redo function for generic xlog record. + */ +void +generic_redo(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffers[MAX_GENERIC_XLOG_PAGES]; + uint8 block_id; + + /* Protect limited size of buffers[] array */ + Assert(record->max_block_id < MAX_GENERIC_XLOG_PAGES); + + /* Iterate over blocks */ + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + XLogRedoAction action; + + if (!XLogRecHasBlockRef(record, block_id)) + { + buffers[block_id] = InvalidBuffer; + continue; + } + + action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]); + + /* Apply redo to given block if needed */ + if (action == BLK_NEEDS_REDO) + { + Page page; + PageHeader pageHeader; + char *blockDelta; + Size blockDeltaSize; + + page = BufferGetPage(buffers[block_id]); + blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize); + applyPageRedo(page, blockDelta, blockDeltaSize); + + /* + * Since the delta contains no information about what's in the + * "hole" between pd_lower and pd_upper, set that to zero to + * ensure we produce the same page state that application of the + * logged action by GenericXLogFinish did. + */ + pageHeader = (PageHeader) page; + memset(page + pageHeader->pd_lower, 0, + pageHeader->pd_upper - pageHeader->pd_lower); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffers[block_id]); + } + } + + /* Changes are done: unlock and release all buffers */ + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + if (BufferIsValid(buffers[block_id])) + UnlockReleaseBuffer(buffers[block_id]); + } +} + +/* + * Mask a generic page before performing consistency checks on it. + */ +void +generic_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn_and_checksum(page); + + mask_unused_space(page); +} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c new file mode 100644 index 0000000..b643564 --- /dev/null +++ b/src/backend/access/transam/multixact.c @@ -0,0 +1,3427 @@ +/*------------------------------------------------------------------------- + * + * multixact.c + * PostgreSQL multi-transaction-log manager + * + * The pg_multixact manager is a pg_xact-like manager that stores an array of + * MultiXactMember for each MultiXactId. It is a fundamental part of the + * shared-row-lock implementation. Each MultiXactMember is comprised of a + * TransactionId and a set of flag bits. The name is a bit historical: + * originally, a MultiXactId consisted of more than one TransactionId (except + * in rare corner cases), hence "multi". Nowadays, however, it's perfectly + * legitimate to have MultiXactIds that only include a single Xid. + * + * The meaning of the flag bits is opaque to this module, but they are mostly + * used in heapam.c to identify lock modes that each of the member transactions + * is holding on any given tuple. This module just contains support to store + * and retrieve the arrays. + * + * We use two SLRU areas, one for storing the offsets at which the data + * starts for each MultiXactId in the other one. This trick allows us to + * store variable length arrays of TransactionIds. (We could alternatively + * use one area containing counts and TransactionIds, with valid MultiXactId + * values pointing at slots containing counts; but that way seems less robust + * since it would get completely confused if someone inquired about a bogus + * MultiXactId that pointed to an intermediate slot containing an XID.) + * + * XLOG interactions: this module generates a record whenever a new OFFSETs or + * MEMBERs page is initialized to zeroes, as well as an + * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined. + * This module ignores the WAL rule "write xlog before data," because it + * suffices that actions recording a MultiXactId in a heap xmax do follow that + * rule. The only way for the MXID to be referenced from any data page is for + * heap_lock_tuple() or heap_update() to have put it there, and each generates + * an XLOG record that must follow ours. The normal LSN interlock between the + * data page and that XLOG record will ensure that our XLOG record reaches + * disk first. If the SLRU members/offsets data reaches disk sooner than the + * XLOG records, we do not care; after recovery, no xmax will refer to it. On + * the flip side, to ensure that all referenced entries _do_ reach disk, this + * module's XLOG records completely rebuild the data entered since the last + * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk + * before each checkpoint is considered complete. + * + * Like clog.c, and unlike subtrans.c, we have to preserve state across + * crashes and ensure that MXID and offset numbering increases monotonically + * across a crash. We do this in the same way as it's done for transaction + * IDs: the WAL record is guaranteed to contain evidence of every MXID we + * could need to worry about, and we just make sure that at the end of + * replay, the next-MXID and next-offset counters are at least as large as + * anything we saw during replay. + * + * We are able to remove segments no longer necessary by carefully tracking + * each table's used values: during vacuum, any multixact older than a certain + * value is removed; the cutoff value is stored in pg_class. The minimum value + * across all tables in each database is stored in pg_database, and the global + * minimum across all databases is part of pg_control and is kept in shared + * memory. Whenever that minimum is advanced, the SLRUs are truncated. + * + * When new multixactid values are to be created, care is taken that the + * counter does not fall within the wraparound horizon considering the global + * minimum value. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/multixact.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/multixact.h" +#include "access/slru.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/pg_type.h" +#include "commands/dbcommands.h" +#include "funcapi.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "postmaster/autovacuum.h" +#include "storage/lmgr.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + + +/* + * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is + * used everywhere else in Postgres. + * + * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, + * MultiXact page numbering also wraps around at + * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at + * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need + * take no explicit notice of that fact in this module, except when comparing + * segment and page numbers in TruncateMultiXact (see + * MultiXactOffsetPagePrecedes). + */ + +/* We need four bytes per offset */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + +#define MultiXactIdToOffsetPage(xid) \ + ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) +#define MultiXactIdToOffsetEntry(xid) \ + ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) +#define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT) + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* + * Because the number of items per page is not a divisor of the last item + * number (member 0xFFFFFFFF), the last segment does not use the maximum number + * of pages, and moreover the last used page therein does not use the same + * number of items as previous pages. (Another way to say it is that the + * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page + * has some empty space after that item.) + * + * This constant is the number of members in the last page of the last segment. + */ +#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ + ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) + +/* page in which a member is to be found */ +#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) +#define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT) + +/* Location (byte offset within page) of flag word for a given member */ +#define MXOffsetToFlagsOffset(xid) \ + ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \ + (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \ + (TransactionId) MULTIXACT_MEMBERGROUP_SIZE) +#define MXOffsetToFlagsBitShift(xid) \ + (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \ + MXACT_MEMBER_BITS_PER_XACT) + +/* Location (byte offset within page) of TransactionId of given member */ +#define MXOffsetToMemberOffset(xid) \ + (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \ + ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId)) + +/* Multixact members wraparound thresholds. */ +#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) +#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ + (MaxMultiXactOffset - MaxMultiXactOffset / 4) + +#define PreviousMultiXactId(xid) \ + ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1) + +/* + * Links to shared-memory data structures for MultiXact control + */ +static SlruCtlData MultiXactOffsetCtlData; +static SlruCtlData MultiXactMemberCtlData; + +#define MultiXactOffsetCtl (&MultiXactOffsetCtlData) +#define MultiXactMemberCtl (&MultiXactMemberCtlData) + +/* + * MultiXact state shared across all backends. All this state is protected + * by MultiXactGenLock. (We also use MultiXactOffsetSLRULock and + * MultiXactMemberSLRULock to guard accesses to the two sets of SLRU + * buffers. For concurrency's sake, we avoid holding more than one of these + * locks at a time.) + */ +typedef struct MultiXactStateData +{ + /* next-to-be-assigned MultiXactId */ + MultiXactId nextMXact; + + /* next-to-be-assigned offset */ + MultiXactOffset nextOffset; + + /* Have we completed multixact startup? */ + bool finishedStartup; + + /* + * Oldest multixact that is still potentially referenced by a relation. + * Anything older than this should not be consulted. These values are + * updated by vacuum. + */ + MultiXactId oldestMultiXactId; + Oid oldestMultiXactDB; + + /* + * Oldest multixact offset that is potentially referenced by a multixact + * referenced by a relation. We don't always know this value, so there's + * a flag here to indicate whether or not we currently do. + */ + MultiXactOffset oldestOffset; + bool oldestOffsetKnown; + + /* support for anti-wraparound measures */ + MultiXactId multiVacLimit; + MultiXactId multiWarnLimit; + MultiXactId multiStopLimit; + MultiXactId multiWrapLimit; + + /* support for members anti-wraparound measures */ + MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ + + /* + * Per-backend data starts here. We have two arrays stored in the area + * immediately following the MultiXactStateData struct. Each is indexed by + * BackendId. + * + * In both arrays, there's a slot for all normal backends (1..MaxBackends) + * followed by a slot for max_prepared_xacts prepared transactions. Valid + * BackendIds start from 1; element zero of each array is never used. + * + * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current + * transaction(s) could possibly be a member of, or InvalidMultiXactId + * when the backend has no live transaction that could possibly be a + * member of a MultiXact. Each backend sets its entry to the current + * nextMXact counter just before first acquiring a shared lock in a given + * transaction, and clears it at transaction end. (This works because only + * during or after acquiring a shared lock could an XID possibly become a + * member of a MultiXact, and that MultiXact would have to be created + * during or after the lock acquisition.) + * + * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's + * current transaction(s) think is potentially live, or InvalidMultiXactId + * when not in a transaction or not in a transaction that's paid any + * attention to MultiXacts yet. This is computed when first needed in a + * given transaction, and cleared at transaction end. We can compute it + * as the minimum of the valid OldestMemberMXactId[] entries at the time + * we compute it (using nextMXact if none are valid). Each backend is + * required not to attempt to access any SLRU data for MultiXactIds older + * than its own OldestVisibleMXactId[] setting; this is necessary because + * the checkpointer could truncate away such data at any instant. + * + * The oldest valid value among all of the OldestMemberMXactId[] and + * OldestVisibleMXactId[] entries is considered by vacuum as the earliest + * possible value still having any live member transaction. Subtracting + * vacuum_multixact_freeze_min_age from that value we obtain the freezing + * point for multixacts for that table. Any value older than that is + * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note + * that multis that have member xids that are older than the cutoff point + * for xids must also be frozen, even if the multis themselves are newer + * than the multixid cutoff point). Whenever a full table vacuum happens, + * the freezing point so computed is used as the new pg_class.relminmxid + * value. The minimum of all those values in a database is stored as + * pg_database.datminmxid. In turn, the minimum of all of those values is + * stored in pg_control and used as truncation point for pg_multixact. At + * checkpoint or restartpoint, unneeded segments are removed. + */ + MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]; +} MultiXactStateData; + +/* + * Last element of OldestMemberMXactId and OldestVisibleMXactId arrays. + * Valid elements are (1..MaxOldestSlot); element 0 is never used. + */ +#define MaxOldestSlot (MaxBackends + max_prepared_xacts) + +/* Pointers to the state data in shared memory */ +static MultiXactStateData *MultiXactState; +static MultiXactId *OldestMemberMXactId; +static MultiXactId *OldestVisibleMXactId; + + +/* + * Definitions for the backend-local MultiXactId cache. + * + * We use this cache to store known MultiXacts, so we don't need to go to + * SLRU areas every time. + * + * The cache lasts for the duration of a single transaction, the rationale + * for this being that most entries will contain our own TransactionId and + * so they will be uninteresting by the time our next transaction starts. + * (XXX not clear that this is correct --- other members of the MultiXact + * could hang around longer than we did. However, it's not clear what a + * better policy for flushing old cache entries would be.) FIXME actually + * this is plain wrong now that multixact's may contain update Xids. + * + * We allocate the cache entries in a memory context that is deleted at + * transaction end, so we don't need to do retail freeing of entries. + */ +typedef struct mXactCacheEnt +{ + MultiXactId multi; + int nmembers; + dlist_node node; + MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; +} mXactCacheEnt; + +#define MAX_CACHE_ENTRIES 256 +static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache); +static int MXactCacheMembers = 0; +static MemoryContext MXactContext = NULL; + +#ifdef MULTIXACT_DEBUG +#define debug_elog2(a,b) elog(a,b) +#define debug_elog3(a,b,c) elog(a,b,c) +#define debug_elog4(a,b,c,d) elog(a,b,c,d) +#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e) +#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f) +#else +#define debug_elog2(a,b) +#define debug_elog3(a,b,c) +#define debug_elog4(a,b,c,d) +#define debug_elog5(a,b,c,d,e) +#define debug_elog6(a,b,c,d,e,f) +#endif + +/* internal MultiXactId management */ +static void MultiXactIdSetOldestVisible(void); +static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, + int nmembers, MultiXactMember *members); +static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); + +/* MultiXact cache management */ +static int mxactMemberComparator(const void *arg1, const void *arg2); +static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members); +static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); +static void mXactCachePut(MultiXactId multi, int nmembers, + MultiXactMember *members); + +static char *mxstatus_to_string(MultiXactStatus status); + +/* management of SLRU infrastructure */ +static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); +static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); +static bool MultiXactOffsetPagePrecedes(int page1, int page2); +static bool MultiXactMemberPagePrecedes(int page1, int page2); +static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, + MultiXactOffset offset2); +static void ExtendMultiXactOffset(MultiXactId multi); +static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); +static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, + MultiXactOffset start, uint32 distance); +static bool SetOffsetVacuumLimit(bool is_startup); +static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); +static void WriteMZeroPageXlogRec(int pageno, uint8 info); +static void WriteMTruncateXlogRec(Oid oldestMultiDB, + MultiXactId startTruncOff, + MultiXactId endTruncOff, + MultiXactOffset startTruncMemb, + MultiXactOffset endTruncMemb); + + +/* + * MultiXactIdCreate + * Construct a MultiXactId representing two TransactionIds. + * + * The two XIDs must be different, or be requesting different statuses. + * + * NB - we don't worry about our local MultiXactId cache here, because that + * is handled by the lower-level routines. + */ +MultiXactId +MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, + TransactionId xid2, MultiXactStatus status2) +{ + MultiXactId newMulti; + MultiXactMember members[2]; + + AssertArg(TransactionIdIsValid(xid1)); + AssertArg(TransactionIdIsValid(xid2)); + + Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); + + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + + /* + * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs + * are still running. In typical usage, xid2 will be our own XID and the + * caller just did a check on xid1, so it'd be wasted effort. + */ + + members[0].xid = xid1; + members[0].status = status1; + members[1].xid = xid2; + members[1].status = status2; + + newMulti = MultiXactIdCreateFromMembers(2, members); + + debug_elog3(DEBUG2, "Create: %s", + mxid_to_string(newMulti, 2, members)); + + return newMulti; +} + +/* + * MultiXactIdExpand + * Add a TransactionId to a pre-existing MultiXactId. + * + * If the TransactionId is already a member of the passed MultiXactId with the + * same status, just return it as-is. + * + * Note that we do NOT actually modify the membership of a pre-existing + * MultiXactId; instead we create a new one. This is necessary to avoid + * a race condition against code trying to wait for one MultiXactId to finish; + * see notes in heapam.c. + * + * NB - we don't worry about our local MultiXactId cache here, because that + * is handled by the lower-level routines. + * + * Note: It is critical that MultiXactIds that come from an old cluster (i.e. + * one upgraded by pg_upgrade from a cluster older than this feature) are not + * passed in. + */ +MultiXactId +MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) +{ + MultiXactId newMulti; + MultiXactMember *members; + MultiXactMember *newMembers; + int nmembers; + int i; + int j; + + AssertArg(MultiXactIdIsValid(multi)); + AssertArg(TransactionIdIsValid(xid)); + + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + + debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s", + multi, xid, mxstatus_to_string(status)); + + /* + * Note: we don't allow for old multis here. The reason is that the only + * caller of this function does a check that the multixact is no longer + * running. + */ + nmembers = GetMultiXactIdMembers(multi, &members, false, false); + + if (nmembers < 0) + { + MultiXactMember member; + + /* + * The MultiXactId is obsolete. This can only happen if all the + * MultiXactId members stop running between the caller checking and + * passing it to us. It would be better to return that fact to the + * caller, but it would complicate the API and it's unlikely to happen + * too often, so just deal with it by creating a singleton MultiXact. + */ + member.xid = xid; + member.status = status; + newMulti = MultiXactIdCreateFromMembers(1, &member); + + debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", + multi, newMulti); + return newMulti; + } + + /* + * If the TransactionId is already a member of the MultiXactId with the + * same status, just return the existing MultiXactId. + */ + for (i = 0; i < nmembers; i++) + { + if (TransactionIdEquals(members[i].xid, xid) && + (members[i].status == status)) + { + debug_elog4(DEBUG2, "Expand: %u is already a member of %u", + xid, multi); + pfree(members); + return multi; + } + } + + /* + * Determine which of the members of the MultiXactId are still of + * interest. This is any running transaction, and also any transaction + * that grabbed something stronger than just a lock and was committed. (An + * update that aborted is of no interest here; and having more than one + * update Xid in a multixact would cause errors elsewhere.) + * + * Removing dead members is not just an optimization: freezing of tuples + * whose Xmax are multis depends on this behavior. + * + * Note we have the same race condition here as above: j could be 0 at the + * end of the loop. + */ + newMembers = (MultiXactMember *) + palloc(sizeof(MultiXactMember) * (nmembers + 1)); + + for (i = 0, j = 0; i < nmembers; i++) + { + if (TransactionIdIsInProgress(members[i].xid) || + (ISUPDATE_from_mxstatus(members[i].status) && + TransactionIdDidCommit(members[i].xid))) + { + newMembers[j].xid = members[i].xid; + newMembers[j++].status = members[i].status; + } + } + + newMembers[j].xid = xid; + newMembers[j++].status = status; + newMulti = MultiXactIdCreateFromMembers(j, newMembers); + + pfree(members); + pfree(newMembers); + + debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti); + + return newMulti; +} + +/* + * MultiXactIdIsRunning + * Returns whether a MultiXactId is "running". + * + * We return true if at least one member of the given MultiXactId is still + * running. Note that a "false" result is certain not to change, + * because it is not legal to add members to an existing MultiXactId. + * + * Caller is expected to have verified that the multixact does not come from + * a pg_upgraded share-locked tuple. + */ +bool +MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly) +{ + MultiXactMember *members; + int nmembers; + int i; + + debug_elog3(DEBUG2, "IsRunning %u?", multi); + + /* + * "false" here means we assume our callers have checked that the given + * multi cannot possibly come from a pg_upgraded database. + */ + nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly); + + if (nmembers <= 0) + { + debug_elog2(DEBUG2, "IsRunning: no members"); + return false; + } + + /* + * Checking for myself is cheap compared to looking in shared memory; + * return true if any live subtransaction of the current top-level + * transaction is a member. + * + * This is not needed for correctness, it's just a fast path. + */ + for (i = 0; i < nmembers; i++) + { + if (TransactionIdIsCurrentTransactionId(members[i].xid)) + { + debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i); + pfree(members); + return true; + } + } + + /* + * This could be made faster by having another entry point in procarray.c, + * walking the PGPROC array only once for all the members. But in most + * cases nmembers should be small enough that it doesn't much matter. + */ + for (i = 0; i < nmembers; i++) + { + if (TransactionIdIsInProgress(members[i].xid)) + { + debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running", + i, members[i].xid); + pfree(members); + return true; + } + } + + pfree(members); + + debug_elog3(DEBUG2, "IsRunning: %u is not running", multi); + + return false; +} + +/* + * MultiXactIdSetOldestMember + * Save the oldest MultiXactId this transaction could be a member of. + * + * We set the OldestMemberMXactId for a given transaction the first time it's + * going to do some operation that might require a MultiXactId (tuple lock, + * update or delete). We need to do this even if we end up using a + * TransactionId instead of a MultiXactId, because there is a chance that + * another transaction would add our XID to a MultiXactId. + * + * The value to set is the next-to-be-assigned MultiXactId, so this is meant to + * be called just before doing any such possibly-MultiXactId-able operation. + */ +void +MultiXactIdSetOldestMember(void) +{ + if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])) + { + MultiXactId nextMXact; + + /* + * You might think we don't need to acquire a lock here, since + * fetching and storing of TransactionIds is probably atomic, but in + * fact we do: suppose we pick up nextMXact and then lose the CPU for + * a long time. Someone else could advance nextMXact, and then + * another someone else could compute an OldestVisibleMXactId that + * would be after the value we are going to store when we get control + * back. Which would be wrong. + * + * Note that a shared lock is sufficient, because it's enough to stop + * someone from advancing nextMXact; and nobody else could be trying + * to write to our OldestMember entry, only reading (and we assume + * storing it is atomic.) + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + + /* + * We have to beware of the possibility that nextMXact is in the + * wrapped-around state. We don't fix the counter itself here, but we + * must be sure to store a valid value in our array entry. + */ + nextMXact = MultiXactState->nextMXact; + if (nextMXact < FirstMultiXactId) + nextMXact = FirstMultiXactId; + + OldestMemberMXactId[MyBackendId] = nextMXact; + + LWLockRelease(MultiXactGenLock); + + debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u", + MyBackendId, nextMXact); + } +} + +/* + * MultiXactIdSetOldestVisible + * Save the oldest MultiXactId this transaction considers possibly live. + * + * We set the OldestVisibleMXactId for a given transaction the first time + * it's going to inspect any MultiXactId. Once we have set this, we are + * guaranteed that the checkpointer won't truncate off SLRU data for + * MultiXactIds at or after our OldestVisibleMXactId. + * + * The value to set is the oldest of nextMXact and all the valid per-backend + * OldestMemberMXactId[] entries. Because of the locking we do, we can be + * certain that no subsequent call to MultiXactIdSetOldestMember can set + * an OldestMemberMXactId[] entry older than what we compute here. Therefore + * there is no live transaction, now or later, that can be a member of any + * MultiXactId older than the OldestVisibleMXactId we compute here. + */ +static void +MultiXactIdSetOldestVisible(void) +{ + if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId])) + { + MultiXactId oldestMXact; + int i; + + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + + /* + * We have to beware of the possibility that nextMXact is in the + * wrapped-around state. We don't fix the counter itself here, but we + * must be sure to store a valid value in our array entry. + */ + oldestMXact = MultiXactState->nextMXact; + if (oldestMXact < FirstMultiXactId) + oldestMXact = FirstMultiXactId; + + for (i = 1; i <= MaxOldestSlot; i++) + { + MultiXactId thisoldest = OldestMemberMXactId[i]; + + if (MultiXactIdIsValid(thisoldest) && + MultiXactIdPrecedes(thisoldest, oldestMXact)) + oldestMXact = thisoldest; + } + + OldestVisibleMXactId[MyBackendId] = oldestMXact; + + LWLockRelease(MultiXactGenLock); + + debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u", + MyBackendId, oldestMXact); + } +} + +/* + * ReadNextMultiXactId + * Return the next MultiXactId to be assigned, but don't allocate it + */ +MultiXactId +ReadNextMultiXactId(void) +{ + MultiXactId mxid; + + /* XXX we could presumably do this without a lock. */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + mxid = MultiXactState->nextMXact; + LWLockRelease(MultiXactGenLock); + + if (mxid < FirstMultiXactId) + mxid = FirstMultiXactId; + + return mxid; +} + +/* + * ReadMultiXactIdRange + * Get the range of IDs that may still be referenced by a relation. + */ +void +ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next) +{ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + *oldest = MultiXactState->oldestMultiXactId; + *next = MultiXactState->nextMXact; + LWLockRelease(MultiXactGenLock); + + if (*oldest < FirstMultiXactId) + *oldest = FirstMultiXactId; + if (*next < FirstMultiXactId) + *next = FirstMultiXactId; +} + + +/* + * MultiXactIdCreateFromMembers + * Make a new MultiXactId from the specified set of members + * + * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the + * given TransactionIds as members. Returns the newly created MultiXactId. + * + * NB: the passed members[] array will be sorted in-place. + */ +MultiXactId +MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) +{ + MultiXactId multi; + MultiXactOffset offset; + xl_multixact_create xlrec; + + debug_elog3(DEBUG2, "Create: %s", + mxid_to_string(InvalidMultiXactId, nmembers, members)); + + /* + * See if the same set of members already exists in our cache; if so, just + * re-use that MultiXactId. (Note: it might seem that looking in our + * cache is insufficient, and we ought to search disk to see if a + * duplicate definition already exists. But since we only ever create + * MultiXacts containing our own XID, in most cases any such MultiXacts + * were in fact created by us, and so will be in our cache. There are + * corner cases where someone else added us to a MultiXact without our + * knowledge, but it's not worth checking for.) + */ + multi = mXactCacheGetBySet(nmembers, members); + if (MultiXactIdIsValid(multi)) + { + debug_elog2(DEBUG2, "Create: in cache!"); + return multi; + } + + /* Verify that there is a single update Xid among the given members. */ + { + int i; + bool has_update = false; + + for (i = 0; i < nmembers; i++) + { + if (ISUPDATE_from_mxstatus(members[i].status)) + { + if (has_update) + elog(ERROR, "new multixact has more than one updating member"); + has_update = true; + } + } + } + + /* + * Assign the MXID and offsets range to use, and make sure there is space + * in the OFFSETs and MEMBERs files. NB: this routine does + * START_CRIT_SECTION(). + * + * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check + * that we've called MultiXactIdSetOldestMember here. This is because + * this routine is used in some places to create new MultiXactIds of which + * the current backend is not a member, notably during freezing of multis + * in vacuum. During vacuum, in particular, it would be unacceptable to + * keep OldestMulti set, in case it runs for long. + */ + multi = GetNewMultiXactId(nmembers, &offset); + + /* Make an XLOG entry describing the new MXID. */ + xlrec.mid = multi; + xlrec.moff = offset; + xlrec.nmembers = nmembers; + + /* + * XXX Note: there's a lot of padding space in MultiXactMember. We could + * find a more compact representation of this Xlog record -- perhaps all + * the status flags in one XLogRecData, then all the xids in another one? + * Not clear that it's worth the trouble though. + */ + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate); + XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember)); + + (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); + + /* Now enter the information into the OFFSETs and MEMBERs logs */ + RecordNewMultiXact(multi, offset, nmembers, members); + + /* Done with critical section */ + END_CRIT_SECTION(); + + /* Store the new MultiXactId in the local cache, too */ + mXactCachePut(multi, nmembers, members); + + debug_elog2(DEBUG2, "Create: all done"); + + return multi; +} + +/* + * RecordNewMultiXact + * Write info about a new multixact into the offsets and members files + * + * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can + * use it. + */ +static void +RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, + int nmembers, MultiXactMember *members) +{ + int pageno; + int prev_pageno; + int entryno; + int slotno; + MultiXactOffset *offptr; + int i; + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + /* + * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" + * to complain about if there's any I/O error. This is kinda bogus, but + * since the errors will always give the full pathname, it should be clear + * enough that a MultiXactId is really involved. Perhaps someday we'll + * take the trouble to generalize the slru.c error reporting code. + */ + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + + *offptr = offset; + + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + + /* Exchange our lock */ + LWLockRelease(MultiXactOffsetSLRULock); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + prev_pageno = -1; + + for (i = 0; i < nmembers; i++, offset++) + { + TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + Assert(members[i].status <= MultiXactStatusUpdate); + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + + if (pageno != prev_pageno) + { + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); + prev_pageno = pageno; + } + + memberptr = (TransactionId *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + + *memberptr = members[i].xid; + + flagsptr = (uint32 *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; + + MultiXactMemberCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(MultiXactMemberSLRULock); +} + +/* + * GetNewMultiXactId + * Get the next MultiXactId. + * + * Also, reserve the needed amount of space in the "members" area. The + * starting offset of the reserved space is returned in *offset. + * + * This may generate XLOG records for expansion of the offsets and/or members + * files. Unfortunately, we have to do that while holding MultiXactGenLock + * to avoid race conditions --- the XLOG record for zeroing a page must appear + * before any backend can possibly try to store data in that page! + * + * We start a critical section before advancing the shared counters. The + * caller must end the critical section after writing SLRU data. + */ +static MultiXactId +GetNewMultiXactId(int nmembers, MultiXactOffset *offset) +{ + MultiXactId result; + MultiXactOffset nextOffset; + + debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers); + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign MultiXactIds during recovery"); + + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + + /* Handle wraparound of the nextMXact counter */ + if (MultiXactState->nextMXact < FirstMultiXactId) + MultiXactState->nextMXact = FirstMultiXactId; + + /* Assign the MXID */ + result = MultiXactState->nextMXact; + + /*---------- + * Check to see if it's safe to assign another MultiXactId. This protects + * against catastrophic data loss due to multixact wraparound. The basic + * rules are: + * + * If we're past multiVacLimit or the safe threshold for member storage + * space, or we don't know what the safe threshold for member storage is, + * start trying to force autovacuum cycles. + * If we're past multiWarnLimit, start issuing warnings. + * If we're past multiStopLimit, refuse to create new MultiXactIds. + * + * Note these are pretty much the same protections in GetNewTransactionId. + *---------- + */ + if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit)) + { + /* + * For safety's sake, we release MultiXactGenLock while sending + * signals, warnings, etc. This is not so much because we care about + * preserving concurrency in this situation, as to avoid any + * possibility of deadlock while doing get_database_name(). First, + * copy all the shared values we'll need in this path. + */ + MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit; + MultiXactId multiStopLimit = MultiXactState->multiStopLimit; + MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit; + Oid oldest_datoid = MultiXactState->oldestMultiXactDB; + + LWLockRelease(MultiXactGenLock); + + if (IsUnderPostmaster && + !MultiXactIdPrecedes(result, multiStopLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* + * Immediately kick autovacuum into action as we're already in + * ERROR territory. + */ + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"", + oldest_datname), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u", + oldest_datoid), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + + /* + * To avoid swamping the postmaster with signals, we issue the autovac + * request only once per 64K multis generated. This still gives + * plenty of chances before we get into real trouble. + */ + if (IsUnderPostmaster && (result % 65536) == 0) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + if (!MultiXactIdPrecedes(result, multiWarnLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(WARNING, + (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", + "database \"%s\" must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - result, + oldest_datname, + multiWrapLimit - result), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", + "database with OID %u must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - result, + oldest_datoid, + multiWrapLimit - result), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + + /* Re-acquire lock and start over */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + result = MultiXactState->nextMXact; + if (result < FirstMultiXactId) + result = FirstMultiXactId; + } + + /* Make sure there is room for the MXID in the file. */ + ExtendMultiXactOffset(result); + + /* + * Reserve the members space, similarly to above. Also, be careful not to + * return zero as the starting offset for any multixact. See + * GetMultiXactIdMembers() for motivation. + */ + nextOffset = MultiXactState->nextOffset; + if (nextOffset == 0) + { + *offset = 1; + nmembers++; /* allocate member slot 0 too */ + } + else + *offset = nextOffset; + + /*---------- + * Protect against overrun of the members space as well, with the + * following rules: + * + * If we're past offsetStopLimit, refuse to generate more multis. + * If we're close to offsetStopLimit, emit a warning. + * + * Arbitrarily, we start emitting warnings when we're 20 segments or less + * from offsetStopLimit. + * + * Note we haven't updated the shared state yet, so if we fail at this + * point, the multixact ID we grabbed can still be used by the next guy. + * + * Note that there is no point in forcing autovacuum runs here: the + * multixact freeze settings would have to be reduced for that to have any + * effect. + *---------- + */ +#define OFFSET_WARN_SEGMENTS 20 + if (MultiXactState->oldestOffsetKnown && + MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, + nmembers)) + { + /* see comment in the corresponding offsets wraparound case */ + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("multixact \"members\" limit exceeded"), + errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", + "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", + MultiXactState->offsetStopLimit - nextOffset - 1, + nmembers, + MultiXactState->offsetStopLimit - nextOffset - 1), + errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.", + MultiXactState->oldestMultiXactDB))); + } + + /* + * Check whether we should kick autovacuum into action, to prevent members + * wraparound. NB we use a much larger window to trigger autovacuum than + * just the warning limit. The warning is just a measure of last resort - + * this is in line with GetNewTransactionId's behaviour. + */ + if (!MultiXactState->oldestOffsetKnown || + (MultiXactState->nextOffset - MultiXactState->oldestOffset + > MULTIXACT_MEMBER_SAFE_THRESHOLD)) + { + /* + * To avoid swamping the postmaster with signals, we issue the autovac + * request only when crossing a segment boundary. With default + * compilation settings that's roughly after 50k members. This still + * gives plenty of chances before we get into real trouble. + */ + if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != + (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + } + + if (MultiXactState->oldestOffsetKnown && + MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, + nextOffset, + nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) + ereport(WARNING, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", + "database with OID %u must be vacuumed before %d more multixact members are used", + MultiXactState->offsetStopLimit - nextOffset + nmembers, + MultiXactState->oldestMultiXactDB, + MultiXactState->offsetStopLimit - nextOffset + nmembers), + errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings."))); + + ExtendMultiXactMember(nextOffset, nmembers); + + /* + * Critical section from here until caller has written the data into the + * just-reserved SLRU space; we don't want to error out with a partly + * written MultiXact structure. (In particular, failing to write our + * start offset after advancing nextMXact would effectively corrupt the + * previous MultiXact.) + */ + START_CRIT_SECTION(); + + /* + * Advance counters. As in GetNewTransactionId(), this must not happen + * until after file extension has succeeded! + * + * We don't care about MultiXactId wraparound here; it will be handled by + * the next iteration. But note that nextMXact may be InvalidMultiXactId + * or the first value on a segment-beginning page after this routine + * exits, so anyone else looking at the variable must be prepared to deal + * with either case. Similarly, nextOffset may be zero, but we won't use + * that as the actual start offset of the next multixact. + */ + (MultiXactState->nextMXact)++; + + MultiXactState->nextOffset += nmembers; + + LWLockRelease(MultiXactGenLock); + + debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); + return result; +} + +/* + * GetMultiXactIdMembers + * Return the set of MultiXactMembers that make up a MultiXactId + * + * Return value is the number of members found, or -1 if there are none, + * and *members is set to a newly palloc'ed array of members. It's the + * caller's responsibility to free it when done with it. + * + * from_pgupgrade must be passed as true if and only if only the multixact + * corresponds to a value from a tuple that was locked in a 9.2-or-older + * installation and later pg_upgrade'd (that is, the infomask is + * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members + * can still be running, so we return -1 just like for an empty multixact + * without any further checking. It would be wrong to try to resolve such a + * multixact: either the multixact is within the current valid multixact + * range, in which case the returned result would be bogus, or outside that + * range, in which case an error would be raised. + * + * In all other cases, the passed multixact must be within the known valid + * range, that is, greater to or equal than oldestMultiXactId, and less than + * nextMXact. Otherwise, an error is raised. + * + * onlyLock must be set to true if caller is certain that the given multi + * is used only to lock tuples; can be false without loss of correctness, + * but passing a true means we can return quickly without checking for + * old updates. + */ +int +GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, + bool from_pgupgrade, bool onlyLock) +{ + int pageno; + int prev_pageno; + int entryno; + int slotno; + MultiXactOffset *offptr; + MultiXactOffset offset; + int length; + int truelength; + int i; + MultiXactId oldestMXact; + MultiXactId nextMXact; + MultiXactId tmpMXact; + MultiXactOffset nextOffset; + MultiXactMember *ptr; + + debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); + + if (!MultiXactIdIsValid(multi) || from_pgupgrade) + { + *members = NULL; + return -1; + } + + /* See if the MultiXactId is in the local cache */ + length = mXactCacheGetById(multi, members); + if (length >= 0) + { + debug_elog3(DEBUG2, "GetMembers: found %s in the cache", + mxid_to_string(multi, length, *members)); + return length; + } + + /* Set our OldestVisibleMXactId[] entry if we didn't already */ + MultiXactIdSetOldestVisible(); + + /* + * If we know the multi is used only for locking and not for updates, then + * we can skip checking if the value is older than our oldest visible + * multi. It cannot possibly still be running. + */ + if (onlyLock && + MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId])) + { + debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old"); + *members = NULL; + return -1; + } + + /* + * We check known limits on MultiXact before resorting to the SLRU area. + * + * An ID older than MultiXactState->oldestMultiXactId cannot possibly be + * useful; it has already been removed, or will be removed shortly, by + * truncation. If one is passed, an error is raised. + * + * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it + * implies undetected ID wraparound has occurred. This raises a hard + * error. + * + * Shared lock is enough here since we aren't modifying any global state. + * Acquire it just long enough to grab the current counter values. We may + * need both nextMXact and nextOffset; see below. + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + + oldestMXact = MultiXactState->oldestMultiXactId; + nextMXact = MultiXactState->nextMXact; + nextOffset = MultiXactState->nextOffset; + + LWLockRelease(MultiXactGenLock); + + if (MultiXactIdPrecedes(multi, oldestMXact)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("MultiXactId %u does no longer exist -- apparent wraparound", + multi))); + + if (!MultiXactIdPrecedes(multi, nextMXact)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("MultiXactId %u has not been created yet -- apparent wraparound", + multi))); + + /* + * Find out the offset at which we need to start reading MultiXactMembers + * and the number of members in the multixact. We determine the latter as + * the difference between this multixact's starting offset and the next + * one's. However, there are some corner cases to worry about: + * + * 1. This multixact may be the latest one created, in which case there is + * no next one to look at. In this case the nextOffset value we just + * saved is the correct endpoint. + * + * 2. The next multixact may still be in process of being filled in: that + * is, another process may have done GetNewMultiXactId but not yet written + * the offset entry for that ID. In that scenario, it is guaranteed that + * the offset entry for that multixact exists (because GetNewMultiXactId + * won't release MultiXactGenLock until it does) but contains zero + * (because we are careful to pre-zero offset pages). Because + * GetNewMultiXactId will never return zero as the starting offset for a + * multixact, when we read zero as the next multixact's offset, we know we + * have this case. We sleep for a bit and try again. + * + * 3. Because GetNewMultiXactId increments offset zero to offset one to + * handle case #2, there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our + * multixact's actual endpoint, or did it end at zero with a subsequent + * increment? We handle this using the knowledge that if the zero'th + * member slot wasn't filled, it'll contain zero, and zero isn't a valid + * transaction ID so it can't be a multixact member. Therefore, if we + * read a zero from the members array, just ignore it. + * + * This is all pretty messy, but the mess occurs only in infrequent corner + * cases, so it seems better than holding the MultiXactGenLock for a long + * time on every multixact creation. + */ +retry: + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + offset = *offptr; + + Assert(offset != 0); + + /* + * Use the same increment rule as GetNewMultiXactId(), that is, don't + * handle wraparound explicitly until needed. + */ + tmpMXact = multi + 1; + + if (nextMXact == tmpMXact) + { + /* Corner case 1: there is no next multixact */ + length = nextOffset - offset; + } + else + { + MultiXactOffset nextMXOffset; + + /* handle wraparound if needed */ + if (tmpMXact < FirstMultiXactId) + tmpMXact = FirstMultiXactId; + + prev_pageno = pageno; + + pageno = MultiXactIdToOffsetPage(tmpMXact); + entryno = MultiXactIdToOffsetEntry(tmpMXact); + + if (pageno != prev_pageno) + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact); + + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + nextMXOffset = *offptr; + + if (nextMXOffset == 0) + { + /* Corner case 2: next multixact is still being filled in */ + LWLockRelease(MultiXactOffsetSLRULock); + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + goto retry; + } + + length = nextMXOffset - offset; + } + + LWLockRelease(MultiXactOffsetSLRULock); + + ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); + + /* Now get the members themselves. */ + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + truelength = 0; + prev_pageno = -1; + for (i = 0; i < length; i++, offset++) + { + TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + + if (pageno != prev_pageno) + { + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); + prev_pageno = pageno; + } + + xactptr = (TransactionId *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + + if (!TransactionIdIsValid(*xactptr)) + { + /* Corner case 3: we must be looking at unused slot zero */ + Assert(offset == 0); + continue; + } + + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + + ptr[truelength].xid = *xactptr; + ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + truelength++; + } + + LWLockRelease(MultiXactMemberSLRULock); + + /* A multixid with zero members should not happen */ + Assert(truelength > 0); + + /* + * Copy the result into the local cache. + */ + mXactCachePut(multi, truelength, ptr); + + debug_elog3(DEBUG2, "GetMembers: no cache for %s", + mxid_to_string(multi, truelength, ptr)); + *members = ptr; + return truelength; +} + +/* + * mxactMemberComparator + * qsort comparison function for MultiXactMember + * + * We can't use wraparound comparison for XIDs because that does not respect + * the triangle inequality! Any old sort order will do. + */ +static int +mxactMemberComparator(const void *arg1, const void *arg2) +{ + MultiXactMember member1 = *(const MultiXactMember *) arg1; + MultiXactMember member2 = *(const MultiXactMember *) arg2; + + if (member1.xid > member2.xid) + return 1; + if (member1.xid < member2.xid) + return -1; + if (member1.status > member2.status) + return 1; + if (member1.status < member2.status) + return -1; + return 0; +} + +/* + * mXactCacheGetBySet + * returns a MultiXactId from the cache based on the set of + * TransactionIds that compose it, or InvalidMultiXactId if + * none matches. + * + * This is helpful, for example, if two transactions want to lock a huge + * table. By using the cache, the second will use the same MultiXactId + * for the majority of tuples, thus keeping MultiXactId usage low (saving + * both I/O and wraparound issues). + * + * NB: the passed members array will be sorted in-place. + */ +static MultiXactId +mXactCacheGetBySet(int nmembers, MultiXactMember *members) +{ + dlist_iter iter; + + debug_elog3(DEBUG2, "CacheGet: looking for %s", + mxid_to_string(InvalidMultiXactId, nmembers, members)); + + /* sort the array so comparison is easy */ + qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); + + dlist_foreach(iter, &MXactCache) + { + mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); + + if (entry->nmembers != nmembers) + continue; + + /* + * We assume the cache entries are sorted, and that the unused bits in + * "status" are zeroed. + */ + if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0) + { + debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi); + dlist_move_head(&MXactCache, iter.cur); + return entry->multi; + } + } + + debug_elog2(DEBUG2, "CacheGet: not found :-("); + return InvalidMultiXactId; +} + +/* + * mXactCacheGetById + * returns the composing MultiXactMember set from the cache for a + * given MultiXactId, if present. + * + * If successful, *xids is set to the address of a palloc'd copy of the + * MultiXactMember set. Return value is number of members, or -1 on failure. + */ +static int +mXactCacheGetById(MultiXactId multi, MultiXactMember **members) +{ + dlist_iter iter; + + debug_elog3(DEBUG2, "CacheGet: looking for %u", multi); + + dlist_foreach(iter, &MXactCache) + { + mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); + + if (entry->multi == multi) + { + MultiXactMember *ptr; + Size size; + + size = sizeof(MultiXactMember) * entry->nmembers; + ptr = (MultiXactMember *) palloc(size); + + memcpy(ptr, entry->members, size); + + debug_elog3(DEBUG2, "CacheGet: found %s", + mxid_to_string(multi, + entry->nmembers, + entry->members)); + + /* + * Note we modify the list while not using a modifiable iterator. + * This is acceptable only because we exit the iteration + * immediately afterwards. + */ + dlist_move_head(&MXactCache, iter.cur); + + *members = ptr; + return entry->nmembers; + } + } + + debug_elog2(DEBUG2, "CacheGet: not found"); + return -1; +} + +/* + * mXactCachePut + * Add a new MultiXactId and its composing set into the local cache. + */ +static void +mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) +{ + mXactCacheEnt *entry; + + debug_elog3(DEBUG2, "CachePut: storing %s", + mxid_to_string(multi, nmembers, members)); + + if (MXactContext == NULL) + { + /* The cache only lives as long as the current transaction */ + debug_elog2(DEBUG2, "CachePut: initializing memory context"); + MXactContext = AllocSetContextCreate(TopTransactionContext, + "MultiXact cache context", + ALLOCSET_SMALL_SIZES); + } + + entry = (mXactCacheEnt *) + MemoryContextAlloc(MXactContext, + offsetof(mXactCacheEnt, members) + + nmembers * sizeof(MultiXactMember)); + + entry->multi = multi; + entry->nmembers = nmembers; + memcpy(entry->members, members, nmembers * sizeof(MultiXactMember)); + + /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ + qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); + + dlist_push_head(&MXactCache, &entry->node); + if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES) + { + dlist_node *node; + mXactCacheEnt *entry; + + node = dlist_tail_node(&MXactCache); + dlist_delete(node); + MXactCacheMembers--; + + entry = dlist_container(mXactCacheEnt, node, node); + debug_elog3(DEBUG2, "CachePut: pruning cached multi %u", + entry->multi); + + pfree(entry); + } +} + +static char * +mxstatus_to_string(MultiXactStatus status) +{ + switch (status) + { + case MultiXactStatusForKeyShare: + return "keysh"; + case MultiXactStatusForShare: + return "sh"; + case MultiXactStatusForNoKeyUpdate: + return "fornokeyupd"; + case MultiXactStatusForUpdate: + return "forupd"; + case MultiXactStatusNoKeyUpdate: + return "nokeyupd"; + case MultiXactStatusUpdate: + return "upd"; + default: + elog(ERROR, "unrecognized multixact status %d", status); + return ""; + } +} + +char * +mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members) +{ + static char *str = NULL; + StringInfoData buf; + int i; + + if (str != NULL) + pfree(str); + + initStringInfo(&buf); + + appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid, + mxstatus_to_string(members[0].status)); + + for (i = 1; i < nmembers; i++) + appendStringInfo(&buf, ", %u (%s)", members[i].xid, + mxstatus_to_string(members[i].status)); + + appendStringInfoChar(&buf, ']'); + str = MemoryContextStrdup(TopMemoryContext, buf.data); + pfree(buf.data); + return str; +} + +/* + * AtEOXact_MultiXact + * Handle transaction end for MultiXact + * + * This is called at top transaction commit or abort (we don't care which). + */ +void +AtEOXact_MultiXact(void) +{ + /* + * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of + * which should only be valid while within a transaction. + * + * We assume that storing a MultiXactId is atomic and so we need not take + * MultiXactGenLock to do this. + */ + OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; + OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; + + /* + * Discard the local MultiXactId cache. Since MXactContext was created as + * a child of TopTransactionContext, we needn't delete it explicitly. + */ + MXactContext = NULL; + dlist_init(&MXactCache); + MXactCacheMembers = 0; +} + +/* + * AtPrepare_MultiXact + * Save multixact state at 2PC transaction prepare + * + * In this phase, we only store our OldestMemberMXactId value in the two-phase + * state file. + */ +void +AtPrepare_MultiXact(void) +{ + MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId]; + + if (MultiXactIdIsValid(myOldestMember)) + RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0, + &myOldestMember, sizeof(MultiXactId)); +} + +/* + * PostPrepare_MultiXact + * Clean up after successful PREPARE TRANSACTION + */ +void +PostPrepare_MultiXact(TransactionId xid) +{ + MultiXactId myOldestMember; + + /* + * Transfer our OldestMemberMXactId value to the slot reserved for the + * prepared transaction. + */ + myOldestMember = OldestMemberMXactId[MyBackendId]; + if (MultiXactIdIsValid(myOldestMember)) + { + BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); + + /* + * Even though storing MultiXactId is atomic, acquire lock to make + * sure others see both changes, not just the reset of the slot of the + * current backend. Using a volatile pointer might suffice, but this + * isn't a hot spot. + */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + + OldestMemberMXactId[dummyBackendId] = myOldestMember; + OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; + + LWLockRelease(MultiXactGenLock); + } + + /* + * We don't need to transfer OldestVisibleMXactId value, because the + * transaction is not going to be looking at any more multixacts once it's + * prepared. + * + * We assume that storing a MultiXactId is atomic and so we need not take + * MultiXactGenLock to do this. + */ + OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; + + /* + * Discard the local MultiXactId cache like in AtEOXact_MultiXact. + */ + MXactContext = NULL; + dlist_init(&MXactCache); + MXactCacheMembers = 0; +} + +/* + * multixact_twophase_recover + * Recover the state of a prepared transaction at startup + */ +void +multixact_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); + MultiXactId oldestMember; + + /* + * Get the oldest member XID from the state file record, and set it in the + * OldestMemberMXactId slot reserved for this prepared transaction. + */ + Assert(len == sizeof(MultiXactId)); + oldestMember = *((MultiXactId *) recdata); + + OldestMemberMXactId[dummyBackendId] = oldestMember; +} + +/* + * multixact_twophase_postcommit + * Similar to AtEOXact_MultiXact but for COMMIT PREPARED + */ +void +multixact_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, true); + + Assert(len == sizeof(MultiXactId)); + + OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId; +} + +/* + * multixact_twophase_postabort + * This is actually just the same as the COMMIT case. + */ +void +multixact_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + multixact_twophase_postcommit(xid, info, recdata, len); +} + +/* + * Initialization of shared memory for MultiXact. We use two SLRU areas, + * thus double memory. Also, reserve space for the shared MultiXactState + * struct and the per-backend MultiXactId arrays (two of those, too). + */ +Size +MultiXactShmemSize(void) +{ + Size size; + + /* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */ +#define SHARED_MULTIXACT_STATE_SIZE \ + add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \ + mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot)) + + size = SHARED_MULTIXACT_STATE_SIZE; + size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0)); + size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0)); + + return size; +} + +void +MultiXactShmemInit(void) +{ + bool found; + + debug_elog2(DEBUG2, "Shared Memory Init for MultiXact"); + + MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes; + MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes; + + SimpleLruInit(MultiXactOffsetCtl, + "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0, + MultiXactOffsetSLRULock, "pg_multixact/offsets", + LWTRANCHE_MULTIXACTOFFSET_BUFFER, + SYNC_HANDLER_MULTIXACT_OFFSET); + SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE); + SimpleLruInit(MultiXactMemberCtl, + "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0, + MultiXactMemberSLRULock, "pg_multixact/members", + LWTRANCHE_MULTIXACTMEMBER_BUFFER, + SYNC_HANDLER_MULTIXACT_MEMBER); + /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ + + /* Initialize our shared state struct */ + MultiXactState = ShmemInitStruct("Shared MultiXact State", + SHARED_MULTIXACT_STATE_SIZE, + &found); + if (!IsUnderPostmaster) + { + Assert(!found); + + /* Make sure we zero out the per-backend state */ + MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); + } + else + Assert(found); + + /* + * Set up array pointers. Note that perBackendXactIds[0] is wasted space + * since we only use indexes 1..MaxOldestSlot in each array. + */ + OldestMemberMXactId = MultiXactState->perBackendXactIds; + OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot; +} + +/* + * This func must be called ONCE on system install. It creates the initial + * MultiXact segments. (The MultiXacts directories are assumed to have been + * created by initdb, and MultiXactShmemInit must have been called already.) + */ +void +BootStrapMultiXact(void) +{ + int slotno; + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the offsets log */ + slotno = ZeroMultiXactOffsetPage(0, false); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactOffsetSLRULock); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the members log */ + slotno = ZeroMultiXactMemberPage(0, false); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactMemberCtl, slotno); + Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactMemberSLRULock); +} + +/* + * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. + * If writeXlog is true, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroMultiXactOffsetPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + + if (writeXlog) + WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); + + return slotno; +} + +/* + * Ditto, for MultiXactMember + */ +static int +ZeroMultiXactMemberPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); + + if (writeXlog) + WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); + + return slotno; +} + +/* + * MaybeExtendOffsetSlru + * Extend the offsets SLRU area, if necessary + * + * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might + * contain files that are shorter than necessary; this would occur if the old + * installation had used multixacts beyond the first page (files cannot be + * copied, because the on-disk representation is different). pg_upgrade would + * update pg_control to set the next offset value to be at that position, so + * that tuples marked as locked by such MultiXacts would be seen as visible + * without having to consult multixact. However, trying to create and use a + * new MultiXactId would result in an error because the page on which the new + * value would reside does not exist. This routine is in charge of creating + * such pages. + */ +static void +MaybeExtendOffsetSlru(void) +{ + int pageno; + + pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) + { + int slotno; + + /* + * Fortunately for us, SimpleLruWritePage is already prepared to deal + * with creating a new segment file even if the page we're writing is + * not the first in it, so this is enough. + */ + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + } + + LWLockRelease(MultiXactOffsetSLRULock); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup. + * + * StartupXLOG has already established nextMXact/nextOffset by calling + * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti + * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet + * replayed WAL. + */ +void +StartupMultiXact(void) +{ + MultiXactId multi = MultiXactState->nextMXact; + MultiXactOffset offset = MultiXactState->nextOffset; + int pageno; + + /* + * Initialize offset's idea of the latest page number. + */ + pageno = MultiXactIdToOffsetPage(multi); + MultiXactOffsetCtl->shared->latest_page_number = pageno; + + /* + * Initialize member's idea of the latest page number. + */ + pageno = MXOffsetToMemberPage(offset); + MultiXactMemberCtl->shared->latest_page_number = pageno; +} + +/* + * This must be called ONCE at the end of startup/recovery. + */ +void +TrimMultiXact(void) +{ + MultiXactId nextMXact; + MultiXactOffset offset; + MultiXactId oldestMXact; + Oid oldestMXactDB; + int pageno; + int entryno; + int flagsoff; + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextMXact = MultiXactState->nextMXact; + offset = MultiXactState->nextOffset; + oldestMXact = MultiXactState->oldestMultiXactId; + oldestMXactDB = MultiXactState->oldestMultiXactDB; + LWLockRelease(MultiXactGenLock); + + /* Clean up offsets state */ + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + /* + * (Re-)Initialize our idea of the latest page number for offsets. + */ + pageno = MultiXactIdToOffsetPage(nextMXact); + MultiXactOffsetCtl->shared->latest_page_number = pageno; + + /* + * Zero out the remainder of the current offsets page. See notes in + * TrimCLOG() for background. Unlike CLOG, some WAL record covers every + * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL + * rule "write xlog before data," nextMXact successors may carry obsolete, + * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() + * operates normally. + */ + entryno = MultiXactIdToOffsetEntry(nextMXact); + if (entryno != 0) + { + int slotno; + MultiXactOffset *offptr; + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + + MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(MultiXactOffsetSLRULock); + + /* And the same for members */ + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + /* + * (Re-)Initialize our idea of the latest page number for members. + */ + pageno = MXOffsetToMemberPage(offset); + MultiXactMemberCtl->shared->latest_page_number = pageno; + + /* + * Zero out the remainder of the current members page. See notes in + * TrimCLOG() for motivation. + */ + flagsoff = MXOffsetToFlagsOffset(offset); + if (flagsoff != 0) + { + int slotno; + TransactionId *xidptr; + int memberoff; + + memberoff = MXOffsetToMemberOffset(offset); + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); + xidptr = (TransactionId *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + + MemSet(xidptr, 0, BLCKSZ - memberoff); + + /* + * Note: we don't need to zero out the flag bits in the remaining + * members of the current group, because they are always reset before + * writing. + */ + + MultiXactMemberCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(MultiXactMemberSLRULock); + + /* signal that we're officially up */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->finishedStartup = true; + LWLockRelease(MultiXactGenLock); + + /* Now compute how far away the next members wraparound is. */ + SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); +} + +/* + * Get the MultiXact data to save in a checkpoint record + */ +void +MultiXactGetCheckptMulti(bool is_shutdown, + MultiXactId *nextMulti, + MultiXactOffset *nextMultiOffset, + MultiXactId *oldestMulti, + Oid *oldestMultiDB) +{ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + *nextMulti = MultiXactState->nextMXact; + *nextMultiOffset = MultiXactState->nextOffset; + *oldestMulti = MultiXactState->oldestMultiXactId; + *oldestMultiDB = MultiXactState->oldestMultiXactDB; + LWLockRelease(MultiXactGenLock); + + debug_elog6(DEBUG2, + "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", + *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointMultiXact(void) +{ + TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); + + /* + * Write dirty MultiXact pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); + + TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); +} + +/* + * Set the next-to-be-assigned MultiXactId and offset + * + * This is used when we can determine the correct next ID/offset exactly + * from a checkpoint record. Although this is only called during bootstrap + * and XLog replay, we take the lock in case any hot-standby backends are + * examining the values. + */ +void +MultiXactSetNextMXact(MultiXactId nextMulti, + MultiXactOffset nextMultiOffset) +{ + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", + nextMulti, nextMultiOffset); + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->nextMXact = nextMulti; + MultiXactState->nextOffset = nextMultiOffset; + LWLockRelease(MultiXactGenLock); + + /* + * During a binary upgrade, make sure that the offsets SLRU is large + * enough to contain the next value that would be created. + * + * We need to do this pretty early during the first startup in binary + * upgrade mode: before StartupMultiXact() in fact, because this routine + * is called even before that by StartupXLOG(). And we can't do it + * earlier than at this point, because during that first call of this + * routine we determine the MultiXactState->nextMXact value that + * MaybeExtendOffsetSlru needs. + */ + if (IsBinaryUpgrade) + MaybeExtendOffsetSlru(); +} + +/* + * Determine the last safe MultiXactId to allocate given the currently oldest + * datminmxid (ie, the oldest MultiXactId that might exist in any database + * of our cluster), and the OID of the (or a) database with that value. + * + * is_startup is true when we are just starting the cluster, false when we + * are updating state in a running cluster. This only affects log messages. + */ +void +SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, + bool is_startup) +{ + MultiXactId multiVacLimit; + MultiXactId multiWarnLimit; + MultiXactId multiStopLimit; + MultiXactId multiWrapLimit; + MultiXactId curMulti; + bool needs_offset_vacuum; + + Assert(MultiXactIdIsValid(oldest_datminmxid)); + + /* + * We pretend that a wrap will happen halfway through the multixact ID + * space, but that's not really true, because multixacts wrap differently + * from transaction IDs. Note that, separately from any concern about + * multixact IDs wrapping, we must ensure that multixact members do not + * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. + */ + multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); + if (multiWrapLimit < FirstMultiXactId) + multiWrapLimit += FirstMultiXactId; + + /* + * We'll refuse to continue assigning MultiXactIds once we get within 3M + * multi of data loss. See SetTransactionIdLimit. + */ + multiStopLimit = multiWrapLimit - 3000000; + if (multiStopLimit < FirstMultiXactId) + multiStopLimit -= FirstMultiXactId; + + /* + * We'll start complaining loudly when we get within 40M multis of data + * loss. This is kind of arbitrary, but if you let your gas gauge get + * down to 2% of full, would you be looking for the next gas station? We + * need to be fairly liberal about this number because there are lots of + * scenarios where most transactions are done by automatic clients that + * won't pay attention to warnings. (No, we're not gonna make this + * configurable. If you know enough to configure it, you know enough to + * not get in this kind of trouble in the first place.) + */ + multiWarnLimit = multiWrapLimit - 40000000; + if (multiWarnLimit < FirstMultiXactId) + multiWarnLimit -= FirstMultiXactId; + + /* + * We'll start trying to force autovacuums when oldest_datminmxid gets to + * be more than autovacuum_multixact_freeze_max_age mxids old. + * + * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter + * so that we don't have to worry about dealing with on-the-fly changes in + * its value. See SetTransactionIdLimit. + */ + multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age; + if (multiVacLimit < FirstMultiXactId) + multiVacLimit += FirstMultiXactId; + + /* Grab lock for just long enough to set the new limit values */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestMultiXactId = oldest_datminmxid; + MultiXactState->oldestMultiXactDB = oldest_datoid; + MultiXactState->multiVacLimit = multiVacLimit; + MultiXactState->multiWarnLimit = multiWarnLimit; + MultiXactState->multiStopLimit = multiStopLimit; + MultiXactState->multiWrapLimit = multiWrapLimit; + curMulti = MultiXactState->nextMXact; + LWLockRelease(MultiXactGenLock); + + /* Log the info */ + ereport(DEBUG1, + (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u", + multiWrapLimit, oldest_datoid))); + + /* + * Computing the actual limits is only possible once the data directory is + * in a consistent state. There's no need to compute the limits while + * still replaying WAL - no decisions about new multis are made even + * though multixact creations might be replayed. So we'll only do further + * checks after TrimMultiXact() has been called. + */ + if (!MultiXactState->finishedStartup) + return; + + Assert(!InRecovery); + + /* Set limits for offset vacuum. */ + needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); + + /* + * If past the autovacuum force point, immediately signal an autovac + * request. The reason for this is that autovac only processes one + * database per invocation. Once it's finished cleaning up the oldest + * database, it'll call here, and we'll signal the postmaster to start + * another iteration immediately if there are still any old databases. + */ + if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || + needs_offset_vacuum) && IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + /* Give an immediate warning if past the wrap warn point */ + if (MultiXactIdPrecedes(multiWarnLimit, curMulti)) + { + char *oldest_datname; + + /* + * We can be called when not inside a transaction, for example during + * StartupXLOG(). In such a case we cannot do database access, so we + * must just report the oldest DB's OID. + * + * Note: it's also possible that get_database_name fails and returns + * NULL, for example because the database just got dropped. We'll + * still warn, even though the warning might now be unnecessary. + */ + if (IsTransactionState()) + oldest_datname = get_database_name(oldest_datoid); + else + oldest_datname = NULL; + + if (oldest_datname) + ereport(WARNING, + (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", + "database \"%s\" must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - curMulti, + oldest_datname, + multiWrapLimit - curMulti), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", + "database with OID %u must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - curMulti, + oldest_datoid, + multiWrapLimit - curMulti), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } +} + +/* + * Ensure the next-to-be-assigned MultiXactId is at least minMulti, + * and similarly nextOffset is at least minMultiOffset. + * + * This is used when we can determine minimum safe values from an XLog + * record (either an on-line checkpoint or an mxact creation log entry). + * Although this is only called during XLog replay, we take the lock in case + * any hot-standby backends are examining the values. + */ +void +MultiXactAdvanceNextMXact(MultiXactId minMulti, + MultiXactOffset minMultiOffset) +{ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) + { + debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); + MultiXactState->nextMXact = minMulti; + } + if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) + { + debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", + minMultiOffset); + MultiXactState->nextOffset = minMultiOffset; + } + LWLockRelease(MultiXactGenLock); +} + +/* + * Update our oldestMultiXactId value, but only if it's more recent than what + * we had. + * + * This may only be called during WAL replay. + */ +void +MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) +{ + Assert(InRecovery); + + if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) + SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); +} + +/* + * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. + * + * NB: this is called while holding MultiXactGenLock. We want it to be very + * fast most of the time; even when it's not so fast, no actual I/O need + * happen unless we're forced to write out a dirty log or xlog page to make + * room in shared memory. + */ +static void +ExtendMultiXactOffset(MultiXactId multi) +{ + int pageno; + + /* + * No work except at first MultiXactId of a page. But beware: just after + * wraparound, the first MultiXactId of page zero is FirstMultiXactId. + */ + if (MultiXactIdToOffsetEntry(multi) != 0 && + multi != FirstMultiXactId) + return; + + pageno = MultiXactIdToOffsetPage(multi); + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroMultiXactOffsetPage(pageno, true); + + LWLockRelease(MultiXactOffsetSLRULock); +} + +/* + * Make sure that MultiXactMember has room for the members of a newly- + * allocated MultiXactId. + * + * Like the above routine, this is called while holding MultiXactGenLock; + * same comments apply. + */ +static void +ExtendMultiXactMember(MultiXactOffset offset, int nmembers) +{ + /* + * It's possible that the members span more than one page of the members + * file, so we loop to ensure we consider each page. The coding is not + * optimal if the members span several pages, but that seems unusual + * enough to not worry much about. + */ + while (nmembers > 0) + { + int flagsoff; + int flagsbit; + uint32 difference; + + /* + * Only zero when at first entry of a page. + */ + flagsoff = MXOffsetToFlagsOffset(offset); + flagsbit = MXOffsetToFlagsBitShift(offset); + if (flagsoff == 0 && flagsbit == 0) + { + int pageno; + + pageno = MXOffsetToMemberPage(offset); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroMultiXactMemberPage(pageno, true); + + LWLockRelease(MultiXactMemberSLRULock); + } + + /* + * Compute the number of items till end of current page. Careful: if + * addition of unsigned ints wraps around, we're at the last page of + * the last segment; since that page holds a different number of items + * than other pages, we need to do it differently. + */ + if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) + { + /* + * This is the last page of the last segment; we can compute the + * number of items left to allocate in it without modulo + * arithmetic. + */ + difference = MaxMultiXactOffset - offset + 1; + } + else + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + + /* + * Advance to next page, taking care to properly handle the wraparound + * case. OK if nmembers goes negative. + */ + nmembers -= difference; + offset += difference; + } +} + +/* + * GetOldestMultiXactId + * + * Return the oldest MultiXactId that's still possibly still seen as live by + * any running transaction. Older ones might still exist on disk, but they no + * longer have any running member transaction. + * + * It's not safe to truncate MultiXact SLRU segments on the value returned by + * this function; however, it can be used by a full-table vacuum to set the + * point at which it will be possible to truncate SLRU for that table. + */ +MultiXactId +GetOldestMultiXactId(void) +{ + MultiXactId oldestMXact; + MultiXactId nextMXact; + int i; + + /* + * This is the oldest valid value among all the OldestMemberMXactId[] and + * OldestVisibleMXactId[] entries, or nextMXact if none are valid. + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + + /* + * We have to beware of the possibility that nextMXact is in the + * wrapped-around state. We don't fix the counter itself here, but we + * must be sure to use a valid value in our calculation. + */ + nextMXact = MultiXactState->nextMXact; + if (nextMXact < FirstMultiXactId) + nextMXact = FirstMultiXactId; + + oldestMXact = nextMXact; + for (i = 1; i <= MaxOldestSlot; i++) + { + MultiXactId thisoldest; + + thisoldest = OldestMemberMXactId[i]; + if (MultiXactIdIsValid(thisoldest) && + MultiXactIdPrecedes(thisoldest, oldestMXact)) + oldestMXact = thisoldest; + thisoldest = OldestVisibleMXactId[i]; + if (MultiXactIdIsValid(thisoldest) && + MultiXactIdPrecedes(thisoldest, oldestMXact)) + oldestMXact = thisoldest; + } + + LWLockRelease(MultiXactGenLock); + + return oldestMXact; +} + +/* + * Determine how aggressively we need to vacuum in order to prevent member + * wraparound. + * + * To do so determine what's the oldest member offset and install the limit + * info in MultiXactState, where it can be used to prevent overrun of old data + * in the members SLRU area. + * + * The return value is true if emergency autovacuum is required and false + * otherwise. + */ +static bool +SetOffsetVacuumLimit(bool is_startup) +{ + MultiXactId oldestMultiXactId; + MultiXactId nextMXact; + MultiXactOffset oldestOffset = 0; /* placate compiler */ + MultiXactOffset prevOldestOffset; + MultiXactOffset nextOffset; + bool oldestOffsetKnown = false; + bool prevOldestOffsetKnown; + MultiXactOffset offsetStopLimit = 0; + MultiXactOffset prevOffsetStopLimit; + + /* + * NB: Have to prevent concurrent truncation, we might otherwise try to + * lookup an oldestMulti that's concurrently getting truncated away. + */ + LWLockAcquire(MultiXactTruncationLock, LW_SHARED); + + /* Read relevant fields from shared memory. */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + oldestMultiXactId = MultiXactState->oldestMultiXactId; + nextMXact = MultiXactState->nextMXact; + nextOffset = MultiXactState->nextOffset; + prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; + prevOldestOffset = MultiXactState->oldestOffset; + prevOffsetStopLimit = MultiXactState->offsetStopLimit; + Assert(MultiXactState->finishedStartup); + LWLockRelease(MultiXactGenLock); + + /* + * Determine the offset of the oldest multixact. Normally, we can read + * the offset from the multixact itself, but there's an important special + * case: if there are no multixacts in existence at all, oldestMXact + * obviously can't point to one. It will instead point to the multixact + * ID that will be assigned the next time one is needed. + */ + if (oldestMultiXactId == nextMXact) + { + /* + * When the next multixact gets created, it will be stored at the next + * offset. + */ + oldestOffset = nextOffset; + oldestOffsetKnown = true; + } + else + { + /* + * Figure out where the oldest existing multixact's offsets are + * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, + * the supposedly-earliest multixact might not really exist. We are + * careful not to fail in that case. + */ + oldestOffsetKnown = + find_multixact_start(oldestMultiXactId, &oldestOffset); + + if (oldestOffsetKnown) + ereport(DEBUG1, + (errmsg_internal("oldest MultiXactId member is at offset %u", + oldestOffset))); + else + ereport(LOG, + (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", + oldestMultiXactId))); + } + + LWLockRelease(MultiXactTruncationLock); + + /* + * If we can, compute limits (and install them MultiXactState) to prevent + * overrun of old data in the members SLRU area. We can only do so if the + * oldest offset is known though. + */ + if (oldestOffsetKnown) + { + /* move back to start of the corresponding segment */ + offsetStopLimit = oldestOffset - (oldestOffset % + (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); + + /* always leave one segment before the wraparound point */ + offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); + + if (!prevOldestOffsetKnown && !is_startup) + ereport(LOG, + (errmsg("MultiXact member wraparound protections are now enabled"))); + + ereport(DEBUG1, + (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", + offsetStopLimit, oldestMultiXactId))); + } + else if (prevOldestOffsetKnown) + { + /* + * If we failed to get the oldest offset this time, but we have a + * value from a previous pass through this function, use the old + * values rather than automatically forcing an emergency autovacuum + * cycle again. + */ + oldestOffset = prevOldestOffset; + oldestOffsetKnown = true; + offsetStopLimit = prevOffsetStopLimit; + } + + /* Install the computed values */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestOffset = oldestOffset; + MultiXactState->oldestOffsetKnown = oldestOffsetKnown; + MultiXactState->offsetStopLimit = offsetStopLimit; + LWLockRelease(MultiXactGenLock); + + /* + * Do we need an emergency autovacuum? If we're not sure, assume yes. + */ + return !oldestOffsetKnown || + (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); +} + +/* + * Return whether adding "distance" to "start" would move past "boundary". + * + * We use this to determine whether the addition is "wrapping around" the + * boundary point, hence the name. The reason we don't want to use the regular + * 2^31-modulo arithmetic here is that we want to be able to use the whole of + * the 2^32-1 space here, allowing for more multixacts than would fit + * otherwise. + */ +static bool +MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, + uint32 distance) +{ + MultiXactOffset finish; + + /* + * Note that offset number 0 is not used (see GetMultiXactIdMembers), so + * if the addition wraps around the UINT_MAX boundary, skip that value. + */ + finish = start + distance; + if (finish < start) + finish++; + + /*----------------------------------------------------------------------- + * When the boundary is numerically greater than the starting point, any + * value numerically between the two is not wrapped: + * + * <----S----B----> + * [---) = F wrapped past B (and UINT_MAX) + * [---) = F not wrapped + * [----] = F wrapped past B + * + * When the boundary is numerically less than the starting point (i.e. the + * UINT_MAX wraparound occurs somewhere in between) then all values in + * between are wrapped: + * + * <----B----S----> + * [---) = F not wrapped past B (but wrapped past UINT_MAX) + * [---) = F wrapped past B (and UINT_MAX) + * [----] = F not wrapped + *----------------------------------------------------------------------- + */ + if (start < boundary) + return finish >= boundary || finish < start; + else + return finish >= boundary && finish < start; +} + +/* + * Find the starting offset of the given MultiXactId. + * + * Returns false if the file containing the multi does not exist on disk. + * Otherwise, returns true and sets *result to the starting member offset. + * + * This function does not prevent concurrent truncation, so if that's + * required, the caller has to protect against that. + */ +static bool +find_multixact_start(MultiXactId multi, MultiXactOffset *result) +{ + MultiXactOffset offset; + int pageno; + int entryno; + int slotno; + MultiXactOffset *offptr; + + Assert(MultiXactState->finishedStartup); + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + /* + * Write out dirty data, so PhysicalPageExists can work correctly. + */ + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); + + if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) + return false; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + offset = *offptr; + LWLockRelease(MultiXactOffsetSLRULock); + + *result = offset; + return true; +} + +/* + * Determine how many multixacts, and how many multixact members, currently + * exist. Return false if unable to determine. + */ +static bool +ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) +{ + MultiXactOffset nextOffset; + MultiXactOffset oldestOffset; + MultiXactId oldestMultiXactId; + MultiXactId nextMultiXactId; + bool oldestOffsetKnown; + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextOffset = MultiXactState->nextOffset; + oldestMultiXactId = MultiXactState->oldestMultiXactId; + nextMultiXactId = MultiXactState->nextMXact; + oldestOffset = MultiXactState->oldestOffset; + oldestOffsetKnown = MultiXactState->oldestOffsetKnown; + LWLockRelease(MultiXactGenLock); + + if (!oldestOffsetKnown) + return false; + + *members = nextOffset - oldestOffset; + *multixacts = nextMultiXactId - oldestMultiXactId; + return true; +} + +/* + * Multixact members can be removed once the multixacts that refer to them + * are older than every datminmxid. autovacuum_multixact_freeze_max_age and + * vacuum_multixact_freeze_table_age work together to make sure we never have + * too many multixacts; we hope that, at least under normal circumstances, + * this will also be sufficient to keep us from using too many offsets. + * However, if the average multixact has many members, we might exhaust the + * members space while still using few enough members that these limits fail + * to trigger full table scans for relminmxid advancement. At that point, + * we'd have no choice but to start failing multixact-creating operations + * with an error. + * + * To prevent that, if more than a threshold portion of the members space is + * used, we effectively reduce autovacuum_multixact_freeze_max_age and + * to a value just less than the number of multixacts in use. We hope that + * this will quickly trigger autovacuuming on the table or tables with the + * oldest relminmxid, thus allowing datminmxid values to advance and removing + * some members. + * + * As the fraction of the member space currently in use grows, we become + * more aggressive in clamping this value. That not only causes autovacuum + * to ramp up, but also makes any manual vacuums the user issues more + * aggressive. This happens because vacuum_set_xid_limits() clamps the + * freeze table and the minimum freeze age based on the effective + * autovacuum_multixact_freeze_max_age this function returns. In the worst + * case, we'll claim the freeze_max_age to zero, and every vacuum of any + * table will try to freeze every multixact. + * + * It's possible that these thresholds should be user-tunable, but for now + * we keep it simple. + */ +int +MultiXactMemberFreezeThreshold(void) +{ + MultiXactOffset members; + uint32 multixacts; + uint32 victim_multixacts; + double fraction; + + /* If we can't determine member space utilization, assume the worst. */ + if (!ReadMultiXactCounts(&multixacts, &members)) + return 0; + + /* If member space utilization is low, no special action is required. */ + if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) + return autovacuum_multixact_freeze_max_age; + + /* + * Compute a target for relminmxid advancement. The number of multixacts + * we try to eliminate from the system is based on how far we are past + * MULTIXACT_MEMBER_SAFE_THRESHOLD. + */ + fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / + (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); + victim_multixacts = multixacts * fraction; + + /* fraction could be > 1.0, but lowest possible freeze age is zero */ + if (victim_multixacts > multixacts) + return 0; + return multixacts - victim_multixacts; +} + +typedef struct mxtruncinfo +{ + int earliestExistingPage; +} mxtruncinfo; + +/* + * SlruScanDirectory callback + * This callback determines the earliest existing page number. + */ +static bool +SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data) +{ + mxtruncinfo *trunc = (mxtruncinfo *) data; + + if (trunc->earliestExistingPage == -1 || + ctl->PagePrecedes(segpage, trunc->earliestExistingPage)) + { + trunc->earliestExistingPage = segpage; + } + + return false; /* keep going */ +} + + +/* + * Delete members segments [oldest, newOldest) + * + * The members SLRU can, in contrast to the offsets one, be filled to almost + * the full range at once. This means SimpleLruTruncate() can't trivially be + * used - instead the to-be-deleted range is computed using the offsets + * SLRU. C.f. TruncateMultiXact(). + */ +static void +PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) +{ + const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); + int startsegment = MXOffsetToMemberSegment(oldestOffset); + int endsegment = MXOffsetToMemberSegment(newOldestOffset); + int segment = startsegment; + + /* + * Delete all the segments but the last one. The last segment can still + * contain, possibly partially, valid data. + */ + while (segment != endsegment) + { + elog(DEBUG2, "truncating multixact members segment %x", segment); + SlruDeleteSegment(MultiXactMemberCtl, segment); + + /* move to next segment, handling wraparound correctly */ + if (segment == maxsegment) + segment = 0; + else + segment += 1; + } +} + +/* + * Delete offsets segments [oldest, newOldest) + */ +static void +PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti) +{ + /* + * We step back one multixact to avoid passing a cutoff page that hasn't + * been created yet in the rare case that oldestMulti would be the first + * item on a page and oldestMulti == nextMulti. In that case, if we + * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound + * detection. + */ + SimpleLruTruncate(MultiXactOffsetCtl, + MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti))); +} + +/* + * Remove all MultiXactOffset and MultiXactMember segments before the oldest + * ones still of interest. + * + * This is only called on a primary as part of vacuum (via + * vac_truncate_clog()). During recovery truncation is done by replaying + * truncation WAL records logged here. + * + * newOldestMulti is the oldest currently required multixact, newOldestMultiDB + * is one of the databases preventing newOldestMulti from increasing. + */ +void +TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) +{ + MultiXactId oldestMulti; + MultiXactId nextMulti; + MultiXactOffset newOldestOffset; + MultiXactOffset oldestOffset; + MultiXactOffset nextOffset; + mxtruncinfo trunc; + MultiXactId earliest; + + Assert(!RecoveryInProgress()); + Assert(MultiXactState->finishedStartup); + + /* + * We can only allow one truncation to happen at once. Otherwise parts of + * members might vanish while we're doing lookups or similar. There's no + * need to have an interlock with creating new multis or such, since those + * are constrained by the limits (which only grow, never shrink). + */ + LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextMulti = MultiXactState->nextMXact; + nextOffset = MultiXactState->nextOffset; + oldestMulti = MultiXactState->oldestMultiXactId; + LWLockRelease(MultiXactGenLock); + Assert(MultiXactIdIsValid(oldestMulti)); + + /* + * Make sure to only attempt truncation if there's values to truncate + * away. In normal processing values shouldn't go backwards, but there's + * some corner cases (due to bugs) where that's possible. + */ + if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti)) + { + LWLockRelease(MultiXactTruncationLock); + return; + } + + /* + * Note we can't just plow ahead with the truncation; it's possible that + * there are no segments to truncate, which is a problem because we are + * going to attempt to read the offsets page to determine where to + * truncate the members SLRU. So we first scan the directory to determine + * the earliest offsets page number that we can read without error. + * + * When nextMXact is less than one segment away from multiWrapLimit, + * SlruScanDirCbFindEarliest can find some early segment other than the + * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST) + * returns false, because not all pairs of entries have the same answer.) + * That can also arise when an earlier truncation attempt failed unlink() + * or returned early from this function. The only consequence is + * returning early, which wastes space that we could have liberated. + * + * NB: It's also possible that the page that oldestMulti is on has already + * been truncated away, and we crashed before updating oldestMulti. + */ + trunc.earliestExistingPage = -1; + SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); + earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; + if (earliest < FirstMultiXactId) + earliest = FirstMultiXactId; + + /* If there's nothing to remove, we can bail out early. */ + if (MultiXactIdPrecedes(oldestMulti, earliest)) + { + LWLockRelease(MultiXactTruncationLock); + return; + } + + /* + * First, compute the safe truncation point for MultiXactMember. This is + * the starting offset of the oldest multixact. + * + * Hopefully, find_multixact_start will always work here, because we've + * already checked that it doesn't precede the earliest MultiXact on disk. + * But if it fails, don't truncate anything, and log a message. + */ + if (oldestMulti == nextMulti) + { + /* there are NO MultiXacts */ + oldestOffset = nextOffset; + } + else if (!find_multixact_start(oldestMulti, &oldestOffset)) + { + ereport(LOG, + (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation", + oldestMulti, earliest))); + LWLockRelease(MultiXactTruncationLock); + return; + } + + /* + * Secondly compute up to where to truncate. Lookup the corresponding + * member offset for newOldestMulti for that. + */ + if (newOldestMulti == nextMulti) + { + /* there are NO MultiXacts */ + newOldestOffset = nextOffset; + } + else if (!find_multixact_start(newOldestMulti, &newOldestOffset)) + { + ereport(LOG, + (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation", + newOldestMulti))); + LWLockRelease(MultiXactTruncationLock); + return; + } + + elog(DEBUG1, "performing multixact truncation: " + "offsets [%u, %u), offsets segments [%x, %x), " + "members [%u, %u), members segments [%x, %x)", + oldestMulti, newOldestMulti, + MultiXactIdToOffsetSegment(oldestMulti), + MultiXactIdToOffsetSegment(newOldestMulti), + oldestOffset, newOldestOffset, + MXOffsetToMemberSegment(oldestOffset), + MXOffsetToMemberSegment(newOldestOffset)); + + /* + * Do truncation, and the WAL logging of the truncation, in a critical + * section. That way offsets/members cannot get out of sync anymore, i.e. + * once consistent the newOldestMulti will always exist in members, even + * if we crashed in the wrong moment. + */ + START_CRIT_SECTION(); + + /* + * Prevent checkpoints from being scheduled concurrently. This is critical + * because otherwise a truncation record might not be replayed after a + * crash/basebackup, even though the state of the data directory would + * require it. + */ + Assert(!MyProc->delayChkpt); + MyProc->delayChkpt = true; + + /* WAL log truncation */ + WriteMTruncateXlogRec(newOldestMultiDB, + oldestMulti, newOldestMulti, + oldestOffset, newOldestOffset); + + /* + * Update in-memory limits before performing the truncation, while inside + * the critical section: Have to do it before truncation, to prevent + * concurrent lookups of those values. Has to be inside the critical + * section as otherwise a future call to this function would error out, + * while looking up the oldest member in offsets, if our caller crashes + * before updating the limits. + */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestMultiXactId = newOldestMulti; + MultiXactState->oldestMultiXactDB = newOldestMultiDB; + LWLockRelease(MultiXactGenLock); + + /* First truncate members */ + PerformMembersTruncation(oldestOffset, newOldestOffset); + + /* Then offsets */ + PerformOffsetsTruncation(oldestMulti, newOldestMulti); + + MyProc->delayChkpt = false; + + END_CRIT_SECTION(); + LWLockRelease(MultiXactTruncationLock); +} + +/* + * Decide whether a MultiXactOffset page number is "older" for truncation + * purposes. Analogous to CLOGPagePrecedes(). + * + * Offsetting the values is optional, because MultiXactIdPrecedes() has + * translational symmetry. + */ +static bool +MultiXactOffsetPagePrecedes(int page1, int page2) +{ + MultiXactId multi1; + MultiXactId multi2; + + multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE; + multi1 += FirstMultiXactId + 1; + multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE; + multi2 += FirstMultiXactId + 1; + + return (MultiXactIdPrecedes(multi1, multi2) && + MultiXactIdPrecedes(multi1, + multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1)); +} + +/* + * Decide whether a MultiXactMember page number is "older" for truncation + * purposes. There is no "invalid offset number" so use the numbers verbatim. + */ +static bool +MultiXactMemberPagePrecedes(int page1, int page2) +{ + MultiXactOffset offset1; + MultiXactOffset offset2; + + offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; + offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; + + return (MultiXactOffsetPrecedes(offset1, offset2) && + MultiXactOffsetPrecedes(offset1, + offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); +} + +/* + * Decide which of two MultiXactIds is earlier. + * + * XXX do we need to do something special for InvalidMultiXactId? + * (Doesn't look like it.) + */ +bool +MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) +{ + int32 diff = (int32) (multi1 - multi2); + + return (diff < 0); +} + +/* + * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2? + * + * XXX do we need to do something special for InvalidMultiXactId? + * (Doesn't look like it.) + */ +bool +MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) +{ + int32 diff = (int32) (multi1 - multi2); + + return (diff <= 0); +} + + +/* + * Decide which of two offsets is earlier. + */ +static bool +MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) +{ + int32 diff = (int32) (offset1 - offset2); + + return (diff < 0); +} + +/* + * Write an xlog record reflecting the zeroing of either a MEMBERs or + * OFFSETs page (info shows which) + */ +static void +WriteMZeroPageXlogRec(int pageno, uint8 info) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_MULTIXACT_ID, info); +} + +/* + * Write a TRUNCATE xlog record + * + * We must flush the xlog record to disk before returning --- see notes in + * TruncateCLOG(). + */ +static void +WriteMTruncateXlogRec(Oid oldestMultiDB, + MultiXactId startTruncOff, MultiXactId endTruncOff, + MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb) +{ + XLogRecPtr recptr; + xl_multixact_truncate xlrec; + + xlrec.oldestMultiDB = oldestMultiDB; + + xlrec.startTruncOff = startTruncOff; + xlrec.endTruncOff = endTruncOff; + + xlrec.startTruncMemb = startTruncMemb; + xlrec.endTruncMemb = endTruncMemb; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate); + recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID); + XLogFlush(recptr); +} + +/* + * MULTIXACT resource manager's routines + */ +void +multixact_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in multixact records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactOffsetSLRULock); + } + else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + slotno = ZeroMultiXactMemberPage(pageno, false); + SimpleLruWritePage(MultiXactMemberCtl, slotno); + Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactMemberSLRULock); + } + else if (info == XLOG_MULTIXACT_CREATE_ID) + { + xl_multixact_create *xlrec = + (xl_multixact_create *) XLogRecGetData(record); + TransactionId max_xid; + int i; + + /* Store the data back into the SLRU files */ + RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, + xlrec->members); + + /* Make sure nextMXact/nextOffset are beyond what this record has */ + MultiXactAdvanceNextMXact(xlrec->mid + 1, + xlrec->moff + xlrec->nmembers); + + /* + * Make sure nextXid is beyond any XID mentioned in the record. This + * should be unnecessary, since any XID found here ought to have other + * evidence in the XLOG, but let's be safe. + */ + max_xid = XLogRecGetXid(record); + for (i = 0; i < xlrec->nmembers; i++) + { + if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid)) + max_xid = xlrec->members[i].xid; + } + + AdvanceNextFullTransactionIdPastXid(max_xid); + } + else if (info == XLOG_MULTIXACT_TRUNCATE_ID) + { + xl_multixact_truncate xlrec; + int pageno; + + memcpy(&xlrec, XLogRecGetData(record), + SizeOfMultiXactTruncate); + + elog(DEBUG1, "replaying multixact truncation: " + "offsets [%u, %u), offsets segments [%x, %x), " + "members [%u, %u), members segments [%x, %x)", + xlrec.startTruncOff, xlrec.endTruncOff, + MultiXactIdToOffsetSegment(xlrec.startTruncOff), + MultiXactIdToOffsetSegment(xlrec.endTruncOff), + xlrec.startTruncMemb, xlrec.endTruncMemb, + MXOffsetToMemberSegment(xlrec.startTruncMemb), + MXOffsetToMemberSegment(xlrec.endTruncMemb)); + + /* should not be required, but more than cheap enough */ + LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); + + /* + * Advance the horizon values, so they're current at the end of + * recovery. + */ + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); + + PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); + + /* + * During XLOG replay, latest_page_number isn't necessarily set up + * yet; insert a suitable value to bypass the sanity test in + * SimpleLruTruncate. + */ + pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); + MultiXactOffsetCtl->shared->latest_page_number = pageno; + PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); + + LWLockRelease(MultiXactTruncationLock); + } + else + elog(PANIC, "multixact_redo: unknown op code %u", info); +} + +Datum +pg_get_multixact_members(PG_FUNCTION_ARGS) +{ + typedef struct + { + MultiXactMember *members; + int nmembers; + int iter; + } mxact; + MultiXactId mxid = PG_GETARG_TRANSACTIONID(0); + mxact *multi; + FuncCallContext *funccxt; + + if (mxid < FirstMultiXactId) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid MultiXactId: %u", mxid))); + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcxt; + TupleDesc tupdesc; + + funccxt = SRF_FIRSTCALL_INIT(); + oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); + + multi = palloc(sizeof(mxact)); + /* no need to allow for old values here */ + multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false, + false); + multi->iter = 0; + + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode", + TEXTOID, -1, 0); + + funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc); + funccxt->user_fctx = multi; + + MemoryContextSwitchTo(oldcxt); + } + + funccxt = SRF_PERCALL_SETUP(); + multi = (mxact *) funccxt->user_fctx; + + while (multi->iter < multi->nmembers) + { + HeapTuple tuple; + char *values[2]; + + values[0] = psprintf("%u", multi->members[multi->iter].xid); + values[1] = mxstatus_to_string(multi->members[multi->iter].status); + + tuple = BuildTupleFromCStrings(funccxt->attinmeta, values); + + multi->iter++; + pfree(values[0]); + SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple)); + } + + SRF_RETURN_DONE(funccxt); +} + +/* + * Entrypoint for sync.c to sync offsets files. + */ +int +multixactoffsetssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path); +} + +/* + * Entrypoint for sync.c to sync members files. + */ +int +multixactmemberssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactMemberCtl, ftag, path); +} diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c new file mode 100644 index 0000000..8ce95ab --- /dev/null +++ b/src/backend/access/transam/parallel.c @@ -0,0 +1,1585 @@ +/*------------------------------------------------------------------------- + * + * parallel.c + * Infrastructure for launching parallel workers + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/transam/parallel.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/nbtree.h" +#include "access/parallel.h" +#include "access/session.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "catalog/pg_enum.h" +#include "catalog/storage.h" +#include "commands/async.h" +#include "executor/execParallel.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "libpq/pqmq.h" +#include "miscadmin.h" +#include "optimizer/optimizer.h" +#include "pgstat.h" +#include "storage/ipc.h" +#include "storage/predicate.h" +#include "storage/sinval.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/combocid.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/relmapper.h" +#include "utils/snapmgr.h" +#include "utils/typcache.h" + +/* + * We don't want to waste a lot of memory on an error queue which, most of + * the time, will process only a handful of small messages. However, it is + * desirable to make it large enough that a typical ErrorResponse can be sent + * without blocking. That way, a worker that errors out can write the whole + * message into the queue and terminate without waiting for the user backend. + */ +#define PARALLEL_ERROR_QUEUE_SIZE 16384 + +/* Magic number for parallel context TOC. */ +#define PARALLEL_MAGIC 0x50477c7c + +/* + * Magic numbers for per-context parallel state sharing. Higher-level code + * should use smaller values, leaving these very large ones for use by this + * module. + */ +#define PARALLEL_KEY_FIXED UINT64CONST(0xFFFFFFFFFFFF0001) +#define PARALLEL_KEY_ERROR_QUEUE UINT64CONST(0xFFFFFFFFFFFF0002) +#define PARALLEL_KEY_LIBRARY UINT64CONST(0xFFFFFFFFFFFF0003) +#define PARALLEL_KEY_GUC UINT64CONST(0xFFFFFFFFFFFF0004) +#define PARALLEL_KEY_COMBO_CID UINT64CONST(0xFFFFFFFFFFFF0005) +#define PARALLEL_KEY_TRANSACTION_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0006) +#define PARALLEL_KEY_ACTIVE_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0007) +#define PARALLEL_KEY_TRANSACTION_STATE UINT64CONST(0xFFFFFFFFFFFF0008) +#define PARALLEL_KEY_ENTRYPOINT UINT64CONST(0xFFFFFFFFFFFF0009) +#define PARALLEL_KEY_SESSION_DSM UINT64CONST(0xFFFFFFFFFFFF000A) +#define PARALLEL_KEY_PENDING_SYNCS UINT64CONST(0xFFFFFFFFFFFF000B) +#define PARALLEL_KEY_REINDEX_STATE UINT64CONST(0xFFFFFFFFFFFF000C) +#define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) +#define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) + +/* Fixed-size parallel state. */ +typedef struct FixedParallelState +{ + /* Fixed-size state that workers must restore. */ + Oid database_id; + Oid authenticated_user_id; + Oid current_user_id; + Oid outer_user_id; + Oid temp_namespace_id; + Oid temp_toast_namespace_id; + int sec_context; + bool is_superuser; + PGPROC *parallel_leader_pgproc; + pid_t parallel_leader_pid; + BackendId parallel_leader_backend_id; + TimestampTz xact_ts; + TimestampTz stmt_ts; + SerializableXactHandle serializable_xact_handle; + + /* Mutex protects remaining fields. */ + slock_t mutex; + + /* Maximum XactLastRecEnd of any worker. */ + XLogRecPtr last_xlog_end; +} FixedParallelState; + +/* + * Our parallel worker number. We initialize this to -1, meaning that we are + * not a parallel worker. In parallel workers, it will be set to a value >= 0 + * and < the number of workers before any user code is invoked; each parallel + * worker will get a different parallel worker number. + */ +int ParallelWorkerNumber = -1; + +/* Is there a parallel message pending which we need to receive? */ +volatile bool ParallelMessagePending = false; + +/* Are we initializing a parallel worker? */ +bool InitializingParallelWorker = false; + +/* Pointer to our fixed parallel state. */ +static FixedParallelState *MyFixedParallelState; + +/* List of active parallel contexts. */ +static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list); + +/* Backend-local copy of data from FixedParallelState. */ +static pid_t ParallelLeaderPid; + +/* + * List of internal parallel worker entry points. We need this for + * reasons explained in LookupParallelWorkerFunction(), below. + */ +static const struct +{ + const char *fn_name; + parallel_worker_main_type fn_addr; +} InternalParallelWorkers[] = + +{ + { + "ParallelQueryMain", ParallelQueryMain + }, + { + "_bt_parallel_build_main", _bt_parallel_build_main + }, + { + "parallel_vacuum_main", parallel_vacuum_main + } +}; + +/* Private functions. */ +static void HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg); +static void WaitForParallelWorkersToExit(ParallelContext *pcxt); +static parallel_worker_main_type LookupParallelWorkerFunction(const char *libraryname, const char *funcname); +static void ParallelWorkerShutdown(int code, Datum arg); + + +/* + * Establish a new parallel context. This should be done after entering + * parallel mode, and (unless there is an error) the context should be + * destroyed before exiting the current subtransaction. + */ +ParallelContext * +CreateParallelContext(const char *library_name, const char *function_name, + int nworkers) +{ + MemoryContext oldcontext; + ParallelContext *pcxt; + + /* It is unsafe to create a parallel context if not in parallel mode. */ + Assert(IsInParallelMode()); + + /* Number of workers should be non-negative. */ + Assert(nworkers >= 0); + + /* We might be running in a short-lived memory context. */ + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + + /* Initialize a new ParallelContext. */ + pcxt = palloc0(sizeof(ParallelContext)); + pcxt->subid = GetCurrentSubTransactionId(); + pcxt->nworkers = nworkers; + pcxt->nworkers_to_launch = nworkers; + pcxt->library_name = pstrdup(library_name); + pcxt->function_name = pstrdup(function_name); + pcxt->error_context_stack = error_context_stack; + shm_toc_initialize_estimator(&pcxt->estimator); + dlist_push_head(&pcxt_list, &pcxt->node); + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); + + return pcxt; +} + +/* + * Establish the dynamic shared memory segment for a parallel context and + * copy state and other bookkeeping information that will be needed by + * parallel workers into it. + */ +void +InitializeParallelDSM(ParallelContext *pcxt) +{ + MemoryContext oldcontext; + Size library_len = 0; + Size guc_len = 0; + Size combocidlen = 0; + Size tsnaplen = 0; + Size asnaplen = 0; + Size tstatelen = 0; + Size pendingsyncslen = 0; + Size reindexlen = 0; + Size relmapperlen = 0; + Size uncommittedenumslen = 0; + Size segsize = 0; + int i; + FixedParallelState *fps; + dsm_handle session_dsm_handle = DSM_HANDLE_INVALID; + Snapshot transaction_snapshot = GetTransactionSnapshot(); + Snapshot active_snapshot = GetActiveSnapshot(); + + /* We might be running in a very short-lived memory context. */ + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + + /* Allow space to store the fixed-size parallel state. */ + shm_toc_estimate_chunk(&pcxt->estimator, sizeof(FixedParallelState)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* + * Normally, the user will have requested at least one worker process, but + * if by chance they have not, we can skip a bunch of things here. + */ + if (pcxt->nworkers > 0) + { + /* Get (or create) the per-session DSM segment's handle. */ + session_dsm_handle = GetSessionDsmHandle(); + + /* + * If we weren't able to create a per-session DSM segment, then we can + * continue but we can't safely launch any workers because their + * record typmods would be incompatible so they couldn't exchange + * tuples. + */ + if (session_dsm_handle == DSM_HANDLE_INVALID) + pcxt->nworkers = 0; + } + + if (pcxt->nworkers > 0) + { + /* Estimate space for various kinds of state sharing. */ + library_len = EstimateLibraryStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, library_len); + guc_len = EstimateGUCStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, guc_len); + combocidlen = EstimateComboCIDStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, combocidlen); + if (IsolationUsesXactSnapshot()) + { + tsnaplen = EstimateSnapshotSpace(transaction_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, tsnaplen); + } + asnaplen = EstimateSnapshotSpace(active_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, asnaplen); + tstatelen = EstimateTransactionStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, tstatelen); + shm_toc_estimate_chunk(&pcxt->estimator, sizeof(dsm_handle)); + pendingsyncslen = EstimatePendingSyncsSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, pendingsyncslen); + reindexlen = EstimateReindexStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, reindexlen); + relmapperlen = EstimateRelationMapSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, relmapperlen); + uncommittedenumslen = EstimateUncommittedEnumsSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, uncommittedenumslen); + /* If you add more chunks here, you probably need to add keys. */ + shm_toc_estimate_keys(&pcxt->estimator, 11); + + /* Estimate space need for error queues. */ + StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) == + PARALLEL_ERROR_QUEUE_SIZE, + "parallel error queue size not buffer-aligned"); + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(PARALLEL_ERROR_QUEUE_SIZE, + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate how much we'll need for the entrypoint info. */ + shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + + strlen(pcxt->function_name) + 2); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + + /* + * Create DSM and initialize with new table of contents. But if the user + * didn't request any workers, then don't bother creating a dynamic shared + * memory segment; instead, just use backend-private memory. + * + * Also, if we can't create a dynamic shared memory segment because the + * maximum number of segments have already been created, then fall back to + * backend-private memory, and plan not to use any workers. We hope this + * won't happen very often, but it's better to abandon the use of + * parallelism than to fail outright. + */ + segsize = shm_toc_estimate(&pcxt->estimator); + if (pcxt->nworkers > 0) + pcxt->seg = dsm_create(segsize, DSM_CREATE_NULL_IF_MAXSEGMENTS); + if (pcxt->seg != NULL) + pcxt->toc = shm_toc_create(PARALLEL_MAGIC, + dsm_segment_address(pcxt->seg), + segsize); + else + { + pcxt->nworkers = 0; + pcxt->private_memory = MemoryContextAlloc(TopMemoryContext, segsize); + pcxt->toc = shm_toc_create(PARALLEL_MAGIC, pcxt->private_memory, + segsize); + } + + /* Initialize fixed-size state in shared memory. */ + fps = (FixedParallelState *) + shm_toc_allocate(pcxt->toc, sizeof(FixedParallelState)); + fps->database_id = MyDatabaseId; + fps->authenticated_user_id = GetAuthenticatedUserId(); + fps->outer_user_id = GetCurrentRoleId(); + fps->is_superuser = session_auth_is_superuser; + GetUserIdAndSecContext(&fps->current_user_id, &fps->sec_context); + GetTempNamespaceState(&fps->temp_namespace_id, + &fps->temp_toast_namespace_id); + fps->parallel_leader_pgproc = MyProc; + fps->parallel_leader_pid = MyProcPid; + fps->parallel_leader_backend_id = MyBackendId; + fps->xact_ts = GetCurrentTransactionStartTimestamp(); + fps->stmt_ts = GetCurrentStatementStartTimestamp(); + fps->serializable_xact_handle = ShareSerializableXact(); + SpinLockInit(&fps->mutex); + fps->last_xlog_end = 0; + shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps); + + /* We can skip the rest of this if we're not budgeting for any workers. */ + if (pcxt->nworkers > 0) + { + char *libraryspace; + char *gucspace; + char *combocidspace; + char *tsnapspace; + char *asnapspace; + char *tstatespace; + char *pendingsyncsspace; + char *reindexspace; + char *relmapperspace; + char *error_queue_space; + char *session_dsm_handle_space; + char *entrypointstate; + char *uncommittedenumsspace; + Size lnamelen; + + /* Serialize shared libraries we have loaded. */ + libraryspace = shm_toc_allocate(pcxt->toc, library_len); + SerializeLibraryState(library_len, libraryspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_LIBRARY, libraryspace); + + /* Serialize GUC settings. */ + gucspace = shm_toc_allocate(pcxt->toc, guc_len); + SerializeGUCState(guc_len, gucspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_GUC, gucspace); + + /* Serialize combo CID state. */ + combocidspace = shm_toc_allocate(pcxt->toc, combocidlen); + SerializeComboCIDState(combocidlen, combocidspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_COMBO_CID, combocidspace); + + /* + * Serialize the transaction snapshot if the transaction + * isolation-level uses a transaction snapshot. + */ + if (IsolationUsesXactSnapshot()) + { + tsnapspace = shm_toc_allocate(pcxt->toc, tsnaplen); + SerializeSnapshot(transaction_snapshot, tsnapspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, + tsnapspace); + } + + /* Serialize the active snapshot. */ + asnapspace = shm_toc_allocate(pcxt->toc, asnaplen); + SerializeSnapshot(active_snapshot, asnapspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, asnapspace); + + /* Provide the handle for per-session segment. */ + session_dsm_handle_space = shm_toc_allocate(pcxt->toc, + sizeof(dsm_handle)); + *(dsm_handle *) session_dsm_handle_space = session_dsm_handle; + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SESSION_DSM, + session_dsm_handle_space); + + /* Serialize transaction state. */ + tstatespace = shm_toc_allocate(pcxt->toc, tstatelen); + SerializeTransactionState(tstatelen, tstatespace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_STATE, tstatespace); + + /* Serialize pending syncs. */ + pendingsyncsspace = shm_toc_allocate(pcxt->toc, pendingsyncslen); + SerializePendingSyncs(pendingsyncslen, pendingsyncsspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_PENDING_SYNCS, + pendingsyncsspace); + + /* Serialize reindex state. */ + reindexspace = shm_toc_allocate(pcxt->toc, reindexlen); + SerializeReindexState(reindexlen, reindexspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_REINDEX_STATE, reindexspace); + + /* Serialize relmapper state. */ + relmapperspace = shm_toc_allocate(pcxt->toc, relmapperlen); + SerializeRelationMap(relmapperlen, relmapperspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_RELMAPPER_STATE, + relmapperspace); + + /* Serialize uncommitted enum state. */ + uncommittedenumsspace = shm_toc_allocate(pcxt->toc, + uncommittedenumslen); + SerializeUncommittedEnums(uncommittedenumsspace, uncommittedenumslen); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_UNCOMMITTEDENUMS, + uncommittedenumsspace); + + /* Allocate space for worker information. */ + pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers); + + /* + * Establish error queues in dynamic shared memory. + * + * These queues should be used only for transmitting ErrorResponse, + * NoticeResponse, and NotifyResponse protocol messages. Tuple data + * should be transmitted via separate (possibly larger?) queues. + */ + error_queue_space = + shm_toc_allocate(pcxt->toc, + mul_size(PARALLEL_ERROR_QUEUE_SIZE, + pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + char *start; + shm_mq *mq; + + start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE; + mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_receiver(mq, MyProc); + pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, error_queue_space); + + /* + * Serialize entrypoint information. It's unsafe to pass function + * pointers across processes, as the function pointer may be different + * in each process in EXEC_BACKEND builds, so we always pass library + * and function name. (We use library name "postgres" for functions + * in the core backend.) + */ + lnamelen = strlen(pcxt->library_name); + entrypointstate = shm_toc_allocate(pcxt->toc, lnamelen + + strlen(pcxt->function_name) + 2); + strcpy(entrypointstate, pcxt->library_name); + strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + } + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Reinitialize the dynamic shared memory segment for a parallel context such + * that we could launch workers for it again. + */ +void +ReinitializeParallelDSM(ParallelContext *pcxt) +{ + FixedParallelState *fps; + + /* Wait for any old workers to exit. */ + if (pcxt->nworkers_launched > 0) + { + WaitForParallelWorkersToFinish(pcxt); + WaitForParallelWorkersToExit(pcxt); + pcxt->nworkers_launched = 0; + if (pcxt->known_attached_workers) + { + pfree(pcxt->known_attached_workers); + pcxt->known_attached_workers = NULL; + pcxt->nknown_attached_workers = 0; + } + } + + /* Reset a few bits of fixed parallel state to a clean state. */ + fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false); + fps->last_xlog_end = 0; + + /* Recreate error queues (if they exist). */ + if (pcxt->nworkers > 0) + { + char *error_queue_space; + int i; + + error_queue_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false); + for (i = 0; i < pcxt->nworkers; ++i) + { + char *start; + shm_mq *mq; + + start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE; + mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_receiver(mq, MyProc); + pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + } + } +} + +/* + * Reinitialize parallel workers for a parallel context such that we could + * launch a different number of workers. This is required for cases where + * we need to reuse the same DSM segment, but the number of workers can + * vary from run-to-run. + */ +void +ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch) +{ + /* + * The number of workers that need to be launched must be less than the + * number of workers with which the parallel context is initialized. + */ + Assert(pcxt->nworkers >= nworkers_to_launch); + pcxt->nworkers_to_launch = nworkers_to_launch; +} + +/* + * Launch parallel workers. + */ +void +LaunchParallelWorkers(ParallelContext *pcxt) +{ + MemoryContext oldcontext; + BackgroundWorker worker; + int i; + bool any_registrations_failed = false; + + /* Skip this if we have no workers. */ + if (pcxt->nworkers == 0 || pcxt->nworkers_to_launch == 0) + return; + + /* We need to be a lock group leader. */ + BecomeLockGroupLeader(); + + /* If we do have workers, we'd better have a DSM segment. */ + Assert(pcxt->seg != NULL); + + /* We might be running in a short-lived memory context. */ + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + + /* Configure a worker. */ + memset(&worker, 0, sizeof(worker)); + snprintf(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %d", + MyProcPid); + snprintf(worker.bgw_type, BGW_MAXLEN, "parallel worker"); + worker.bgw_flags = + BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION + | BGWORKER_CLASS_PARALLEL; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + sprintf(worker.bgw_library_name, "postgres"); + sprintf(worker.bgw_function_name, "ParallelWorkerMain"); + worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(pcxt->seg)); + worker.bgw_notify_pid = MyProcPid; + + /* + * Start workers. + * + * The caller must be able to tolerate ending up with fewer workers than + * expected, so there is no need to throw an error here if registration + * fails. It wouldn't help much anyway, because registering the worker in + * no way guarantees that it will start up and initialize successfully. + */ + for (i = 0; i < pcxt->nworkers_to_launch; ++i) + { + memcpy(worker.bgw_extra, &i, sizeof(int)); + if (!any_registrations_failed && + RegisterDynamicBackgroundWorker(&worker, + &pcxt->worker[i].bgwhandle)) + { + shm_mq_set_handle(pcxt->worker[i].error_mqh, + pcxt->worker[i].bgwhandle); + pcxt->nworkers_launched++; + } + else + { + /* + * If we weren't able to register the worker, then we've bumped up + * against the max_worker_processes limit, and future + * registrations will probably fail too, so arrange to skip them. + * But we still have to execute this code for the remaining slots + * to make sure that we forget about the error queues we budgeted + * for those workers. Otherwise, we'll wait for them to start, + * but they never will. + */ + any_registrations_failed = true; + pcxt->worker[i].bgwhandle = NULL; + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + } + } + + /* + * Now that nworkers_launched has taken its final value, we can initialize + * known_attached_workers. + */ + if (pcxt->nworkers_launched > 0) + { + pcxt->known_attached_workers = + palloc0(sizeof(bool) * pcxt->nworkers_launched); + pcxt->nknown_attached_workers = 0; + } + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Wait for all workers to attach to their error queues, and throw an error if + * any worker fails to do this. + * + * Callers can assume that if this function returns successfully, then the + * number of workers given by pcxt->nworkers_launched have initialized and + * attached to their error queues. Whether or not these workers are guaranteed + * to still be running depends on what code the caller asked them to run; + * this function does not guarantee that they have not exited. However, it + * does guarantee that any workers which exited must have done so cleanly and + * after successfully performing the work with which they were tasked. + * + * If this function is not called, then some of the workers that were launched + * may not have been started due to a fork() failure, or may have exited during + * early startup prior to attaching to the error queue, so nworkers_launched + * cannot be viewed as completely reliable. It will never be less than the + * number of workers which actually started, but it might be more. Any workers + * that failed to start will still be discovered by + * WaitForParallelWorkersToFinish and an error will be thrown at that time, + * provided that function is eventually reached. + * + * In general, the leader process should do as much work as possible before + * calling this function. fork() failures and other early-startup failures + * are very uncommon, and having the leader sit idle when it could be doing + * useful work is undesirable. However, if the leader needs to wait for + * all of its workers or for a specific worker, it may want to call this + * function before doing so. If not, it must make some other provision for + * the failure-to-start case, lest it wait forever. On the other hand, a + * leader which never waits for a worker that might not be started yet, or + * at least never does so prior to WaitForParallelWorkersToFinish(), need not + * call this function at all. + */ +void +WaitForParallelWorkersToAttach(ParallelContext *pcxt) +{ + int i; + + /* Skip this if we have no launched workers. */ + if (pcxt->nworkers_launched == 0) + return; + + for (;;) + { + /* + * This will process any parallel messages that are pending and it may + * also throw an error propagated from a worker. + */ + CHECK_FOR_INTERRUPTS(); + + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + BgwHandleStatus status; + shm_mq *mq; + int rc; + pid_t pid; + + if (pcxt->known_attached_workers[i]) + continue; + + /* + * If error_mqh is NULL, then the worker has already exited + * cleanly. + */ + if (pcxt->worker[i].error_mqh == NULL) + { + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + continue; + } + + status = GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid); + if (status == BGWH_STARTED) + { + /* Has the worker attached to the error queue? */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) != NULL) + { + /* Yes, so it is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + } + else if (status == BGWH_STOPPED) + { + /* + * If the worker stopped without attaching to the error queue, + * throw an error. + */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("parallel worker failed to initialize"), + errhint("More details may be available in the server log."))); + + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + else + { + /* + * Worker not yet started, so we must wait. The postmaster + * will notify us if the worker's state changes. Our latch + * might also get set for some other reason, but if so we'll + * just end up waiting for the same worker again. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, + -1, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + } + + /* If all workers are known to have started, we're done. */ + if (pcxt->nknown_attached_workers >= pcxt->nworkers_launched) + { + Assert(pcxt->nknown_attached_workers == pcxt->nworkers_launched); + break; + } + } +} + +/* + * Wait for all workers to finish computing. + * + * Even if the parallel operation seems to have completed successfully, it's + * important to call this function afterwards. We must not miss any errors + * the workers may have thrown during the parallel operation, or any that they + * may yet throw while shutting down. + * + * Also, we want to update our notion of XactLastRecEnd based on worker + * feedback. + */ +void +WaitForParallelWorkersToFinish(ParallelContext *pcxt) +{ + for (;;) + { + bool anyone_alive = false; + int nfinished = 0; + int i; + + /* + * This will process any parallel messages that are pending, which may + * change the outcome of the loop that follows. It may also throw an + * error propagated from a worker. + */ + CHECK_FOR_INTERRUPTS(); + + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + /* + * If error_mqh is NULL, then the worker has already exited + * cleanly. If we have received a message through error_mqh from + * the worker, we know it started up cleanly, and therefore we're + * certain to be notified when it exits. + */ + if (pcxt->worker[i].error_mqh == NULL) + ++nfinished; + else if (pcxt->known_attached_workers[i]) + { + anyone_alive = true; + break; + } + } + + if (!anyone_alive) + { + /* If all workers are known to have finished, we're done. */ + if (nfinished >= pcxt->nworkers_launched) + { + Assert(nfinished == pcxt->nworkers_launched); + break; + } + + /* + * We didn't detect any living workers, but not all workers are + * known to have exited cleanly. Either not all workers have + * launched yet, or maybe some of them failed to start or + * terminated abnormally. + */ + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + pid_t pid; + shm_mq *mq; + + /* + * If the worker is BGWH_NOT_YET_STARTED or BGWH_STARTED, we + * should just keep waiting. If it is BGWH_STOPPED, then + * further investigation is needed. + */ + if (pcxt->worker[i].error_mqh == NULL || + pcxt->worker[i].bgwhandle == NULL || + GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, + &pid) != BGWH_STOPPED) + continue; + + /* + * Check whether the worker ended up stopped without ever + * attaching to the error queue. If so, the postmaster was + * unable to fork the worker or it exited without initializing + * properly. We must throw an error, since the caller may + * have been expecting the worker to do some work before + * exiting. + */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("parallel worker failed to initialize"), + errhint("More details may be available in the server log."))); + + /* + * The worker is stopped, but is attached to the error queue. + * Unless there's a bug somewhere, this will only happen when + * the worker writes messages and terminates after the + * CHECK_FOR_INTERRUPTS() near the top of this function and + * before the call to GetBackgroundWorkerPid(). In that case, + * or latch should have been set as well and the right things + * will happen on the next pass through the loop. + */ + } + } + + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1, + WAIT_EVENT_PARALLEL_FINISH); + ResetLatch(MyLatch); + } + + if (pcxt->toc != NULL) + { + FixedParallelState *fps; + + fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false); + if (fps->last_xlog_end > XactLastRecEnd) + XactLastRecEnd = fps->last_xlog_end; + } +} + +/* + * Wait for all workers to exit. + * + * This function ensures that workers have been completely shutdown. The + * difference between WaitForParallelWorkersToFinish and this function is + * that the former just ensures that last message sent by a worker backend is + * received by the leader backend whereas this ensures the complete shutdown. + */ +static void +WaitForParallelWorkersToExit(ParallelContext *pcxt) +{ + int i; + + /* Wait until the workers actually die. */ + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + BgwHandleStatus status; + + if (pcxt->worker == NULL || pcxt->worker[i].bgwhandle == NULL) + continue; + + status = WaitForBackgroundWorkerShutdown(pcxt->worker[i].bgwhandle); + + /* + * If the postmaster kicked the bucket, we have no chance of cleaning + * up safely -- we won't be able to tell when our workers are actually + * dead. This doesn't necessitate a PANIC since they will all abort + * eventually, but we can't safely continue this session. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during a parallel transaction"))); + + /* Release memory. */ + pfree(pcxt->worker[i].bgwhandle); + pcxt->worker[i].bgwhandle = NULL; + } +} + +/* + * Destroy a parallel context. + * + * If expecting a clean exit, you should use WaitForParallelWorkersToFinish() + * first, before calling this function. When this function is invoked, any + * remaining workers are forcibly killed; the dynamic shared memory segment + * is unmapped; and we then wait (uninterruptibly) for the workers to exit. + */ +void +DestroyParallelContext(ParallelContext *pcxt) +{ + int i; + + /* + * Be careful about order of operations here! We remove the parallel + * context from the list before we do anything else; otherwise, if an + * error occurs during a subsequent step, we might try to nuke it again + * from AtEOXact_Parallel or AtEOSubXact_Parallel. + */ + dlist_delete(&pcxt->node); + + /* Kill each worker in turn, and forget their error queues. */ + if (pcxt->worker != NULL) + { + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + if (pcxt->worker[i].error_mqh != NULL) + { + TerminateBackgroundWorker(pcxt->worker[i].bgwhandle); + + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + } + } + } + + /* + * If we have allocated a shared memory segment, detach it. This will + * implicitly detach the error queues, and any other shared memory queues, + * stored there. + */ + if (pcxt->seg != NULL) + { + dsm_detach(pcxt->seg); + pcxt->seg = NULL; + } + + /* + * If this parallel context is actually in backend-private memory rather + * than shared memory, free that memory instead. + */ + if (pcxt->private_memory != NULL) + { + pfree(pcxt->private_memory); + pcxt->private_memory = NULL; + } + + /* + * We can't finish transaction commit or abort until all of the workers + * have exited. This means, in particular, that we can't respond to + * interrupts at this stage. + */ + HOLD_INTERRUPTS(); + WaitForParallelWorkersToExit(pcxt); + RESUME_INTERRUPTS(); + + /* Free the worker array itself. */ + if (pcxt->worker != NULL) + { + pfree(pcxt->worker); + pcxt->worker = NULL; + } + + /* Free memory. */ + pfree(pcxt->library_name); + pfree(pcxt->function_name); + pfree(pcxt); +} + +/* + * Are there any parallel contexts currently active? + */ +bool +ParallelContextActive(void) +{ + return !dlist_is_empty(&pcxt_list); +} + +/* + * Handle receipt of an interrupt indicating a parallel worker message. + * + * Note: this is called within a signal handler! All we can do is set + * a flag that will cause the next CHECK_FOR_INTERRUPTS() to invoke + * HandleParallelMessages(). + */ +void +HandleParallelMessageInterrupt(void) +{ + InterruptPending = true; + ParallelMessagePending = true; + SetLatch(MyLatch); +} + +/* + * Handle any queued protocol messages received from parallel workers. + */ +void +HandleParallelMessages(void) +{ + dlist_iter iter; + MemoryContext oldcontext; + + static MemoryContext hpm_context = NULL; + + /* + * This is invoked from ProcessInterrupts(), and since some of the + * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential + * for recursive calls if more signals are received while this runs. It's + * unclear that recursive entry would be safe, and it doesn't seem useful + * even if it is safe, so let's block interrupts until done. + */ + HOLD_INTERRUPTS(); + + /* + * Moreover, CurrentMemoryContext might be pointing almost anywhere. We + * don't want to risk leaking data into long-lived contexts, so let's do + * our work here in a private context that we can reset on each use. + */ + if (hpm_context == NULL) /* first time through? */ + hpm_context = AllocSetContextCreate(TopMemoryContext, + "HandleParallelMessages", + ALLOCSET_DEFAULT_SIZES); + else + MemoryContextReset(hpm_context); + + oldcontext = MemoryContextSwitchTo(hpm_context); + + /* OK to process messages. Reset the flag saying there are more to do. */ + ParallelMessagePending = false; + + dlist_foreach(iter, &pcxt_list) + { + ParallelContext *pcxt; + int i; + + pcxt = dlist_container(ParallelContext, node, iter.cur); + if (pcxt->worker == NULL) + continue; + + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + /* + * Read as many messages as we can from each worker, but stop when + * either (1) the worker's error queue goes away, which can happen + * if we receive a Terminate message from the worker; or (2) no + * more messages can be read from the worker without blocking. + */ + while (pcxt->worker[i].error_mqh != NULL) + { + shm_mq_result res; + Size nbytes; + void *data; + + res = shm_mq_receive(pcxt->worker[i].error_mqh, &nbytes, + &data, true); + if (res == SHM_MQ_WOULD_BLOCK) + break; + else if (res == SHM_MQ_SUCCESS) + { + StringInfoData msg; + + initStringInfo(&msg); + appendBinaryStringInfo(&msg, data, nbytes); + HandleParallelMessage(pcxt, i, &msg); + pfree(msg.data); + } + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to parallel worker"))); + } + } + } + + MemoryContextSwitchTo(oldcontext); + + /* Might as well clear the context on our way out */ + MemoryContextReset(hpm_context); + + RESUME_INTERRUPTS(); +} + +/* + * Handle a single protocol message received from a single parallel worker. + */ +static void +HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg) +{ + char msgtype; + + if (pcxt->known_attached_workers != NULL && + !pcxt->known_attached_workers[i]) + { + pcxt->known_attached_workers[i] = true; + pcxt->nknown_attached_workers++; + } + + msgtype = pq_getmsgbyte(msg); + + switch (msgtype) + { + case 'K': /* BackendKeyData */ + { + int32 pid = pq_getmsgint(msg, 4); + + (void) pq_getmsgint(msg, 4); /* discard cancel key */ + (void) pq_getmsgend(msg); + pcxt->worker[i].pid = pid; + break; + } + + case 'E': /* ErrorResponse */ + case 'N': /* NoticeResponse */ + { + ErrorData edata; + ErrorContextCallback *save_error_context_stack; + + /* Parse ErrorResponse or NoticeResponse. */ + pq_parse_errornotice(msg, &edata); + + /* Death of a worker isn't enough justification for suicide. */ + edata.elevel = Min(edata.elevel, ERROR); + + /* + * If desired, add a context line to show that this is a + * message propagated from a parallel worker. Otherwise, it + * can sometimes be confusing to understand what actually + * happened. (We don't do this in FORCE_PARALLEL_REGRESS mode + * because it causes test-result instability depending on + * whether a parallel worker is actually used or not.) + */ + if (force_parallel_mode != FORCE_PARALLEL_REGRESS) + { + if (edata.context) + edata.context = psprintf("%s\n%s", edata.context, + _("parallel worker")); + else + edata.context = pstrdup(_("parallel worker")); + } + + /* + * Context beyond that should use the error context callbacks + * that were in effect when the ParallelContext was created, + * not the current ones. + */ + save_error_context_stack = error_context_stack; + error_context_stack = pcxt->error_context_stack; + + /* Rethrow error or print notice. */ + ThrowErrorData(&edata); + + /* Not an error, so restore previous context stack. */ + error_context_stack = save_error_context_stack; + + break; + } + + case 'A': /* NotifyResponse */ + { + /* Propagate NotifyResponse. */ + int32 pid; + const char *channel; + const char *payload; + + pid = pq_getmsgint(msg, 4); + channel = pq_getmsgrawstring(msg); + payload = pq_getmsgrawstring(msg); + pq_endmessage(msg); + + NotifyMyFrontEnd(channel, payload, pid); + + break; + } + + case 'X': /* Terminate, indicating clean exit */ + { + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + break; + } + + default: + { + elog(ERROR, "unrecognized message type received from parallel worker: %c (message length %d bytes)", + msgtype, msg->len); + } + } +} + +/* + * End-of-subtransaction cleanup for parallel contexts. + * + * Currently, it's forbidden to enter or leave a subtransaction while + * parallel mode is in effect, so we could just blow away everything. But + * we may want to relax that restriction in the future, so this code + * contemplates that there may be multiple subtransaction IDs in pcxt_list. + */ +void +AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId) +{ + while (!dlist_is_empty(&pcxt_list)) + { + ParallelContext *pcxt; + + pcxt = dlist_head_element(ParallelContext, node, &pcxt_list); + if (pcxt->subid != mySubId) + break; + if (isCommit) + elog(WARNING, "leaked parallel context"); + DestroyParallelContext(pcxt); + } +} + +/* + * End-of-transaction cleanup for parallel contexts. + */ +void +AtEOXact_Parallel(bool isCommit) +{ + while (!dlist_is_empty(&pcxt_list)) + { + ParallelContext *pcxt; + + pcxt = dlist_head_element(ParallelContext, node, &pcxt_list); + if (isCommit) + elog(WARNING, "leaked parallel context"); + DestroyParallelContext(pcxt); + } +} + +/* + * Main entrypoint for parallel workers. + */ +void +ParallelWorkerMain(Datum main_arg) +{ + dsm_segment *seg; + shm_toc *toc; + FixedParallelState *fps; + char *error_queue_space; + shm_mq *mq; + shm_mq_handle *mqh; + char *libraryspace; + char *entrypointstate; + char *library_name; + char *function_name; + parallel_worker_main_type entrypt; + char *gucspace; + char *combocidspace; + char *tsnapspace; + char *asnapspace; + char *tstatespace; + char *pendingsyncsspace; + char *reindexspace; + char *relmapperspace; + char *uncommittedenumsspace; + StringInfoData msgbuf; + char *session_dsm_handle_space; + Snapshot tsnapshot; + Snapshot asnapshot; + + /* Set flag to indicate that we're initializing a parallel worker. */ + InitializingParallelWorker = true; + + /* Establish signal handlers. */ + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* Determine and set our parallel worker number. */ + Assert(ParallelWorkerNumber == -1); + memcpy(&ParallelWorkerNumber, MyBgworkerEntry->bgw_extra, sizeof(int)); + + /* Set up a memory context to work in, just for cleanliness. */ + CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext, + "Parallel worker", + ALLOCSET_DEFAULT_SIZES); + + /* + * Attach to the dynamic shared memory segment for the parallel query, and + * find its table of contents. + * + * Note: at this point, we have not created any ResourceOwner in this + * process. This will result in our DSM mapping surviving until process + * exit, which is fine. If there were a ResourceOwner, it would acquire + * ownership of the mapping, but we have no need for that. + */ + seg = dsm_attach(DatumGetUInt32(main_arg)); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + toc = shm_toc_attach(PARALLEL_MAGIC, dsm_segment_address(seg)); + if (toc == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid magic number in dynamic shared memory segment"))); + + /* Look up fixed parallel state. */ + fps = shm_toc_lookup(toc, PARALLEL_KEY_FIXED, false); + MyFixedParallelState = fps; + + /* Arrange to signal the leader if we exit. */ + ParallelLeaderPid = fps->parallel_leader_pid; + ParallelLeaderBackendId = fps->parallel_leader_backend_id; + on_shmem_exit(ParallelWorkerShutdown, (Datum) 0); + + /* + * Now we can find and attach to the error queue provided for us. That's + * good, because until we do that, any errors that happen here will not be + * reported back to the process that requested that this worker be + * launched. + */ + error_queue_space = shm_toc_lookup(toc, PARALLEL_KEY_ERROR_QUEUE, false); + mq = (shm_mq *) (error_queue_space + + ParallelWorkerNumber * PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_sender(mq, MyProc); + mqh = shm_mq_attach(mq, seg, NULL); + pq_redirect_to_shm_mq(seg, mqh); + pq_set_parallel_leader(fps->parallel_leader_pid, + fps->parallel_leader_backend_id); + + /* + * Send a BackendKeyData message to the process that initiated parallelism + * so that it has access to our PID before it receives any other messages + * from us. Our cancel key is sent, too, since that's the way the + * protocol message is defined, but it won't actually be used for anything + * in this case. + */ + pq_beginmessage(&msgbuf, 'K'); + pq_sendint32(&msgbuf, (int32) MyProcPid); + pq_sendint32(&msgbuf, (int32) MyCancelKey); + pq_endmessage(&msgbuf); + + /* + * Hooray! Primary initialization is complete. Now, we need to set up our + * backend-local state to match the original backend. + */ + + /* + * Join locking group. We must do this before anything that could try to + * acquire a heavyweight lock, because any heavyweight locks acquired to + * this point could block either directly against the parallel group + * leader or against some process which in turn waits for a lock that + * conflicts with the parallel group leader, causing an undetected + * deadlock. (If we can't join the lock group, the leader has gone away, + * so just exit quietly.) + */ + if (!BecomeLockGroupMember(fps->parallel_leader_pgproc, + fps->parallel_leader_pid)) + return; + + /* + * Restore transaction and statement start-time timestamps. This must + * happen before anything that would start a transaction, else asserts in + * xact.c will fire. + */ + SetParallelStartTimestamps(fps->xact_ts, fps->stmt_ts); + + /* + * Identify the entry point to be called. In theory this could result in + * loading an additional library, though most likely the entry point is in + * the core backend or in a library we just loaded. + */ + entrypointstate = shm_toc_lookup(toc, PARALLEL_KEY_ENTRYPOINT, false); + library_name = entrypointstate; + function_name = entrypointstate + strlen(library_name) + 1; + + entrypt = LookupParallelWorkerFunction(library_name, function_name); + + /* Restore database connection. */ + BackgroundWorkerInitializeConnectionByOid(fps->database_id, + fps->authenticated_user_id, + 0); + + /* + * Set the client encoding to the database encoding, since that is what + * the leader will expect. + */ + SetClientEncoding(GetDatabaseEncoding()); + + /* + * Load libraries that were loaded by original backend. We want to do + * this before restoring GUCs, because the libraries might define custom + * variables. + */ + libraryspace = shm_toc_lookup(toc, PARALLEL_KEY_LIBRARY, false); + StartTransactionCommand(); + RestoreLibraryState(libraryspace); + + /* Restore GUC values from launching backend. */ + gucspace = shm_toc_lookup(toc, PARALLEL_KEY_GUC, false); + RestoreGUCState(gucspace); + CommitTransactionCommand(); + + /* Crank up a transaction state appropriate to a parallel worker. */ + tstatespace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_STATE, false); + StartParallelWorkerTransaction(tstatespace); + + /* Restore combo CID state. */ + combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID, false); + RestoreComboCIDState(combocidspace); + + /* Attach to the per-session DSM segment and contained objects. */ + session_dsm_handle_space = + shm_toc_lookup(toc, PARALLEL_KEY_SESSION_DSM, false); + AttachSession(*(dsm_handle *) session_dsm_handle_space); + + /* + * If the transaction isolation level is REPEATABLE READ or SERIALIZABLE, + * the leader has serialized the transaction snapshot and we must restore + * it. At lower isolation levels, there is no transaction-lifetime + * snapshot, but we need TransactionXmin to get set to a value which is + * less than or equal to the xmin of every snapshot that will be used by + * this worker. The easiest way to accomplish that is to install the + * active snapshot as the transaction snapshot. Code running in this + * parallel worker might take new snapshots via GetTransactionSnapshot() + * or GetLatestSnapshot(), but it shouldn't have any way of acquiring a + * snapshot older than the active snapshot. + */ + asnapspace = shm_toc_lookup(toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, false); + tsnapspace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, true); + asnapshot = RestoreSnapshot(asnapspace); + tsnapshot = tsnapspace ? RestoreSnapshot(tsnapspace) : asnapshot; + RestoreTransactionSnapshot(tsnapshot, + fps->parallel_leader_pgproc); + PushActiveSnapshot(asnapshot); + + /* + * We've changed which tuples we can see, and must therefore invalidate + * system caches. + */ + InvalidateSystemCaches(); + + /* + * Restore current role id. Skip verifying whether session user is + * allowed to become this role and blindly restore the leader's state for + * current role. + */ + SetCurrentRoleId(fps->outer_user_id, fps->is_superuser); + + /* Restore user ID and security context. */ + SetUserIdAndSecContext(fps->current_user_id, fps->sec_context); + + /* Restore temp-namespace state to ensure search path matches leader's. */ + SetTempNamespaceState(fps->temp_namespace_id, + fps->temp_toast_namespace_id); + + /* Restore pending syncs. */ + pendingsyncsspace = shm_toc_lookup(toc, PARALLEL_KEY_PENDING_SYNCS, + false); + RestorePendingSyncs(pendingsyncsspace); + + /* Restore reindex state. */ + reindexspace = shm_toc_lookup(toc, PARALLEL_KEY_REINDEX_STATE, false); + RestoreReindexState(reindexspace); + + /* Restore relmapper state. */ + relmapperspace = shm_toc_lookup(toc, PARALLEL_KEY_RELMAPPER_STATE, false); + RestoreRelationMap(relmapperspace); + + /* Restore uncommitted enums. */ + uncommittedenumsspace = shm_toc_lookup(toc, PARALLEL_KEY_UNCOMMITTEDENUMS, + false); + RestoreUncommittedEnums(uncommittedenumsspace); + + /* Attach to the leader's serializable transaction, if SERIALIZABLE. */ + AttachSerializableXact(fps->serializable_xact_handle); + + /* + * We've initialized all of our state now; nothing should change + * hereafter. + */ + InitializingParallelWorker = false; + EnterParallelMode(); + + /* + * Time to do the real work: invoke the caller-supplied code. + */ + entrypt(seg, toc); + + /* Must exit parallel mode to pop active snapshot. */ + ExitParallelMode(); + + /* Must pop active snapshot so snapmgr.c doesn't complain. */ + PopActiveSnapshot(); + + /* Shut down the parallel-worker transaction. */ + EndParallelWorkerTransaction(); + + /* Detach from the per-session DSM segment. */ + DetachSession(); + + /* Report success. */ + pq_putmessage('X', NULL, 0); +} + +/* + * Update shared memory with the ending location of the last WAL record we + * wrote, if it's greater than the value already stored there. + */ +void +ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end) +{ + FixedParallelState *fps = MyFixedParallelState; + + Assert(fps != NULL); + SpinLockAcquire(&fps->mutex); + if (fps->last_xlog_end < last_xlog_end) + fps->last_xlog_end = last_xlog_end; + SpinLockRelease(&fps->mutex); +} + +/* + * Make sure the leader tries to read from our error queue one more time. + * This guards against the case where we exit uncleanly without sending an + * ErrorResponse to the leader, for example because some code calls proc_exit + * directly. + */ +static void +ParallelWorkerShutdown(int code, Datum arg) +{ + SendProcSignal(ParallelLeaderPid, + PROCSIG_PARALLEL_MESSAGE, + ParallelLeaderBackendId); +} + +/* + * Look up (and possibly load) a parallel worker entry point function. + * + * For functions contained in the core code, we use library name "postgres" + * and consult the InternalParallelWorkers array. External functions are + * looked up, and loaded if necessary, using load_external_function(). + * + * The point of this is to pass function names as strings across process + * boundaries. We can't pass actual function addresses because of the + * possibility that the function has been loaded at a different address + * in a different process. This is obviously a hazard for functions in + * loadable libraries, but it can happen even for functions in the core code + * on platforms using EXEC_BACKEND (e.g., Windows). + * + * At some point it might be worthwhile to get rid of InternalParallelWorkers[] + * in favor of applying load_external_function() for core functions too; + * but that raises portability issues that are not worth addressing now. + */ +static parallel_worker_main_type +LookupParallelWorkerFunction(const char *libraryname, const char *funcname) +{ + /* + * If the function is to be loaded from postgres itself, search the + * InternalParallelWorkers array. + */ + if (strcmp(libraryname, "postgres") == 0) + { + int i; + + for (i = 0; i < lengthof(InternalParallelWorkers); i++) + { + if (strcmp(InternalParallelWorkers[i].fn_name, funcname) == 0) + return InternalParallelWorkers[i].fn_addr; + } + + /* We can only reach this by programming error. */ + elog(ERROR, "internal function \"%s\" not found", funcname); + } + + /* Otherwise load from external library. */ + return (parallel_worker_main_type) + load_external_function(libraryname, funcname, true, NULL); +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c new file mode 100644 index 0000000..58091f6 --- /dev/null +++ b/src/backend/access/transam/rmgr.c @@ -0,0 +1,38 @@ +/* + * rmgr.c + * + * Resource managers definition + * + * src/backend/access/transam/rmgr.c + */ +#include "postgres.h" + +#include "access/brin_xlog.h" +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/generic_xlog.h" +#include "access/ginxlog.h" +#include "access/gistxlog.h" +#include "access/hash_xlog.h" +#include "access/heapam_xlog.h" +#include "access/multixact.h" +#include "access/nbtxlog.h" +#include "access/spgxlog.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/storage_xlog.h" +#include "commands/dbcommands_xlog.h" +#include "commands/sequence.h" +#include "commands/tablespace.h" +#include "replication/message.h" +#include "replication/origin.h" +#include "storage/standby.h" +#include "utils/relmapper.h" + +/* must be kept in sync with RmgrData definition in xlog_internal.h */ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ + { name, redo, desc, identify, startup, cleanup, mask }, + +const RmgrData RmgrTable[RM_MAX_ID + 1] = { +#include "access/rmgrlist.h" +}; diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c new file mode 100644 index 0000000..82149ad --- /dev/null +++ b/src/backend/access/transam/slru.c @@ -0,0 +1,1611 @@ +/*------------------------------------------------------------------------- + * + * slru.c + * Simple LRU buffering for transaction status logfiles + * + * We use a simple least-recently-used scheme to manage a pool of page + * buffers. Under ordinary circumstances we expect that write + * traffic will occur mostly to the latest page (and to the just-prior + * page, soon after a page transition). Read traffic will probably touch + * a larger span of pages, but in any case a fairly small number of page + * buffers should be sufficient. So, we just search the buffers using plain + * linear search; there's no need for a hashtable or anything fancy. + * The management algorithm is straight LRU except that we will never swap + * out the latest page (since we know it's going to be hit again eventually). + * + * We use a control LWLock to protect the shared data structures, plus + * per-buffer LWLocks that synchronize I/O for each buffer. The control lock + * must be held to examine or modify any shared state. A process that is + * reading in or writing out a page buffer does not hold the control lock, + * only the per-buffer lock for the buffer it is working on. + * + * "Holding the control lock" means exclusive lock in all cases except for + * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for + * the implications of that. + * + * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively + * before releasing the control lock. The per-buffer lock is released after + * completing the I/O, re-acquiring the control lock, and updating the shared + * state. (Deadlock is not possible here, because we never try to initiate + * I/O when someone else is already doing I/O on the same buffer.) + * To wait for I/O to complete, release the control lock, acquire the + * per-buffer lock in shared mode, immediately release the per-buffer lock, + * reacquire the control lock, and then recheck state (since arbitrary things + * could have happened while we didn't have the lock). + * + * As with the regular buffer manager, it is possible for another process + * to re-dirty a page that is currently being written out. This is handled + * by re-setting the page's page_dirty flag. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/slru.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "access/slru.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/fd.h" +#include "storage/shmem.h" + +#define SlruFileName(ctl, path, seg) \ + snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) + +/* + * During SimpleLruWriteAll(), we will usually not need to write more than one + * or two physical files, but we may need to write several pages per file. We + * can consolidate the I/O requests by leaving files open until control returns + * to SimpleLruWriteAll(). This data structure remembers which files are open. + */ +#define MAX_WRITEALL_BUFFERS 16 + +typedef struct SlruWriteAllData +{ + int num_files; /* # files actually open */ + int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */ + int segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */ +} SlruWriteAllData; + +typedef struct SlruWriteAllData *SlruWriteAll; + +/* + * Populate a file tag describing a segment file. We only use the segment + * number, since we can derive everything else we need by having separate + * sync handler functions for clog, multixact etc. + */ +#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = (xx_handler), \ + (a).segno = (xx_segno) \ +) + +/* + * Macro to mark a buffer slot "most recently used". Note multiple evaluation + * of arguments! + * + * The reason for the if-test is that there are often many consecutive + * accesses to the same page (particularly the latest page). By suppressing + * useless increments of cur_lru_count, we reduce the probability that old + * pages' counts will "wrap around" and make them appear recently used. + * + * We allow this code to be executed concurrently by multiple processes within + * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic, + * this should not cause any completely-bogus values to enter the computation. + * However, it is possible for either cur_lru_count or individual + * page_lru_count entries to be "reset" to lower values than they should have, + * in case a process is delayed while it executes this macro. With care in + * SlruSelectLRUPage(), this does little harm, and in any case the absolute + * worst possible consequence is a nonoptimal choice of page to evict. The + * gain from allowing concurrent reads of SLRU pages seems worth it. + */ +#define SlruRecentlyUsed(shared, slotno) \ + do { \ + int new_lru_count = (shared)->cur_lru_count; \ + if (new_lru_count != (shared)->page_lru_count[slotno]) { \ + (shared)->cur_lru_count = ++new_lru_count; \ + (shared)->page_lru_count[slotno] = new_lru_count; \ + } \ + } while (0) + +/* Saved info for SlruReportIOError */ +typedef enum +{ + SLRU_OPEN_FAILED, + SLRU_SEEK_FAILED, + SLRU_READ_FAILED, + SLRU_WRITE_FAILED, + SLRU_FSYNC_FAILED, + SLRU_CLOSE_FAILED +} SlruErrorCause; + +static SlruErrorCause slru_errcause; +static int slru_errno; + + +static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); +static void SimpleLruWaitIO(SlruCtl ctl, int slotno); +static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata); +static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); +static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, + SlruWriteAll fdata); +static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); +static int SlruSelectLRUPage(SlruCtl ctl, int pageno); + +static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, + int segpage, void *data); +static void SlruInternalDeleteSegment(SlruCtl ctl, int segno); + +/* + * Initialization of shared memory + */ + +Size +SimpleLruShmemSize(int nslots, int nlsns) +{ + Size sz; + + /* we assume nslots isn't so large as to risk overflow */ + sz = MAXALIGN(sizeof(SlruSharedData)); + sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */ + sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */ + sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */ + sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */ + sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */ + sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */ + + if (nlsns > 0) + sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ + + return BUFFERALIGN(sz) + BLCKSZ * nslots; +} + +/* + * Initialize, or attach to, a simple LRU cache in shared memory. + * + * ctl: address of local (unshared) control structure. + * name: name of SLRU. (This is user-visible, pick with care!) + * nslots: number of page slots to use. + * nlsns: number of LSN groups per page (set to zero if not relevant). + * ctllock: LWLock to use to control access to the shared control structure. + * subdir: PGDATA-relative subdirectory that will contain the files. + * tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks. + */ +void +SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler) +{ + SlruShared shared; + bool found; + + shared = (SlruShared) ShmemInitStruct(name, + SimpleLruShmemSize(nslots, nlsns), + &found); + + if (!IsUnderPostmaster) + { + /* Initialize locks and shared memory area */ + char *ptr; + Size offset; + int slotno; + + Assert(!found); + + memset(shared, 0, sizeof(SlruSharedData)); + + shared->ControlLock = ctllock; + + shared->num_slots = nslots; + shared->lsn_groups_per_page = nlsns; + + shared->cur_lru_count = 0; + + /* shared->latest_page_number will be set later */ + + shared->slru_stats_idx = pgstat_slru_index(name); + + ptr = (char *) shared; + offset = MAXALIGN(sizeof(SlruSharedData)); + shared->page_buffer = (char **) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(char *)); + shared->page_status = (SlruPageStatus *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(SlruPageStatus)); + shared->page_dirty = (bool *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(bool)); + shared->page_number = (int *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(int)); + shared->page_lru_count = (int *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(int)); + + /* Initialize LWLocks */ + shared->buffer_locks = (LWLockPadded *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(LWLockPadded)); + + if (nlsns > 0) + { + shared->group_lsn = (XLogRecPtr *) (ptr + offset); + offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); + } + + ptr += BUFFERALIGN(offset); + for (slotno = 0; slotno < nslots; slotno++) + { + LWLockInitialize(&shared->buffer_locks[slotno].lock, + tranche_id); + + shared->page_buffer[slotno] = ptr; + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + shared->page_dirty[slotno] = false; + shared->page_lru_count[slotno] = 0; + ptr += BLCKSZ; + } + + /* Should fit to estimated shmem size */ + Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns)); + } + else + Assert(found); + + /* + * Initialize the unshared control struct, including directory path. We + * assume caller set PagePrecedes. + */ + ctl->shared = shared; + ctl->sync_handler = sync_handler; + strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); +} + +/* + * Initialize (or reinitialize) a page to zeroes. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +int +SimpleLruZeroPage(SlruCtl ctl, int pageno) +{ + SlruShared shared = ctl->shared; + int slotno; + + /* Find a suitable buffer slot for the page */ + slotno = SlruSelectLRUPage(ctl, pageno); + Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || + (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno]) || + shared->page_number[slotno] == pageno); + + /* Mark the slot as containing this page */ + shared->page_number[slotno] = pageno; + shared->page_status[slotno] = SLRU_PAGE_VALID; + shared->page_dirty[slotno] = true; + SlruRecentlyUsed(shared, slotno); + + /* Set the buffer to zeroes */ + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + + /* Set the LSNs for this new page to zero */ + SimpleLruZeroLSNs(ctl, slotno); + + /* Assume this page is now the latest active page */ + shared->latest_page_number = pageno; + + /* update the stats counter of zeroed pages */ + pgstat_count_slru_page_zeroed(shared->slru_stats_idx); + + return slotno; +} + +/* + * Zero all the LSNs we store for this slru page. + * + * This should be called each time we create a new page, and each time we read + * in a page from disk into an existing buffer. (Such an old page cannot + * have any interesting LSNs, since we'd have flushed them before writing + * the page in the first place.) + * + * This assumes that InvalidXLogRecPtr is bitwise-all-0. + */ +static void +SimpleLruZeroLSNs(SlruCtl ctl, int slotno) +{ + SlruShared shared = ctl->shared; + + if (shared->lsn_groups_per_page > 0) + MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0, + shared->lsn_groups_per_page * sizeof(XLogRecPtr)); +} + +/* + * Wait for any active I/O on a page slot to finish. (This does not + * guarantee that new I/O hasn't been started before we return, though. + * In fact the slot might not even contain the same page anymore.) + * + * Control lock must be held at entry, and will be held at exit. + */ +static void +SimpleLruWaitIO(SlruCtl ctl, int slotno) +{ + SlruShared shared = ctl->shared; + + /* See notes at top of file */ + LWLockRelease(shared->ControlLock); + LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED); + LWLockRelease(&shared->buffer_locks[slotno].lock); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + /* + * If the slot is still in an io-in-progress state, then either someone + * already started a new I/O on the slot, or a previous I/O failed and + * neglected to reset the page state. That shouldn't happen, really, but + * it seems worth a few extra cycles to check and recover from it. We can + * cheaply test for failure by seeing if the buffer lock is still held (we + * assume that transaction abort would release the lock). + */ + if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || + shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS) + { + if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED)) + { + /* indeed, the I/O must have failed */ + if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS) + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + else /* write_in_progress */ + { + shared->page_status[slotno] = SLRU_PAGE_VALID; + shared->page_dirty[slotno] = true; + } + LWLockRelease(&shared->buffer_locks[slotno].lock); + } + } +} + +/* + * Find a page in a shared buffer, reading it in if necessary. + * The page number must correspond to an already-initialized page. + * + * If write_ok is true then it is OK to return a page that is in + * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure + * that modification of the page is safe. If write_ok is false then we + * will not return the page until it is not undergoing active I/O. + * + * The passed-in xid is used only for error reporting, and may be + * InvalidTransactionId if no specific xid is associated with the action. + * + * Return value is the shared-buffer slot number now holding the page. + * The buffer's LRU access info is updated. + * + * Control lock must be held at entry, and will be held at exit. + */ +int +SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, + TransactionId xid) +{ + SlruShared shared = ctl->shared; + + /* Outer loop handles restart if we must wait for someone else's I/O */ + for (;;) + { + int slotno; + bool ok; + + /* See if page already is in memory; if not, pick victim slot */ + slotno = SlruSelectLRUPage(ctl, pageno); + + /* Did we find the page in memory? */ + if (shared->page_number[slotno] == pageno && + shared->page_status[slotno] != SLRU_PAGE_EMPTY) + { + /* + * If page is still being read in, we must wait for I/O. Likewise + * if the page is being written and the caller said that's not OK. + */ + if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || + (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && + !write_ok)) + { + SimpleLruWaitIO(ctl, slotno); + /* Now we must recheck state from the top */ + continue; + } + /* Otherwise, it's ready to use */ + SlruRecentlyUsed(shared, slotno); + + /* update the stats counter of pages found in the SLRU */ + pgstat_count_slru_page_hit(shared->slru_stats_idx); + + return slotno; + } + + /* We found no match; assert we selected a freeable slot */ + Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || + (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno])); + + /* Mark the slot read-busy */ + shared->page_number[slotno] = pageno; + shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; + shared->page_dirty[slotno] = false; + + /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ + LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); + + /* Release control lock while doing I/O */ + LWLockRelease(shared->ControlLock); + + /* Do the read */ + ok = SlruPhysicalReadPage(ctl, pageno, slotno); + + /* Set the LSNs for this newly read-in page to zero */ + SimpleLruZeroLSNs(ctl, slotno); + + /* Re-acquire control lock and update page state */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + Assert(shared->page_number[slotno] == pageno && + shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && + !shared->page_dirty[slotno]); + + shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY; + + LWLockRelease(&shared->buffer_locks[slotno].lock); + + /* Now it's okay to ereport if we failed */ + if (!ok) + SlruReportIOError(ctl, pageno, xid); + + SlruRecentlyUsed(shared, slotno); + + /* update the stats counter of pages not found in SLRU */ + pgstat_count_slru_page_read(shared->slru_stats_idx); + + return slotno; + } +} + +/* + * Find a page in a shared buffer, reading it in if necessary. + * The page number must correspond to an already-initialized page. + * The caller must intend only read-only access to the page. + * + * The passed-in xid is used only for error reporting, and may be + * InvalidTransactionId if no specific xid is associated with the action. + * + * Return value is the shared-buffer slot number now holding the page. + * The buffer's LRU access info is updated. + * + * Control lock must NOT be held at entry, but will be held at exit. + * It is unspecified whether the lock will be shared or exclusive. + */ +int +SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) +{ + SlruShared shared = ctl->shared; + int slotno; + + /* Try to find the page while holding only shared lock */ + LWLockAcquire(shared->ControlLock, LW_SHARED); + + /* See if page is already in a buffer */ + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + if (shared->page_number[slotno] == pageno && + shared->page_status[slotno] != SLRU_PAGE_EMPTY && + shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) + { + /* See comments for SlruRecentlyUsed macro */ + SlruRecentlyUsed(shared, slotno); + + /* update the stats counter of pages found in the SLRU */ + pgstat_count_slru_page_hit(shared->slru_stats_idx); + + return slotno; + } + } + + /* No luck, so switch to normal exclusive lock and do regular read */ + LWLockRelease(shared->ControlLock); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + return SimpleLruReadPage(ctl, pageno, true, xid); +} + +/* + * Write a page from a shared buffer, if necessary. + * Does nothing if the specified slot is not dirty. + * + * NOTE: only one write attempt is made here. Hence, it is possible that + * the page is still dirty at exit (if someone else re-dirtied it during + * the write). However, we *do* attempt a fresh write even if the page + * is already being written; this is for checkpoints. + * + * Control lock must be held at entry, and will be held at exit. + */ +static void +SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) +{ + SlruShared shared = ctl->shared; + int pageno = shared->page_number[slotno]; + bool ok; + + /* If a write is in progress, wait for it to finish */ + while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && + shared->page_number[slotno] == pageno) + { + SimpleLruWaitIO(ctl, slotno); + } + + /* + * Do nothing if page is not dirty, or if buffer no longer contains the + * same page we were called for. + */ + if (!shared->page_dirty[slotno] || + shared->page_status[slotno] != SLRU_PAGE_VALID || + shared->page_number[slotno] != pageno) + return; + + /* + * Mark the slot write-busy, and clear the dirtybit. After this point, a + * transaction status update on this page will mark it dirty again. + */ + shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; + shared->page_dirty[slotno] = false; + + /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ + LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); + + /* Release control lock while doing I/O */ + LWLockRelease(shared->ControlLock); + + /* Do the write */ + ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); + + /* If we failed, and we're in a flush, better close the files */ + if (!ok && fdata) + { + int i; + + for (i = 0; i < fdata->num_files; i++) + CloseTransientFile(fdata->fd[i]); + } + + /* Re-acquire control lock and update page state */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + Assert(shared->page_number[slotno] == pageno && + shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); + + /* If we failed to write, mark the page dirty again */ + if (!ok) + shared->page_dirty[slotno] = true; + + shared->page_status[slotno] = SLRU_PAGE_VALID; + + LWLockRelease(&shared->buffer_locks[slotno].lock); + + /* Now it's okay to ereport if we failed */ + if (!ok) + SlruReportIOError(ctl, pageno, InvalidTransactionId); + + /* If part of a checkpoint, count this as a buffer written. */ + if (fdata) + CheckpointStats.ckpt_bufs_written++; +} + +/* + * Wrapper of SlruInternalWritePage, for external callers. + * fdata is always passed a NULL here. + */ +void +SimpleLruWritePage(SlruCtl ctl, int slotno) +{ + SlruInternalWritePage(ctl, slotno, NULL); +} + +/* + * Return whether the given page exists on disk. + * + * A false return means that either the file does not exist, or that it's not + * large enough to contain the given page. + */ +bool +SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) +{ + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + int offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + int fd; + bool result; + off_t endpos; + + /* update the stats counter of checked pages */ + pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx); + + SlruFileName(ctl, path, segno); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + /* expected: file doesn't exist */ + if (errno == ENOENT) + return false; + + /* report error normally */ + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); + } + + if ((endpos = lseek(fd, 0, SEEK_END)) < 0) + { + slru_errcause = SLRU_SEEK_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); + } + + result = endpos >= (off_t) (offset + BLCKSZ); + + if (CloseTransientFile(fd) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + return false; + } + + return result; +} + +/* + * Physical read of a (previously existing) page into a buffer slot + * + * On failure, we cannot just ereport(ERROR) since caller has put state in + * shared memory that must be undone. So, we return false and save enough + * info in static variables to let SlruReportIOError make the report. + * + * For now, assume it's not worth keeping a file pointer open across + * read/write operations. We could cache one virtual file pointer ... + */ +static bool +SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) +{ + SlruShared shared = ctl->shared; + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + off_t offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + int fd; + + SlruFileName(ctl, path, segno); + + /* + * In a crash-and-restart situation, it's possible for us to receive + * commands to set the commit status of transactions whose bits are in + * already-truncated segments of the commit log (see notes in + * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case + * where the file doesn't exist, and return zeroes instead. + */ + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (errno != ENOENT || !InRecovery) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; + } + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); + if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) + { + pgstat_report_wait_end(); + slru_errcause = SLRU_READ_FAILED; + slru_errno = errno; + CloseTransientFile(fd); + return false; + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + return false; + } + + return true; +} + +/* + * Physical write of a page from a buffer slot + * + * On failure, we cannot just ereport(ERROR) since caller has put state in + * shared memory that must be undone. So, we return false and save enough + * info in static variables to let SlruReportIOError make the report. + * + * For now, assume it's not worth keeping a file pointer open across + * independent read/write operations. We do batch operations during + * SimpleLruWriteAll, though. + * + * fdata is NULL for a standalone write, pointer to open-file info during + * SimpleLruWriteAll. + */ +static bool +SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) +{ + SlruShared shared = ctl->shared; + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + off_t offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + int fd = -1; + + /* update the stats counter of written pages */ + pgstat_count_slru_page_written(shared->slru_stats_idx); + + /* + * Honor the write-WAL-before-data rule, if appropriate, so that we do not + * write out data before associated WAL records. This is the same action + * performed during FlushBuffer() in the main buffer manager. + */ + if (shared->group_lsn != NULL) + { + /* + * We must determine the largest async-commit LSN for the page. This + * is a bit tedious, but since this entire function is a slow path + * anyway, it seems better to do this here than to maintain a per-page + * LSN variable (which'd need an extra comparison in the + * transaction-commit path). + */ + XLogRecPtr max_lsn; + int lsnindex, + lsnoff; + + lsnindex = slotno * shared->lsn_groups_per_page; + max_lsn = shared->group_lsn[lsnindex++]; + for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) + { + XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; + + if (max_lsn < this_lsn) + max_lsn = this_lsn; + } + + if (!XLogRecPtrIsInvalid(max_lsn)) + { + /* + * As noted above, elog(ERROR) is not acceptable here, so if + * XLogFlush were to fail, we must PANIC. This isn't much of a + * restriction because XLogFlush is just about all critical + * section anyway, but let's make sure. + */ + START_CRIT_SECTION(); + XLogFlush(max_lsn); + END_CRIT_SECTION(); + } + } + + /* + * During a WriteAll, we may already have the desired file open. + */ + if (fdata) + { + int i; + + for (i = 0; i < fdata->num_files; i++) + { + if (fdata->segno[i] == segno) + { + fd = fdata->fd[i]; + break; + } + } + } + + if (fd < 0) + { + /* + * If the file doesn't already exist, we should create it. It is + * possible for this to need to happen when writing a page that's not + * first in its segment; we assume the OS can cope with that. (Note: + * it might seem that it'd be okay to create files only when + * SimpleLruZeroPage is called for the first page of a segment. + * However, if after a crash and restart the REDO logic elects to + * replay the log from a checkpoint before the latest one, then it's + * possible that we will get commands to set transaction status of + * transactions that have already been truncated from the commit log. + * Easiest way to deal with that is to accept references to + * nonexistent files here and in SlruPhysicalReadPage.) + * + * Note: it is possible for more than one backend to be executing this + * code simultaneously for different pages of the same file. Hence, + * don't use O_EXCL or O_TRUNC or anything like that. + */ + SlruFileName(ctl, path, segno); + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + + if (fdata) + { + if (fdata->num_files < MAX_WRITEALL_BUFFERS) + { + fdata->fd[fdata->num_files] = fd; + fdata->segno[fdata->num_files] = segno; + fdata->num_files++; + } + else + { + /* + * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, + * fall back to treating it as a standalone write. + */ + fdata = NULL; + } + } + } + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); + if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) + { + pgstat_report_wait_end(); + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + slru_errcause = SLRU_WRITE_FAILED; + slru_errno = errno; + if (!fdata) + CloseTransientFile(fd); + return false; + } + pgstat_report_wait_end(); + + /* Queue up a sync request for the checkpointer. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + { + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) + { + /* No space to enqueue sync request. Do it synchronously. */ + pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); + if (pg_fsync(fd) != 0) + { + pgstat_report_wait_end(); + slru_errcause = SLRU_FSYNC_FAILED; + slru_errno = errno; + CloseTransientFile(fd); + return false; + } + pgstat_report_wait_end(); + } + } + + /* Close file, unless part of flush request. */ + if (!fdata) + { + if (CloseTransientFile(fd) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + return false; + } + } + + return true; +} + +/* + * Issue the error message after failure of SlruPhysicalReadPage or + * SlruPhysicalWritePage. Call this after cleaning up shared-memory state. + */ +static void +SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) +{ + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + int offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + + SlruFileName(ctl, path, segno); + errno = slru_errno; + switch (slru_errcause) + { + case SLRU_OPEN_FAILED: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not open file \"%s\": %m.", path))); + break; + case SLRU_SEEK_FAILED: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not seek in file \"%s\" to offset %u: %m.", + path, offset))); + break; + case SLRU_READ_FAILED: + if (errno) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not read from file \"%s\" at offset %u: %m.", + path, offset))); + else + ereport(ERROR, + (errmsg("could not access status of transaction %u", xid), + errdetail("Could not read from file \"%s\" at offset %u: read too few bytes.", path, offset))); + break; + case SLRU_WRITE_FAILED: + if (errno) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not write to file \"%s\" at offset %u: %m.", + path, offset))); + else + ereport(ERROR, + (errmsg("could not access status of transaction %u", xid), + errdetail("Could not write to file \"%s\" at offset %u: wrote too few bytes.", + path, offset))); + break; + case SLRU_FSYNC_FAILED: + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not fsync file \"%s\": %m.", + path))); + break; + case SLRU_CLOSE_FAILED: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not close file \"%s\": %m.", + path))); + break; + default: + /* can't get here, we trust */ + elog(ERROR, "unrecognized SimpleLru error cause: %d", + (int) slru_errcause); + break; + } +} + +/* + * Select the slot to re-use when we need a free slot. + * + * The target page number is passed because we need to consider the + * possibility that some other process reads in the target page while + * we are doing I/O to free a slot. Hence, check or recheck to see if + * any slot already holds the target page, and return that slot if so. + * Thus, the returned slot is *either* a slot already holding the pageno + * (could be any state except EMPTY), *or* a freeable slot (state EMPTY + * or CLEAN). + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +SlruSelectLRUPage(SlruCtl ctl, int pageno) +{ + SlruShared shared = ctl->shared; + + /* Outer loop handles restart after I/O */ + for (;;) + { + int slotno; + int cur_count; + int bestvalidslot = 0; /* keep compiler quiet */ + int best_valid_delta = -1; + int best_valid_page_number = 0; /* keep compiler quiet */ + int bestinvalidslot = 0; /* keep compiler quiet */ + int best_invalid_delta = -1; + int best_invalid_page_number = 0; /* keep compiler quiet */ + + /* See if page already has a buffer assigned */ + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + if (shared->page_number[slotno] == pageno && + shared->page_status[slotno] != SLRU_PAGE_EMPTY) + return slotno; + } + + /* + * If we find any EMPTY slot, just select that one. Else choose a + * victim page to replace. We normally take the least recently used + * valid page, but we will never take the slot containing + * latest_page_number, even if it appears least recently used. We + * will select a slot that is already I/O busy only if there is no + * other choice: a read-busy slot will not be least recently used once + * the read finishes, and waiting for an I/O on a write-busy slot is + * inferior to just picking some other slot. Testing shows the slot + * we pick instead will often be clean, allowing us to begin a read at + * once. + * + * Normally the page_lru_count values will all be different and so + * there will be a well-defined LRU page. But since we allow + * concurrent execution of SlruRecentlyUsed() within + * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages + * acquire the same lru_count values. In that case we break ties by + * choosing the furthest-back page. + * + * Notice that this next line forcibly advances cur_lru_count to a + * value that is certainly beyond any value that will be in the + * page_lru_count array after the loop finishes. This ensures that + * the next execution of SlruRecentlyUsed will mark the page newly + * used, even if it's for a page that has the current counter value. + * That gets us back on the path to having good data when there are + * multiple pages with the same lru_count. + */ + cur_count = (shared->cur_lru_count)++; + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + int this_delta; + int this_page_number; + + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + return slotno; + this_delta = cur_count - shared->page_lru_count[slotno]; + if (this_delta < 0) + { + /* + * Clean up in case shared updates have caused cur_count + * increments to get "lost". We back off the page counts, + * rather than trying to increase cur_count, to avoid any + * question of infinite loops or failure in the presence of + * wrapped-around counts. + */ + shared->page_lru_count[slotno] = cur_count; + this_delta = 0; + } + this_page_number = shared->page_number[slotno]; + if (this_page_number == shared->latest_page_number) + continue; + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + { + if (this_delta > best_valid_delta || + (this_delta == best_valid_delta && + ctl->PagePrecedes(this_page_number, + best_valid_page_number))) + { + bestvalidslot = slotno; + best_valid_delta = this_delta; + best_valid_page_number = this_page_number; + } + } + else + { + if (this_delta > best_invalid_delta || + (this_delta == best_invalid_delta && + ctl->PagePrecedes(this_page_number, + best_invalid_page_number))) + { + bestinvalidslot = slotno; + best_invalid_delta = this_delta; + best_invalid_page_number = this_page_number; + } + } + } + + /* + * If all pages (except possibly the latest one) are I/O busy, we'll + * have to wait for an I/O to complete and then retry. In that + * unhappy case, we choose to wait for the I/O on the least recently + * used slot, on the assumption that it was likely initiated first of + * all the I/Os in progress and may therefore finish first. + */ + if (best_valid_delta < 0) + { + SimpleLruWaitIO(ctl, bestinvalidslot); + continue; + } + + /* + * If the selected page is clean, we're set. + */ + if (!shared->page_dirty[bestvalidslot]) + return bestvalidslot; + + /* + * Write the page. + */ + SlruInternalWritePage(ctl, bestvalidslot, NULL); + + /* + * Now loop back and try again. This is the easiest way of dealing + * with corner cases such as the victim page being re-dirtied while we + * wrote it. + */ + } +} + +/* + * Write dirty pages to disk during checkpoint or database shutdown. Flushing + * is deferred until the next call to ProcessSyncRequests(), though we do fsync + * the containing directory here to make sure that newly created directory + * entries are on disk. + */ +void +SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) +{ + SlruShared shared = ctl->shared; + SlruWriteAllData fdata; + int slotno; + int pageno = 0; + int i; + bool ok; + + /* update the stats counter of flushes */ + pgstat_count_slru_flush(shared->slru_stats_idx); + + /* + * Find and write dirty pages + */ + fdata.num_files = 0; + + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + SlruInternalWritePage(ctl, slotno, &fdata); + + /* + * In some places (e.g. checkpoints), we cannot assert that the slot + * is clean now, since another process might have re-dirtied it + * already. That's okay. + */ + Assert(allow_redirtied || + shared->page_status[slotno] == SLRU_PAGE_EMPTY || + (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno])); + } + + LWLockRelease(shared->ControlLock); + + /* + * Now close any files that were open + */ + ok = true; + for (i = 0; i < fdata.num_files; i++) + { + if (CloseTransientFile(fdata.fd[i]) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; + ok = false; + } + } + if (!ok) + SlruReportIOError(ctl, pageno, InvalidTransactionId); + + /* Ensure that directory entries for new files are on disk. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + fsync_fname(ctl->Dir, true); +} + +/* + * Remove all segments before the one holding the passed page number + * + * All SLRUs prevent concurrent calls to this function, either with an LWLock + * or by calling it only as part of a checkpoint. Mutual exclusion must begin + * before computing cutoffPage. Mutual exclusion must end after any limit + * update that would permit other backends to write fresh data into the + * segment immediately preceding the one containing cutoffPage. Otherwise, + * when the SLRU is quite full, SimpleLruTruncate() might delete that segment + * after it has accrued freshly-written data. + */ +void +SimpleLruTruncate(SlruCtl ctl, int cutoffPage) +{ + SlruShared shared = ctl->shared; + int slotno; + + /* update the stats counter of truncates */ + pgstat_count_slru_truncate(shared->slru_stats_idx); + + /* + * Scan shared memory and remove any pages preceding the cutoff page, to + * ensure we won't rewrite them later. (Since this is normally called in + * or just after a checkpoint, any dirty pages should have been flushed + * already ... we're just being extra careful here.) + */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + +restart:; + + /* + * While we are holding the lock, make an important safety check: the + * current endpoint page must not be eligible for removal. + */ + if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) + { + LWLockRelease(shared->ControlLock); + ereport(LOG, + (errmsg("could not truncate directory \"%s\": apparent wraparound", + ctl->Dir))); + return; + } + + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + continue; + if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage)) + continue; + + /* + * If page is clean, just change state to EMPTY (expected case). + */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno]) + { + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + continue; + } + + /* + * Hmm, we have (or may have) I/O operations acting on the page, so + * we've got to wait for them to finish and then start again. This is + * the same logic as in SlruSelectLRUPage. (XXX if page is dirty, + * wouldn't it be OK to just discard it without writing it? + * SlruMayDeleteSegment() uses a stricter qualification, so we might + * not delete this page in the end; even if we don't delete it, we + * won't have cause to read its data again. For now, keep the logic + * the same as it was.) + */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + SlruInternalWritePage(ctl, slotno, NULL); + else + SimpleLruWaitIO(ctl, slotno); + goto restart; + } + + LWLockRelease(shared->ControlLock); + + /* Now we can remove the old segment(s) */ + (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); +} + +/* + * Delete an individual SLRU segment. + * + * NB: This does not touch the SLRU buffers themselves, callers have to ensure + * they either can't yet contain anything, or have already been cleaned out. + */ +static void +SlruInternalDeleteSegment(SlruCtl ctl, int segno) +{ + char path[MAXPGPATH]; + + /* Forget any fsync requests queued for this segment. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + { + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true); + } + + /* Unlink the file. */ + SlruFileName(ctl, path, segno); + ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path))); + unlink(path); +} + +/* + * Delete an individual SLRU segment, identified by the segment number. + */ +void +SlruDeleteSegment(SlruCtl ctl, int segno) +{ + SlruShared shared = ctl->shared; + int slotno; + bool did_write; + + /* Clean out any possibly existing references to the segment. */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); +restart: + did_write = false; + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT; + + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + continue; + + /* not the segment we're looking for */ + if (pagesegno != segno) + continue; + + /* If page is clean, just change state to EMPTY (expected case). */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno]) + { + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + continue; + } + + /* Same logic as SimpleLruTruncate() */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + SlruInternalWritePage(ctl, slotno, NULL); + else + SimpleLruWaitIO(ctl, slotno); + + did_write = true; + } + + /* + * Be extra careful and re-check. The IO functions release the control + * lock, so new pages could have been read in. + */ + if (did_write) + goto restart; + + SlruInternalDeleteSegment(ctl, segno); + + LWLockRelease(shared->ControlLock); +} + +/* + * Determine whether a segment is okay to delete. + * + * segpage is the first page of the segment, and cutoffPage is the oldest (in + * PagePrecedes order) page in the SLRU containing still-useful data. Since + * every core PagePrecedes callback implements "wrap around", check the + * segment's first and last pages: + * + * first=cutoff: no; cutoff falls inside this segment + * first>=cutoff && last=cutoff && last>=cutoff: no; every page of this segment is too young + */ +static bool +SlruMayDeleteSegment(SlruCtl ctl, int segpage, int cutoffPage) +{ + int seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1; + + Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0); + + return (ctl->PagePrecedes(segpage, cutoffPage) && + ctl->PagePrecedes(seg_last_page, cutoffPage)); +} + +#ifdef USE_ASSERT_CHECKING +static void +SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) +{ + TransactionId lhs, + rhs; + int newestPage, + oldestPage; + TransactionId newestXact, + oldestXact; + + /* + * Compare an XID pair having undefined order (see RFC 1982), a pair at + * "opposite ends" of the XID space. TransactionIdPrecedes() treats each + * as preceding the other. If RHS is oldestXact, LHS is the first XID we + * must not assign. + */ + lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */ + rhs = lhs + (1U << 31); + Assert(TransactionIdPrecedes(lhs, rhs)); + Assert(TransactionIdPrecedes(rhs, lhs)); + Assert(!TransactionIdPrecedes(lhs - 1, rhs)); + Assert(TransactionIdPrecedes(rhs, lhs - 1)); + Assert(TransactionIdPrecedes(lhs + 1, rhs)); + Assert(!TransactionIdPrecedes(rhs, lhs + 1)); + Assert(!TransactionIdFollowsOrEquals(lhs, rhs)); + Assert(!TransactionIdFollowsOrEquals(rhs, lhs)); + Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page)); + Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page)); + Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page)); + Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page)); + Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page)); + Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page)); + Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page) + || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */ + Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page) + || (1U << 31) % per_page != 0); + Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page)); + Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page)); + Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page)); + + /* + * GetNewTransactionId() has assigned the last XID it can safely use, and + * that XID is in the *LAST* page of the second segment. We must not + * delete that segment. + */ + newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; + newestXact = newestPage * per_page + offset; + Assert(newestXact / per_page == newestPage); + oldestXact = newestXact + 1; + oldestXact -= 1U << 31; + oldestPage = oldestXact / per_page; + Assert(!SlruMayDeleteSegment(ctl, + (newestPage - + newestPage % SLRU_PAGES_PER_SEGMENT), + oldestPage)); + + /* + * GetNewTransactionId() has assigned the last XID it can safely use, and + * that XID is in the *FIRST* page of the second segment. We must not + * delete that segment. + */ + newestPage = SLRU_PAGES_PER_SEGMENT; + newestXact = newestPage * per_page + offset; + Assert(newestXact / per_page == newestPage); + oldestXact = newestXact + 1; + oldestXact -= 1U << 31; + oldestPage = oldestXact / per_page; + Assert(!SlruMayDeleteSegment(ctl, + (newestPage - + newestPage % SLRU_PAGES_PER_SEGMENT), + oldestPage)); +} + +/* + * Unit-test a PagePrecedes function. + * + * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It + * assumes each value occupies a contiguous, fixed-size region of SLRU bytes. + * (MultiXactMemberCtl separates flags from XIDs. AsyncCtl has + * variable-length entries, no keys, and no random access. These unit tests + * do not apply to them.) + */ +void +SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page) +{ + /* Test first, middle and last entries of a page. */ + SlruPagePrecedesTestOffset(ctl, per_page, 0); + SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2); + SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1); +} +#endif + +/* + * SlruScanDirectory callback + * This callback reports true if there's any segment wholly prior to the + * one containing the page passed as "data". + */ +bool +SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data) +{ + int cutoffPage = *(int *) data; + + if (SlruMayDeleteSegment(ctl, segpage, cutoffPage)) + return true; /* found one; don't iterate any more */ + + return false; /* keep going */ +} + +/* + * SlruScanDirectory callback. + * This callback deletes segments prior to the one passed in as "data". + */ +static bool +SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) +{ + int cutoffPage = *(int *) data; + + if (SlruMayDeleteSegment(ctl, segpage, cutoffPage)) + SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT); + + return false; /* keep going */ +} + +/* + * SlruScanDirectory callback. + * This callback deletes all segments. + */ +bool +SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) +{ + SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT); + + return false; /* keep going */ +} + +/* + * Scan the SimpleLru directory and apply a callback to each file found in it. + * + * If the callback returns true, the scan is stopped. The last return value + * from the callback is returned. + * + * The callback receives the following arguments: 1. the SlruCtl struct for the + * slru being truncated; 2. the filename being considered; 3. the page number + * for the first page of that file; 4. a pointer to the opaque data given to us + * by the caller. + * + * Note that the ordering in which the directory is scanned is not guaranteed. + * + * Note that no locking is applied. + */ +bool +SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) +{ + bool retval = false; + DIR *cldir; + struct dirent *clde; + int segno; + int segpage; + + cldir = AllocateDir(ctl->Dir); + while ((clde = ReadDir(cldir, ctl->Dir)) != NULL) + { + size_t len; + + len = strlen(clde->d_name); + + if ((len == 4 || len == 5 || len == 6) && + strspn(clde->d_name, "0123456789ABCDEF") == len) + { + segno = (int) strtol(clde->d_name, NULL, 16); + segpage = segno * SLRU_PAGES_PER_SEGMENT; + + elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s", + ctl->Dir, clde->d_name); + retval = callback(ctl, clde->d_name, segpage, data); + if (retval) + break; + } + } + FreeDir(cldir); + + return retval; +} + +/* + * Individual SLRUs (clog, ...) have to provide a sync.c handler function so + * that they can provide the correct "SlruCtl" (otherwise we don't know how to + * build the path), but they just forward to this common implementation that + * performs the fsync. + */ +int +SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path) +{ + int fd; + int save_errno; + int result; + + SlruFileName(ctl, path, ftag->segno); + + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + if (fd < 0) + return -1; + + result = pg_fsync(fd); + save_errno = errno; + + CloseTransientFile(fd); + + errno = save_errno; + return result; +} diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c new file mode 100644 index 0000000..6a8e521 --- /dev/null +++ b/src/backend/access/transam/subtrans.c @@ -0,0 +1,374 @@ +/*------------------------------------------------------------------------- + * + * subtrans.c + * PostgreSQL subtransaction-log manager + * + * The pg_subtrans manager is a pg_xact-like manager that stores the parent + * transaction Id for each transaction. It is a fundamental part of the + * nested transactions implementation. A main transaction has a parent + * of InvalidTransactionId, and each subtransaction has its immediate parent. + * The tree can easily be walked from child to parent, but not in the + * opposite direction. + * + * This code is based on xact.c, but the robustness requirements + * are completely different from pg_xact, because we only need to remember + * pg_subtrans information for currently-open transactions. Thus, there is + * no need to preserve data over a crash and restart. + * + * There are no XLOG interactions since we do not care about preserving + * data across crashes. During database startup, we simply force the + * currently-active page of SUBTRANS to zeroes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/subtrans.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/slru.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "pg_trace.h" +#include "utils/snapmgr.h" + + +/* + * Defines for SubTrans page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * SubTrans page numbering also wraps around at + * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at + * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes) and zeroing + * them in StartupSUBTRANS. + */ + +/* We need four bytes per xact */ +#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) + +#define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE) +#define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE) + + +/* + * Link to shared-memory data structures for SUBTRANS control + */ +static SlruCtlData SubTransCtlData; + +#define SubTransCtl (&SubTransCtlData) + + +static int ZeroSUBTRANSPage(int pageno); +static bool SubTransPagePrecedes(int page1, int page2); + + +/* + * Record the parent of a subtransaction in the subtrans log. + */ +void +SubTransSetParent(TransactionId xid, TransactionId parent) +{ + int pageno = TransactionIdToPage(xid); + int entryno = TransactionIdToEntry(xid); + int slotno; + TransactionId *ptr; + + Assert(TransactionIdIsValid(parent)); + Assert(TransactionIdFollows(xid, parent)); + + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid); + ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; + ptr += entryno; + + /* + * It's possible we'll try to set the parent xid multiple times but we + * shouldn't ever be changing the xid from one valid xid to another valid + * xid, which would corrupt the data structure. + */ + if (*ptr != parent) + { + Assert(*ptr == InvalidTransactionId); + *ptr = parent; + SubTransCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(SubtransSLRULock); +} + +/* + * Interrogate the parent of a transaction in the subtrans log. + */ +TransactionId +SubTransGetParent(TransactionId xid) +{ + int pageno = TransactionIdToPage(xid); + int entryno = TransactionIdToEntry(xid); + int slotno; + TransactionId *ptr; + TransactionId parent; + + /* Can't ask about stuff that might not be around anymore */ + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + /* Bootstrap and frozen XIDs have no parent */ + if (!TransactionIdIsNormal(xid)) + return InvalidTransactionId; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + + slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid); + ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; + ptr += entryno; + + parent = *ptr; + + LWLockRelease(SubtransSLRULock); + + return parent; +} + +/* + * SubTransGetTopmostTransaction + * + * Returns the topmost transaction of the given transaction id. + * + * Because we cannot look back further than TransactionXmin, it is possible + * that this function will lie and return an intermediate subtransaction ID + * instead of the true topmost parent ID. This is OK, because in practice + * we only care about detecting whether the topmost parent is still running + * or is part of a current snapshot's list of still-running transactions. + * Therefore, any XID before TransactionXmin is as good as any other. + */ +TransactionId +SubTransGetTopmostTransaction(TransactionId xid) +{ + TransactionId parentXid = xid, + previousXid = xid; + + /* Can't ask about stuff that might not be around anymore */ + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + while (TransactionIdIsValid(parentXid)) + { + previousXid = parentXid; + if (TransactionIdPrecedes(parentXid, TransactionXmin)) + break; + parentXid = SubTransGetParent(parentXid); + + /* + * By convention the parent xid gets allocated first, so should always + * precede the child xid. Anything else points to a corrupted data + * structure that could lead to an infinite loop, so exit. + */ + if (!TransactionIdPrecedes(parentXid, previousXid)) + elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u", + previousXid, parentXid); + } + + Assert(TransactionIdIsValid(previousXid)); + + return previousXid; +} + + +/* + * Initialization of shared memory for SUBTRANS + */ +Size +SUBTRANSShmemSize(void) +{ + return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0); +} + +void +SUBTRANSShmemInit(void) +{ + SubTransCtl->PagePrecedes = SubTransPagePrecedes; + SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0, + SubtransSLRULock, "pg_subtrans", + LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE); + SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE); +} + +/* + * This func must be called ONCE on system install. It creates + * the initial SUBTRANS segment. (The SUBTRANS directory is assumed to + * have been created by the initdb shell script, and SUBTRANSShmemInit + * must have been called already.) + * + * Note: it's not really necessary to create the initial segment now, + * since slru.c would create it on first write anyway. But we may as well + * do it to be sure the directory is set up correctly. + */ +void +BootStrapSUBTRANS(void) +{ + int slotno; + + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the subtrans log */ + slotno = ZeroSUBTRANSPage(0); + + /* Make sure it's written out */ + SimpleLruWritePage(SubTransCtl, slotno); + Assert(!SubTransCtl->shared->page_dirty[slotno]); + + LWLockRelease(SubtransSLRULock); +} + +/* + * Initialize (or reinitialize) a page of SUBTRANS to zeroes. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroSUBTRANSPage(int pageno) +{ + return SimpleLruZeroPage(SubTransCtl, pageno); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + * + * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid + * if there are none. + */ +void +StartupSUBTRANS(TransactionId oldestActiveXID) +{ + FullTransactionId nextXid; + int startPage; + int endPage; + + /* + * Since we don't expect pg_subtrans to be valid across crashes, we + * initialize the currently-active page(s) to zeroes during startup. + * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero + * the new page without regard to whatever was previously on disk. + */ + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + startPage = TransactionIdToPage(oldestActiveXID); + nextXid = ShmemVariableCache->nextXid; + endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid)); + + while (startPage != endPage) + { + (void) ZeroSUBTRANSPage(startPage); + startPage++; + /* must account for wraparound */ + if (startPage > TransactionIdToPage(MaxTransactionId)) + startPage = 0; + } + (void) ZeroSUBTRANSPage(startPage); + + LWLockRelease(SubtransSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointSUBTRANS(void) +{ + /* + * Write dirty SUBTRANS pages to disk + * + * This is not actually necessary from a correctness point of view. We do + * it merely to improve the odds that writing of dirty pages is done by + * the checkpoint process and not by backends. + */ + TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true); + SimpleLruWriteAll(SubTransCtl, true); + TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); +} + + +/* + * Make sure that SUBTRANS has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty subtrans page to make room + * in shared memory. + */ +void +ExtendSUBTRANS(TransactionId newestXact) +{ + int pageno; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToEntry(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToPage(newestXact); + + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + /* Zero the page */ + ZeroSUBTRANSPage(pageno); + + LWLockRelease(SubtransSLRULock); +} + + +/* + * Remove all SUBTRANS segments before the one holding the passed transaction ID + * + * oldestXact is the oldest TransactionXmin of any running transaction. This + * is called only during checkpoint. + */ +void +TruncateSUBTRANS(TransactionId oldestXact) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. We step + * back one transaction to avoid passing a cutoff page that hasn't been + * created yet in the rare case that oldestXact would be the first item on + * a page and oldestXact == next XID. In that case, if we didn't subtract + * one, we'd trigger SimpleLruTruncate's wraparound detection. + */ + TransactionIdRetreat(oldestXact); + cutoffPage = TransactionIdToPage(oldestXact); + + SimpleLruTruncate(SubTransCtl, cutoffPage); +} + + +/* + * Decide whether a SUBTRANS page number is "older" for truncation purposes. + * Analogous to CLOGPagePrecedes(). + */ +static bool +SubTransPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + SUBTRANS_XACTS_PER_PAGE - 1)); +} diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c new file mode 100644 index 0000000..8d0903c --- /dev/null +++ b/src/backend/access/transam/timeline.c @@ -0,0 +1,600 @@ +/*------------------------------------------------------------------------- + * + * timeline.c + * Functions for reading and writing timeline history files. + * + * A timeline history file lists the timeline changes of the timeline, in + * a simple text format. They are archived along with the WAL segments. + * + * The files are named like ".history". For example, if the database + * starts up and switches to timeline 5, the timeline history file would be + * called "00000005.history". + * + * Each line in the file represents a timeline switch: + * + * + * + * parentTLI ID of the parent timeline + * switchpoint XLogRecPtr of the WAL location where the switch happened + * reason human-readable explanation of why the timeline was changed + * + * The fields are separated by tabs. Lines beginning with # are comments, and + * are ignored. Empty lines are also ignored. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/timeline.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/timeline.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "access/xlogdefs.h" +#include "pgstat.h" +#include "storage/fd.h" + +/* + * Copies all timeline history files with id's between 'begin' and 'end' + * from archive to pg_wal. + */ +void +restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end) +{ + char path[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + TimeLineID tli; + + for (tli = begin; tli < end; tli++) + { + if (tli == 1) + continue; + + TLHistoryFileName(histfname, tli); + if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false)) + KeepFileRestoredFromArchive(path, histfname); + } +} + +/* + * Try to read a timeline's history file. + * + * If successful, return the list of component TLIs (the given TLI followed by + * its ancestor TLIs). If we can't find the history file, assume that the + * timeline has no parents, and return a list of just the specified timeline + * ID. + */ +List * +readTimeLineHistory(TimeLineID targetTLI) +{ + List *result; + char path[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + FILE *fd; + TimeLineHistoryEntry *entry; + TimeLineID lasttli = 0; + XLogRecPtr prevend; + bool fromArchive = false; + + /* Timeline 1 does not have a history file, so no need to check */ + if (targetTLI == 1) + { + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = entry->end = InvalidXLogRecPtr; + return list_make1(entry); + } + + if (ArchiveRecoveryRequested) + { + TLHistoryFileName(histfname, targetTLI); + fromArchive = + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); + } + else + TLHistoryFilePath(path, targetTLI); + + fd = AllocateFile(path, "r"); + if (fd == NULL) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + /* Not there, so assume no parents */ + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = entry->end = InvalidXLogRecPtr; + return list_make1(entry); + } + + result = NIL; + + /* + * Parse the file... + */ + prevend = InvalidXLogRecPtr; + for (;;) + { + char fline[MAXPGPATH]; + char *res; + char *ptr; + TimeLineID tli; + uint32 switchpoint_hi; + uint32 switchpoint_lo; + int nfields; + + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ); + res = fgets(fline, sizeof(fline), fd); + pgstat_report_wait_end(); + if (res == NULL) + { + if (ferror(fd)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + + break; + } + + /* skip leading whitespace and check for # comment */ + for (ptr = fline; *ptr; ptr++) + { + if (!isspace((unsigned char) *ptr)) + break; + } + if (*ptr == '\0' || *ptr == '#') + continue; + + nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo); + + if (nfields < 1) + { + /* expect a numeric timeline ID as first field of line */ + ereport(FATAL, + (errmsg("syntax error in history file: %s", fline), + errhint("Expected a numeric timeline ID."))); + } + if (nfields != 3) + ereport(FATAL, + (errmsg("syntax error in history file: %s", fline), + errhint("Expected a write-ahead log switchpoint location."))); + + if (result && tli <= lasttli) + ereport(FATAL, + (errmsg("invalid data in history file: %s", fline), + errhint("Timeline IDs must be in increasing sequence."))); + + lasttli = tli; + + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = tli; + entry->begin = prevend; + entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo; + prevend = entry->end; + + /* Build list with newest item first */ + result = lcons(entry, result); + + /* we ignore the remainder of each line */ + } + + FreeFile(fd); + + if (result && targetTLI <= lasttli) + ereport(FATAL, + (errmsg("invalid data in history file \"%s\"", path), + errhint("Timeline IDs must be less than child timeline's ID."))); + + /* + * Create one more entry for the "tip" of the timeline, which has no entry + * in the history file. + */ + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = prevend; + entry->end = InvalidXLogRecPtr; + + result = lcons(entry, result); + + /* + * If the history file was fetched from archive, save it in pg_wal for + * future reference. + */ + if (fromArchive) + KeepFileRestoredFromArchive(path, histfname); + + return result; +} + +/* + * Probe whether a timeline history file exists for the given timeline ID + */ +bool +existsTimeLineHistory(TimeLineID probeTLI) +{ + char path[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + FILE *fd; + + /* Timeline 1 does not have a history file, so no need to check */ + if (probeTLI == 1) + return false; + + if (ArchiveRecoveryRequested) + { + TLHistoryFileName(histfname, probeTLI); + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); + } + else + TLHistoryFilePath(path, probeTLI); + + fd = AllocateFile(path, "r"); + if (fd != NULL) + { + FreeFile(fd); + return true; + } + else + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return false; + } +} + +/* + * Find the newest existing timeline, assuming that startTLI exists. + * + * Note: while this is somewhat heuristic, it does positively guarantee + * that (result + 1) is not a known timeline, and therefore it should + * be safe to assign that ID to a new timeline. + */ +TimeLineID +findNewestTimeLine(TimeLineID startTLI) +{ + TimeLineID newestTLI; + TimeLineID probeTLI; + + /* + * The algorithm is just to probe for the existence of timeline history + * files. XXX is it useful to allow gaps in the sequence? + */ + newestTLI = startTLI; + + for (probeTLI = startTLI + 1;; probeTLI++) + { + if (existsTimeLineHistory(probeTLI)) + { + newestTLI = probeTLI; /* probeTLI exists */ + } + else + { + /* doesn't exist, assume we're done */ + break; + } + } + + return newestTLI; +} + +/* + * Create a new timeline history file. + * + * newTLI: ID of the new timeline + * parentTLI: ID of its immediate parent + * switchpoint: WAL location where the system switched to the new timeline + * reason: human-readable explanation of why the timeline was switched + * + * Currently this is only used at the end recovery, and so there are no locking + * considerations. But we should be just as tense as XLogFileInit to avoid + * emplacing a bogus file. + */ +void +writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, + XLogRecPtr switchpoint, char *reason) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + char buffer[BLCKSZ]; + int srcfd; + int fd; + int nbytes; + + Assert(newTLI > parentTLI); /* else bad selection of newTLI */ + + /* + * Write into a temp file name. + */ + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + /* + * If a history file exists for the parent, copy it verbatim + */ + if (ArchiveRecoveryRequested) + { + TLHistoryFileName(histfname, parentTLI); + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); + } + else + TLHistoryFilePath(path, parentTLI); + + srcfd = OpenTransientFile(path, O_RDONLY); + if (srcfd < 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + /* Not there, so assume parent has no parents */ + } + else + { + for (;;) + { + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ); + nbytes = (int) read(srcfd, buffer, sizeof(buffer)); + pgstat_report_wait_end(); + if (nbytes < 0 || errno != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + if (nbytes == 0) + break; + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE); + if ((int) write(fd, buffer, nbytes) != nbytes) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk + * space + */ + unlink(tmppath); + + /* + * if write didn't set errno, assume problem is no disk space + */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + } + + if (CloseTransientFile(srcfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + } + + /* + * Append one line with the details of this timeline split. + * + * If we did have a parent file, insert an extra newline just in case the + * parent file failed to end with one. + */ + snprintf(buffer, sizeof(buffer), + "%s%u\t%X/%X\t%s\n", + (srcfd < 0) ? "" : "\n", + parentTLI, + LSN_FORMAT_ARGS(switchpoint), + reason); + + nbytes = strlen(buffer); + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE); + if ((int) write(fd, buffer, nbytes) != nbytes) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + /* + * Now move the completed history file into place with its final name. + */ + TLHistoryFilePath(path, newTLI); + + /* + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). + */ + durable_rename_excl(tmppath, path, ERROR); + + /* The history file can be archived immediately. */ + if (XLogArchivingActive()) + { + TLHistoryFileName(histfname, newTLI); + XLogArchiveNotify(histfname); + } +} + +/* + * Writes a history file for given timeline and contents. + * + * Currently this is only used in the walreceiver process, and so there are + * no locking considerations. But we should be just as tense as XLogFileInit + * to avoid emplacing a bogus file. + */ +void +writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + int fd; + + /* + * Write into a temp file name. + */ + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE); + if ((int) write(fd, content, size) != size) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + /* + * Now move the completed history file into place with its final name. + */ + TLHistoryFilePath(path, tli); + + /* + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). + */ + durable_rename_excl(tmppath, path, ERROR); +} + +/* + * Returns true if 'expectedTLEs' contains a timeline with id 'tli' + */ +bool +tliInHistory(TimeLineID tli, List *expectedTLEs) +{ + ListCell *cell; + + foreach(cell, expectedTLEs) + { + if (((TimeLineHistoryEntry *) lfirst(cell))->tli == tli) + return true; + } + + return false; +} + +/* + * Returns the ID of the timeline in use at a particular point in time, in + * the given timeline history. + */ +TimeLineID +tliOfPointInHistory(XLogRecPtr ptr, List *history) +{ + ListCell *cell; + + foreach(cell, history) + { + TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell); + + if ((XLogRecPtrIsInvalid(tle->begin) || tle->begin <= ptr) && + (XLogRecPtrIsInvalid(tle->end) || ptr < tle->end)) + { + /* found it */ + return tle->tli; + } + } + + /* shouldn't happen. */ + elog(ERROR, "timeline history was not contiguous"); + return 0; /* keep compiler quiet */ +} + +/* + * Returns the point in history where we branched off the given timeline, + * and the timeline we branched to (*nextTLI). Returns InvalidXLogRecPtr if + * the timeline is current, ie. we have not branched off from it, and throws + * an error if the timeline is not part of this server's history. + */ +XLogRecPtr +tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI) +{ + ListCell *cell; + + if (nextTLI) + *nextTLI = 0; + foreach(cell, history) + { + TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell); + + if (tle->tli == tli) + return tle->end; + if (nextTLI) + *nextTLI = tle->tli; + } + + ereport(ERROR, + (errmsg("requested timeline %u is not in this server's history", + tli))); + return InvalidXLogRecPtr; /* keep compiler quiet */ +} diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c new file mode 100644 index 0000000..e9e0ef7 --- /dev/null +++ b/src/backend/access/transam/transam.c @@ -0,0 +1,430 @@ +/*------------------------------------------------------------------------- + * + * transam.c + * postgres transaction (commit) log interface routines + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/transam/transam.c + * + * NOTES + * This file contains the high level access-method interface to the + * transaction system. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/clog.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "utils/snapmgr.h" + +/* + * Single-item cache for results of TransactionLogFetch. It's worth having + * such a cache because we frequently find ourselves repeatedly checking the + * same XID, for example when scanning a table just after a bulk insert, + * update, or delete. + */ +static TransactionId cachedFetchXid = InvalidTransactionId; +static XidStatus cachedFetchXidStatus; +static XLogRecPtr cachedCommitLSN; + +/* Local functions */ +static XidStatus TransactionLogFetch(TransactionId transactionId); + + +/* ---------------------------------------------------------------- + * Postgres log access method interface + * + * TransactionLogFetch + * ---------------------------------------------------------------- + */ + +/* + * TransactionLogFetch --- fetch commit status of specified transaction id + */ +static XidStatus +TransactionLogFetch(TransactionId transactionId) +{ + XidStatus xidstatus; + XLogRecPtr xidlsn; + + /* + * Before going to the commit log manager, check our single item cache to + * see if we didn't just check the transaction status a moment ago. + */ + if (TransactionIdEquals(transactionId, cachedFetchXid)) + return cachedFetchXidStatus; + + /* + * Also, check to see if the transaction ID is a permanent one. + */ + if (!TransactionIdIsNormal(transactionId)) + { + if (TransactionIdEquals(transactionId, BootstrapTransactionId)) + return TRANSACTION_STATUS_COMMITTED; + if (TransactionIdEquals(transactionId, FrozenTransactionId)) + return TRANSACTION_STATUS_COMMITTED; + return TRANSACTION_STATUS_ABORTED; + } + + /* + * Get the transaction status. + */ + xidstatus = TransactionIdGetStatus(transactionId, &xidlsn); + + /* + * Cache it, but DO NOT cache status for unfinished or sub-committed + * transactions! We only cache status that is guaranteed not to change. + */ + if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS && + xidstatus != TRANSACTION_STATUS_SUB_COMMITTED) + { + cachedFetchXid = transactionId; + cachedFetchXidStatus = xidstatus; + cachedCommitLSN = xidlsn; + } + + return xidstatus; +} + +/* ---------------------------------------------------------------- + * Interface functions + * + * TransactionIdDidCommit + * TransactionIdDidAbort + * ======== + * these functions test the transaction status of + * a specified transaction id. + * + * TransactionIdCommitTree + * TransactionIdAsyncCommitTree + * TransactionIdAbortTree + * ======== + * these functions set the transaction status of the specified + * transaction tree. + * + * See also TransactionIdIsInProgress, which once was in this module + * but now lives in procarray.c. + * ---------------------------------------------------------------- + */ + +/* + * TransactionIdDidCommit + * True iff transaction associated with the identifier did commit. + * + * Note: + * Assumes transaction identifier is valid and exists in clog. + */ +bool /* true if given transaction committed */ +TransactionIdDidCommit(TransactionId transactionId) +{ + XidStatus xidstatus; + + xidstatus = TransactionLogFetch(transactionId); + + /* + * If it's marked committed, it's committed. + */ + if (xidstatus == TRANSACTION_STATUS_COMMITTED) + return true; + + /* + * If it's marked subcommitted, we have to check the parent recursively. + * However, if it's older than TransactionXmin, we can't look at + * pg_subtrans; instead assume that the parent crashed without cleaning up + * its children. + * + * Originally we Assert'ed that the result of SubTransGetParent was not + * zero. However with the introduction of prepared transactions, there can + * be a window just after database startup where we do not have complete + * knowledge in pg_subtrans of the transactions after TransactionXmin. + * StartupSUBTRANS() has ensured that any missing information will be + * zeroed. Since this case should not happen under normal conditions, it + * seems reasonable to emit a WARNING for it. + */ + if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) + { + TransactionId parentXid; + + if (TransactionIdPrecedes(transactionId, TransactionXmin)) + return false; + parentXid = SubTransGetParent(transactionId); + if (!TransactionIdIsValid(parentXid)) + { + elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", + transactionId); + return false; + } + return TransactionIdDidCommit(parentXid); + } + + /* + * It's not committed. + */ + return false; +} + +/* + * TransactionIdDidAbort + * True iff transaction associated with the identifier did abort. + * + * Note: + * Assumes transaction identifier is valid and exists in clog. + */ +bool /* true if given transaction aborted */ +TransactionIdDidAbort(TransactionId transactionId) +{ + XidStatus xidstatus; + + xidstatus = TransactionLogFetch(transactionId); + + /* + * If it's marked aborted, it's aborted. + */ + if (xidstatus == TRANSACTION_STATUS_ABORTED) + return true; + + /* + * If it's marked subcommitted, we have to check the parent recursively. + * However, if it's older than TransactionXmin, we can't look at + * pg_subtrans; instead assume that the parent crashed without cleaning up + * its children. + */ + if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) + { + TransactionId parentXid; + + if (TransactionIdPrecedes(transactionId, TransactionXmin)) + return true; + parentXid = SubTransGetParent(transactionId); + if (!TransactionIdIsValid(parentXid)) + { + /* see notes in TransactionIdDidCommit */ + elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", + transactionId); + return true; + } + return TransactionIdDidAbort(parentXid); + } + + /* + * It's not aborted. + */ + return false; +} + +/* + * TransactionIdIsKnownCompleted + * True iff transaction associated with the identifier is currently + * known to have either committed or aborted. + * + * This does NOT look into pg_xact but merely probes our local cache + * (and so it's not named TransactionIdDidComplete, which would be the + * appropriate name for a function that worked that way). + * + * NB: This is unused, and will be removed in v15. This was used to + * short-circuit TransactionIdIsInProgress, but that was wrong for a + * transaction that was known to be marked as committed in CLOG but not + * yet removed from the proc array. This is kept in backbranches just in + * case it is still used by extensions. However, extensions doing + * something similar to tuple visibility checks should also be careful to + * check the proc array first! + * + * Note: + * Assumes transaction identifier is valid. + */ +bool +TransactionIdIsKnownCompleted(TransactionId transactionId) +{ + if (TransactionIdEquals(transactionId, cachedFetchXid)) + { + /* If it's in the cache at all, it must be completed. */ + return true; + } + + return false; +} + +/* + * TransactionIdCommitTree + * Marks the given transaction and children as committed + * + * "xid" is a toplevel transaction commit, and the xids array contains its + * committed subtransactions. + * + * This commit operation is not guaranteed to be atomic, but if not, subxids + * are correctly marked subcommit first. + */ +void +TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids) +{ + TransactionIdSetTreeStatus(xid, nxids, xids, + TRANSACTION_STATUS_COMMITTED, + InvalidXLogRecPtr); +} + +/* + * TransactionIdAsyncCommitTree + * Same as above, but for async commits. The commit record LSN is needed. + */ +void +TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, + XLogRecPtr lsn) +{ + TransactionIdSetTreeStatus(xid, nxids, xids, + TRANSACTION_STATUS_COMMITTED, lsn); +} + +/* + * TransactionIdAbortTree + * Marks the given transaction and children as aborted. + * + * "xid" is a toplevel transaction commit, and the xids array contains its + * committed subtransactions. + * + * We don't need to worry about the non-atomic behavior, since any onlookers + * will consider all the xacts as not-yet-committed anyway. + */ +void +TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids) +{ + TransactionIdSetTreeStatus(xid, nxids, xids, + TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr); +} + +/* + * TransactionIdPrecedes --- is id1 logically < id2? + */ +bool +TransactionIdPrecedes(TransactionId id1, TransactionId id2) +{ + /* + * If either ID is a permanent XID then we can just do unsigned + * comparison. If both are normal, do a modulo-2^32 comparison. + */ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 < id2); + + diff = (int32) (id1 - id2); + return (diff < 0); +} + +/* + * TransactionIdPrecedesOrEquals --- is id1 logically <= id2? + */ +bool +TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2) +{ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 <= id2); + + diff = (int32) (id1 - id2); + return (diff <= 0); +} + +/* + * TransactionIdFollows --- is id1 logically > id2? + */ +bool +TransactionIdFollows(TransactionId id1, TransactionId id2) +{ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 > id2); + + diff = (int32) (id1 - id2); + return (diff > 0); +} + +/* + * TransactionIdFollowsOrEquals --- is id1 logically >= id2? + */ +bool +TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2) +{ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 >= id2); + + diff = (int32) (id1 - id2); + return (diff >= 0); +} + + +/* + * TransactionIdLatest --- get latest XID among a main xact and its children + */ +TransactionId +TransactionIdLatest(TransactionId mainxid, + int nxids, const TransactionId *xids) +{ + TransactionId result; + + /* + * In practice it is highly likely that the xids[] array is sorted, and so + * we could save some cycles by just taking the last child XID, but this + * probably isn't so performance-critical that it's worth depending on + * that assumption. But just to show we're not totally stupid, scan the + * array back-to-front to avoid useless assignments. + */ + result = mainxid; + while (--nxids >= 0) + { + if (TransactionIdPrecedes(result, xids[nxids])) + result = xids[nxids]; + } + return result; +} + + +/* + * TransactionIdGetCommitLSN + * + * This function returns an LSN that is late enough to be able + * to guarantee that if we flush up to the LSN returned then we + * will have flushed the transaction's commit record to disk. + * + * The result is not necessarily the exact LSN of the transaction's + * commit record! For example, for long-past transactions (those whose + * clog pages already migrated to disk), we'll return InvalidXLogRecPtr. + * Also, because we group transactions on the same clog page to conserve + * storage, we might return the LSN of a later transaction that falls into + * the same group. + */ +XLogRecPtr +TransactionIdGetCommitLSN(TransactionId xid) +{ + XLogRecPtr result; + + /* + * Currently, all uses of this function are for xids that were just + * reported to be committed by TransactionLogFetch, so we expect that + * checking TransactionLogFetch's cache will usually succeed and avoid an + * extra trip to shared memory. + */ + if (TransactionIdEquals(xid, cachedFetchXid)) + return cachedCommitLSN; + + /* Special XIDs are always known committed */ + if (!TransactionIdIsNormal(xid)) + return InvalidXLogRecPtr; + + /* + * Get the transaction status. + */ + (void) TransactionIdGetStatus(xid, &result); + + return result; +} diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c new file mode 100644 index 0000000..b0aaad6 --- /dev/null +++ b/src/backend/access/transam/twophase.c @@ -0,0 +1,2527 @@ +/*------------------------------------------------------------------------- + * + * twophase.c + * Two-phase commit support functions. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/transam/twophase.c + * + * NOTES + * Each global transaction is associated with a global transaction + * identifier (GID). The client assigns a GID to a postgres + * transaction with the PREPARE TRANSACTION command. + * + * We keep all active global transactions in a shared memory array. + * When the PREPARE TRANSACTION command is issued, the GID is + * reserved for the transaction in the array. This is done before + * a WAL entry is made, because the reservation checks for duplicate + * GIDs and aborts the transaction if there already is a global + * transaction in prepared state with the same GID. + * + * A global transaction (gxact) also has dummy PGPROC; this is what keeps + * the XID considered running by TransactionIdIsInProgress. It is also + * convenient as a PGPROC to hook the gxact's locks to. + * + * Information to recover prepared transactions in case of crash is + * now stored in WAL for the common case. In some cases there will be + * an extended period between preparing a GXACT and commit/abort, in + * which case we need to separately record prepared transaction data + * in permanent storage. This includes locking information, pending + * notifications etc. All that state information is written to the + * per-transaction state file in the pg_twophase directory. + * All prepared transactions will be written prior to shutdown. + * + * Life track of state data is following: + * + * * On PREPARE TRANSACTION backend writes state data only to the WAL and + * stores pointer to the start of the WAL record in + * gxact->prepare_start_lsn. + * * If COMMIT occurs before checkpoint then backend reads data from WAL + * using prepare_start_lsn. + * * On checkpoint state data copied to files in pg_twophase directory and + * fsynced + * * If COMMIT happens after checkpoint then backend reads state data from + * files + * + * During replay and replication, TwoPhaseState also holds information + * about active prepared transactions that haven't been moved to disk yet. + * + * Replay of twophase records happens by the following rules: + * + * * At the beginning of recovery, pg_twophase is scanned once, filling + * TwoPhaseState with entries marked with gxact->inredo and + * gxact->ondisk. Two-phase file data older than the XID horizon of + * the redo position are discarded. + * * On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts. + * gxact->inredo is set to true for such entries. + * * On Checkpoint we iterate through TwoPhaseState->prepXacts entries + * that have gxact->inredo set and are behind the redo_horizon. We + * save them to disk and then switch gxact->ondisk to true. + * * On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts. + * If gxact->ondisk is true, the corresponding entry from the disk + * is additionally deleted. + * * RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions() + * and PrescanPreparedTransactions() have been modified to go through + * gxact->inredo entries that have not made it to disk. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include + +#include "access/commit_ts.h" +#include "access/htup_details.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogreader.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "catalog/storage.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "replication/origin.h" +#include "replication/syncrep.h" +#include "replication/walsender.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/md.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +/* + * Directory where Two-phase commit files reside within PGDATA + */ +#define TWOPHASE_DIR "pg_twophase" + +/* GUC variable, can't be changed after startup */ +int max_prepared_xacts = 0; + +/* + * This struct describes one global transaction that is in prepared state + * or attempting to become prepared. + * + * The lifecycle of a global transaction is: + * + * 1. After checking that the requested GID is not in use, set up an entry in + * the TwoPhaseState->prepXacts array with the correct GID and valid = false, + * and mark it as locked by my backend. + * + * 2. After successfully completing prepare, set valid = true and enter the + * referenced PGPROC into the global ProcArray. + * + * 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry is + * valid and not locked, then mark the entry as locked by storing my current + * backend ID into locking_backend. This prevents concurrent attempts to + * commit or rollback the same prepared xact. + * + * 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry + * from the ProcArray and the TwoPhaseState->prepXacts array and return it to + * the freelist. + * + * Note that if the preparing transaction fails between steps 1 and 2, the + * entry must be removed so that the GID and the GlobalTransaction struct + * can be reused. See AtAbort_Twophase(). + * + * typedef struct GlobalTransactionData *GlobalTransaction appears in + * twophase.h + */ + +typedef struct GlobalTransactionData +{ + GlobalTransaction next; /* list link for free list */ + int pgprocno; /* ID of associated dummy PGPROC */ + BackendId dummyBackendId; /* similar to backend id for backends */ + TimestampTz prepared_at; /* time of preparation */ + + /* + * Note that we need to keep track of two LSNs for each GXACT. We keep + * track of the start LSN because this is the address we must use to read + * state data back from WAL when committing a prepared GXACT. We keep + * track of the end LSN because that is the LSN we need to wait for prior + * to commit. + */ + XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */ + XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */ + TransactionId xid; /* The GXACT id */ + + Oid owner; /* ID of user that executed the xact */ + BackendId locking_backend; /* backend currently working on the xact */ + bool valid; /* true if PGPROC entry is in proc array */ + bool ondisk; /* true if prepare state file is on disk */ + bool inredo; /* true if entry was added via xlog_redo */ + char gid[GIDSIZE]; /* The GID assigned to the prepared xact */ +} GlobalTransactionData; + +/* + * Two Phase Commit shared state. Access to this struct is protected + * by TwoPhaseStateLock. + */ +typedef struct TwoPhaseStateData +{ + /* Head of linked list of free GlobalTransactionData structs */ + GlobalTransaction freeGXacts; + + /* Number of valid prepXacts entries. */ + int numPrepXacts; + + /* There are max_prepared_xacts items in this array */ + GlobalTransaction prepXacts[FLEXIBLE_ARRAY_MEMBER]; +} TwoPhaseStateData; + +static TwoPhaseStateData *TwoPhaseState; + +/* + * Global transaction entry currently locked by us, if any. Note that any + * access to the entry pointed to by this variable must be protected by + * TwoPhaseStateLock, though obviously the pointer itself doesn't need to be + * (since it's just local memory). + */ +static GlobalTransaction MyLockedGxact = NULL; + +static bool twophaseExitRegistered = false; + +static void RecordTransactionCommitPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + int ninvalmsgs, + SharedInvalidationMessage *invalmsgs, + bool initfileinval, + const char *gid); +static void RecordTransactionAbortPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + const char *gid); +static void ProcessRecords(char *bufptr, TransactionId xid, + const TwoPhaseCallback callbacks[]); +static void RemoveGXact(GlobalTransaction gxact); + +static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len); +static char *ProcessTwoPhaseBuffer(TransactionId xid, + XLogRecPtr prepare_start_lsn, + bool fromdisk, bool setParent, bool setNextXid); +static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, + const char *gid, TimestampTz prepared_at, Oid owner, + Oid databaseid); +static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); +static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); + +/* + * Initialization of shared memory + */ +Size +TwoPhaseShmemSize(void) +{ + Size size; + + /* Need the fixed struct, the array of pointers, and the GTD structs */ + size = offsetof(TwoPhaseStateData, prepXacts); + size = add_size(size, mul_size(max_prepared_xacts, + sizeof(GlobalTransaction))); + size = MAXALIGN(size); + size = add_size(size, mul_size(max_prepared_xacts, + sizeof(GlobalTransactionData))); + + return size; +} + +void +TwoPhaseShmemInit(void) +{ + bool found; + + TwoPhaseState = ShmemInitStruct("Prepared Transaction Table", + TwoPhaseShmemSize(), + &found); + if (!IsUnderPostmaster) + { + GlobalTransaction gxacts; + int i; + + Assert(!found); + TwoPhaseState->freeGXacts = NULL; + TwoPhaseState->numPrepXacts = 0; + + /* + * Initialize the linked list of free GlobalTransactionData structs + */ + gxacts = (GlobalTransaction) + ((char *) TwoPhaseState + + MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) + + sizeof(GlobalTransaction) * max_prepared_xacts)); + for (i = 0; i < max_prepared_xacts; i++) + { + /* insert into linked list */ + gxacts[i].next = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = &gxacts[i]; + + /* associate it with a PGPROC assigned by InitProcGlobal */ + gxacts[i].pgprocno = PreparedXactProcs[i].pgprocno; + + /* + * Assign a unique ID for each dummy proc, so that the range of + * dummy backend IDs immediately follows the range of normal + * backend IDs. We don't dare to assign a real backend ID to dummy + * procs, because prepared transactions don't take part in cache + * invalidation like a real backend ID would imply, but having a + * unique ID for them is nevertheless handy. This arrangement + * allows you to allocate an array of size (MaxBackends + + * max_prepared_xacts + 1), and have a slot for every backend and + * prepared transaction. Currently multixact.c uses that + * technique. + */ + gxacts[i].dummyBackendId = MaxBackends + 1 + i; + } + } + else + Assert(found); +} + +/* + * Exit hook to unlock the global transaction entry we're working on. + */ +static void +AtProcExit_Twophase(int code, Datum arg) +{ + /* same logic as abort */ + AtAbort_Twophase(); +} + +/* + * Abort hook to unlock the global transaction entry we're working on. + */ +void +AtAbort_Twophase(void) +{ + if (MyLockedGxact == NULL) + return; + + /* + * What to do with the locked global transaction entry? If we were in the + * process of preparing the transaction, but haven't written the WAL + * record and state file yet, the transaction must not be considered as + * prepared. Likewise, if we are in the process of finishing an + * already-prepared transaction, and fail after having already written the + * 2nd phase commit or rollback record to the WAL, the transaction should + * not be considered as prepared anymore. In those cases, just remove the + * entry from shared memory. + * + * Otherwise, the entry must be left in place so that the transaction can + * be finished later, so just unlock it. + * + * If we abort during prepare, after having written the WAL record, we + * might not have transferred all locks and other state to the prepared + * transaction yet. Likewise, if we abort during commit or rollback, + * after having written the WAL record, we might not have released all the + * resources held by the transaction yet. In those cases, the in-memory + * state can be wrong, but it's too late to back out. + */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + if (!MyLockedGxact->valid) + RemoveGXact(MyLockedGxact); + else + MyLockedGxact->locking_backend = InvalidBackendId; + LWLockRelease(TwoPhaseStateLock); + + MyLockedGxact = NULL; +} + +/* + * This is called after we have finished transferring state to the prepared + * PGPROC entry. + */ +void +PostPrepare_Twophase(void) +{ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + MyLockedGxact->locking_backend = InvalidBackendId; + LWLockRelease(TwoPhaseStateLock); + + MyLockedGxact = NULL; +} + + +/* + * MarkAsPreparing + * Reserve the GID for the given transaction. + */ +GlobalTransaction +MarkAsPreparing(TransactionId xid, const char *gid, + TimestampTz prepared_at, Oid owner, Oid databaseid) +{ + GlobalTransaction gxact; + int i; + + if (strlen(gid) >= GIDSIZE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("transaction identifier \"%s\" is too long", + gid))); + + /* fail immediately if feature is disabled */ + if (max_prepared_xacts == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("prepared transactions are disabled"), + errhint("Set max_prepared_transactions to a nonzero value."))); + + /* on first call, register the exit hook */ + if (!twophaseExitRegistered) + { + before_shmem_exit(AtProcExit_Twophase, 0); + twophaseExitRegistered = true; + } + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + + /* Check for conflicting GID */ + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + gxact = TwoPhaseState->prepXacts[i]; + if (strcmp(gxact->gid, gid) == 0) + { + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("transaction identifier \"%s\" is already in use", + gid))); + } + } + + /* Get a free gxact from the freelist */ + if (TwoPhaseState->freeGXacts == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("maximum number of prepared transactions reached"), + errhint("Increase max_prepared_transactions (currently %d).", + max_prepared_xacts))); + gxact = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = gxact->next; + + MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid); + + gxact->ondisk = false; + + /* And insert it into the active array */ + Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); + TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; + + LWLockRelease(TwoPhaseStateLock); + + return gxact; +} + +/* + * MarkAsPreparingGuts + * + * This uses a gxact struct and puts it into the active array. + * NOTE: this is also used when reloading a gxact after a crash; so avoid + * assuming that we can use very much backend context. + * + * Note: This function should be called with appropriate locks held. + */ +static void +MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, + TimestampTz prepared_at, Oid owner, Oid databaseid) +{ + PGPROC *proc; + int i; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + + Assert(gxact != NULL); + proc = &ProcGlobal->allProcs[gxact->pgprocno]; + + /* Initialize the PGPROC entry */ + MemSet(proc, 0, sizeof(PGPROC)); + proc->pgprocno = gxact->pgprocno; + SHMQueueElemInit(&(proc->links)); + proc->waitStatus = PROC_WAIT_STATUS_OK; + if (LocalTransactionIdIsValid(MyProc->lxid)) + { + /* clone VXID, for TwoPhaseGetXidByVirtualXID() to find */ + proc->lxid = MyProc->lxid; + proc->backendId = MyBackendId; + } + else + { + Assert(AmStartupProcess() || !IsPostmasterEnvironment); + /* GetLockConflicts() uses this to specify a wait on the XID */ + proc->lxid = xid; + proc->backendId = InvalidBackendId; + } + proc->xid = xid; + Assert(proc->xmin == InvalidTransactionId); + proc->delayChkpt = false; + proc->statusFlags = 0; + proc->delayChkptEnd = false; + proc->pid = 0; + proc->databaseId = databaseid; + proc->roleId = owner; + proc->tempNamespaceId = InvalidOid; + proc->isBackgroundWorker = false; + proc->lwWaiting = false; + proc->lwWaitMode = 0; + proc->waitLock = NULL; + proc->waitProcLock = NULL; + pg_atomic_init_u64(&proc->waitStart, 0); + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + SHMQueueInit(&(proc->myProcLocks[i])); + /* subxid data must be filled later by GXactLoadSubxactData */ + proc->subxidStatus.overflowed = false; + proc->subxidStatus.count = 0; + + gxact->prepared_at = prepared_at; + gxact->xid = xid; + gxact->owner = owner; + gxact->locking_backend = MyBackendId; + gxact->valid = false; + gxact->inredo = false; + strcpy(gxact->gid, gid); + + /* + * Remember that we have this GlobalTransaction entry locked for us. If we + * abort after this, we must release it. + */ + MyLockedGxact = gxact; +} + +/* + * GXactLoadSubxactData + * + * If the transaction being persisted had any subtransactions, this must + * be called before MarkAsPrepared() to load information into the dummy + * PGPROC. + */ +static void +GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts, + TransactionId *children) +{ + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + + /* We need no extra lock since the GXACT isn't valid yet */ + if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS) + { + proc->subxidStatus.overflowed = true; + nsubxacts = PGPROC_MAX_CACHED_SUBXIDS; + } + if (nsubxacts > 0) + { + memcpy(proc->subxids.xids, children, + nsubxacts * sizeof(TransactionId)); + proc->subxidStatus.count = nsubxacts; + } +} + +/* + * MarkAsPrepared + * Mark the GXACT as fully valid, and enter it into the global ProcArray. + * + * lock_held indicates whether caller already holds TwoPhaseStateLock. + */ +static void +MarkAsPrepared(GlobalTransaction gxact, bool lock_held) +{ + /* Lock here may be overkill, but I'm not convinced of that ... */ + if (!lock_held) + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + Assert(!gxact->valid); + gxact->valid = true; + if (!lock_held) + LWLockRelease(TwoPhaseStateLock); + + /* + * Put it into the global ProcArray so TransactionIdIsInProgress considers + * the XID as still running. + */ + ProcArrayAdd(&ProcGlobal->allProcs[gxact->pgprocno]); +} + +/* + * LockGXact + * Locate the prepared transaction and mark it busy for COMMIT or PREPARE. + */ +static GlobalTransaction +LockGXact(const char *gid, Oid user) +{ + int i; + + /* on first call, register the exit hook */ + if (!twophaseExitRegistered) + { + before_shmem_exit(AtProcExit_Twophase, 0); + twophaseExitRegistered = true; + } + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + + /* Ignore not-yet-valid GIDs */ + if (!gxact->valid) + continue; + if (strcmp(gxact->gid, gid) != 0) + continue; + + /* Found it, but has someone else got it locked? */ + if (gxact->locking_backend != InvalidBackendId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("prepared transaction with identifier \"%s\" is busy", + gid))); + + if (user != gxact->owner && !superuser_arg(user)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to finish prepared transaction"), + errhint("Must be superuser or the user that prepared the transaction."))); + + /* + * Note: it probably would be possible to allow committing from + * another database; but at the moment NOTIFY is known not to work and + * there may be some other issues as well. Hence disallow until + * someone gets motivated to make it work. + */ + if (MyDatabaseId != proc->databaseId) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("prepared transaction belongs to another database"), + errhint("Connect to the database where the transaction was prepared to finish it."))); + + /* OK for me to lock it */ + gxact->locking_backend = MyBackendId; + MyLockedGxact = gxact; + + LWLockRelease(TwoPhaseStateLock); + + return gxact; + } + + LWLockRelease(TwoPhaseStateLock); + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("prepared transaction with identifier \"%s\" does not exist", + gid))); + + /* NOTREACHED */ + return NULL; +} + +/* + * RemoveGXact + * Remove the prepared transaction from the shared memory array. + * + * NB: caller should have already removed it from ProcArray + */ +static void +RemoveGXact(GlobalTransaction gxact) +{ + int i; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + if (gxact == TwoPhaseState->prepXacts[i]) + { + /* remove from the active array */ + TwoPhaseState->numPrepXacts--; + TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts]; + + /* and put it back in the freelist */ + gxact->next = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = gxact; + + return; + } + } + + elog(ERROR, "failed to find %p in GlobalTransaction array", gxact); +} + +/* + * Returns an array of all prepared transactions for the user-level + * function pg_prepared_xact. + * + * The returned array and all its elements are copies of internal data + * structures, to minimize the time we need to hold the TwoPhaseStateLock. + * + * WARNING -- we return even those transactions that are not fully prepared + * yet. The caller should filter them out if he doesn't want them. + * + * The returned array is palloc'd. + */ +static int +GetPreparedTransactionList(GlobalTransaction *gxacts) +{ + GlobalTransaction array; + int num; + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + if (TwoPhaseState->numPrepXacts == 0) + { + LWLockRelease(TwoPhaseStateLock); + + *gxacts = NULL; + return 0; + } + + num = TwoPhaseState->numPrepXacts; + array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num); + *gxacts = array; + for (i = 0; i < num; i++) + memcpy(array + i, TwoPhaseState->prepXacts[i], + sizeof(GlobalTransactionData)); + + LWLockRelease(TwoPhaseStateLock); + + return num; +} + + +/* Working status for pg_prepared_xact */ +typedef struct +{ + GlobalTransaction array; + int ngxacts; + int currIdx; +} Working_State; + +/* + * pg_prepared_xact + * Produce a view with one row per prepared transaction. + * + * This function is here so we don't have to export the + * GlobalTransactionData struct definition. + */ +Datum +pg_prepared_xact(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Working_State *status; + + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext oldcontext; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * Switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* build tupdesc for result tuples */ + /* this had better match pg_prepared_xacts view in system_views.sql */ + tupdesc = CreateTemplateTupleDesc(5); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared", + TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid", + OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid", + OIDOID, -1, 0); + + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + /* + * Collect all the 2PC status information that we will format and send + * out as a result set. + */ + status = (Working_State *) palloc(sizeof(Working_State)); + funcctx->user_fctx = (void *) status; + + status->ngxacts = GetPreparedTransactionList(&status->array); + status->currIdx = 0; + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + status = (Working_State *) funcctx->user_fctx; + + while (status->array != NULL && status->currIdx < status->ngxacts) + { + GlobalTransaction gxact = &status->array[status->currIdx++]; + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + Datum values[5]; + bool nulls[5]; + HeapTuple tuple; + Datum result; + + if (!gxact->valid) + continue; + + /* + * Form tuple with appropriate data. + */ + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = TransactionIdGetDatum(proc->xid); + values[1] = CStringGetTextDatum(gxact->gid); + values[2] = TimestampTzGetDatum(gxact->prepared_at); + values[3] = ObjectIdGetDatum(gxact->owner); + values[4] = ObjectIdGetDatum(proc->databaseId); + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + + SRF_RETURN_DONE(funcctx); +} + +/* + * TwoPhaseGetGXact + * Get the GlobalTransaction struct for a prepared transaction + * specified by XID + * + * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the + * caller had better hold it. + */ +static GlobalTransaction +TwoPhaseGetGXact(TransactionId xid, bool lock_held) +{ + GlobalTransaction result = NULL; + int i; + + static TransactionId cached_xid = InvalidTransactionId; + static GlobalTransaction cached_gxact = NULL; + + Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock)); + + /* + * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called + * repeatedly for the same XID. We can save work with a simple cache. + */ + if (xid == cached_xid) + return cached_gxact; + + if (!lock_held) + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + if (gxact->xid == xid) + { + result = gxact; + break; + } + } + + if (!lock_held) + LWLockRelease(TwoPhaseStateLock); + + if (result == NULL) /* should not happen */ + elog(ERROR, "failed to find GlobalTransaction for xid %u", xid); + + cached_xid = xid; + cached_gxact = result; + + return result; +} + +/* + * TwoPhaseGetXidByVirtualXID + * Lookup VXID among xacts prepared since last startup. + * + * (This won't find recovered xacts.) If more than one matches, return any + * and set "have_more" to true. To witness multiple matches, a single + * BackendId must consume 2^32 LXIDs, with no intervening database restart. + */ +TransactionId +TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, + bool *have_more) +{ + int i; + TransactionId result = InvalidTransactionId; + + Assert(VirtualTransactionIdIsValid(vxid)); + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *proc; + VirtualTransactionId proc_vxid; + + if (!gxact->valid) + continue; + proc = &ProcGlobal->allProcs[gxact->pgprocno]; + GET_VXID_FROM_PGPROC(proc_vxid, *proc); + if (VirtualTransactionIdEquals(vxid, proc_vxid)) + { + /* Startup process sets proc->backendId to InvalidBackendId. */ + Assert(!gxact->inredo); + + if (result != InvalidTransactionId) + { + *have_more = true; + break; + } + result = gxact->xid; + } + } + + LWLockRelease(TwoPhaseStateLock); + + return result; +} + +/* + * TwoPhaseGetDummyBackendId + * Get the dummy backend ID for prepared transaction specified by XID + * + * Dummy backend IDs are similar to real backend IDs of real backends. + * They start at MaxBackends + 1, and are unique across all currently active + * real backends and prepared transactions. If lock_held is set to true, + * TwoPhaseStateLock will not be taken, so the caller had better hold it. + */ +BackendId +TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held) +{ + GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + + return gxact->dummyBackendId; +} + +/* + * TwoPhaseGetDummyProc + * Get the PGPROC that represents a prepared transaction specified by XID + * + * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the + * caller had better hold it. + */ +PGPROC * +TwoPhaseGetDummyProc(TransactionId xid, bool lock_held) +{ + GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + + return &ProcGlobal->allProcs[gxact->pgprocno]; +} + +/************************************************************************/ +/* State file support */ +/************************************************************************/ + +#define TwoPhaseFilePath(path, xid) \ + snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X", xid) + +/* + * 2PC state file format: + * + * 1. TwoPhaseFileHeader + * 2. TransactionId[] (subtransactions) + * 3. RelFileNode[] (files to be deleted at commit) + * 4. RelFileNode[] (files to be deleted at abort) + * 5. SharedInvalidationMessage[] (inval messages to be sent at commit) + * 6. TwoPhaseRecordOnDisk + * 7. ... + * 8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID) + * 9. checksum (CRC-32C) + * + * Each segment except the final checksum is MAXALIGN'd. + */ + +/* + * Header for a 2PC state file + */ +#define TWOPHASE_MAGIC 0x57F94534 /* format identifier */ + +typedef xl_xact_prepare TwoPhaseFileHeader; + +/* + * Header for each record in a state file + * + * NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header. + * The rmgr data will be stored starting on a MAXALIGN boundary. + */ +typedef struct TwoPhaseRecordOnDisk +{ + uint32 len; /* length of rmgr data */ + TwoPhaseRmgrId rmid; /* resource manager for this record */ + uint16 info; /* flag bits for use by rmgr */ +} TwoPhaseRecordOnDisk; + +/* + * During prepare, the state file is assembled in memory before writing it + * to WAL and the actual state file. We use a chain of StateFileChunk blocks + * for that. + */ +typedef struct StateFileChunk +{ + char *data; + uint32 len; + struct StateFileChunk *next; +} StateFileChunk; + +static struct xllist +{ + StateFileChunk *head; /* first data block in the chain */ + StateFileChunk *tail; /* last block in chain */ + uint32 num_chunks; + uint32 bytes_free; /* free bytes left in tail block */ + uint32 total_len; /* total data bytes in chain */ +} records; + + +/* + * Append a block of data to records data structure. + * + * NB: each block is padded to a MAXALIGN multiple. This must be + * accounted for when the file is later read! + * + * The data is copied, so the caller is free to modify it afterwards. + */ +static void +save_state_data(const void *data, uint32 len) +{ + uint32 padlen = MAXALIGN(len); + + if (padlen > records.bytes_free) + { + records.tail->next = palloc0(sizeof(StateFileChunk)); + records.tail = records.tail->next; + records.tail->len = 0; + records.tail->next = NULL; + records.num_chunks++; + + records.bytes_free = Max(padlen, 512); + records.tail->data = palloc(records.bytes_free); + } + + memcpy(((char *) records.tail->data) + records.tail->len, data, len); + records.tail->len += padlen; + records.bytes_free -= padlen; + records.total_len += padlen; +} + +/* + * Start preparing a state file. + * + * Initializes data structure and inserts the 2PC file header record. + */ +void +StartPrepare(GlobalTransaction gxact) +{ + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + TransactionId xid = gxact->xid; + TwoPhaseFileHeader hdr; + TransactionId *children; + RelFileNode *commitrels; + RelFileNode *abortrels; + SharedInvalidationMessage *invalmsgs; + + /* Initialize linked list */ + records.head = palloc0(sizeof(StateFileChunk)); + records.head->len = 0; + records.head->next = NULL; + + records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512); + records.head->data = palloc(records.bytes_free); + + records.tail = records.head; + records.num_chunks = 1; + + records.total_len = 0; + + /* Create header */ + hdr.magic = TWOPHASE_MAGIC; + hdr.total_len = 0; /* EndPrepare will fill this in */ + hdr.xid = xid; + hdr.database = proc->databaseId; + hdr.prepared_at = gxact->prepared_at; + hdr.owner = gxact->owner; + hdr.nsubxacts = xactGetCommittedChildren(&children); + hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels); + hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels); + hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs, + &hdr.initfileinval); + hdr.gidlen = strlen(gxact->gid) + 1; /* Include '\0' */ + + save_state_data(&hdr, sizeof(TwoPhaseFileHeader)); + save_state_data(gxact->gid, hdr.gidlen); + + /* + * Add the additional info about subxacts, deletable files and cache + * invalidation messages. + */ + if (hdr.nsubxacts > 0) + { + save_state_data(children, hdr.nsubxacts * sizeof(TransactionId)); + /* While we have the child-xact data, stuff it in the gxact too */ + GXactLoadSubxactData(gxact, hdr.nsubxacts, children); + } + if (hdr.ncommitrels > 0) + { + save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileNode)); + pfree(commitrels); + } + if (hdr.nabortrels > 0) + { + save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileNode)); + pfree(abortrels); + } + if (hdr.ninvalmsgs > 0) + { + save_state_data(invalmsgs, + hdr.ninvalmsgs * sizeof(SharedInvalidationMessage)); + pfree(invalmsgs); + } +} + +/* + * Finish preparing state data and writing it to WAL. + */ +void +EndPrepare(GlobalTransaction gxact) +{ + TwoPhaseFileHeader *hdr; + StateFileChunk *record; + bool replorigin; + + /* Add the end sentinel to the list of 2PC records */ + RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0, + NULL, 0); + + /* Go back and fill in total_len in the file header record */ + hdr = (TwoPhaseFileHeader *) records.head->data; + Assert(hdr->magic == TWOPHASE_MAGIC); + hdr->total_len = records.total_len + sizeof(pg_crc32c); + + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + if (replorigin) + { + hdr->origin_lsn = replorigin_session_origin_lsn; + hdr->origin_timestamp = replorigin_session_origin_timestamp; + } + else + { + hdr->origin_lsn = InvalidXLogRecPtr; + hdr->origin_timestamp = 0; + } + + /* + * If the data size exceeds MaxAllocSize, we won't be able to read it in + * ReadTwoPhaseFile. Check for that now, rather than fail in the case + * where we write data to file and then re-read at commit time. + */ + if (hdr->total_len > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("two-phase state file maximum length exceeded"))); + + /* + * Now writing 2PC state data to WAL. We let the WAL's CRC protection + * cover us, so no need to calculate a separate CRC. + * + * We have to set delayChkpt here, too; otherwise a checkpoint starting + * immediately after the WAL record is inserted could complete without + * fsync'ing our state file. (This is essentially the same kind of race + * condition as the COMMIT-to-clog-write case that RecordTransactionCommit + * uses delayChkpt for; see notes there.) + * + * We save the PREPARE record's location in the gxact for later use by + * CheckPointTwoPhase. + */ + XLogEnsureRecordSpace(0, records.num_chunks); + + START_CRIT_SECTION(); + + Assert(!MyProc->delayChkpt); + MyProc->delayChkpt = true; + + XLogBeginInsert(); + for (record = records.head; record != NULL; record = record->next) + XLogRegisterData(record->data, record->len); + + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE); + + if (replorigin) + { + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + gxact->prepare_end_lsn); + } + + XLogFlush(gxact->prepare_end_lsn); + + /* If we crash now, we have prepared: WAL replay will fix things */ + + /* Store record's start location to read that later on Commit */ + gxact->prepare_start_lsn = ProcLastRecPtr; + + /* + * Mark the prepared transaction as valid. As soon as xact.c marks MyProc + * as not running our XID (which it will do immediately after this + * function returns), others can commit/rollback the xact. + * + * NB: a side effect of this is to make a dummy ProcArray entry for the + * prepared XID. This must happen before we clear the XID from MyProc / + * ProcGlobal->xids[], else there is a window where the XID is not running + * according to TransactionIdIsInProgress, and onlookers would be entitled + * to assume the xact crashed. Instead we have a window where the same + * XID appears twice in ProcArray, which is OK. + */ + MarkAsPrepared(gxact, false); + + /* + * Now we can mark ourselves as out of the commit critical section: a + * checkpoint starting after this will certainly see the gxact as a + * candidate for fsyncing. + */ + MyProc->delayChkpt = false; + + /* + * Remember that we have this GlobalTransaction entry locked for us. If + * we crash after this point, it's too late to abort, but we must unlock + * it so that the prepared transaction can be committed or rolled back. + */ + MyLockedGxact = gxact; + + END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked the prepare, but still show as + * running in the procarray (twice!) and continue to hold locks. + */ + SyncRepWaitForLSN(gxact->prepare_end_lsn, false); + + records.tail = records.head = NULL; + records.num_chunks = 0; +} + +/* + * Register a 2PC record to be written to state file. + */ +void +RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, + const void *data, uint32 len) +{ + TwoPhaseRecordOnDisk record; + + record.rmid = rmid; + record.info = info; + record.len = len; + save_state_data(&record, sizeof(TwoPhaseRecordOnDisk)); + if (len > 0) + save_state_data(data, len); +} + + +/* + * Read and validate the state file for xid. + * + * If it looks OK (has a valid magic number and CRC), return the palloc'd + * contents of the file, issuing an error when finding corrupted data. If + * missing_ok is true, which indicates that missing files can be safely + * ignored, then return NULL. This state can be reached when doing recovery. + */ +static char * +ReadTwoPhaseFile(TransactionId xid, bool missing_ok) +{ + char path[MAXPGPATH]; + char *buf; + TwoPhaseFileHeader *hdr; + int fd; + struct stat stat; + uint32 crc_offset; + pg_crc32c calc_crc, + file_crc; + int r; + + TwoPhaseFilePath(path, xid); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (missing_ok && errno == ENOENT) + return NULL; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + + /* + * Check file length. We can determine a lower bound pretty easily. We + * set an upper bound to avoid palloc() failure on a corrupt file, though + * we can't guarantee that we won't get an out of memory error anyway, + * even on a valid file. + */ + if (fstat(fd, &stat)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + + if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) + + MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) + + sizeof(pg_crc32c)) || + stat.st_size > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_plural("incorrect size of file \"%s\": %lld byte", + "incorrect size of file \"%s\": %lld bytes", + (long long int) stat.st_size, path, + (long long int) stat.st_size))); + + crc_offset = stat.st_size - sizeof(pg_crc32c); + if (crc_offset != MAXALIGN(crc_offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("incorrect alignment of CRC offset for file \"%s\"", + path))); + + /* + * OK, slurp in the file. + */ + buf = (char *) palloc(stat.st_size); + + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ); + r = read(fd, buf, stat.st_size); + if (r != stat.st_size) + { + if (r < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + else + ereport(ERROR, + (errmsg("could not read file \"%s\": read %d of %lld", + path, r, (long long int) stat.st_size))); + } + + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + hdr = (TwoPhaseFileHeader *) buf; + if (hdr->magic != TWOPHASE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid magic number stored in file \"%s\"", + path))); + + if (hdr->total_len != stat.st_size) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid size stored in file \"%s\"", + path))); + + INIT_CRC32C(calc_crc); + COMP_CRC32C(calc_crc, buf, crc_offset); + FIN_CRC32C(calc_crc); + + file_crc = *((pg_crc32c *) (buf + crc_offset)); + + if (!EQ_CRC32C(calc_crc, file_crc)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("calculated CRC checksum does not match value stored in file \"%s\"", + path))); + + return buf; +} + + +/* + * Reads 2PC data from xlog. During checkpoint this data will be moved to + * twophase files and ReadTwoPhaseFile should be used instead. + * + * Note clearly that this function can access WAL during normal operation, + * similarly to the way WALSender or Logical Decoding would do. While + * accessing WAL, read_local_xlog_page() may change ThisTimeLineID, + * particularly if this routine is called for the end-of-recovery checkpoint + * in the checkpointer itself, so save the current timeline number value + * and restore it once done. + */ +static void +XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) +{ + XLogRecord *record; + XLogReaderState *xlogreader; + char *errormsg; + TimeLineID save_currtli = ThisTimeLineID; + + xlogreader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &read_local_xlog_page, + .segment_open = &wal_segment_open, + .segment_close = &wal_segment_close), + NULL); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + + XLogBeginRead(xlogreader, lsn); + record = XLogReadRecord(xlogreader, &errormsg); + + /* + * Restore immediately the timeline where it was previously, as + * read_local_xlog_page() could have changed it if the record was read + * while recovery was finishing or if the timeline has jumped in-between. + */ + ThisTimeLineID = save_currtli; + + if (record == NULL) + { + if (errormsg) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read two-phase state from WAL at %X/%X: %s", + LSN_FORMAT_ARGS(lsn), errormsg))); + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read two-phase state from WAL at %X/%X", + LSN_FORMAT_ARGS(lsn)))); + } + + if (XLogRecGetRmid(xlogreader) != RM_XACT_ID || + (XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("expected two-phase state data is not present in WAL at %X/%X", + LSN_FORMAT_ARGS(lsn)))); + + if (len != NULL) + *len = XLogRecGetDataLen(xlogreader); + + *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader)); + memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader)); + + XLogReaderFree(xlogreader); +} + + +/* + * Confirms an xid is prepared, during recovery + */ +bool +StandbyTransactionIdIsPrepared(TransactionId xid) +{ + char *buf; + TwoPhaseFileHeader *hdr; + bool result; + + Assert(TransactionIdIsValid(xid)); + + if (max_prepared_xacts <= 0) + return false; /* nothing to do */ + + /* Read and validate file */ + buf = ReadTwoPhaseFile(xid, true); + if (buf == NULL) + return false; + + /* Check header also */ + hdr = (TwoPhaseFileHeader *) buf; + result = TransactionIdEquals(hdr->xid, xid); + pfree(buf); + + return result; +} + +/* + * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED + */ +void +FinishPreparedTransaction(const char *gid, bool isCommit) +{ + GlobalTransaction gxact; + PGPROC *proc; + TransactionId xid; + char *buf; + char *bufptr; + TwoPhaseFileHeader *hdr; + TransactionId latestXid; + TransactionId *children; + RelFileNode *commitrels; + RelFileNode *abortrels; + RelFileNode *delrels; + int ndelrels; + SharedInvalidationMessage *invalmsgs; + + /* + * Validate the GID, and lock the GXACT to ensure that two backends do not + * try to commit the same GID at once. + */ + gxact = LockGXact(gid, GetUserId()); + proc = &ProcGlobal->allProcs[gxact->pgprocno]; + xid = gxact->xid; + + /* + * Read and validate 2PC state data. State data will typically be stored + * in WAL files if the LSN is after the last checkpoint record, or moved + * to disk if for some reason they have lived for a long time. + */ + if (gxact->ondisk) + buf = ReadTwoPhaseFile(xid, false); + else + XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); + + + /* + * Disassemble the header area + */ + hdr = (TwoPhaseFileHeader *) buf; + Assert(TransactionIdEquals(hdr->xid, xid)); + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + bufptr += MAXALIGN(hdr->gidlen); + children = (TransactionId *) bufptr; + bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); + commitrels = (RelFileNode *) bufptr; + bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); + abortrels = (RelFileNode *) bufptr; + bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); + invalmsgs = (SharedInvalidationMessage *) bufptr; + bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage)); + + /* compute latestXid among all children */ + latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children); + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* + * The order of operations here is critical: make the XLOG entry for + * commit or abort, then mark the transaction committed or aborted in + * pg_xact, then remove its PGPROC from the global ProcArray (which means + * TransactionIdIsInProgress will stop saying the prepared xact is in + * progress), then run the post-commit or post-abort callbacks. The + * callbacks will release the locks the transaction held. + */ + if (isCommit) + RecordTransactionCommitPrepared(xid, + hdr->nsubxacts, children, + hdr->ncommitrels, commitrels, + hdr->ninvalmsgs, invalmsgs, + hdr->initfileinval, gid); + else + RecordTransactionAbortPrepared(xid, + hdr->nsubxacts, children, + hdr->nabortrels, abortrels, + gid); + + ProcArrayRemove(proc, latestXid); + + /* + * In case we fail while running the callbacks, mark the gxact invalid so + * no one else will try to commit/rollback, and so it will be recycled if + * we fail after this point. It is still locked by our backend so it + * won't go away yet. + * + * (We assume it's safe to do this without taking TwoPhaseStateLock.) + */ + gxact->valid = false; + + /* + * We have to remove any files that were supposed to be dropped. For + * consistency with the regular xact.c code paths, must do this before + * releasing locks, so do it before running the callbacks. + * + * NB: this code knows that we couldn't be dropping any temp rels ... + */ + if (isCommit) + { + delrels = commitrels; + ndelrels = hdr->ncommitrels; + } + else + { + delrels = abortrels; + ndelrels = hdr->nabortrels; + } + + /* Make sure files supposed to be dropped are dropped */ + DropRelationFiles(delrels, ndelrels, false); + + /* + * Handle cache invalidation messages. + * + * Relcache init file invalidation requires processing both before and + * after we send the SI messages. See AtEOXact_Inval() + */ + if (hdr->initfileinval) + RelationCacheInitFilePreInvalidate(); + SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs); + if (hdr->initfileinval) + RelationCacheInitFilePostInvalidate(); + + /* + * Acquire the two-phase lock. We want to work on the two-phase callbacks + * while holding it to avoid potential conflicts with other transactions + * attempting to use the same GID, so the lock is released once the shared + * memory state is cleared. + */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + + /* And now do the callbacks */ + if (isCommit) + ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); + else + ProcessRecords(bufptr, xid, twophase_postabort_callbacks); + + PredicateLockTwoPhaseFinish(xid, isCommit); + + /* Clear shared memory state */ + RemoveGXact(gxact); + + /* + * Release the lock as all callbacks are called and shared memory cleanup + * is done. + */ + LWLockRelease(TwoPhaseStateLock); + + /* Count the prepared xact as committed or aborted */ + AtEOXact_PgStat(isCommit, false); + + /* + * And now we can clean up any files we may have left. + */ + if (gxact->ondisk) + RemoveTwoPhaseFile(xid, true); + + MyLockedGxact = NULL; + + RESUME_INTERRUPTS(); + + pfree(buf); +} + +/* + * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record. + */ +static void +ProcessRecords(char *bufptr, TransactionId xid, + const TwoPhaseCallback callbacks[]) +{ + for (;;) + { + TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr; + + Assert(record->rmid <= TWOPHASE_RM_MAX_ID); + if (record->rmid == TWOPHASE_RM_END_ID) + break; + + bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk)); + + if (callbacks[record->rmid] != NULL) + callbacks[record->rmid] (xid, record->info, + (void *) bufptr, record->len); + + bufptr += MAXALIGN(record->len); + } +} + +/* + * Remove the 2PC file for the specified XID. + * + * If giveWarning is false, do not complain about file-not-present; + * this is an expected case during WAL replay. + */ +static void +RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) +{ + char path[MAXPGPATH]; + + TwoPhaseFilePath(path, xid); + if (unlink(path)) + if (errno != ENOENT || giveWarning) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); +} + +/* + * Recreates a state file. This is used in WAL replay and during + * checkpoint creation. + * + * Note: content and len don't include CRC. + */ +static void +RecreateTwoPhaseFile(TransactionId xid, void *content, int len) +{ + char path[MAXPGPATH]; + pg_crc32c statefile_crc; + int fd; + + /* Recompute CRC */ + INIT_CRC32C(statefile_crc); + COMP_CRC32C(statefile_crc, content, len); + FIN_CRC32C(statefile_crc); + + TwoPhaseFilePath(path, xid); + + fd = OpenTransientFile(path, + O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not recreate file \"%s\": %m", path))); + + /* Write content and CRC */ + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE); + if (write(fd, content, len) != len) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", path))); + } + if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", path))); + } + pgstat_report_wait_end(); + + /* + * We must fsync the file because the end-of-replay checkpoint will not do + * so, there being no GXACT in shared memory yet to tell it to. + */ + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + +/* + * CheckPointTwoPhase -- handle 2PC component of checkpointing. + * + * We must fsync the state file of any GXACT that is valid or has been + * generated during redo and has a PREPARE LSN <= the checkpoint's redo + * horizon. (If the gxact isn't valid yet, has not been generated in + * redo, or has a later LSN, this checkpoint is not responsible for + * fsyncing it.) + * + * This is deliberately run as late as possible in the checkpoint sequence, + * because GXACTs ordinarily have short lifespans, and so it is quite + * possible that GXACTs that were valid at checkpoint start will no longer + * exist if we wait a little bit. With typical checkpoint settings this + * will be about 3 minutes for an online checkpoint, so as a result we + * expect that there will be no GXACTs that need to be copied to disk. + * + * If a GXACT remains valid across multiple checkpoints, it will already + * be on disk so we don't bother to repeat that write. + */ +void +CheckPointTwoPhase(XLogRecPtr redo_horizon) +{ + int i; + int serialized_xacts = 0; + + if (max_prepared_xacts <= 0) + return; /* nothing to do */ + + TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START(); + + /* + * We are expecting there to be zero GXACTs that need to be copied to + * disk, so we perform all I/O while holding TwoPhaseStateLock for + * simplicity. This prevents any new xacts from preparing while this + * occurs, which shouldn't be a problem since the presence of long-lived + * prepared xacts indicates the transaction manager isn't active. + * + * It's also possible to move I/O out of the lock, but on every error we + * should check whether somebody committed our transaction in different + * backend. Let's leave this optimization for future, if somebody will + * spot that this place cause bottleneck. + * + * Note that it isn't possible for there to be a GXACT with a + * prepare_end_lsn set prior to the last checkpoint yet is marked invalid, + * because of the efforts with delayChkpt. + */ + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + /* + * Note that we are using gxact not PGPROC so this works in recovery + * also + */ + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + if ((gxact->valid || gxact->inredo) && + !gxact->ondisk && + gxact->prepare_end_lsn <= redo_horizon) + { + char *buf; + int len; + + XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); + RecreateTwoPhaseFile(gxact->xid, buf, len); + gxact->ondisk = true; + gxact->prepare_start_lsn = InvalidXLogRecPtr; + gxact->prepare_end_lsn = InvalidXLogRecPtr; + pfree(buf); + serialized_xacts++; + } + } + LWLockRelease(TwoPhaseStateLock); + + /* + * Flush unconditionally the parent directory to make any information + * durable on disk. Two-phase files could have been removed and those + * removals need to be made persistent as well as any files newly created + * previously since the last checkpoint. + */ + fsync_fname(TWOPHASE_DIR, true); + + TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE(); + + if (log_checkpoints && serialized_xacts > 0) + ereport(LOG, + (errmsg_plural("%u two-phase state file was written " + "for a long-running prepared transaction", + "%u two-phase state files were written " + "for long-running prepared transactions", + serialized_xacts, + serialized_xacts))); +} + +/* + * restoreTwoPhaseData + * + * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data. + * This is called once at the beginning of recovery, saving any extra + * lookups in the future. Two-phase files that are newer than the + * minimum XID horizon are discarded on the way. + */ +void +restoreTwoPhaseData(void) +{ + DIR *cldir; + struct dirent *clde; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + cldir = AllocateDir(TWOPHASE_DIR); + while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) + { + if (strlen(clde->d_name) == 8 && + strspn(clde->d_name, "0123456789ABCDEF") == 8) + { + TransactionId xid; + char *buf; + + xid = (TransactionId) strtoul(clde->d_name, NULL, 16); + + buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr, + true, false, false); + if (buf == NULL) + continue; + + PrepareRedoAdd(buf, InvalidXLogRecPtr, + InvalidXLogRecPtr, InvalidRepOriginId); + } + } + LWLockRelease(TwoPhaseStateLock); + FreeDir(cldir); +} + +/* + * PrescanPreparedTransactions + * + * Scan the shared memory entries of TwoPhaseState and determine the range + * of valid XIDs present. This is run during database startup, after we + * have completed reading WAL. ShmemVariableCache->nextXid has been set to + * one more than the highest XID for which evidence exists in WAL. + * + * We throw away any prepared xacts with main XID beyond nextXid --- if any + * are present, it suggests that the DBA has done a PITR recovery to an + * earlier point in time without cleaning out pg_twophase. We dare not + * try to recover such prepared xacts since they likely depend on database + * state that doesn't exist now. + * + * However, we will advance nextXid beyond any subxact XIDs belonging to + * valid prepared xacts. We need to do this since subxact commit doesn't + * write a WAL entry, and so there might be no evidence in WAL of those + * subxact XIDs. + * + * On corrupted two-phase files, fail immediately. Keeping around broken + * entries and let replay continue causes harm on the system, and a new + * backup should be rolled in. + * + * Our other responsibility is to determine and return the oldest valid XID + * among the prepared xacts (if none, return ShmemVariableCache->nextXid). + * This is needed to synchronize pg_subtrans startup properly. + * + * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all + * top-level xids is stored in *xids_p. The number of entries in the array + * is returned in *nxids_p. + */ +TransactionId +PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) +{ + FullTransactionId nextXid = ShmemVariableCache->nextXid; + TransactionId origNextXid = XidFromFullTransactionId(nextXid); + TransactionId result = origNextXid; + TransactionId *xids = NULL; + int nxids = 0; + int allocsize = 0; + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + Assert(gxact->inredo); + + xid = gxact->xid; + + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, false, true); + + if (buf == NULL) + continue; + + /* + * OK, we think this file is valid. Incorporate xid into the + * running-minimum result. + */ + if (TransactionIdPrecedes(xid, result)) + result = xid; + + if (xids_p) + { + if (nxids == allocsize) + { + if (nxids == 0) + { + allocsize = 10; + xids = palloc(allocsize * sizeof(TransactionId)); + } + else + { + allocsize = allocsize * 2; + xids = repalloc(xids, allocsize * sizeof(TransactionId)); + } + } + xids[nxids++] = xid; + } + + pfree(buf); + } + LWLockRelease(TwoPhaseStateLock); + + if (xids_p) + { + *xids_p = xids; + *nxids_p = nxids; + } + + return result; +} + +/* + * StandbyRecoverPreparedTransactions + * + * Scan the shared memory entries of TwoPhaseState and setup all the required + * information to allow standby queries to treat prepared transactions as still + * active. + * + * This is never called at the end of recovery - we use + * RecoverPreparedTransactions() at that point. + * + * The lack of calls to SubTransSetParent() calls here is by design; + * those calls are made by RecoverPreparedTransactions() at the end of recovery + * for those xacts that need this. + */ +void +StandbyRecoverPreparedTransactions(void) +{ + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + Assert(gxact->inredo); + + xid = gxact->xid; + + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, false, false); + if (buf != NULL) + pfree(buf); + } + LWLockRelease(TwoPhaseStateLock); +} + +/* + * RecoverPreparedTransactions + * + * Scan the shared memory entries of TwoPhaseState and reload the state for + * each prepared transaction (reacquire locks, etc). + * + * This is run at the end of recovery, but before we allow backends to write + * WAL. + * + * At the end of recovery the way we take snapshots will change. We now need + * to mark all running transactions with their full SubTransSetParent() info + * to allow normal snapshots to work correctly if snapshots overflow. + * We do this here because by definition prepared transactions are the only + * type of write transaction still running, so this is necessary and + * complete. + */ +void +RecoverPreparedTransactions(void) +{ + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + char *bufptr; + TwoPhaseFileHeader *hdr; + TransactionId *subxids; + const char *gid; + + xid = gxact->xid; + + /* + * Reconstruct subtrans state for the transaction --- needed because + * pg_subtrans is not preserved over a restart. Note that we are + * linking all the subtransactions directly to the top-level XID; + * there may originally have been a more complex hierarchy, but + * there's no need to restore that exactly. It's possible that + * SubTransSetParent has been set before, if the prepared transaction + * generated xid assignment records. + */ + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, true, false); + if (buf == NULL) + continue; + + ereport(LOG, + (errmsg("recovering prepared transaction %u from shared memory", xid))); + + hdr = (TwoPhaseFileHeader *) buf; + Assert(TransactionIdEquals(hdr->xid, xid)); + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + gid = (const char *) bufptr; + bufptr += MAXALIGN(hdr->gidlen); + subxids = (TransactionId *) bufptr; + bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); + bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); + bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); + bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage)); + + /* + * Recreate its GXACT and dummy PGPROC. But, check whether it was + * added in redo and already has a shmem entry for it. + */ + MarkAsPreparingGuts(gxact, xid, gid, + hdr->prepared_at, + hdr->owner, hdr->database); + + /* recovered, so reset the flag for entries generated by redo */ + gxact->inredo = false; + + GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids); + MarkAsPrepared(gxact, true); + + LWLockRelease(TwoPhaseStateLock); + + /* + * Recover other state (notably locks) using resource managers. + */ + ProcessRecords(bufptr, xid, twophase_recover_callbacks); + + /* + * Release locks held by the standby process after we process each + * prepared transaction. As a result, we don't need too many + * additional locks at any one time. + */ + if (InHotStandby) + StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids); + + /* + * We're done with recovering this transaction. Clear MyLockedGxact, + * like we do in PrepareTransaction() during normal operation. + */ + PostPrepare_Twophase(); + + pfree(buf); + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + } + + LWLockRelease(TwoPhaseStateLock); +} + +/* + * ProcessTwoPhaseBuffer + * + * Given a transaction id, read it either from disk or read it directly + * via shmem xlog record pointer using the provided "prepare_start_lsn". + * + * If setParent is true, set up subtransaction parent linkages. + * + * If setNextXid is true, set ShmemVariableCache->nextXid to the newest + * value scanned. + */ +static char * +ProcessTwoPhaseBuffer(TransactionId xid, + XLogRecPtr prepare_start_lsn, + bool fromdisk, + bool setParent, bool setNextXid) +{ + FullTransactionId nextXid = ShmemVariableCache->nextXid; + TransactionId origNextXid = XidFromFullTransactionId(nextXid); + TransactionId *subxids; + char *buf; + TwoPhaseFileHeader *hdr; + int i; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + + if (!fromdisk) + Assert(prepare_start_lsn != InvalidXLogRecPtr); + + /* Already processed? */ + if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing stale two-phase state file for transaction %u", + xid))); + RemoveTwoPhaseFile(xid, true); + } + else + { + ereport(WARNING, + (errmsg("removing stale two-phase state from memory for transaction %u", + xid))); + PrepareRedoRemove(xid, true); + } + return NULL; + } + + /* Reject XID if too new */ + if (TransactionIdFollowsOrEquals(xid, origNextXid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing future two-phase state file for transaction %u", + xid))); + RemoveTwoPhaseFile(xid, true); + } + else + { + ereport(WARNING, + (errmsg("removing future two-phase state from memory for transaction %u", + xid))); + PrepareRedoRemove(xid, true); + } + return NULL; + } + + if (fromdisk) + { + /* Read and validate file */ + buf = ReadTwoPhaseFile(xid, false); + } + else + { + /* Read xlog data */ + XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL); + } + + /* Deconstruct header */ + hdr = (TwoPhaseFileHeader *) buf; + if (!TransactionIdEquals(hdr->xid, xid)) + { + if (fromdisk) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted two-phase state file for transaction %u", + xid))); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted two-phase state in memory for transaction %u", + xid))); + } + + /* + * Examine subtransaction XIDs ... they should all follow main XID, and + * they may force us to advance nextXid. + */ + subxids = (TransactionId *) (buf + + MAXALIGN(sizeof(TwoPhaseFileHeader)) + + MAXALIGN(hdr->gidlen)); + for (i = 0; i < hdr->nsubxacts; i++) + { + TransactionId subxid = subxids[i]; + + Assert(TransactionIdFollows(subxid, xid)); + + /* update nextXid if needed */ + if (setNextXid) + AdvanceNextFullTransactionIdPastXid(subxid); + + if (setParent) + SubTransSetParent(subxid, xid); + } + + return buf; +} + + +/* + * RecordTransactionCommitPrepared + * + * This is basically the same as RecordTransactionCommit (q.v. if you change + * this function): in particular, we must set the delayChkpt flag to avoid a + * race condition. + * + * We know the transaction made at least one XLOG entry (its PREPARE), + * so it is never possible to optimize out the commit record. + */ +static void +RecordTransactionCommitPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + int ninvalmsgs, + SharedInvalidationMessage *invalmsgs, + bool initfileinval, + const char *gid) +{ + XLogRecPtr recptr; + TimestampTz committs = GetCurrentTimestamp(); + bool replorigin; + + /* + * Are we using the replication origins feature? Or, in other words, are + * we replaying remote actions? + */ + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + START_CRIT_SECTION(); + + /* See notes in RecordTransactionCommit */ + Assert(!MyProc->delayChkpt); + MyProc->delayChkpt = true; + + /* + * Emit the XLOG commit record. Note that we mark 2PC commits as + * potentially having AccessExclusiveLocks since we don't know whether or + * not they do. + */ + recptr = XactLogCommitRecord(committs, + nchildren, children, nrels, rels, + ninvalmsgs, invalmsgs, + initfileinval, + MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK, + xid, gid); + + + if (replorigin) + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + XactLastRecEnd); + + /* + * Record commit timestamp. The value comes from plain commit timestamp + * if replorigin is not enabled, or replorigin already set a value for us + * in replorigin_session_origin_timestamp otherwise. + * + * We don't need to WAL-log anything here, as the commit record written + * above already contains the data. + */ + if (!replorigin || replorigin_session_origin_timestamp == 0) + replorigin_session_origin_timestamp = committs; + + TransactionTreeSetCommitTsData(xid, nchildren, children, + replorigin_session_origin_timestamp, + replorigin_session_origin); + + /* + * We don't currently try to sleep before flush here ... nor is there any + * support for async commit of a prepared xact (the very idea is probably + * a contradiction) + */ + + /* Flush XLOG to disk */ + XLogFlush(recptr); + + /* Mark the transaction committed in pg_xact */ + TransactionIdCommitTree(xid, nchildren, children); + + /* Checkpoint can proceed now */ + MyProc->delayChkpt = false; + + END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + SyncRepWaitForLSN(recptr, true); +} + +/* + * RecordTransactionAbortPrepared + * + * This is basically the same as RecordTransactionAbort. + * + * We know the transaction made at least one XLOG entry (its PREPARE), + * so it is never possible to optimize out the abort record. + */ +static void +RecordTransactionAbortPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + const char *gid) +{ + XLogRecPtr recptr; + bool replorigin; + + /* + * Are we using the replication origins feature? Or, in other words, are + * we replaying remote actions? + */ + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + /* + * Catch the scenario where we aborted partway through + * RecordTransactionCommitPrepared ... + */ + if (TransactionIdDidCommit(xid)) + elog(PANIC, "cannot abort transaction %u, it was already committed", + xid); + + START_CRIT_SECTION(); + + /* + * Emit the XLOG commit record. Note that we mark 2PC aborts as + * potentially having AccessExclusiveLocks since we don't know whether or + * not they do. + */ + recptr = XactLogAbortRecord(GetCurrentTimestamp(), + nchildren, children, + nrels, rels, + MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK, + xid, gid); + + if (replorigin) + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + XactLastRecEnd); + + /* Always flush, since we're about to remove the 2PC state file */ + XLogFlush(recptr); + + /* + * Mark the transaction aborted in clog. This is not absolutely necessary + * but we may as well do it while we are here. + */ + TransactionIdAbortTree(xid, nchildren, children); + + END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + SyncRepWaitForLSN(recptr, false); +} + +/* + * PrepareRedoAdd + * + * Store pointers to the start/end of the WAL record along with the xid in + * a gxact entry in shared memory TwoPhaseState structure. If caller + * specifies InvalidXLogRecPtr as WAL location to fetch the two-phase + * data, the entry is marked as located on disk. + */ +void +PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, + XLogRecPtr end_lsn, RepOriginId origin_id) +{ + TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf; + char *bufptr; + const char *gid; + GlobalTransaction gxact; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + gid = (const char *) bufptr; + + /* + * Reserve the GID for the given transaction in the redo code path. + * + * This creates a gxact struct and puts it into the active array. + * + * In redo, this struct is mainly used to track PREPARE/COMMIT entries in + * shared memory. Hence, we only fill up the bare minimum contents here. + * The gxact also gets marked with gxact->inredo set to true to indicate + * that it got added in the redo phase + */ + + /* Get a free gxact from the freelist */ + if (TwoPhaseState->freeGXacts == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("maximum number of prepared transactions reached"), + errhint("Increase max_prepared_transactions (currently %d).", + max_prepared_xacts))); + gxact = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = gxact->next; + + gxact->prepared_at = hdr->prepared_at; + gxact->prepare_start_lsn = start_lsn; + gxact->prepare_end_lsn = end_lsn; + gxact->xid = hdr->xid; + gxact->owner = hdr->owner; + gxact->locking_backend = InvalidBackendId; + gxact->valid = false; + gxact->ondisk = XLogRecPtrIsInvalid(start_lsn); + gxact->inredo = true; /* yes, added in redo */ + strcpy(gxact->gid, gid); + + /* And insert it into the active array */ + Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); + TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; + + if (origin_id != InvalidRepOriginId) + { + /* recover apply progress */ + replorigin_advance(origin_id, hdr->origin_lsn, end_lsn, + false /* backward */ , false /* WAL */ ); + } + + elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid); +} + +/* + * PrepareRedoRemove + * + * Remove the corresponding gxact entry from TwoPhaseState. Also remove + * the 2PC file if a prepared transaction was saved via an earlier checkpoint. + * + * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState + * is updated. + */ +void +PrepareRedoRemove(TransactionId xid, bool giveWarning) +{ + GlobalTransaction gxact = NULL; + int i; + bool found = false; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + gxact = TwoPhaseState->prepXacts[i]; + + if (gxact->xid == xid) + { + Assert(gxact->inredo); + found = true; + break; + } + } + + /* + * Just leave if there is nothing, this is expected during WAL replay. + */ + if (!found) + return; + + /* + * And now we can clean up any files we may have left. + */ + elog(DEBUG2, "removing 2PC data for transaction %u", xid); + if (gxact->ondisk) + RemoveTwoPhaseFile(xid, giveWarning); + RemoveGXact(gxact); +} diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c new file mode 100644 index 0000000..1fd7855 --- /dev/null +++ b/src/backend/access/transam/twophase_rmgr.c @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * twophase_rmgr.c + * Two-phase-commit resource managers tables + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/transam/twophase_rmgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/multixact.h" +#include "access/twophase_rmgr.h" +#include "pgstat.h" +#include "storage/lock.h" +#include "storage/predicate.h" + + +const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_recover, /* Lock */ + NULL, /* pgstat */ + multixact_twophase_recover, /* MultiXact */ + predicatelock_twophase_recover /* PredicateLock */ +}; + +const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_postcommit, /* Lock */ + pgstat_twophase_postcommit, /* pgstat */ + multixact_twophase_postcommit, /* MultiXact */ + NULL /* PredicateLock */ +}; + +const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_postabort, /* Lock */ + pgstat_twophase_postabort, /* pgstat */ + multixact_twophase_postabort, /* MultiXact */ + NULL /* PredicateLock */ +}; + +const TwoPhaseCallback twophase_standby_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_standby_recover, /* Lock */ + NULL, /* pgstat */ + NULL, /* MultiXact */ + NULL /* PredicateLock */ +}; diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c new file mode 100644 index 0000000..a22bf37 --- /dev/null +++ b/src/backend/access/transam/varsup.c @@ -0,0 +1,637 @@ +/*------------------------------------------------------------------------- + * + * varsup.c + * postgres OID & XID variables support routines + * + * Copyright (c) 2000-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/transam/varsup.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "commands/dbcommands.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "utils/syscache.h" + + +/* Number of OIDs to prefetch (preallocate) per XLOG write */ +#define VAR_OID_PREFETCH 8192 + +/* pointer to "variable cache" in shared memory (set up by shmem.c) */ +VariableCache ShmemVariableCache = NULL; + + +/* + * Allocate the next FullTransactionId for a new transaction or + * subtransaction. + * + * The new XID is also stored into MyProc->xid/ProcGlobal->xids[] before + * returning. + * + * Note: when this is called, we are actually already inside a valid + * transaction, since XIDs are now not allocated until the transaction + * does something. So it is safe to do a database lookup if we want to + * issue a warning about XID wrap. + */ +FullTransactionId +GetNewTransactionId(bool isSubXact) +{ + FullTransactionId full_xid; + TransactionId xid; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new XIDs after that point. + */ + if (IsInParallelMode()) + elog(ERROR, "cannot assign TransactionIds during a parallel operation"); + + /* + * During bootstrap initialization, we return the special bootstrap + * transaction id. + */ + if (IsBootstrapProcessingMode()) + { + Assert(!isSubXact); + MyProc->xid = BootstrapTransactionId; + ProcGlobal->xids[MyProc->pgxactoff] = BootstrapTransactionId; + return FullTransactionIdFromEpochAndXid(0, BootstrapTransactionId); + } + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign TransactionIds during recovery"); + + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + + full_xid = ShmemVariableCache->nextXid; + xid = XidFromFullTransactionId(full_xid); + + /*---------- + * Check to see if it's safe to assign another XID. This protects against + * catastrophic data loss due to XID wraparound. The basic rules are: + * + * If we're past xidVacLimit, start trying to force autovacuum cycles. + * If we're past xidWarnLimit, start issuing warnings. + * If we're past xidStopLimit, refuse to execute transactions, unless + * we are running in single-user mode (which gives an escape hatch + * to the DBA who somehow got past the earlier defenses). + * + * Note that this coding also appears in GetNewMultiXactId. + *---------- + */ + if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit)) + { + /* + * For safety's sake, we release XidGenLock while sending signals, + * warnings, etc. This is not so much because we care about + * preserving concurrency in this situation, as to avoid any + * possibility of deadlock while doing get_database_name(). First, + * copy all the shared values we'll need in this path. + */ + TransactionId xidWarnLimit = ShmemVariableCache->xidWarnLimit; + TransactionId xidStopLimit = ShmemVariableCache->xidStopLimit; + TransactionId xidWrapLimit = ShmemVariableCache->xidWrapLimit; + Oid oldest_datoid = ShmemVariableCache->oldestXidDB; + + LWLockRelease(XidGenLock); + + /* + * To avoid swamping the postmaster with signals, we issue the autovac + * request only once per 64K transaction starts. This still gives + * plenty of chances before we get into real trouble. + */ + if (IsUnderPostmaster && (xid % 65536) == 0) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + if (IsUnderPostmaster && + TransactionIdFollowsOrEquals(xid, xidStopLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands to avoid wraparound data loss in database \"%s\"", + oldest_datname), + errhint("Stop the postmaster and vacuum that database in single-user mode.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands to avoid wraparound data loss in database with OID %u", + oldest_datoid), + errhint("Stop the postmaster and vacuum that database in single-user mode.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + else if (TransactionIdFollowsOrEquals(xid, xidWarnLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(WARNING, + (errmsg("database \"%s\" must be vacuumed within %u transactions", + oldest_datname, + xidWrapLimit - xid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg("database with OID %u must be vacuumed within %u transactions", + oldest_datoid, + xidWrapLimit - xid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + + /* Re-acquire lock and start over */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + full_xid = ShmemVariableCache->nextXid; + xid = XidFromFullTransactionId(full_xid); + } + + /* + * If we are allocating the first XID of a new page of the commit log, + * zero out that commit-log page before returning. We must do this while + * holding XidGenLock, else another xact could acquire and commit a later + * XID before we zero the page. Fortunately, a page of the commit log + * holds 32K or more transactions, so we don't have to do this very often. + * + * Extend pg_subtrans and pg_commit_ts too. + */ + ExtendCLOG(xid); + ExtendCommitTs(xid); + ExtendSUBTRANS(xid); + + /* + * Now advance the nextXid counter. This must not happen until after we + * have successfully completed ExtendCLOG() --- if that routine fails, we + * want the next incoming transaction to try it again. We cannot assign + * more XIDs until there is CLOG space for them. + */ + FullTransactionIdAdvance(&ShmemVariableCache->nextXid); + + /* + * We must store the new XID into the shared ProcArray before releasing + * XidGenLock. This ensures that every active XID older than + * latestCompletedXid is present in the ProcArray, which is essential for + * correct OldestXmin tracking; see src/backend/access/transam/README. + * + * Note that readers of ProcGlobal->xids/PGPROC->xid should be careful to + * fetch the value for each proc only once, rather than assume they can + * read a value multiple times and get the same answer each time. Note we + * are assuming that TransactionId and int fetch/store are atomic. + * + * The same comments apply to the subxact xid count and overflow fields. + * + * Use of a write barrier prevents dangerous code rearrangement in this + * function; other backends could otherwise e.g. be examining my subxids + * info concurrently, and we don't want them to see an invalid + * intermediate state, such as an incremented nxids before the array entry + * is filled. + * + * Other processes that read nxids should do so before reading xids + * elements with a pg_read_barrier() in between, so that they can be sure + * not to read an uninitialized array element; see + * src/backend/storage/lmgr/README.barrier. + * + * If there's no room to fit a subtransaction XID into PGPROC, set the + * cache-overflowed flag instead. This forces readers to look in + * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a + * race-condition window, in that the new XID will not appear as running + * until its parent link has been placed into pg_subtrans. However, that + * will happen before anyone could possibly have a reason to inquire about + * the status of the XID, so it seems OK. (Snapshots taken during this + * window *will* include the parent XID, so they will deliver the correct + * answer later on when someone does have a reason to inquire.) + */ + if (!isSubXact) + { + Assert(ProcGlobal->subxidStates[MyProc->pgxactoff].count == 0); + Assert(!ProcGlobal->subxidStates[MyProc->pgxactoff].overflowed); + Assert(MyProc->subxidStatus.count == 0); + Assert(!MyProc->subxidStatus.overflowed); + + /* LWLockRelease acts as barrier */ + MyProc->xid = xid; + ProcGlobal->xids[MyProc->pgxactoff] = xid; + } + else + { + XidCacheStatus *substat = &ProcGlobal->subxidStates[MyProc->pgxactoff]; + int nxids = MyProc->subxidStatus.count; + + Assert(substat->count == MyProc->subxidStatus.count); + Assert(substat->overflowed == MyProc->subxidStatus.overflowed); + + if (nxids < PGPROC_MAX_CACHED_SUBXIDS) + { + MyProc->subxids.xids[nxids] = xid; + pg_write_barrier(); + MyProc->subxidStatus.count = substat->count = nxids + 1; + } + else + MyProc->subxidStatus.overflowed = substat->overflowed = true; + } + + LWLockRelease(XidGenLock); + + return full_xid; +} + +/* + * Read nextXid but don't allocate it. + */ +FullTransactionId +ReadNextFullTransactionId(void) +{ + FullTransactionId fullXid; + + LWLockAcquire(XidGenLock, LW_SHARED); + fullXid = ShmemVariableCache->nextXid; + LWLockRelease(XidGenLock); + + return fullXid; +} + +/* + * Advance nextXid to the value after a given xid. The epoch is inferred. + * This must only be called during recovery or from two-phase start-up code. + */ +void +AdvanceNextFullTransactionIdPastXid(TransactionId xid) +{ + FullTransactionId newNextFullXid; + TransactionId next_xid; + uint32 epoch; + + /* + * It is safe to read nextXid without a lock, because this is only called + * from the startup process or single-process mode, meaning that no other + * process can modify it. + */ + Assert(AmStartupProcess() || !IsUnderPostmaster); + + /* Fast return if this isn't an xid high enough to move the needle. */ + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + if (!TransactionIdFollowsOrEquals(xid, next_xid)) + return; + + /* + * Compute the FullTransactionId that comes after the given xid. To do + * this, we preserve the existing epoch, but detect when we've wrapped + * into a new epoch. This is necessary because WAL records and 2PC state + * currently contain 32 bit xids. The wrap logic is safe in those cases + * because the span of active xids cannot exceed one epoch at any given + * point in the WAL stream. + */ + TransactionIdAdvance(xid); + epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); + if (unlikely(xid < next_xid)) + ++epoch; + newNextFullXid = FullTransactionIdFromEpochAndXid(epoch, xid); + + /* + * We still need to take a lock to modify the value when there are + * concurrent readers. + */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextXid = newNextFullXid; + LWLockRelease(XidGenLock); +} + +/* + * Advance the cluster-wide value for the oldest valid clog entry. + * + * We must acquire XactTruncationLock to advance the oldestClogXid. It's not + * necessary to hold the lock during the actual clog truncation, only when we + * advance the limit, as code looking up arbitrary xids is required to hold + * XactTruncationLock from when it tests oldestClogXid through to when it + * completes the clog lookup. + */ +void +AdvanceOldestClogXid(TransactionId oldest_datfrozenxid) +{ + LWLockAcquire(XactTruncationLock, LW_EXCLUSIVE); + if (TransactionIdPrecedes(ShmemVariableCache->oldestClogXid, + oldest_datfrozenxid)) + { + ShmemVariableCache->oldestClogXid = oldest_datfrozenxid; + } + LWLockRelease(XactTruncationLock); +} + +/* + * Determine the last safe XID to allocate using the currently oldest + * datfrozenxid (ie, the oldest XID that might exist in any database + * of our cluster), and the OID of the (or a) database with that value. + */ +void +SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) +{ + TransactionId xidVacLimit; + TransactionId xidWarnLimit; + TransactionId xidStopLimit; + TransactionId xidWrapLimit; + TransactionId curXid; + + Assert(TransactionIdIsNormal(oldest_datfrozenxid)); + + /* + * The place where we actually get into deep trouble is halfway around + * from the oldest potentially-existing XID. (This calculation is + * probably off by one or two counts, because the special XIDs reduce the + * size of the loop a little bit. But we throw in plenty of slop below, + * so it doesn't matter.) + */ + xidWrapLimit = oldest_datfrozenxid + (MaxTransactionId >> 1); + if (xidWrapLimit < FirstNormalTransactionId) + xidWrapLimit += FirstNormalTransactionId; + + /* + * We'll refuse to continue assigning XIDs in interactive mode once we get + * within 3M transactions of data loss. This leaves lots of room for the + * DBA to fool around fixing things in a standalone backend, while not + * being significant compared to total XID space. (VACUUM requires an XID + * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA + * might do by reflex, assigns an XID. Hence, we had better be sure + * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two + * completely-idle segments. In the event of edge-case bugs involving + * page or segment arithmetic, idle segments render the bugs unreachable + * outside of single-user mode. + */ + xidStopLimit = xidWrapLimit - 3000000; + if (xidStopLimit < FirstNormalTransactionId) + xidStopLimit -= FirstNormalTransactionId; + + /* + * We'll start complaining loudly when we get within 40M transactions of + * data loss. This is kind of arbitrary, but if you let your gas gauge + * get down to 2% of full, would you be looking for the next gas station? + * We need to be fairly liberal about this number because there are lots + * of scenarios where most transactions are done by automatic clients that + * won't pay attention to warnings. (No, we're not gonna make this + * configurable. If you know enough to configure it, you know enough to + * not get in this kind of trouble in the first place.) + */ + xidWarnLimit = xidWrapLimit - 40000000; + if (xidWarnLimit < FirstNormalTransactionId) + xidWarnLimit -= FirstNormalTransactionId; + + /* + * We'll start trying to force autovacuums when oldest_datfrozenxid gets + * to be more than autovacuum_freeze_max_age transactions old. + * + * Note: guc.c ensures that autovacuum_freeze_max_age is in a sane range, + * so that xidVacLimit will be well before xidWarnLimit. + * + * Note: autovacuum_freeze_max_age is a PGC_POSTMASTER parameter so that + * we don't have to worry about dealing with on-the-fly changes in its + * value. It doesn't look practical to update shared state from a GUC + * assign hook (too many processes would try to execute the hook, + * resulting in race conditions as well as crashes of those not connected + * to shared memory). Perhaps this can be improved someday. See also + * SetMultiXactIdLimit. + */ + xidVacLimit = oldest_datfrozenxid + autovacuum_freeze_max_age; + if (xidVacLimit < FirstNormalTransactionId) + xidVacLimit += FirstNormalTransactionId; + + /* Grab lock for just long enough to set the new limit values */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->oldestXid = oldest_datfrozenxid; + ShmemVariableCache->xidVacLimit = xidVacLimit; + ShmemVariableCache->xidWarnLimit = xidWarnLimit; + ShmemVariableCache->xidStopLimit = xidStopLimit; + ShmemVariableCache->xidWrapLimit = xidWrapLimit; + ShmemVariableCache->oldestXidDB = oldest_datoid; + curXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + LWLockRelease(XidGenLock); + + /* Log the info */ + ereport(DEBUG1, + (errmsg_internal("transaction ID wrap limit is %u, limited by database with OID %u", + xidWrapLimit, oldest_datoid))); + + /* + * If past the autovacuum force point, immediately signal an autovac + * request. The reason for this is that autovac only processes one + * database per invocation. Once it's finished cleaning up the oldest + * database, it'll call here, and we'll signal the postmaster to start + * another iteration immediately if there are still any old databases. + */ + if (TransactionIdFollowsOrEquals(curXid, xidVacLimit) && + IsUnderPostmaster && !InRecovery) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + /* Give an immediate warning if past the wrap warn point */ + if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit) && !InRecovery) + { + char *oldest_datname; + + /* + * We can be called when not inside a transaction, for example during + * StartupXLOG(). In such a case we cannot do database access, so we + * must just report the oldest DB's OID. + * + * Note: it's also possible that get_database_name fails and returns + * NULL, for example because the database just got dropped. We'll + * still warn, even though the warning might now be unnecessary. + */ + if (IsTransactionState()) + oldest_datname = get_database_name(oldest_datoid); + else + oldest_datname = NULL; + + if (oldest_datname) + ereport(WARNING, + (errmsg("database \"%s\" must be vacuumed within %u transactions", + oldest_datname, + xidWrapLimit - curXid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg("database with OID %u must be vacuumed within %u transactions", + oldest_datoid, + xidWrapLimit - curXid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } +} + + +/* + * ForceTransactionIdLimitUpdate -- does the XID wrap-limit data need updating? + * + * We primarily check whether oldestXidDB is valid. The cases we have in + * mind are that that database was dropped, or the field was reset to zero + * by pg_resetwal. In either case we should force recalculation of the + * wrap limit. Also do it if oldestXid is old enough to be forcing + * autovacuums or other actions; this ensures we update our state as soon + * as possible once extra overhead is being incurred. + */ +bool +ForceTransactionIdLimitUpdate(void) +{ + TransactionId nextXid; + TransactionId xidVacLimit; + TransactionId oldestXid; + Oid oldestXidDB; + + /* Locking is probably not really necessary, but let's be careful */ + LWLockAcquire(XidGenLock, LW_SHARED); + nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + xidVacLimit = ShmemVariableCache->xidVacLimit; + oldestXid = ShmemVariableCache->oldestXid; + oldestXidDB = ShmemVariableCache->oldestXidDB; + LWLockRelease(XidGenLock); + + if (!TransactionIdIsNormal(oldestXid)) + return true; /* shouldn't happen, but just in case */ + if (!TransactionIdIsValid(xidVacLimit)) + return true; /* this shouldn't happen anymore either */ + if (TransactionIdFollowsOrEquals(nextXid, xidVacLimit)) + return true; /* past xidVacLimit, don't delay updating */ + if (!SearchSysCacheExists1(DATABASEOID, ObjectIdGetDatum(oldestXidDB))) + return true; /* could happen, per comments above */ + return false; +} + + +/* + * GetNewObjectId -- allocate a new OID + * + * OIDs are generated by a cluster-wide counter. Since they are only 32 bits + * wide, counter wraparound will occur eventually, and therefore it is unwise + * to assume they are unique unless precautions are taken to make them so. + * Hence, this routine should generally not be used directly. The only direct + * callers should be GetNewOidWithIndex() and GetNewRelFileNode() in + * catalog/catalog.c. + */ +Oid +GetNewObjectId(void) +{ + Oid result; + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign OIDs during recovery"); + + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + + /* + * Check for wraparound of the OID counter. We *must* not return 0 + * (InvalidOid), and in normal operation we mustn't return anything below + * FirstNormalObjectId since that range is reserved for initdb (see + * IsCatalogRelationOid()). Note we are relying on unsigned comparison. + * + * During initdb, we start the OID generator at FirstBootstrapObjectId, so + * we only wrap if before that point when in bootstrap or standalone mode. + * The first time through this routine after normal postmaster start, the + * counter will be forced up to FirstNormalObjectId. This mechanism + * leaves the OIDs between FirstBootstrapObjectId and FirstNormalObjectId + * available for automatic assignment during initdb, while ensuring they + * will never conflict with user-assigned OIDs. + */ + if (ShmemVariableCache->nextOid < ((Oid) FirstNormalObjectId)) + { + if (IsPostmasterEnvironment) + { + /* wraparound, or first post-initdb assignment, in normal mode */ + ShmemVariableCache->nextOid = FirstNormalObjectId; + ShmemVariableCache->oidCount = 0; + } + else + { + /* we may be bootstrapping, so don't enforce the full range */ + if (ShmemVariableCache->nextOid < ((Oid) FirstBootstrapObjectId)) + { + /* wraparound in standalone mode (unlikely but possible) */ + ShmemVariableCache->nextOid = FirstNormalObjectId; + ShmemVariableCache->oidCount = 0; + } + } + } + + /* If we run out of logged for use oids then we must log more */ + if (ShmemVariableCache->oidCount == 0) + { + XLogPutNextOid(ShmemVariableCache->nextOid + VAR_OID_PREFETCH); + ShmemVariableCache->oidCount = VAR_OID_PREFETCH; + } + + result = ShmemVariableCache->nextOid; + + (ShmemVariableCache->nextOid)++; + (ShmemVariableCache->oidCount)--; + + LWLockRelease(OidGenLock); + + return result; +} + + +#ifdef USE_ASSERT_CHECKING + +/* + * Assert that xid is between [oldestXid, nextXid], which is the range we + * expect XIDs coming from tables etc to be in. + * + * As ShmemVariableCache->oldestXid could change just after this call without + * further precautions, and as a wrapped-around xid could again fall within + * the valid range, this assertion can only detect if something is definitely + * wrong, but not establish correctness. + * + * This intentionally does not expose a return value, to avoid code being + * introduced that depends on the return value. + */ +void +AssertTransactionIdInAllowableRange(TransactionId xid) +{ + TransactionId oldest_xid; + TransactionId next_xid; + + Assert(TransactionIdIsValid(xid)); + + /* we may see bootstrap / frozen */ + if (!TransactionIdIsNormal(xid)) + return; + + /* + * We can't acquire XidGenLock, as this may be called with XidGenLock + * already held (or with other locks that don't allow XidGenLock to be + * nested). That's ok for our purposes though, since we already rely on + * 32bit reads to be atomic. While nextXid is 64 bit, we only look at the + * lower 32bit, so a skewed read doesn't hurt. + * + * There's no increased danger of falling outside [oldest, next] by + * accessing them without a lock. xid needs to have been created with + * GetNewTransactionId() in the originating session, and the locks there + * pair with the memory barrier below. We do however accept xid to be <= + * to next_xid, instead of just <, as xid could be from the procarray, + * before we see the updated nextXid value. + */ + pg_memory_barrier(); + oldest_xid = ShmemVariableCache->oldestXid; + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + + Assert(TransactionIdFollowsOrEquals(xid, oldest_xid) || + TransactionIdPrecedesOrEquals(xid, next_xid)); +} +#endif diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c new file mode 100644 index 0000000..477ca9c --- /dev/null +++ b/src/backend/access/transam/xact.c @@ -0,0 +1,6169 @@ +/*------------------------------------------------------------------------- + * + * xact.c + * top level transaction system support routines + * + * See src/backend/access/transam/README for more information. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/transam/xact.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/commit_ts.h" +#include "access/multixact.h" +#include "access/parallel.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "catalog/pg_enum.h" +#include "catalog/storage.h" +#include "commands/async.h" +#include "commands/tablecmds.h" +#include "commands/trigger.h" +#include "executor/spi.h" +#include "libpq/be-fsstubs.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/logicallauncher.h" +#include "replication/origin.h" +#include "replication/snapbuild.h" +#include "replication/syncrep.h" +#include "replication/walsender.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/lmgr.h" +#include "storage/md.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/combocid.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/relmapper.h" +#include "utils/snapmgr.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* + * User-tweakable parameters + */ +int DefaultXactIsoLevel = XACT_READ_COMMITTED; +int XactIsoLevel; + +bool DefaultXactReadOnly = false; +bool XactReadOnly; + +bool DefaultXactDeferrable = false; +bool XactDeferrable; + +int synchronous_commit = SYNCHRONOUS_COMMIT_ON; + +/* + * CheckXidAlive is a xid value pointing to a possibly ongoing (sub) + * transaction. Currently, it is used in logical decoding. It's possible + * that such transactions can get aborted while the decoding is ongoing in + * which case we skip decoding that particular transaction. To ensure that we + * check whether the CheckXidAlive is aborted after fetching the tuple from + * system tables. We also ensure that during logical decoding we never + * directly access the tableam or heap APIs because we are checking for the + * concurrent aborts only in systable_* APIs. + */ +TransactionId CheckXidAlive = InvalidTransactionId; +bool bsysscan = false; + +/* + * When running as a parallel worker, we place only a single + * TransactionStateData on the parallel worker's state stack, and the XID + * reflected there will be that of the *innermost* currently-active + * subtransaction in the backend that initiated parallelism. However, + * GetTopTransactionId() and TransactionIdIsCurrentTransactionId() + * need to return the same answers in the parallel worker as they would have + * in the user backend, so we need some additional bookkeeping. + * + * XactTopFullTransactionId stores the XID of our toplevel transaction, which + * will be the same as TopTransactionStateData.fullTransactionId in an + * ordinary backend; but in a parallel backend, which does not have the entire + * transaction state, it will instead be copied from the backend that started + * the parallel operation. + * + * nParallelCurrentXids will be 0 and ParallelCurrentXids NULL in an ordinary + * backend, but in a parallel backend, nParallelCurrentXids will contain the + * number of XIDs that need to be considered current, and ParallelCurrentXids + * will contain the XIDs themselves. This includes all XIDs that were current + * or sub-committed in the parent at the time the parallel operation began. + * The XIDs are stored sorted in numerical order (not logical order) to make + * lookups as fast as possible. + */ +FullTransactionId XactTopFullTransactionId = {InvalidTransactionId}; +int nParallelCurrentXids = 0; +TransactionId *ParallelCurrentXids; + +/* + * Miscellaneous flag bits to record events which occur on the top level + * transaction. These flags are only persisted in MyXactFlags and are intended + * so we remember to do certain things later on in the transaction. This is + * globally accessible, so can be set from anywhere in the code that requires + * recording flags. + */ +int MyXactFlags; + +/* + * transaction states - transaction state from server perspective + */ +typedef enum TransState +{ + TRANS_DEFAULT, /* idle */ + TRANS_START, /* transaction starting */ + TRANS_INPROGRESS, /* inside a valid transaction */ + TRANS_COMMIT, /* commit in progress */ + TRANS_ABORT, /* abort in progress */ + TRANS_PREPARE /* prepare in progress */ +} TransState; + +/* + * transaction block states - transaction state of client queries + * + * Note: the subtransaction states are used only for non-topmost + * transactions; the others appear only in the topmost transaction. + */ +typedef enum TBlockState +{ + /* not-in-transaction-block states */ + TBLOCK_DEFAULT, /* idle */ + TBLOCK_STARTED, /* running single-query transaction */ + + /* transaction block states */ + TBLOCK_BEGIN, /* starting transaction block */ + TBLOCK_INPROGRESS, /* live transaction */ + TBLOCK_IMPLICIT_INPROGRESS, /* live transaction after implicit BEGIN */ + TBLOCK_PARALLEL_INPROGRESS, /* live transaction inside parallel worker */ + TBLOCK_END, /* COMMIT received */ + TBLOCK_ABORT, /* failed xact, awaiting ROLLBACK */ + TBLOCK_ABORT_END, /* failed xact, ROLLBACK received */ + TBLOCK_ABORT_PENDING, /* live xact, ROLLBACK received */ + TBLOCK_PREPARE, /* live xact, PREPARE received */ + + /* subtransaction states */ + TBLOCK_SUBBEGIN, /* starting a subtransaction */ + TBLOCK_SUBINPROGRESS, /* live subtransaction */ + TBLOCK_SUBRELEASE, /* RELEASE received */ + TBLOCK_SUBCOMMIT, /* COMMIT received while TBLOCK_SUBINPROGRESS */ + TBLOCK_SUBABORT, /* failed subxact, awaiting ROLLBACK */ + TBLOCK_SUBABORT_END, /* failed subxact, ROLLBACK received */ + TBLOCK_SUBABORT_PENDING, /* live subxact, ROLLBACK received */ + TBLOCK_SUBRESTART, /* live subxact, ROLLBACK TO received */ + TBLOCK_SUBABORT_RESTART /* failed subxact, ROLLBACK TO received */ +} TBlockState; + +/* + * transaction state structure + */ +typedef struct TransactionStateData +{ + FullTransactionId fullTransactionId; /* my FullTransactionId */ + SubTransactionId subTransactionId; /* my subxact ID */ + char *name; /* savepoint name, if any */ + int savepointLevel; /* savepoint level */ + TransState state; /* low-level state */ + TBlockState blockState; /* high-level state */ + int nestingLevel; /* transaction nesting depth */ + int gucNestLevel; /* GUC context nesting depth */ + MemoryContext curTransactionContext; /* my xact-lifetime context */ + ResourceOwner curTransactionOwner; /* my query resources */ + TransactionId *childXids; /* subcommitted child XIDs, in XID order */ + int nChildXids; /* # of subcommitted child XIDs */ + int maxChildXids; /* allocated size of childXids[] */ + Oid prevUser; /* previous CurrentUserId setting */ + int prevSecContext; /* previous SecurityRestrictionContext */ + bool prevXactReadOnly; /* entry-time xact r/o state */ + bool startedInRecovery; /* did we start in recovery? */ + bool didLogXid; /* has xid been included in WAL record? */ + int parallelModeLevel; /* Enter/ExitParallelMode counter */ + bool chain; /* start a new block after this one */ + bool assigned; /* assigned to top-level XID */ + struct TransactionStateData *parent; /* back link to parent */ +} TransactionStateData; + +typedef TransactionStateData *TransactionState; + +/* + * Serialized representation used to transmit transaction state to parallel + * workers through shared memory. + */ +typedef struct SerializedTransactionState +{ + int xactIsoLevel; + bool xactDeferrable; + FullTransactionId topFullTransactionId; + FullTransactionId currentFullTransactionId; + CommandId currentCommandId; + int nParallelCurrentXids; + TransactionId parallelCurrentXids[FLEXIBLE_ARRAY_MEMBER]; +} SerializedTransactionState; + +/* The size of SerializedTransactionState, not including the final array. */ +#define SerializedTransactionStateHeaderSize \ + offsetof(SerializedTransactionState, parallelCurrentXids) + +/* + * CurrentTransactionState always points to the current transaction state + * block. It will point to TopTransactionStateData when not in a + * transaction at all, or when in a top-level transaction. + */ +static TransactionStateData TopTransactionStateData = { + .state = TRANS_DEFAULT, + .blockState = TBLOCK_DEFAULT, + .assigned = false, +}; + +/* + * unreportedXids holds XIDs of all subtransactions that have not yet been + * reported in an XLOG_XACT_ASSIGNMENT record. + */ +static int nUnreportedXids; +static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS]; + +static TransactionState CurrentTransactionState = &TopTransactionStateData; + +/* + * The subtransaction ID and command ID assignment counters are global + * to a whole transaction, so we do not keep them in the state stack. + */ +static SubTransactionId currentSubTransactionId; +static CommandId currentCommandId; +static bool currentCommandIdUsed; + +/* + * xactStartTimestamp is the value of transaction_timestamp(). + * stmtStartTimestamp is the value of statement_timestamp(). + * xactStopTimestamp is the time at which we log a commit or abort WAL record. + * These do not change as we enter and exit subtransactions, so we don't + * keep them inside the TransactionState stack. + */ +static TimestampTz xactStartTimestamp; +static TimestampTz stmtStartTimestamp; +static TimestampTz xactStopTimestamp; + +/* + * GID to be used for preparing the current transaction. This is also + * global to a whole transaction, so we don't keep it in the state stack. + */ +static char *prepareGID; + +/* + * Some commands want to force synchronous commit. + */ +static bool forceSyncCommit = false; + +/* Flag for logging statements in a transaction. */ +bool xact_is_sampled = false; + +/* + * Private context for transaction-abort work --- we reserve space for this + * at startup to ensure that AbortTransaction and AbortSubTransaction can work + * when we've run out of memory. + */ +static MemoryContext TransactionAbortContext = NULL; + +/* + * List of add-on start- and end-of-xact callbacks + */ +typedef struct XactCallbackItem +{ + struct XactCallbackItem *next; + XactCallback callback; + void *arg; +} XactCallbackItem; + +static XactCallbackItem *Xact_callbacks = NULL; + +/* + * List of add-on start- and end-of-subxact callbacks + */ +typedef struct SubXactCallbackItem +{ + struct SubXactCallbackItem *next; + SubXactCallback callback; + void *arg; +} SubXactCallbackItem; + +static SubXactCallbackItem *SubXact_callbacks = NULL; + + +/* local function prototypes */ +static void AssignTransactionId(TransactionState s); +static void AbortTransaction(void); +static void AtAbort_Memory(void); +static void AtCleanup_Memory(void); +static void AtAbort_ResourceOwner(void); +static void AtCCI_LocalCache(void); +static void AtCommit_Memory(void); +static void AtStart_Cache(void); +static void AtStart_Memory(void); +static void AtStart_ResourceOwner(void); +static void CallXactCallbacks(XactEvent event); +static void CallSubXactCallbacks(SubXactEvent event, + SubTransactionId mySubid, + SubTransactionId parentSubid); +static void CleanupTransaction(void); +static void CheckTransactionBlock(bool isTopLevel, bool throwError, + const char *stmtType); +static void CommitTransaction(void); +static TransactionId RecordTransactionAbort(bool isSubXact); +static void StartTransaction(void); + +static void StartSubTransaction(void); +static void CommitSubTransaction(void); +static void AbortSubTransaction(void); +static void CleanupSubTransaction(void); +static void PushTransaction(void); +static void PopTransaction(void); + +static void AtSubAbort_Memory(void); +static void AtSubCleanup_Memory(void); +static void AtSubAbort_ResourceOwner(void); +static void AtSubCommit_Memory(void); +static void AtSubStart_Memory(void); +static void AtSubStart_ResourceOwner(void); + +static void ShowTransactionState(const char *str); +static void ShowTransactionStateRec(const char *str, TransactionState state); +static const char *BlockStateAsString(TBlockState blockState); +static const char *TransStateAsString(TransState state); + + +/* ---------------------------------------------------------------- + * transaction state accessors + * ---------------------------------------------------------------- + */ + +/* + * IsTransactionState + * + * This returns true if we are inside a valid transaction; that is, + * it is safe to initiate database access, take heavyweight locks, etc. + */ +bool +IsTransactionState(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * TRANS_DEFAULT and TRANS_ABORT are obviously unsafe states. However, we + * also reject the startup/shutdown states TRANS_START, TRANS_COMMIT, + * TRANS_PREPARE since it might be too soon or too late within those + * transition states to do anything interesting. Hence, the only "valid" + * state is TRANS_INPROGRESS. + */ + return (s->state == TRANS_INPROGRESS); +} + +/* + * IsAbortedTransactionBlockState + * + * This returns true if we are within an aborted transaction block. + */ +bool +IsAbortedTransactionBlockState(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_ABORT || + s->blockState == TBLOCK_SUBABORT) + return true; + + return false; +} + + +/* + * GetTopTransactionId + * + * This will return the XID of the main transaction, assigning one if + * it's not yet set. Be careful to call this only inside a valid xact. + */ +TransactionId +GetTopTransactionId(void) +{ + if (!FullTransactionIdIsValid(XactTopFullTransactionId)) + AssignTransactionId(&TopTransactionStateData); + return XidFromFullTransactionId(XactTopFullTransactionId); +} + +/* + * GetTopTransactionIdIfAny + * + * This will return the XID of the main transaction, if one is assigned. + * It will return InvalidTransactionId if we are not currently inside a + * transaction, or inside a transaction that hasn't yet been assigned an XID. + */ +TransactionId +GetTopTransactionIdIfAny(void) +{ + return XidFromFullTransactionId(XactTopFullTransactionId); +} + +/* + * GetCurrentTransactionId + * + * This will return the XID of the current transaction (main or sub + * transaction), assigning one if it's not yet set. Be careful to call this + * only inside a valid xact. + */ +TransactionId +GetCurrentTransactionId(void) +{ + TransactionState s = CurrentTransactionState; + + if (!FullTransactionIdIsValid(s->fullTransactionId)) + AssignTransactionId(s); + return XidFromFullTransactionId(s->fullTransactionId); +} + +/* + * GetCurrentTransactionIdIfAny + * + * This will return the XID of the current sub xact, if one is assigned. + * It will return InvalidTransactionId if we are not currently inside a + * transaction, or inside a transaction that hasn't been assigned an XID yet. + */ +TransactionId +GetCurrentTransactionIdIfAny(void) +{ + return XidFromFullTransactionId(CurrentTransactionState->fullTransactionId); +} + +/* + * GetTopFullTransactionId + * + * This will return the FullTransactionId of the main transaction, assigning + * one if it's not yet set. Be careful to call this only inside a valid xact. + */ +FullTransactionId +GetTopFullTransactionId(void) +{ + if (!FullTransactionIdIsValid(XactTopFullTransactionId)) + AssignTransactionId(&TopTransactionStateData); + return XactTopFullTransactionId; +} + +/* + * GetTopFullTransactionIdIfAny + * + * This will return the FullTransactionId of the main transaction, if one is + * assigned. It will return InvalidFullTransactionId if we are not currently + * inside a transaction, or inside a transaction that hasn't yet been assigned + * one. + */ +FullTransactionId +GetTopFullTransactionIdIfAny(void) +{ + return XactTopFullTransactionId; +} + +/* + * GetCurrentFullTransactionId + * + * This will return the FullTransactionId of the current transaction (main or + * sub transaction), assigning one if it's not yet set. Be careful to call + * this only inside a valid xact. + */ +FullTransactionId +GetCurrentFullTransactionId(void) +{ + TransactionState s = CurrentTransactionState; + + if (!FullTransactionIdIsValid(s->fullTransactionId)) + AssignTransactionId(s); + return s->fullTransactionId; +} + +/* + * GetCurrentFullTransactionIdIfAny + * + * This will return the FullTransactionId of the current sub xact, if one is + * assigned. It will return InvalidFullTransactionId if we are not currently + * inside a transaction, or inside a transaction that hasn't been assigned one + * yet. + */ +FullTransactionId +GetCurrentFullTransactionIdIfAny(void) +{ + return CurrentTransactionState->fullTransactionId; +} + +/* + * MarkCurrentTransactionIdLoggedIfAny + * + * Remember that the current xid - if it is assigned - now has been wal logged. + */ +void +MarkCurrentTransactionIdLoggedIfAny(void) +{ + if (FullTransactionIdIsValid(CurrentTransactionState->fullTransactionId)) + CurrentTransactionState->didLogXid = true; +} + + +/* + * GetStableLatestTransactionId + * + * Get the transaction's XID if it has one, else read the next-to-be-assigned + * XID. Once we have a value, return that same value for the remainder of the + * current transaction. This is meant to provide the reference point for the + * age(xid) function, but might be useful for other maintenance tasks as well. + */ +TransactionId +GetStableLatestTransactionId(void) +{ + static LocalTransactionId lxid = InvalidLocalTransactionId; + static TransactionId stablexid = InvalidTransactionId; + + if (lxid != MyProc->lxid) + { + lxid = MyProc->lxid; + stablexid = GetTopTransactionIdIfAny(); + if (!TransactionIdIsValid(stablexid)) + stablexid = ReadNextTransactionId(); + } + + Assert(TransactionIdIsValid(stablexid)); + + return stablexid; +} + +/* + * AssignTransactionId + * + * Assigns a new permanent FullTransactionId to the given TransactionState. + * We do not assign XIDs to transactions until/unless this is called. + * Also, any parent TransactionStates that don't yet have XIDs are assigned + * one; this maintains the invariant that a child transaction has an XID + * following its parent's. + */ +static void +AssignTransactionId(TransactionState s) +{ + bool isSubXact = (s->parent != NULL); + ResourceOwner currentOwner; + bool log_unknown_top = false; + + /* Assert that caller didn't screw up */ + Assert(!FullTransactionIdIsValid(s->fullTransactionId)); + Assert(s->state == TRANS_INPROGRESS); + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new XIDs at this point. + */ + if (IsInParallelMode() || IsParallelWorker()) + elog(ERROR, "cannot assign XIDs during a parallel operation"); + + /* + * Ensure parent(s) have XIDs, so that a child always has an XID later + * than its parent. Mustn't recurse here, or we might get a stack + * overflow if we're at the bottom of a huge stack of subtransactions none + * of which have XIDs yet. + */ + if (isSubXact && !FullTransactionIdIsValid(s->parent->fullTransactionId)) + { + TransactionState p = s->parent; + TransactionState *parents; + size_t parentOffset = 0; + + parents = palloc(sizeof(TransactionState) * s->nestingLevel); + while (p != NULL && !FullTransactionIdIsValid(p->fullTransactionId)) + { + parents[parentOffset++] = p; + p = p->parent; + } + + /* + * This is technically a recursive call, but the recursion will never + * be more than one layer deep. + */ + while (parentOffset != 0) + AssignTransactionId(parents[--parentOffset]); + + pfree(parents); + } + + /* + * When wal_level=logical, guarantee that a subtransaction's xid can only + * be seen in the WAL stream if its toplevel xid has been logged before. + * If necessary we log an xact_assignment record with fewer than + * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set + * for a transaction even though it appears in a WAL record, we just might + * superfluously log something. That can happen when an xid is included + * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in + * xl_standby_locks. + */ + if (isSubXact && XLogLogicalInfoActive() && + !TopTransactionStateData.didLogXid) + log_unknown_top = true; + + /* + * Generate a new FullTransactionId and record its xid in PG_PROC and + * pg_subtrans. + * + * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in + * shared storage other than PG_PROC; because if there's no room for it in + * PG_PROC, the subtrans entry is needed to ensure that other backends see + * the Xid as "running". See GetNewTransactionId. + */ + s->fullTransactionId = GetNewTransactionId(isSubXact); + if (!isSubXact) + XactTopFullTransactionId = s->fullTransactionId; + + if (isSubXact) + SubTransSetParent(XidFromFullTransactionId(s->fullTransactionId), + XidFromFullTransactionId(s->parent->fullTransactionId)); + + /* + * If it's a top-level transaction, the predicate locking system needs to + * be told about it too. + */ + if (!isSubXact) + RegisterPredicateLockingXid(XidFromFullTransactionId(s->fullTransactionId)); + + /* + * Acquire lock on the transaction XID. (We assume this cannot block.) We + * have to ensure that the lock is assigned to the transaction's own + * ResourceOwner. + */ + currentOwner = CurrentResourceOwner; + CurrentResourceOwner = s->curTransactionOwner; + + XactLockTableInsert(XidFromFullTransactionId(s->fullTransactionId)); + + CurrentResourceOwner = currentOwner; + + /* + * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each + * top-level transaction we issue a WAL record for the assignment. We + * include the top-level xid and all the subxids that have not yet been + * reported using XLOG_XACT_ASSIGNMENT records. + * + * This is required to limit the amount of shared memory required in a hot + * standby server to keep track of in-progress XIDs. See notes for + * RecordKnownAssignedTransactionIds(). + * + * We don't keep track of the immediate parent of each subxid, only the + * top-level transaction that each subxact belongs to. This is correct in + * recovery only because aborted subtransactions are separately WAL + * logged. + * + * This is correct even for the case where several levels above us didn't + * have an xid assigned as we recursed up to them beforehand. + */ + if (isSubXact && XLogStandbyInfoActive()) + { + unreportedXids[nUnreportedXids] = XidFromFullTransactionId(s->fullTransactionId); + nUnreportedXids++; + + /* + * ensure this test matches similar one in + * RecoverPreparedTransactions() + */ + if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS || + log_unknown_top) + { + xl_xact_assignment xlrec; + + /* + * xtop is always set by now because we recurse up transaction + * stack to the highest unassigned xid and then come back down + */ + xlrec.xtop = GetTopTransactionId(); + Assert(TransactionIdIsValid(xlrec.xtop)); + xlrec.nsubxacts = nUnreportedXids; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment); + XLogRegisterData((char *) unreportedXids, + nUnreportedXids * sizeof(TransactionId)); + + (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT); + + nUnreportedXids = 0; + /* mark top, not current xact as having been logged */ + TopTransactionStateData.didLogXid = true; + } + } +} + +/* + * GetCurrentSubTransactionId + */ +SubTransactionId +GetCurrentSubTransactionId(void) +{ + TransactionState s = CurrentTransactionState; + + return s->subTransactionId; +} + +/* + * SubTransactionIsActive + * + * Test if the specified subxact ID is still active. Note caller is + * responsible for checking whether this ID is relevant to the current xact. + */ +bool +SubTransactionIsActive(SubTransactionId subxid) +{ + TransactionState s; + + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (s->state == TRANS_ABORT) + continue; + if (s->subTransactionId == subxid) + return true; + } + return false; +} + + +/* + * GetCurrentCommandId + * + * "used" must be true if the caller intends to use the command ID to mark + * inserted/updated/deleted tuples. false means the ID is being fetched + * for read-only purposes (ie, as a snapshot validity cutoff). See + * CommandCounterIncrement() for discussion. + */ +CommandId +GetCurrentCommandId(bool used) +{ + /* this is global to a transaction, not subtransaction-local */ + if (used) + { + /* + * Forbid setting currentCommandIdUsed in a parallel worker, because + * we have no provision for communicating this back to the leader. We + * could relax this restriction when currentCommandIdUsed was already + * true at the start of the parallel operation. + */ + Assert(!IsParallelWorker()); + currentCommandIdUsed = true; + } + return currentCommandId; +} + +/* + * SetParallelStartTimestamps + * + * In a parallel worker, we should inherit the parent transaction's + * timestamps rather than setting our own. The parallel worker + * infrastructure must call this to provide those values before + * calling StartTransaction() or SetCurrentStatementStartTimestamp(). + */ +void +SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts) +{ + Assert(IsParallelWorker()); + xactStartTimestamp = xact_ts; + stmtStartTimestamp = stmt_ts; +} + +/* + * GetCurrentTransactionStartTimestamp + */ +TimestampTz +GetCurrentTransactionStartTimestamp(void) +{ + return xactStartTimestamp; +} + +/* + * GetCurrentStatementStartTimestamp + */ +TimestampTz +GetCurrentStatementStartTimestamp(void) +{ + return stmtStartTimestamp; +} + +/* + * GetCurrentTransactionStopTimestamp + * + * We return current time if the transaction stop time hasn't been set + * (which can happen if we decide we don't need to log an XLOG record). + */ +TimestampTz +GetCurrentTransactionStopTimestamp(void) +{ + if (xactStopTimestamp != 0) + return xactStopTimestamp; + return GetCurrentTimestamp(); +} + +/* + * SetCurrentStatementStartTimestamp + * + * In a parallel worker, this should already have been provided by a call + * to SetParallelStartTimestamps(). + */ +void +SetCurrentStatementStartTimestamp(void) +{ + if (!IsParallelWorker()) + stmtStartTimestamp = GetCurrentTimestamp(); + else + Assert(stmtStartTimestamp != 0); +} + +/* + * SetCurrentTransactionStopTimestamp + */ +static inline void +SetCurrentTransactionStopTimestamp(void) +{ + xactStopTimestamp = GetCurrentTimestamp(); +} + +/* + * GetCurrentTransactionNestLevel + * + * Note: this will return zero when not inside any transaction, one when + * inside a top-level transaction, etc. + */ +int +GetCurrentTransactionNestLevel(void) +{ + TransactionState s = CurrentTransactionState; + + return s->nestingLevel; +} + + +/* + * TransactionIdIsCurrentTransactionId + */ +bool +TransactionIdIsCurrentTransactionId(TransactionId xid) +{ + TransactionState s; + + /* + * We always say that BootstrapTransactionId is "not my transaction ID" + * even when it is (ie, during bootstrap). Along with the fact that + * transam.c always treats BootstrapTransactionId as already committed, + * this causes the heapam_visibility.c routines to see all tuples as + * committed, which is what we need during bootstrap. (Bootstrap mode + * only inserts tuples, it never updates or deletes them, so all tuples + * can be presumed good immediately.) + * + * Likewise, InvalidTransactionId and FrozenTransactionId are certainly + * not my transaction ID, so we can just return "false" immediately for + * any non-normal XID. + */ + if (!TransactionIdIsNormal(xid)) + return false; + + if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) + return true; + + /* + * In parallel workers, the XIDs we must consider as current are stored in + * ParallelCurrentXids rather than the transaction-state stack. Note that + * the XIDs in this array are sorted numerically rather than according to + * transactionIdPrecedes order. + */ + if (nParallelCurrentXids > 0) + { + int low, + high; + + low = 0; + high = nParallelCurrentXids - 1; + while (low <= high) + { + int middle; + TransactionId probe; + + middle = low + (high - low) / 2; + probe = ParallelCurrentXids[middle]; + if (probe == xid) + return true; + else if (probe < xid) + low = middle + 1; + else + high = middle - 1; + } + return false; + } + + /* + * We will return true for the Xid of the current subtransaction, any of + * its subcommitted children, any of its parents, or any of their + * previously subcommitted children. However, a transaction being aborted + * is no longer "current", even though it may still have an entry on the + * state stack. + */ + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + int low, + high; + + if (s->state == TRANS_ABORT) + continue; + if (!FullTransactionIdIsValid(s->fullTransactionId)) + continue; /* it can't have any child XIDs either */ + if (TransactionIdEquals(xid, XidFromFullTransactionId(s->fullTransactionId))) + return true; + /* As the childXids array is ordered, we can use binary search */ + low = 0; + high = s->nChildXids - 1; + while (low <= high) + { + int middle; + TransactionId probe; + + middle = low + (high - low) / 2; + probe = s->childXids[middle]; + if (TransactionIdEquals(probe, xid)) + return true; + else if (TransactionIdPrecedes(probe, xid)) + low = middle + 1; + else + high = middle - 1; + } + } + + return false; +} + +/* + * TransactionStartedDuringRecovery + * + * Returns true if the current transaction started while recovery was still + * in progress. Recovery might have ended since so RecoveryInProgress() might + * return false already. + */ +bool +TransactionStartedDuringRecovery(void) +{ + return CurrentTransactionState->startedInRecovery; +} + +/* + * EnterParallelMode + */ +void +EnterParallelMode(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parallelModeLevel >= 0); + + ++s->parallelModeLevel; +} + +/* + * ExitParallelMode + */ +void +ExitParallelMode(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parallelModeLevel > 0); + Assert(s->parallelModeLevel > 1 || !ParallelContextActive()); + + --s->parallelModeLevel; +} + +/* + * IsInParallelMode + * + * Are we in a parallel operation, as either the leader or a worker? Check + * this to prohibit operations that change backend-local state expected to + * match across all workers. Mere caches usually don't require such a + * restriction. State modified in a strict push/pop fashion, such as the + * active snapshot stack, is often fine. + */ +bool +IsInParallelMode(void) +{ + return CurrentTransactionState->parallelModeLevel != 0; +} + +/* + * CommandCounterIncrement + */ +void +CommandCounterIncrement(void) +{ + /* + * If the current value of the command counter hasn't been "used" to mark + * tuples, we need not increment it, since there's no need to distinguish + * a read-only command from others. This helps postpone command counter + * overflow, and keeps no-op CommandCounterIncrement operations cheap. + */ + if (currentCommandIdUsed) + { + /* + * Workers synchronize transaction state at the beginning of each + * parallel operation, so we can't account for new commands after that + * point. + */ + if (IsInParallelMode() || IsParallelWorker()) + elog(ERROR, "cannot start commands during a parallel operation"); + + currentCommandId += 1; + if (currentCommandId == InvalidCommandId) + { + currentCommandId -= 1; + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than 2^32-2 commands in a transaction"))); + } + currentCommandIdUsed = false; + + /* Propagate new command ID into static snapshots */ + SnapshotSetCommandId(currentCommandId); + + /* + * Make any catalog changes done by the just-completed command visible + * in the local syscache. We obviously don't need to do this after a + * read-only command. (But see hacks in inval.c to make real sure we + * don't think a command that queued inval messages was read-only.) + */ + AtCCI_LocalCache(); + } +} + +/* + * ForceSyncCommit + * + * Interface routine to allow commands to force a synchronous commit of the + * current top-level transaction. Currently, two-phase commit does not + * persist and restore this variable. So long as all callers use + * PreventInTransactionBlock(), that omission has no consequences. + */ +void +ForceSyncCommit(void) +{ + forceSyncCommit = true; +} + + +/* ---------------------------------------------------------------- + * StartTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtStart_Cache + */ +static void +AtStart_Cache(void) +{ + AcceptInvalidationMessages(); +} + +/* + * AtStart_Memory + */ +static void +AtStart_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * If this is the first time through, create a private context for + * AbortTransaction to work in. By reserving some space now, we can + * insulate AbortTransaction from out-of-memory scenarios. Like + * ErrorContext, we set it up with slow growth rate and a nonzero minimum + * size, so that space will be reserved immediately. + */ + if (TransactionAbortContext == NULL) + TransactionAbortContext = + AllocSetContextCreate(TopMemoryContext, + "TransactionAbortContext", + 32 * 1024, + 32 * 1024, + 32 * 1024); + + /* + * We shouldn't have a transaction context already. + */ + Assert(TopTransactionContext == NULL); + + /* + * Create a toplevel context for the transaction. + */ + TopTransactionContext = + AllocSetContextCreate(TopMemoryContext, + "TopTransactionContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * In a top-level transaction, CurTransactionContext is the same as + * TopTransactionContext. + */ + CurTransactionContext = TopTransactionContext; + s->curTransactionContext = CurTransactionContext; + + /* Make the CurTransactionContext active. */ + MemoryContextSwitchTo(CurTransactionContext); +} + +/* + * AtStart_ResourceOwner + */ +static void +AtStart_ResourceOwner(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * We shouldn't have a transaction resource owner already. + */ + Assert(TopTransactionResourceOwner == NULL); + + /* + * Create a toplevel resource owner for the transaction. + */ + s->curTransactionOwner = ResourceOwnerCreate(NULL, "TopTransaction"); + + TopTransactionResourceOwner = s->curTransactionOwner; + CurTransactionResourceOwner = s->curTransactionOwner; + CurrentResourceOwner = s->curTransactionOwner; +} + +/* ---------------------------------------------------------------- + * StartSubTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtSubStart_Memory + */ +static void +AtSubStart_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(CurTransactionContext != NULL); + + /* + * Create a CurTransactionContext, which will be used to hold data that + * survives subtransaction commit but disappears on subtransaction abort. + * We make it a child of the immediate parent's CurTransactionContext. + */ + CurTransactionContext = AllocSetContextCreate(CurTransactionContext, + "CurTransactionContext", + ALLOCSET_DEFAULT_SIZES); + s->curTransactionContext = CurTransactionContext; + + /* Make the CurTransactionContext active. */ + MemoryContextSwitchTo(CurTransactionContext); +} + +/* + * AtSubStart_ResourceOwner + */ +static void +AtSubStart_ResourceOwner(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parent != NULL); + + /* + * Create a resource owner for the subtransaction. We make it a child of + * the immediate parent's resource owner. + */ + s->curTransactionOwner = + ResourceOwnerCreate(s->parent->curTransactionOwner, + "SubTransaction"); + + CurTransactionResourceOwner = s->curTransactionOwner; + CurrentResourceOwner = s->curTransactionOwner; +} + +/* ---------------------------------------------------------------- + * CommitTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * RecordTransactionCommit + * + * Returns latest XID among xact and its children, or InvalidTransactionId + * if the xact has no XID. (We compute that here just because it's easier.) + * + * If you change this function, see RecordTransactionCommitPrepared also. + */ +static TransactionId +RecordTransactionCommit(void) +{ + TransactionId xid = GetTopTransactionIdIfAny(); + bool markXidCommitted = TransactionIdIsValid(xid); + TransactionId latestXid = InvalidTransactionId; + int nrels; + RelFileNode *rels; + int nchildren; + TransactionId *children; + int nmsgs = 0; + SharedInvalidationMessage *invalMessages = NULL; + bool RelcacheInitFileInval = false; + bool wrote_xlog; + + /* + * Log pending invalidations for logical decoding of in-progress + * transactions. Normally for DDLs, we log this at each command end, + * however, for certain cases where we directly update the system table + * without a transaction block, the invalidations are not logged till this + * time. + */ + if (XLogLogicalInfoActive()) + LogLogicalInvalidations(); + + /* Get data needed for commit record */ + nrels = smgrGetPendingDeletes(true, &rels); + nchildren = xactGetCommittedChildren(&children); + if (XLogStandbyInfoActive()) + nmsgs = xactGetCommittedInvalidationMessages(&invalMessages, + &RelcacheInitFileInval); + wrote_xlog = (XactLastRecEnd != 0); + + /* + * If we haven't been assigned an XID yet, we neither can, nor do we want + * to write a COMMIT record. + */ + if (!markXidCommitted) + { + /* + * We expect that every RelationDropStorage is followed by a catalog + * update, and hence XID assignment, so we shouldn't get here with any + * pending deletes. Use a real test not just an Assert to check this, + * since it's a bit fragile. + */ + if (nrels != 0) + elog(ERROR, "cannot commit a transaction that deleted files but has no xid"); + + /* Can't have child XIDs either; AssignTransactionId enforces this */ + Assert(nchildren == 0); + + /* + * Transactions without an assigned xid can contain invalidation + * messages (e.g. explicit relcache invalidations or catcache + * invalidations for inplace updates); standbys need to process those. + * We can't emit a commit record without an xid, and we don't want to + * force assigning an xid, because that'd be problematic for e.g. + * vacuum. Hence we emit a bespoke record for the invalidations. We + * don't want to use that in case a commit record is emitted, so they + * happen synchronously with commits (besides not wanting to emit more + * WAL records). + */ + if (nmsgs != 0) + { + LogStandbyInvalidations(nmsgs, invalMessages, + RelcacheInitFileInval); + wrote_xlog = true; /* not strictly necessary */ + } + + /* + * If we didn't create XLOG entries, we're done here; otherwise we + * should trigger flushing those entries the same as a commit record + * would. This will primarily happen for HOT pruning and the like; we + * want these to be flushed to disk in due time. + */ + if (!wrote_xlog) + goto cleanup; + } + else + { + bool replorigin; + + /* + * Are we using the replication origins feature? Or, in other words, + * are we replaying remote actions? + */ + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + /* + * Begin commit critical section and insert the commit XLOG record. + */ + /* Tell bufmgr and smgr to prepare for commit */ + BufmgrCommit(); + + /* + * Mark ourselves as within our "commit critical section". This + * forces any concurrent checkpoint to wait until we've updated + * pg_xact. Without this, it is possible for the checkpoint to set + * REDO after the XLOG record but fail to flush the pg_xact update to + * disk, leading to loss of the transaction commit if the system + * crashes a little later. + * + * Note: we could, but don't bother to, set this flag in + * RecordTransactionAbort. That's because loss of a transaction abort + * is noncritical; the presumption would be that it aborted, anyway. + * + * It's safe to change the delayChkpt flag of our own backend without + * holding the ProcArrayLock, since we're the only one modifying it. + * This makes checkpoint's determination of which xacts are delayChkpt + * a bit fuzzy, but it doesn't matter. + */ + Assert(!MyProc->delayChkpt); + START_CRIT_SECTION(); + MyProc->delayChkpt = true; + + SetCurrentTransactionStopTimestamp(); + + XactLogCommitRecord(xactStopTimestamp, + nchildren, children, nrels, rels, + nmsgs, invalMessages, + RelcacheInitFileInval, + MyXactFlags, + InvalidTransactionId, NULL /* plain commit */ ); + + if (replorigin) + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + XactLastRecEnd); + + /* + * Record commit timestamp. The value comes from plain commit + * timestamp if there's no replication origin; otherwise, the + * timestamp was already set in replorigin_session_origin_timestamp by + * replication. + * + * We don't need to WAL-log anything here, as the commit record + * written above already contains the data. + */ + + if (!replorigin || replorigin_session_origin_timestamp == 0) + replorigin_session_origin_timestamp = xactStopTimestamp; + + TransactionTreeSetCommitTsData(xid, nchildren, children, + replorigin_session_origin_timestamp, + replorigin_session_origin); + } + + /* + * Check if we want to commit asynchronously. We can allow the XLOG flush + * to happen asynchronously if synchronous_commit=off, or if the current + * transaction has not performed any WAL-logged operation or didn't assign + * an xid. The transaction can end up not writing any WAL, even if it has + * an xid, if it only wrote to temporary and/or unlogged tables. It can + * end up having written WAL without an xid if it did HOT pruning. In + * case of a crash, the loss of such a transaction will be irrelevant; + * temp tables will be lost anyway, unlogged tables will be truncated and + * HOT pruning will be done again later. (Given the foregoing, you might + * think that it would be unnecessary to emit the XLOG record at all in + * this case, but we don't currently try to do that. It would certainly + * cause problems at least in Hot Standby mode, where the + * KnownAssignedXids machinery requires tracking every XID assignment. It + * might be OK to skip it only when wal_level < replica, but for now we + * don't.) + * + * However, if we're doing cleanup of any non-temp rels or committing any + * command that wanted to force sync commit, then we must flush XLOG + * immediately. (We must not allow asynchronous commit if there are any + * non-temp tables to be deleted, because we might delete the files before + * the COMMIT record is flushed to disk. We do allow asynchronous commit + * if all to-be-deleted tables are temporary though, since they are lost + * anyway if we crash.) + */ + if ((wrote_xlog && markXidCommitted && + synchronous_commit > SYNCHRONOUS_COMMIT_OFF) || + forceSyncCommit || nrels > 0) + { + XLogFlush(XactLastRecEnd); + + /* + * Now we may update the CLOG, if we wrote a COMMIT record above + */ + if (markXidCommitted) + TransactionIdCommitTree(xid, nchildren, children); + } + else + { + /* + * Asynchronous commit case: + * + * This enables possible committed transaction loss in the case of a + * postmaster crash because WAL buffers are left unwritten. Ideally we + * could issue the WAL write without the fsync, but some + * wal_sync_methods do not allow separate write/fsync. + * + * Report the latest async commit LSN, so that the WAL writer knows to + * flush this commit. + */ + XLogSetAsyncXactLSN(XactLastRecEnd); + + /* + * We must not immediately update the CLOG, since we didn't flush the + * XLOG. Instead, we store the LSN up to which the XLOG must be + * flushed before the CLOG may be updated. + */ + if (markXidCommitted) + TransactionIdAsyncCommitTree(xid, nchildren, children, XactLastRecEnd); + } + + /* + * If we entered a commit critical section, leave it now, and let + * checkpoints proceed. + */ + if (markXidCommitted) + { + MyProc->delayChkpt = false; + END_CRIT_SECTION(); + } + + /* Compute latestXid while we have the child XIDs handy */ + latestXid = TransactionIdLatest(xid, nchildren, children); + + /* + * Wait for synchronous replication, if required. Similar to the decision + * above about using committing asynchronously we only want to wait if + * this backend assigned an xid and wrote WAL. No need to wait if an xid + * was assigned due to temporary/unlogged tables or due to HOT pruning. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + if (wrote_xlog && markXidCommitted) + SyncRepWaitForLSN(XactLastRecEnd, true); + + /* remember end of last commit record */ + XactLastCommitEnd = XactLastRecEnd; + + /* Reset XactLastRecEnd until the next transaction writes something */ + XactLastRecEnd = 0; +cleanup: + /* Clean up local data */ + if (rels) + pfree(rels); + + return latestXid; +} + + +/* + * AtCCI_LocalCache + */ +static void +AtCCI_LocalCache(void) +{ + /* + * Make any pending relation map changes visible. We must do this before + * processing local sinval messages, so that the map changes will get + * reflected into the relcache when relcache invals are processed. + */ + AtCCI_RelationMap(); + + /* + * Make catalog changes visible to me for the next command. + */ + CommandEndInvalidationMessages(); +} + +/* + * AtCommit_Memory + */ +static void +AtCommit_Memory(void) +{ + /* + * Now that we're "out" of a transaction, have the system allocate things + * in the top memory context instead of per-transaction contexts. + */ + MemoryContextSwitchTo(TopMemoryContext); + + /* + * Release all transaction-local memory. + */ + Assert(TopTransactionContext != NULL); + MemoryContextDelete(TopTransactionContext); + TopTransactionContext = NULL; + CurTransactionContext = NULL; + CurrentTransactionState->curTransactionContext = NULL; +} + +/* ---------------------------------------------------------------- + * CommitSubTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtSubCommit_Memory + */ +static void +AtSubCommit_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parent != NULL); + + /* Return to parent transaction level's memory context. */ + CurTransactionContext = s->parent->curTransactionContext; + MemoryContextSwitchTo(CurTransactionContext); + + /* + * Ordinarily we cannot throw away the child's CurTransactionContext, + * since the data it contains will be needed at upper commit. However, if + * there isn't actually anything in it, we can throw it away. This avoids + * a small memory leak in the common case of "trivial" subxacts. + */ + if (MemoryContextIsEmpty(s->curTransactionContext)) + { + MemoryContextDelete(s->curTransactionContext); + s->curTransactionContext = NULL; + } +} + +/* + * AtSubCommit_childXids + * + * Pass my own XID and my child XIDs up to my parent as committed children. + */ +static void +AtSubCommit_childXids(void) +{ + TransactionState s = CurrentTransactionState; + int new_nChildXids; + + Assert(s->parent != NULL); + + /* + * The parent childXids array will need to hold my XID and all my + * childXids, in addition to the XIDs already there. + */ + new_nChildXids = s->parent->nChildXids + s->nChildXids + 1; + + /* Allocate or enlarge the parent array if necessary */ + if (s->parent->maxChildXids < new_nChildXids) + { + int new_maxChildXids; + TransactionId *new_childXids; + + /* + * Make it 2x what's needed right now, to avoid having to enlarge it + * repeatedly. But we can't go above MaxAllocSize. (The latter limit + * is what ensures that we don't need to worry about integer overflow + * here or in the calculation of new_nChildXids.) + */ + new_maxChildXids = Min(new_nChildXids * 2, + (int) (MaxAllocSize / sizeof(TransactionId))); + + if (new_maxChildXids < new_nChildXids) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("maximum number of committed subtransactions (%d) exceeded", + (int) (MaxAllocSize / sizeof(TransactionId))))); + + /* + * We keep the child-XID arrays in TopTransactionContext; this avoids + * setting up child-transaction contexts for what might be just a few + * bytes of grandchild XIDs. + */ + if (s->parent->childXids == NULL) + new_childXids = + MemoryContextAlloc(TopTransactionContext, + new_maxChildXids * sizeof(TransactionId)); + else + new_childXids = repalloc(s->parent->childXids, + new_maxChildXids * sizeof(TransactionId)); + + s->parent->childXids = new_childXids; + s->parent->maxChildXids = new_maxChildXids; + } + + /* + * Copy all my XIDs to parent's array. + * + * Note: We rely on the fact that the XID of a child always follows that + * of its parent. By copying the XID of this subtransaction before the + * XIDs of its children, we ensure that the array stays ordered. Likewise, + * all XIDs already in the array belong to subtransactions started and + * subcommitted before us, so their XIDs must precede ours. + */ + s->parent->childXids[s->parent->nChildXids] = XidFromFullTransactionId(s->fullTransactionId); + + if (s->nChildXids > 0) + memcpy(&s->parent->childXids[s->parent->nChildXids + 1], + s->childXids, + s->nChildXids * sizeof(TransactionId)); + + s->parent->nChildXids = new_nChildXids; + + /* Release child's array to avoid leakage */ + if (s->childXids != NULL) + pfree(s->childXids); + /* We must reset these to avoid double-free if fail later in commit */ + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; +} + +/* ---------------------------------------------------------------- + * AbortTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * RecordTransactionAbort + * + * Returns latest XID among xact and its children, or InvalidTransactionId + * if the xact has no XID. (We compute that here just because it's easier.) + */ +static TransactionId +RecordTransactionAbort(bool isSubXact) +{ + TransactionId xid = GetCurrentTransactionIdIfAny(); + TransactionId latestXid; + int nrels; + RelFileNode *rels; + int nchildren; + TransactionId *children; + TimestampTz xact_time; + + /* + * If we haven't been assigned an XID, nobody will care whether we aborted + * or not. Hence, we're done in that case. It does not matter if we have + * rels to delete (note that this routine is not responsible for actually + * deleting 'em). We cannot have any child XIDs, either. + */ + if (!TransactionIdIsValid(xid)) + { + /* Reset XactLastRecEnd until the next transaction writes something */ + if (!isSubXact) + XactLastRecEnd = 0; + return InvalidTransactionId; + } + + /* + * We have a valid XID, so we should write an ABORT record for it. + * + * We do not flush XLOG to disk here, since the default assumption after a + * crash would be that we aborted, anyway. For the same reason, we don't + * need to worry about interlocking against checkpoint start. + */ + + /* + * Check that we haven't aborted halfway through RecordTransactionCommit. + */ + if (TransactionIdDidCommit(xid)) + elog(PANIC, "cannot abort transaction %u, it was already committed", + xid); + + /* Fetch the data we need for the abort record */ + nrels = smgrGetPendingDeletes(false, &rels); + nchildren = xactGetCommittedChildren(&children); + + /* XXX do we really need a critical section here? */ + START_CRIT_SECTION(); + + /* Write the ABORT record */ + if (isSubXact) + xact_time = GetCurrentTimestamp(); + else + { + SetCurrentTransactionStopTimestamp(); + xact_time = xactStopTimestamp; + } + + XactLogAbortRecord(xact_time, + nchildren, children, + nrels, rels, + MyXactFlags, InvalidTransactionId, + NULL); + + /* + * Report the latest async abort LSN, so that the WAL writer knows to + * flush this abort. There's nothing to be gained by delaying this, since + * WALWriter may as well do this when it can. This is important with + * streaming replication because if we don't flush WAL regularly we will + * find that large aborts leave us with a long backlog for when commits + * occur after the abort, increasing our window of data loss should + * problems occur at that point. + */ + if (!isSubXact) + XLogSetAsyncXactLSN(XactLastRecEnd); + + /* + * Mark the transaction aborted in clog. This is not absolutely necessary + * but we may as well do it while we are here; also, in the subxact case + * it is helpful because XactLockTableWait makes use of it to avoid + * waiting for already-aborted subtransactions. It is OK to do it without + * having flushed the ABORT record to disk, because in event of a crash + * we'd be assumed to have aborted anyway. + */ + TransactionIdAbortTree(xid, nchildren, children); + + END_CRIT_SECTION(); + + /* Compute latestXid while we have the child XIDs handy */ + latestXid = TransactionIdLatest(xid, nchildren, children); + + /* + * If we're aborting a subtransaction, we can immediately remove failed + * XIDs from PGPROC's cache of running child XIDs. We do that here for + * subxacts, because we already have the child XID array at hand. For + * main xacts, the equivalent happens just after this function returns. + */ + if (isSubXact) + XidCacheRemoveRunningXids(xid, nchildren, children, latestXid); + + /* Reset XactLastRecEnd until the next transaction writes something */ + if (!isSubXact) + XactLastRecEnd = 0; + + /* And clean up local data */ + if (rels) + pfree(rels); + + return latestXid; +} + +/* + * AtAbort_Memory + */ +static void +AtAbort_Memory(void) +{ + /* + * Switch into TransactionAbortContext, which should have some free space + * even if nothing else does. We'll work in this context until we've + * finished cleaning up. + * + * It is barely possible to get here when we've not been able to create + * TransactionAbortContext yet; if so use TopMemoryContext. + */ + if (TransactionAbortContext != NULL) + MemoryContextSwitchTo(TransactionAbortContext); + else + MemoryContextSwitchTo(TopMemoryContext); +} + +/* + * AtSubAbort_Memory + */ +static void +AtSubAbort_Memory(void) +{ + Assert(TransactionAbortContext != NULL); + + MemoryContextSwitchTo(TransactionAbortContext); +} + + +/* + * AtAbort_ResourceOwner + */ +static void +AtAbort_ResourceOwner(void) +{ + /* + * Make sure we have a valid ResourceOwner, if possible (else it will be + * NULL, which is OK) + */ + CurrentResourceOwner = TopTransactionResourceOwner; +} + +/* + * AtSubAbort_ResourceOwner + */ +static void +AtSubAbort_ResourceOwner(void) +{ + TransactionState s = CurrentTransactionState; + + /* Make sure we have a valid ResourceOwner */ + CurrentResourceOwner = s->curTransactionOwner; +} + + +/* + * AtSubAbort_childXids + */ +static void +AtSubAbort_childXids(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * We keep the child-XID arrays in TopTransactionContext (see + * AtSubCommit_childXids). This means we'd better free the array + * explicitly at abort to avoid leakage. + */ + if (s->childXids != NULL) + pfree(s->childXids); + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + /* + * We could prune the unreportedXids array here. But we don't bother. That + * would potentially reduce number of XLOG_XACT_ASSIGNMENT records but it + * would likely introduce more CPU time into the more common paths, so we + * choose not to do that. + */ +} + +/* ---------------------------------------------------------------- + * CleanupTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtCleanup_Memory + */ +static void +AtCleanup_Memory(void) +{ + Assert(CurrentTransactionState->parent == NULL); + + /* + * Now that we're "out" of a transaction, have the system allocate things + * in the top memory context instead of per-transaction contexts. + */ + MemoryContextSwitchTo(TopMemoryContext); + + /* + * Clear the special abort context for next time. + */ + if (TransactionAbortContext != NULL) + MemoryContextResetAndDeleteChildren(TransactionAbortContext); + + /* + * Release all transaction-local memory. + */ + if (TopTransactionContext != NULL) + MemoryContextDelete(TopTransactionContext); + TopTransactionContext = NULL; + CurTransactionContext = NULL; + CurrentTransactionState->curTransactionContext = NULL; +} + + +/* ---------------------------------------------------------------- + * CleanupSubTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtSubCleanup_Memory + */ +static void +AtSubCleanup_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parent != NULL); + + /* Make sure we're not in an about-to-be-deleted context */ + MemoryContextSwitchTo(s->parent->curTransactionContext); + CurTransactionContext = s->parent->curTransactionContext; + + /* + * Clear the special abort context for next time. + */ + if (TransactionAbortContext != NULL) + MemoryContextResetAndDeleteChildren(TransactionAbortContext); + + /* + * Delete the subxact local memory contexts. Its CurTransactionContext can + * go too (note this also kills CurTransactionContexts from any children + * of the subxact). + */ + if (s->curTransactionContext) + MemoryContextDelete(s->curTransactionContext); + s->curTransactionContext = NULL; +} + +/* ---------------------------------------------------------------- + * interface routines + * ---------------------------------------------------------------- + */ + +/* + * StartTransaction + */ +static void +StartTransaction(void) +{ + TransactionState s; + VirtualTransactionId vxid; + + /* + * Let's just make sure the state stack is empty + */ + s = &TopTransactionStateData; + CurrentTransactionState = s; + + Assert(!FullTransactionIdIsValid(XactTopFullTransactionId)); + + /* check the current transaction state */ + Assert(s->state == TRANS_DEFAULT); + + /* + * Set the current transaction state information appropriately during + * start processing. Note that once the transaction status is switched + * this process cannot fail until the user ID and the security context + * flags are fetched below. + */ + s->state = TRANS_START; + s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + + /* Determine if statements are logged in this transaction */ + xact_is_sampled = log_xact_sample_rate != 0 && + (log_xact_sample_rate == 1 || + random() <= log_xact_sample_rate * MAX_RANDOM_VALUE); + + /* + * initialize current transaction state fields + * + * note: prevXactReadOnly is not used at the outermost level + */ + s->nestingLevel = 1; + s->gucNestLevel = 1; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + /* + * Once the current user ID and the security context flags are fetched, + * both will be properly reset even if transaction startup fails. + */ + GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext); + + /* SecurityRestrictionContext should never be set outside a transaction */ + Assert(s->prevSecContext == 0); + + /* + * Make sure we've reset xact state variables + * + * If recovery is still in progress, mark this transaction as read-only. + * We have lower level defences in XLogInsert and elsewhere to stop us + * from modifying data during recovery, but this gives the normal + * indication to the user that the transaction is read-only. + */ + if (RecoveryInProgress()) + { + s->startedInRecovery = true; + XactReadOnly = true; + } + else + { + s->startedInRecovery = false; + XactReadOnly = DefaultXactReadOnly; + } + XactDeferrable = DefaultXactDeferrable; + XactIsoLevel = DefaultXactIsoLevel; + forceSyncCommit = false; + MyXactFlags = 0; + + /* + * reinitialize within-transaction counters + */ + s->subTransactionId = TopSubTransactionId; + currentSubTransactionId = TopSubTransactionId; + currentCommandId = FirstCommandId; + currentCommandIdUsed = false; + + /* + * initialize reported xid accounting + */ + nUnreportedXids = 0; + s->didLogXid = false; + + /* + * must initialize resource-management stuff first + */ + AtStart_Memory(); + AtStart_ResourceOwner(); + + /* + * Assign a new LocalTransactionId, and combine it with the backendId to + * form a virtual transaction id. + */ + vxid.backendId = MyBackendId; + vxid.localTransactionId = GetNextLocalTransactionId(); + + /* + * Lock the virtual transaction id before we announce it in the proc array + */ + VirtualXactLockTableInsert(vxid); + + /* + * Advertise it in the proc array. We assume assignment of + * localTransactionId is atomic, and the backendId should be set already. + */ + Assert(MyProc->backendId == vxid.backendId); + MyProc->lxid = vxid.localTransactionId; + + TRACE_POSTGRESQL_TRANSACTION_START(vxid.localTransactionId); + + /* + * set transaction_timestamp() (a/k/a now()). Normally, we want this to + * be the same as the first command's statement_timestamp(), so don't do a + * fresh GetCurrentTimestamp() call (which'd be expensive anyway). But + * for transactions started inside procedures (i.e., nonatomic SPI + * contexts), we do need to advance the timestamp. Also, in a parallel + * worker, the timestamp should already have been provided by a call to + * SetParallelStartTimestamps(). + */ + if (!IsParallelWorker()) + { + if (!SPI_inside_nonatomic_context()) + xactStartTimestamp = stmtStartTimestamp; + else + xactStartTimestamp = GetCurrentTimestamp(); + } + else + Assert(xactStartTimestamp != 0); + pgstat_report_xact_timestamp(xactStartTimestamp); + /* Mark xactStopTimestamp as unset. */ + xactStopTimestamp = 0; + + /* + * initialize other subsystems for new transaction + */ + AtStart_GUC(); + AtStart_Cache(); + AfterTriggerBeginXact(); + + /* + * done with start processing, set current transaction state to "in + * progress" + */ + s->state = TRANS_INPROGRESS; + + ShowTransactionState("StartTransaction"); +} + + +/* + * CommitTransaction + * + * NB: if you change this routine, better look at PrepareTransaction too! + */ +static void +CommitTransaction(void) +{ + TransactionState s = CurrentTransactionState; + TransactionId latestXid; + bool is_parallel_worker; + + is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS); + + /* Enforce parallel mode restrictions during parallel worker commit. */ + if (is_parallel_worker) + EnterParallelMode(); + + ShowTransactionState("CommitTransaction"); + + /* + * check the current transaction state + */ + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "CommitTransaction while in %s state", + TransStateAsString(s->state)); + Assert(s->parent == NULL); + + /* + * Do pre-commit processing that involves calling user-defined code, such + * as triggers. SECURITY_RESTRICTED_OPERATION contexts must not queue an + * action that would run here, because that would bypass the sandbox. + * Since closing cursors could queue trigger actions, triggers could open + * cursors, etc, we have to keep looping until there's nothing left to do. + */ + for (;;) + { + /* + * Fire all currently pending deferred triggers. + */ + AfterTriggerFireDeferred(); + + /* + * Close open portals (converting holdable ones into static portals). + * If there weren't any, we are done ... otherwise loop back to check + * if they queued deferred triggers. Lather, rinse, repeat. + */ + if (!PreCommit_Portals(false)) + break; + } + + /* + * The remaining actions cannot call any user-defined code, so it's safe + * to start shutting down within-transaction services. But note that most + * of this stuff could still throw an error, which would switch us into + * the transaction-abort path. + */ + + CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_PRE_COMMIT + : XACT_EVENT_PRE_COMMIT); + + /* If we might have parallel workers, clean them up now. */ + if (IsInParallelMode()) + AtEOXact_Parallel(true); + + /* Shut down the deferred-trigger manager */ + AfterTriggerEndXact(true); + + /* + * Let ON COMMIT management do its thing (must happen after closing + * cursors, to avoid dangling-reference problems) + */ + PreCommit_on_commit_actions(); + + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before AtEOXact_RelationMap(), so that we + * don't see committed-but-broken files after a crash. + */ + smgrDoPendingSyncs(true, is_parallel_worker); + + /* close large objects before lower-level cleanup */ + AtEOXact_LargeObject(true); + + /* + * Insert notifications sent by NOTIFY commands into the queue. This + * should be late in the pre-commit sequence to minimize time spent + * holding the notify-insertion lock. However, this could result in + * creating a snapshot, so we must do it before serializable cleanup. + */ + PreCommit_Notify(); + + /* + * Mark serializable transaction as complete for predicate locking + * purposes. This should be done as late as we can put it and still allow + * errors to be raised for failure patterns found at commit. This is not + * appropriate in a parallel worker however, because we aren't committing + * the leader's transaction and its serializable state will live on. + */ + if (!is_parallel_worker) + PreCommit_CheckForSerializationFailure(); + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* Commit updates to the relation map --- do this as late as possible */ + AtEOXact_RelationMap(true, is_parallel_worker); + + /* + * set the current transaction state information appropriately during + * commit processing + */ + s->state = TRANS_COMMIT; + s->parallelModeLevel = 0; + + if (!is_parallel_worker) + { + /* + * We need to mark our XIDs as committed in pg_xact. This is where we + * durably commit. + */ + latestXid = RecordTransactionCommit(); + } + else + { + /* + * We must not mark our XID committed; the parallel leader is + * responsible for that. + */ + latestXid = InvalidTransactionId; + + /* + * Make sure the leader will know about any WAL we wrote before it + * commits. + */ + ParallelWorkerReportLastRecEnd(XactLastRecEnd); + } + + TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid); + + /* + * Let others know about no transaction in progress by me. Note that this + * must be done _before_ releasing locks we hold and _after_ + * RecordTransactionCommit. + */ + ProcArrayEndTransaction(MyProc, latestXid); + + /* + * This is all post-commit cleanup. Note that if an error is raised here, + * it's too late to abort the transaction. This should be just + * noncritical resource releasing. + * + * The ordering of operations is not entirely random. The idea is: + * release resources visible to other backends (eg, files, buffer pins); + * then release locks; then release backend-local resources. We want to + * release locks at the point where any backend waiting for us will see + * our transaction as being fully cleaned up. + * + * Resources that can be associated with individual queries are handled by + * the ResourceOwner mechanism. The other calls here are for backend-wide + * state. + */ + + CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_COMMIT + : XACT_EVENT_COMMIT); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, true); + + /* Check we've released all buffer pins */ + AtEOXact_Buffers(true); + + /* Clean up the relation cache */ + AtEOXact_RelationCache(true); + + /* + * Make catalog changes visible to all backends. This has to happen after + * relcache references are dropped (see comments for + * AtEOXact_RelationCache), but before locks are released (if anyone is + * waiting for lock on a relation we've modified, we want them to know + * about the catalog change before they start using the relation). + */ + AtEOXact_Inval(true); + + AtEOXact_MultiXact(); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_LOCKS, + true, true); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, true); + + /* + * Likewise, dropping of files deleted during the transaction is best done + * after releasing relcache and buffer pins. (This is not strictly + * necessary during commit, since such pins should have been released + * already, but this ordering is definitely critical during abort.) Since + * this may take many seconds, also delay until after releasing locks. + * Other backends will observe the attendant catalog changes and not + * attempt to access affected files. + */ + smgrDoPendingDeletes(true); + + /* + * Send out notification signals to other backends (and do other + * post-commit NOTIFY cleanup). This must not happen until after our + * transaction is fully done from the viewpoint of other backends. + */ + AtCommit_Notify(); + + /* + * Everything after this should be purely internal-to-this-backend + * cleanup. + */ + AtEOXact_GUC(true, 1); + AtEOXact_SPI(true); + AtEOXact_Enum(); + AtEOXact_on_commit_actions(true); + AtEOXact_Namespace(true, is_parallel_worker); + AtEOXact_SMgr(); + AtEOXact_Files(true); + AtEOXact_ComboCid(); + AtEOXact_HashTables(true); + AtEOXact_PgStat(true, is_parallel_worker); + AtEOXact_Snapshot(true, false); + AtEOXact_ApplyLauncher(true); + pgstat_report_xact_timestamp(0); + + CurrentResourceOwner = NULL; + ResourceOwnerDelete(TopTransactionResourceOwner); + s->curTransactionOwner = NULL; + CurTransactionResourceOwner = NULL; + TopTransactionResourceOwner = NULL; + + AtCommit_Memory(); + + s->fullTransactionId = InvalidFullTransactionId; + s->subTransactionId = InvalidSubTransactionId; + s->nestingLevel = 0; + s->gucNestLevel = 0; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + XactTopFullTransactionId = InvalidFullTransactionId; + nParallelCurrentXids = 0; + + /* + * done with commit processing, set current transaction state back to + * default + */ + s->state = TRANS_DEFAULT; + + RESUME_INTERRUPTS(); +} + + +/* + * PrepareTransaction + * + * NB: if you change this routine, better look at CommitTransaction too! + */ +static void +PrepareTransaction(void) +{ + TransactionState s = CurrentTransactionState; + TransactionId xid = GetCurrentTransactionId(); + GlobalTransaction gxact; + TimestampTz prepared_at; + + Assert(!IsInParallelMode()); + + ShowTransactionState("PrepareTransaction"); + + /* + * check the current transaction state + */ + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "PrepareTransaction while in %s state", + TransStateAsString(s->state)); + Assert(s->parent == NULL); + + /* + * Do pre-commit processing that involves calling user-defined code, such + * as triggers. Since closing cursors could queue trigger actions, + * triggers could open cursors, etc, we have to keep looping until there's + * nothing left to do. + */ + for (;;) + { + /* + * Fire all currently pending deferred triggers. + */ + AfterTriggerFireDeferred(); + + /* + * Close open portals (converting holdable ones into static portals). + * If there weren't any, we are done ... otherwise loop back to check + * if they queued deferred triggers. Lather, rinse, repeat. + */ + if (!PreCommit_Portals(true)) + break; + } + + CallXactCallbacks(XACT_EVENT_PRE_PREPARE); + + /* + * The remaining actions cannot call any user-defined code, so it's safe + * to start shutting down within-transaction services. But note that most + * of this stuff could still throw an error, which would switch us into + * the transaction-abort path. + */ + + /* Shut down the deferred-trigger manager */ + AfterTriggerEndXact(true); + + /* + * Let ON COMMIT management do its thing (must happen after closing + * cursors, to avoid dangling-reference problems) + */ + PreCommit_on_commit_actions(); + + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before EndPrepare(), so that we don't see + * committed-but-broken files after a crash and COMMIT PREPARED. + */ + smgrDoPendingSyncs(true, false); + + /* close large objects before lower-level cleanup */ + AtEOXact_LargeObject(true); + + /* NOTIFY requires no work at this point */ + + /* + * Mark serializable transaction as complete for predicate locking + * purposes. This should be done as late as we can put it and still allow + * errors to be raised for failure patterns found at commit. + */ + PreCommit_CheckForSerializationFailure(); + + /* + * Don't allow PREPARE TRANSACTION if we've accessed a temporary table in + * this transaction. Having the prepared xact hold locks on another + * backend's temp table seems a bad idea --- for instance it would prevent + * the backend from exiting. There are other problems too, such as how to + * clean up the source backend's local buffers and ON COMMIT state if the + * prepared xact includes a DROP of a temp table. + * + * Other objects types, like functions, operators or extensions, share the + * same restriction as they should not be created, locked or dropped as + * this can mess up with this session or even a follow-up session trying + * to use the same temporary namespace. + * + * We must check this after executing any ON COMMIT actions, because they + * might still access a temp relation. + * + * XXX In principle this could be relaxed to allow some useful special + * cases, such as a temp table created and dropped all within the + * transaction. That seems to require much more bookkeeping though. + */ + if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPNAMESPACE)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a transaction that has operated on temporary objects"))); + + /* + * Likewise, don't allow PREPARE after pg_export_snapshot. This could be + * supported if we added cleanup logic to twophase.c, but for now it + * doesn't seem worth the trouble. + */ + if (XactHasExportedSnapshots()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a transaction that has exported snapshots"))); + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* + * set the current transaction state information appropriately during + * prepare processing + */ + s->state = TRANS_PREPARE; + + prepared_at = GetCurrentTimestamp(); + + /* Tell bufmgr and smgr to prepare for commit */ + BufmgrCommit(); + + /* + * Reserve the GID for this transaction. This could fail if the requested + * GID is invalid or already in use. + */ + gxact = MarkAsPreparing(xid, prepareGID, prepared_at, + GetUserId(), MyDatabaseId); + prepareGID = NULL; + + /* + * Collect data for the 2PC state file. Note that in general, no actual + * state change should happen in the called modules during this step, + * since it's still possible to fail before commit, and in that case we + * want transaction abort to be able to clean up. (In particular, the + * AtPrepare routines may error out if they find cases they cannot + * handle.) State cleanup should happen in the PostPrepare routines + * below. However, some modules can go ahead and clear state here because + * they wouldn't do anything with it during abort anyway. + * + * Note: because the 2PC state file records will be replayed in the same + * order they are made, the order of these calls has to match the order in + * which we want things to happen during COMMIT PREPARED or ROLLBACK + * PREPARED; in particular, pay attention to whether things should happen + * before or after releasing the transaction's locks. + */ + StartPrepare(gxact); + + AtPrepare_Notify(); + AtPrepare_Locks(); + AtPrepare_PredicateLocks(); + AtPrepare_PgStat(); + AtPrepare_MultiXact(); + AtPrepare_RelationMap(); + + /* + * Here is where we really truly prepare. + * + * We have to record transaction prepares even if we didn't make any + * updates, because the transaction manager might get confused if we lose + * a global transaction. + */ + EndPrepare(gxact); + + /* + * Now we clean up backend-internal state and release internal resources. + */ + + /* Reset XactLastRecEnd until the next transaction writes something */ + XactLastRecEnd = 0; + + /* + * Transfer our locks to a dummy PGPROC. This has to be done before + * ProcArrayClearTransaction(). Otherwise, a GetLockConflicts() would + * conclude "xact already committed or aborted" for our locks. + */ + PostPrepare_Locks(xid); + + /* + * Let others know about no transaction in progress by me. This has to be + * done *after* the prepared transaction has been marked valid, else + * someone may think it is unlocked and recyclable. + */ + ProcArrayClearTransaction(MyProc); + + /* + * In normal commit-processing, this is all non-critical post-transaction + * cleanup. When the transaction is prepared, however, it's important + * that the locks and other per-backend resources are transferred to the + * prepared transaction's PGPROC entry. Note that if an error is raised + * here, it's too late to abort the transaction. XXX: This probably should + * be in a critical section, to force a PANIC if any of this fails, but + * that cure could be worse than the disease. + */ + + CallXactCallbacks(XACT_EVENT_PREPARE); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, true); + + /* Check we've released all buffer pins */ + AtEOXact_Buffers(true); + + /* Clean up the relation cache */ + AtEOXact_RelationCache(true); + + /* notify doesn't need a postprepare call */ + + PostPrepare_PgStat(); + + PostPrepare_Inval(); + + PostPrepare_smgr(); + + PostPrepare_MultiXact(xid); + + PostPrepare_PredicateLocks(xid); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_LOCKS, + true, true); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, true); + + /* + * Allow another backend to finish the transaction. After + * PostPrepare_Twophase(), the transaction is completely detached from our + * backend. The rest is just non-critical cleanup of backend-local state. + */ + PostPrepare_Twophase(); + + /* PREPARE acts the same as COMMIT as far as GUC is concerned */ + AtEOXact_GUC(true, 1); + AtEOXact_SPI(true); + AtEOXact_Enum(); + AtEOXact_on_commit_actions(true); + AtEOXact_Namespace(true, false); + AtEOXact_SMgr(); + AtEOXact_Files(true); + AtEOXact_ComboCid(); + AtEOXact_HashTables(true); + /* don't call AtEOXact_PgStat here; we fixed pgstat state above */ + AtEOXact_Snapshot(true, true); + pgstat_report_xact_timestamp(0); + + CurrentResourceOwner = NULL; + ResourceOwnerDelete(TopTransactionResourceOwner); + s->curTransactionOwner = NULL; + CurTransactionResourceOwner = NULL; + TopTransactionResourceOwner = NULL; + + AtCommit_Memory(); + + s->fullTransactionId = InvalidFullTransactionId; + s->subTransactionId = InvalidSubTransactionId; + s->nestingLevel = 0; + s->gucNestLevel = 0; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + XactTopFullTransactionId = InvalidFullTransactionId; + nParallelCurrentXids = 0; + + /* + * done with 1st phase commit processing, set current transaction state + * back to default + */ + s->state = TRANS_DEFAULT; + + RESUME_INTERRUPTS(); +} + + +/* + * AbortTransaction + */ +static void +AbortTransaction(void) +{ + TransactionState s = CurrentTransactionState; + TransactionId latestXid; + bool is_parallel_worker; + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* Make sure we have a valid memory context and resource owner */ + AtAbort_Memory(); + AtAbort_ResourceOwner(); + + /* + * Release any LW locks we might be holding as quickly as possible. + * (Regular locks, however, must be held till we finish aborting.) + * Releasing LW locks is critical since we might try to grab them again + * while cleaning up! + */ + LWLockReleaseAll(); + + /* Clear wait information and command progress indicator */ + pgstat_report_wait_end(); + pgstat_progress_end_command(); + + /* Clean up buffer I/O and buffer context locks, too */ + AbortBufferIO(); + UnlockBuffers(); + + /* Reset WAL record construction state */ + XLogResetInsertion(); + + /* Cancel condition variable sleep */ + ConditionVariableCancelSleep(); + + /* + * Also clean up any open wait for lock, since the lock manager will choke + * if we try to wait for another lock before doing this. + */ + LockErrorCleanup(); + + /* + * If any timeout events are still active, make sure the timeout interrupt + * is scheduled. This covers possible loss of a timeout interrupt due to + * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm). + * We delay this till after LockErrorCleanup so that we don't uselessly + * reschedule lock or deadlock check timeouts. + */ + reschedule_timeouts(); + + /* + * Re-enable signals, in case we got here by longjmp'ing out of a signal + * handler. We do this fairly early in the sequence so that the timeout + * infrastructure will be functional if needed while aborting. + */ + PG_SETMASK(&UnBlockSig); + + /* + * check the current transaction state + */ + is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS); + if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE) + elog(WARNING, "AbortTransaction while in %s state", + TransStateAsString(s->state)); + Assert(s->parent == NULL); + + /* + * set the current transaction state information appropriately during the + * abort processing + */ + s->state = TRANS_ABORT; + + /* + * Reset user ID which might have been changed transiently. We need this + * to clean up in case control escaped out of a SECURITY DEFINER function + * or other local change of CurrentUserId; therefore, the prior value of + * SecurityRestrictionContext also needs to be restored. + * + * (Note: it is not necessary to restore session authorization or role + * settings here because those can only be changed via GUC, and GUC will + * take care of rolling them back if need be.) + */ + SetUserIdAndSecContext(s->prevUser, s->prevSecContext); + + /* Forget about any active REINDEX. */ + ResetReindexState(s->nestingLevel); + + /* Reset logical streaming state. */ + ResetLogicalStreamingState(); + + /* Reset snapshot export state. */ + SnapBuildResetExportedSnapshotState(); + + /* If in parallel mode, clean up workers and exit parallel mode. */ + if (IsInParallelMode()) + { + AtEOXact_Parallel(false); + s->parallelModeLevel = 0; + } + + /* + * do abort processing + */ + AfterTriggerEndXact(false); /* 'false' means it's abort */ + AtAbort_Portals(); + smgrDoPendingSyncs(false, is_parallel_worker); + AtEOXact_LargeObject(false); + AtAbort_Notify(); + AtEOXact_RelationMap(false, is_parallel_worker); + AtAbort_Twophase(); + + /* + * Advertise the fact that we aborted in pg_xact (assuming that we got as + * far as assigning an XID to advertise). But if we're inside a parallel + * worker, skip this; the user backend must be the one to write the abort + * record. + */ + if (!is_parallel_worker) + latestXid = RecordTransactionAbort(false); + else + { + latestXid = InvalidTransactionId; + + /* + * Since the parallel leader won't get our value of XactLastRecEnd in + * this case, we nudge WAL-writer ourselves in this case. See related + * comments in RecordTransactionAbort for why this matters. + */ + XLogSetAsyncXactLSN(XactLastRecEnd); + } + + TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid); + + /* + * Let others know about no transaction in progress by me. Note that this + * must be done _before_ releasing locks we hold and _after_ + * RecordTransactionAbort. + */ + ProcArrayEndTransaction(MyProc, latestXid); + + /* + * Post-abort cleanup. See notes in CommitTransaction() concerning + * ordering. We can skip all of it if the transaction failed before + * creating a resource owner. + */ + if (TopTransactionResourceOwner != NULL) + { + if (is_parallel_worker) + CallXactCallbacks(XACT_EVENT_PARALLEL_ABORT); + else + CallXactCallbacks(XACT_EVENT_ABORT); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, true); + AtEOXact_Buffers(false); + AtEOXact_RelationCache(false); + AtEOXact_Inval(false); + AtEOXact_MultiXact(); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_LOCKS, + false, true); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + false, true); + smgrDoPendingDeletes(false); + + AtEOXact_GUC(false, 1); + AtEOXact_SPI(false); + AtEOXact_Enum(); + AtEOXact_on_commit_actions(false); + AtEOXact_Namespace(false, is_parallel_worker); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_ComboCid(); + AtEOXact_HashTables(false); + AtEOXact_PgStat(false, is_parallel_worker); + AtEOXact_ApplyLauncher(false); + pgstat_report_xact_timestamp(0); + } + + /* + * State remains TRANS_ABORT until CleanupTransaction(). + */ + RESUME_INTERRUPTS(); +} + +/* + * CleanupTransaction + */ +static void +CleanupTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * State should still be TRANS_ABORT from AbortTransaction(). + */ + if (s->state != TRANS_ABORT) + elog(FATAL, "CleanupTransaction: unexpected state %s", + TransStateAsString(s->state)); + + /* + * do abort cleanup processing + */ + AtCleanup_Portals(); /* now safe to release portal memory */ + AtEOXact_Snapshot(false, true); /* and release the transaction's snapshots */ + + CurrentResourceOwner = NULL; /* and resource owner */ + if (TopTransactionResourceOwner) + ResourceOwnerDelete(TopTransactionResourceOwner); + s->curTransactionOwner = NULL; + CurTransactionResourceOwner = NULL; + TopTransactionResourceOwner = NULL; + + AtCleanup_Memory(); /* and transaction memory */ + + s->fullTransactionId = InvalidFullTransactionId; + s->subTransactionId = InvalidSubTransactionId; + s->nestingLevel = 0; + s->gucNestLevel = 0; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + s->parallelModeLevel = 0; + + XactTopFullTransactionId = InvalidFullTransactionId; + nParallelCurrentXids = 0; + + /* + * done with abort processing, set current transaction state back to + * default + */ + s->state = TRANS_DEFAULT; +} + +/* + * StartTransactionCommand + */ +void +StartTransactionCommand(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + /* + * if we aren't in a transaction block, we just do our usual start + * transaction. + */ + case TBLOCK_DEFAULT: + StartTransaction(); + s->blockState = TBLOCK_STARTED; + break; + + /* + * We are somewhere in a transaction block or subtransaction and + * about to start a new command. For now we do nothing, but + * someday we may do command-local resource initialization. (Note + * that any needed CommandCounterIncrement was done by the + * previous CommitTransactionCommand.) + */ + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + break; + + /* + * Here we are in a failed transaction block (one of the commands + * caused an abort) so we do nothing but remain in the abort + * state. Eventually we will get a ROLLBACK command which will + * get us out of this state. (It is up to other code to ensure + * that no commands other than ROLLBACK will be processed in these + * states.) + */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + break; + + /* These cases are invalid. */ + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(ERROR, "StartTransactionCommand: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + /* + * We must switch to CurTransactionContext before returning. This is + * already done if we called StartTransaction, otherwise not. + */ + Assert(CurTransactionContext != NULL); + MemoryContextSwitchTo(CurTransactionContext); +} + + +/* + * Simple system for saving and restoring transaction characteristics + * (isolation level, read only, deferrable). We need this for transaction + * chaining, so that we can set the characteristics of the new transaction to + * be the same as the previous one. (We need something like this because the + * GUC system resets the characteristics at transaction end, so for example + * just skipping the reset in StartTransaction() won't work.) + */ +static int save_XactIsoLevel; +static bool save_XactReadOnly; +static bool save_XactDeferrable; + +void +SaveTransactionCharacteristics(void) +{ + save_XactIsoLevel = XactIsoLevel; + save_XactReadOnly = XactReadOnly; + save_XactDeferrable = XactDeferrable; +} + +void +RestoreTransactionCharacteristics(void) +{ + XactIsoLevel = save_XactIsoLevel; + XactReadOnly = save_XactReadOnly; + XactDeferrable = save_XactDeferrable; +} + + +/* + * CommitTransactionCommand + */ +void +CommitTransactionCommand(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->chain) + SaveTransactionCharacteristics(); + + switch (s->blockState) + { + /* + * These shouldn't happen. TBLOCK_DEFAULT means the previous + * StartTransactionCommand didn't set the STARTED state + * appropriately, while TBLOCK_PARALLEL_INPROGRESS should be ended + * by EndParallelWorkerTransaction(), not this function. + */ + case TBLOCK_DEFAULT: + case TBLOCK_PARALLEL_INPROGRESS: + elog(FATAL, "CommitTransactionCommand: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * If we aren't in a transaction block, just do our usual + * transaction commit, and return to the idle state. + */ + case TBLOCK_STARTED: + CommitTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We are completing a "BEGIN TRANSACTION" command, so we change + * to the "transaction block in progress" state and return. (We + * assume the BEGIN did nothing to the database, so we need no + * CommandCounterIncrement.) + */ + case TBLOCK_BEGIN: + s->blockState = TBLOCK_INPROGRESS; + break; + + /* + * This is the case when we have finished executing a command + * someplace within a transaction block. We increment the command + * counter and return. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + CommandCounterIncrement(); + break; + + /* + * We are completing a "COMMIT" command. Do it and return to the + * idle state. + */ + case TBLOCK_END: + CommitTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(); + } + break; + + /* + * Here we are in the middle of a transaction block but one of the + * commands caused an abort so we do nothing but remain in the + * abort state. Eventually we will get a ROLLBACK command. + */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + break; + + /* + * Here we were in an aborted transaction block and we just got + * the ROLLBACK command from the user, so clean up the + * already-aborted transaction and return to the idle state. + */ + case TBLOCK_ABORT_END: + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(); + } + break; + + /* + * Here we were in a perfectly good transaction block but the user + * told us to ROLLBACK anyway. We have to abort the transaction + * and then clean up. + */ + case TBLOCK_ABORT_PENDING: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(); + } + break; + + /* + * We are completing a "PREPARE TRANSACTION" command. Do it and + * return to the idle state. + */ + case TBLOCK_PREPARE: + PrepareTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We were just issued a SAVEPOINT inside a transaction block. + * Start a subtransaction. (DefineSavepoint already did + * PushTransaction, so as to have someplace to put the SUBBEGIN + * state.) + */ + case TBLOCK_SUBBEGIN: + StartSubTransaction(); + s->blockState = TBLOCK_SUBINPROGRESS; + break; + + /* + * We were issued a RELEASE command, so we end the current + * subtransaction and return to the parent transaction. The parent + * might be ended too, so repeat till we find an INPROGRESS + * transaction or subtransaction. + */ + case TBLOCK_SUBRELEASE: + do + { + CommitSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + } while (s->blockState == TBLOCK_SUBRELEASE); + + Assert(s->blockState == TBLOCK_INPROGRESS || + s->blockState == TBLOCK_SUBINPROGRESS); + break; + + /* + * We were issued a COMMIT, so we end the current subtransaction + * hierarchy and perform final commit. We do this by rolling up + * any subtransactions into their parent, which leads to O(N^2) + * operations with respect to resource owners - this isn't that + * bad until we approach a thousands of savepoints but is + * necessary for correctness should after triggers create new + * resource owners. + */ + case TBLOCK_SUBCOMMIT: + do + { + CommitSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + } while (s->blockState == TBLOCK_SUBCOMMIT); + /* If we had a COMMIT command, finish off the main xact too */ + if (s->blockState == TBLOCK_END) + { + Assert(s->parent == NULL); + CommitTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(); + } + } + else if (s->blockState == TBLOCK_PREPARE) + { + Assert(s->parent == NULL); + PrepareTransaction(); + s->blockState = TBLOCK_DEFAULT; + } + else + elog(ERROR, "CommitTransactionCommand: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * The current already-failed subtransaction is ending due to a + * ROLLBACK or ROLLBACK TO command, so pop it and recursively + * examine the parent (which could be in any of several states). + */ + case TBLOCK_SUBABORT_END: + CleanupSubTransaction(); + CommitTransactionCommand(); + break; + + /* + * As above, but it's not dead yet, so abort first. + */ + case TBLOCK_SUBABORT_PENDING: + AbortSubTransaction(); + CleanupSubTransaction(); + CommitTransactionCommand(); + break; + + /* + * The current subtransaction is the target of a ROLLBACK TO + * command. Abort and pop it, then start a new subtransaction + * with the same name. + */ + case TBLOCK_SUBRESTART: + { + char *name; + int savepointLevel; + + /* save name and keep Cleanup from freeing it */ + name = s->name; + s->name = NULL; + savepointLevel = s->savepointLevel; + + AbortSubTransaction(); + CleanupSubTransaction(); + + DefineSavepoint(NULL); + s = CurrentTransactionState; /* changed by push */ + s->name = name; + s->savepointLevel = savepointLevel; + + /* This is the same as TBLOCK_SUBBEGIN case */ + AssertState(s->blockState == TBLOCK_SUBBEGIN); + StartSubTransaction(); + s->blockState = TBLOCK_SUBINPROGRESS; + } + break; + + /* + * Same as above, but the subtransaction had already failed, so we + * don't need AbortSubTransaction. + */ + case TBLOCK_SUBABORT_RESTART: + { + char *name; + int savepointLevel; + + /* save name and keep Cleanup from freeing it */ + name = s->name; + s->name = NULL; + savepointLevel = s->savepointLevel; + + CleanupSubTransaction(); + + DefineSavepoint(NULL); + s = CurrentTransactionState; /* changed by push */ + s->name = name; + s->savepointLevel = savepointLevel; + + /* This is the same as TBLOCK_SUBBEGIN case */ + AssertState(s->blockState == TBLOCK_SUBBEGIN); + StartSubTransaction(); + s->blockState = TBLOCK_SUBINPROGRESS; + } + break; + } +} + +/* + * AbortCurrentTransaction + */ +void +AbortCurrentTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + case TBLOCK_DEFAULT: + if (s->state == TRANS_DEFAULT) + { + /* we are idle, so nothing to do */ + } + else + { + /* + * We can get here after an error during transaction start + * (state will be TRANS_START). Need to clean up the + * incompletely started transaction. First, adjust the + * low-level state to suppress warning message from + * AbortTransaction. + */ + if (s->state == TRANS_START) + s->state = TRANS_INPROGRESS; + AbortTransaction(); + CleanupTransaction(); + } + break; + + /* + * If we aren't in a transaction block, we just do the basic abort + * & cleanup transaction. For this purpose, we treat an implicit + * transaction block as if it were a simple statement. + */ + case TBLOCK_STARTED: + case TBLOCK_IMPLICIT_INPROGRESS: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * If we are in TBLOCK_BEGIN it means something screwed up right + * after reading "BEGIN TRANSACTION". We assume that the user + * will interpret the error as meaning the BEGIN failed to get him + * into a transaction block, so we should abort and return to idle + * state. + */ + case TBLOCK_BEGIN: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We are somewhere in a transaction block and we've gotten a + * failure, so we abort the transaction and set up the persistent + * ABORT state. We will stay in ABORT until we get a ROLLBACK. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + AbortTransaction(); + s->blockState = TBLOCK_ABORT; + /* CleanupTransaction happens when we exit TBLOCK_ABORT_END */ + break; + + /* + * Here, we failed while trying to COMMIT. Clean up the + * transaction and return to idle state (we do not want to stay in + * the transaction). + */ + case TBLOCK_END: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * Here, we are already in an aborted transaction state and are + * waiting for a ROLLBACK, but for some reason we failed again! So + * we just remain in the abort state. + */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + break; + + /* + * We are in a failed transaction and we got the ROLLBACK command. + * We have already aborted, we just need to cleanup and go to idle + * state. + */ + case TBLOCK_ABORT_END: + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We are in a live transaction and we got a ROLLBACK command. + * Abort, cleanup, go to idle state. + */ + case TBLOCK_ABORT_PENDING: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * Here, we failed while trying to PREPARE. Clean up the + * transaction and return to idle state (we do not want to stay in + * the transaction). + */ + case TBLOCK_PREPARE: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We got an error inside a subtransaction. Abort just the + * subtransaction, and go to the persistent SUBABORT state until + * we get ROLLBACK. + */ + case TBLOCK_SUBINPROGRESS: + AbortSubTransaction(); + s->blockState = TBLOCK_SUBABORT; + break; + + /* + * If we failed while trying to create a subtransaction, clean up + * the broken subtransaction and abort the parent. The same + * applies if we get a failure while ending a subtransaction. + */ + case TBLOCK_SUBBEGIN: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + AbortSubTransaction(); + CleanupSubTransaction(); + AbortCurrentTransaction(); + break; + + /* + * Same as above, except the Abort() was already done. + */ + case TBLOCK_SUBABORT_END: + case TBLOCK_SUBABORT_RESTART: + CleanupSubTransaction(); + AbortCurrentTransaction(); + break; + } +} + +/* + * PreventInTransactionBlock + * + * This routine is to be called by statements that must not run inside + * a transaction block, typically because they have non-rollback-able + * side effects or do internal commits. + * + * If we have already started a transaction block, issue an error; also issue + * an error if we appear to be running inside a user-defined function (which + * could issue more commands and possibly cause a failure after the statement + * completes). Subtransactions are verboten too. + * + * We must also set XACT_FLAGS_NEEDIMMEDIATECOMMIT in MyXactFlags, to ensure + * that postgres.c follows through by committing after the statement is done. + * + * isTopLevel: passed down from ProcessUtility to determine whether we are + * inside a function. (We will always fail if this is false, but it's + * convenient to centralize the check here instead of making callers do it.) + * stmtType: statement type name, for error messages. + */ +void +PreventInTransactionBlock(bool isTopLevel, const char *stmtType) +{ + /* + * xact block already started? + */ + if (IsTransactionBlock()) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s cannot run inside a transaction block", + stmtType))); + + /* + * subtransaction? + */ + if (IsSubTransaction()) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s cannot run inside a subtransaction", + stmtType))); + + /* + * inside a function call? + */ + if (!isTopLevel) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s cannot be executed from a function", stmtType))); + + /* If we got past IsTransactionBlock test, should be in default state */ + if (CurrentTransactionState->blockState != TBLOCK_DEFAULT && + CurrentTransactionState->blockState != TBLOCK_STARTED) + elog(FATAL, "cannot prevent transaction chain"); + + /* All okay. Set the flag to make sure the right thing happens later. */ + MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT; +} + +/* + * WarnNoTransactionBlock + * RequireTransactionBlock + * + * These two functions allow for warnings or errors if a command is executed + * outside of a transaction block. This is useful for commands that have no + * effects that persist past transaction end (and so calling them outside a + * transaction block is presumably an error). DECLARE CURSOR is an example. + * While top-level transaction control commands (BEGIN/COMMIT/ABORT) and SET + * that have no effect issue warnings, all other no-effect commands generate + * errors. + * + * If we appear to be running inside a user-defined function, we do not + * issue anything, since the function could issue more commands that make + * use of the current statement's results. Likewise subtransactions. + * Thus these are inverses for PreventInTransactionBlock. + * + * isTopLevel: passed down from ProcessUtility to determine whether we are + * inside a function. + * stmtType: statement type name, for warning or error messages. + */ +void +WarnNoTransactionBlock(bool isTopLevel, const char *stmtType) +{ + CheckTransactionBlock(isTopLevel, false, stmtType); +} + +void +RequireTransactionBlock(bool isTopLevel, const char *stmtType) +{ + CheckTransactionBlock(isTopLevel, true, stmtType); +} + +/* + * This is the implementation of the above two. + */ +static void +CheckTransactionBlock(bool isTopLevel, bool throwError, const char *stmtType) +{ + /* + * xact block already started? + */ + if (IsTransactionBlock()) + return; + + /* + * subtransaction? + */ + if (IsSubTransaction()) + return; + + /* + * inside a function call? + */ + if (!isTopLevel) + return; + + ereport(throwError ? ERROR : WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + stmtType))); +} + +/* + * IsInTransactionBlock + * + * This routine is for statements that need to behave differently inside + * a transaction block than when running as single commands. ANALYZE is + * currently the only example. + * + * isTopLevel: passed down from ProcessUtility to determine whether we are + * inside a function. + */ +bool +IsInTransactionBlock(bool isTopLevel) +{ + /* + * Return true on same conditions that would make + * PreventInTransactionBlock error out + */ + if (IsTransactionBlock()) + return true; + + if (IsSubTransaction()) + return true; + + if (!isTopLevel) + return true; + + if (CurrentTransactionState->blockState != TBLOCK_DEFAULT && + CurrentTransactionState->blockState != TBLOCK_STARTED) + return true; + + /* + * If we tell the caller we're not in a transaction block, then inform + * postgres.c that it had better commit when the statement is done. + * Otherwise our report could be a lie. + */ + MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT; + + return false; +} + + +/* + * Register or deregister callback functions for start- and end-of-xact + * operations. + * + * These functions are intended for use by dynamically loaded modules. + * For built-in modules we generally just hardwire the appropriate calls + * (mainly because it's easier to control the order that way, where needed). + * + * At transaction end, the callback occurs post-commit or post-abort, so the + * callback functions can only do noncritical cleanup. + */ +void +RegisterXactCallback(XactCallback callback, void *arg) +{ + XactCallbackItem *item; + + item = (XactCallbackItem *) + MemoryContextAlloc(TopMemoryContext, sizeof(XactCallbackItem)); + item->callback = callback; + item->arg = arg; + item->next = Xact_callbacks; + Xact_callbacks = item; +} + +void +UnregisterXactCallback(XactCallback callback, void *arg) +{ + XactCallbackItem *item; + XactCallbackItem *prev; + + prev = NULL; + for (item = Xact_callbacks; item; prev = item, item = item->next) + { + if (item->callback == callback && item->arg == arg) + { + if (prev) + prev->next = item->next; + else + Xact_callbacks = item->next; + pfree(item); + break; + } + } +} + +static void +CallXactCallbacks(XactEvent event) +{ + XactCallbackItem *item; + + for (item = Xact_callbacks; item; item = item->next) + item->callback(event, item->arg); +} + + +/* + * Register or deregister callback functions for start- and end-of-subxact + * operations. + * + * Pretty much same as above, but for subtransaction events. + * + * At subtransaction end, the callback occurs post-subcommit or post-subabort, + * so the callback functions can only do noncritical cleanup. At + * subtransaction start, the callback is called when the subtransaction has + * finished initializing. + */ +void +RegisterSubXactCallback(SubXactCallback callback, void *arg) +{ + SubXactCallbackItem *item; + + item = (SubXactCallbackItem *) + MemoryContextAlloc(TopMemoryContext, sizeof(SubXactCallbackItem)); + item->callback = callback; + item->arg = arg; + item->next = SubXact_callbacks; + SubXact_callbacks = item; +} + +void +UnregisterSubXactCallback(SubXactCallback callback, void *arg) +{ + SubXactCallbackItem *item; + SubXactCallbackItem *prev; + + prev = NULL; + for (item = SubXact_callbacks; item; prev = item, item = item->next) + { + if (item->callback == callback && item->arg == arg) + { + if (prev) + prev->next = item->next; + else + SubXact_callbacks = item->next; + pfree(item); + break; + } + } +} + +static void +CallSubXactCallbacks(SubXactEvent event, + SubTransactionId mySubid, + SubTransactionId parentSubid) +{ + SubXactCallbackItem *item; + + for (item = SubXact_callbacks; item; item = item->next) + item->callback(event, mySubid, parentSubid, item->arg); +} + + +/* ---------------------------------------------------------------- + * transaction block support + * ---------------------------------------------------------------- + */ + +/* + * BeginTransactionBlock + * This executes a BEGIN command. + */ +void +BeginTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + /* + * We are not inside a transaction block, so allow one to begin. + */ + case TBLOCK_STARTED: + s->blockState = TBLOCK_BEGIN; + break; + + /* + * BEGIN converts an implicit transaction block to a regular one. + * (Note that we allow this even if we've already done some + * commands, which is a bit odd but matches historical practice.) + */ + case TBLOCK_IMPLICIT_INPROGRESS: + s->blockState = TBLOCK_BEGIN; + break; + + /* + * Already a transaction block in progress. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + ereport(WARNING, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("there is already a transaction in progress"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "BeginTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } +} + +/* + * PrepareTransactionBlock + * This executes a PREPARE command. + * + * Since PREPARE may actually do a ROLLBACK, the result indicates what + * happened: true for PREPARE, false for ROLLBACK. + * + * Note that we don't actually do anything here except change blockState. + * The real work will be done in the upcoming PrepareTransaction(). + * We do it this way because it's not convenient to change memory context, + * resource owner, etc while executing inside a Portal. + */ +bool +PrepareTransactionBlock(const char *gid) +{ + TransactionState s; + bool result; + + /* Set up to commit the current transaction */ + result = EndTransactionBlock(false); + + /* If successful, change outer tblock state to PREPARE */ + if (result) + { + s = CurrentTransactionState; + + while (s->parent != NULL) + s = s->parent; + + if (s->blockState == TBLOCK_END) + { + /* Save GID where PrepareTransaction can find it again */ + prepareGID = MemoryContextStrdup(TopTransactionContext, gid); + + s->blockState = TBLOCK_PREPARE; + } + else + { + /* + * ignore case where we are not in a transaction; + * EndTransactionBlock already issued a warning. + */ + Assert(s->blockState == TBLOCK_STARTED || + s->blockState == TBLOCK_IMPLICIT_INPROGRESS); + /* Don't send back a PREPARE result tag... */ + result = false; + } + } + + return result; +} + +/* + * EndTransactionBlock + * This executes a COMMIT command. + * + * Since COMMIT may actually do a ROLLBACK, the result indicates what + * happened: true for COMMIT, false for ROLLBACK. + * + * Note that we don't actually do anything here except change blockState. + * The real work will be done in the upcoming CommitTransactionCommand(). + * We do it this way because it's not convenient to change memory context, + * resource owner, etc while executing inside a Portal. + */ +bool +EndTransactionBlock(bool chain) +{ + TransactionState s = CurrentTransactionState; + bool result = false; + + switch (s->blockState) + { + /* + * We are in a transaction block, so tell CommitTransactionCommand + * to COMMIT. + */ + case TBLOCK_INPROGRESS: + s->blockState = TBLOCK_END; + result = true; + break; + + /* + * We are in an implicit transaction block. If AND CHAIN was + * specified, error. Otherwise commit, but issue a warning + * because there was no explicit BEGIN before this. + */ + case TBLOCK_IMPLICIT_INPROGRESS: + if (chain) + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "COMMIT AND CHAIN"))); + else + ereport(WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + errmsg("there is no transaction in progress"))); + s->blockState = TBLOCK_END; + result = true; + break; + + /* + * We are in a failed transaction block. Tell + * CommitTransactionCommand it's time to exit the block. + */ + case TBLOCK_ABORT: + s->blockState = TBLOCK_ABORT_END; + break; + + /* + * We are in a live subtransaction block. Set up to subcommit all + * open subtransactions and then commit the main transaction. + */ + case TBLOCK_SUBINPROGRESS: + while (s->parent != NULL) + { + if (s->blockState == TBLOCK_SUBINPROGRESS) + s->blockState = TBLOCK_SUBCOMMIT; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + s = s->parent; + } + if (s->blockState == TBLOCK_INPROGRESS) + s->blockState = TBLOCK_END; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + result = true; + break; + + /* + * Here we are inside an aborted subtransaction. Treat the COMMIT + * as ROLLBACK: set up to abort everything and exit the main + * transaction. + */ + case TBLOCK_SUBABORT: + while (s->parent != NULL) + { + if (s->blockState == TBLOCK_SUBINPROGRESS) + s->blockState = TBLOCK_SUBABORT_PENDING; + else if (s->blockState == TBLOCK_SUBABORT) + s->blockState = TBLOCK_SUBABORT_END; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + s = s->parent; + } + if (s->blockState == TBLOCK_INPROGRESS) + s->blockState = TBLOCK_ABORT_PENDING; + else if (s->blockState == TBLOCK_ABORT) + s->blockState = TBLOCK_ABORT_END; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * The user issued COMMIT when not inside a transaction. For + * COMMIT without CHAIN, issue a WARNING, staying in + * TBLOCK_STARTED state. The upcoming call to + * CommitTransactionCommand() will then close the transaction and + * put us back into the default state. For COMMIT AND CHAIN, + * error. + */ + case TBLOCK_STARTED: + if (chain) + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "COMMIT AND CHAIN"))); + else + ereport(WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + errmsg("there is no transaction in progress"))); + result = true; + break; + + /* + * The user issued a COMMIT that somehow ran inside a parallel + * worker. We can't cope with that. + */ + case TBLOCK_PARALLEL_INPROGRESS: + ereport(FATAL, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot commit during a parallel operation"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + Assert(s->blockState == TBLOCK_STARTED || + s->blockState == TBLOCK_END || + s->blockState == TBLOCK_ABORT_END || + s->blockState == TBLOCK_ABORT_PENDING); + + s->chain = chain; + + return result; +} + +/* + * UserAbortTransactionBlock + * This executes a ROLLBACK command. + * + * As above, we don't actually do anything here except change blockState. + */ +void +UserAbortTransactionBlock(bool chain) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + /* + * We are inside a transaction block and we got a ROLLBACK command + * from the user, so tell CommitTransactionCommand to abort and + * exit the transaction block. + */ + case TBLOCK_INPROGRESS: + s->blockState = TBLOCK_ABORT_PENDING; + break; + + /* + * We are inside a failed transaction block and we got a ROLLBACK + * command from the user. Abort processing is already done, so + * CommitTransactionCommand just has to cleanup and go back to + * idle state. + */ + case TBLOCK_ABORT: + s->blockState = TBLOCK_ABORT_END; + break; + + /* + * We are inside a subtransaction. Mark everything up to top + * level as exitable. + */ + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBABORT: + while (s->parent != NULL) + { + if (s->blockState == TBLOCK_SUBINPROGRESS) + s->blockState = TBLOCK_SUBABORT_PENDING; + else if (s->blockState == TBLOCK_SUBABORT) + s->blockState = TBLOCK_SUBABORT_END; + else + elog(FATAL, "UserAbortTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + s = s->parent; + } + if (s->blockState == TBLOCK_INPROGRESS) + s->blockState = TBLOCK_ABORT_PENDING; + else if (s->blockState == TBLOCK_ABORT) + s->blockState = TBLOCK_ABORT_END; + else + elog(FATAL, "UserAbortTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * The user issued ABORT when not inside a transaction. For + * ROLLBACK without CHAIN, issue a WARNING and go to abort state. + * The upcoming call to CommitTransactionCommand() will then put + * us back into the default state. For ROLLBACK AND CHAIN, error. + * + * We do the same thing with ABORT inside an implicit transaction, + * although in this case we might be rolling back actual database + * state changes. (It's debatable whether we should issue a + * WARNING in this case, but we have done so historically.) + */ + case TBLOCK_STARTED: + case TBLOCK_IMPLICIT_INPROGRESS: + if (chain) + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "ROLLBACK AND CHAIN"))); + else + ereport(WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + errmsg("there is no transaction in progress"))); + s->blockState = TBLOCK_ABORT_PENDING; + break; + + /* + * The user issued an ABORT that somehow ran inside a parallel + * worker. We can't cope with that. + */ + case TBLOCK_PARALLEL_INPROGRESS: + ereport(FATAL, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot abort during a parallel operation"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "UserAbortTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + Assert(s->blockState == TBLOCK_ABORT_END || + s->blockState == TBLOCK_ABORT_PENDING); + + s->chain = chain; +} + +/* + * BeginImplicitTransactionBlock + * Start an implicit transaction block if we're not already in one. + * + * Unlike BeginTransactionBlock, this is called directly from the main loop + * in postgres.c, not within a Portal. So we can just change blockState + * without a lot of ceremony. We do not expect caller to do + * CommitTransactionCommand/StartTransactionCommand. + */ +void +BeginImplicitTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * If we are in STARTED state (that is, no transaction block is open), + * switch to IMPLICIT_INPROGRESS state, creating an implicit transaction + * block. + * + * For caller convenience, we consider all other transaction states as + * legal here; otherwise the caller would need its own state check, which + * seems rather pointless. + */ + if (s->blockState == TBLOCK_STARTED) + s->blockState = TBLOCK_IMPLICIT_INPROGRESS; +} + +/* + * EndImplicitTransactionBlock + * End an implicit transaction block, if we're in one. + * + * Like EndTransactionBlock, we just make any needed blockState change here. + * The real work will be done in the upcoming CommitTransactionCommand(). + */ +void +EndImplicitTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * If we are in IMPLICIT_INPROGRESS state, switch back to STARTED state, + * allowing CommitTransactionCommand to commit whatever happened during + * the implicit transaction block as though it were a single statement. + * + * For caller convenience, we consider all other transaction states as + * legal here; otherwise the caller would need its own state check, which + * seems rather pointless. + */ + if (s->blockState == TBLOCK_IMPLICIT_INPROGRESS) + s->blockState = TBLOCK_STARTED; +} + +/* + * DefineSavepoint + * This executes a SAVEPOINT command. + */ +void +DefineSavepoint(const char *name) +{ + TransactionState s = CurrentTransactionState; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new subtransactions after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot define savepoints during a parallel operation"))); + + switch (s->blockState) + { + case TBLOCK_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + /* Normal subtransaction start */ + PushTransaction(); + s = CurrentTransactionState; /* changed by push */ + + /* + * Savepoint names, like the TransactionState block itself, live + * in TopTransactionContext. + */ + if (name) + s->name = MemoryContextStrdup(TopTransactionContext, name); + break; + + /* + * We disallow savepoint commands in implicit transaction blocks. + * There would be no great difficulty in allowing them so far as + * this module is concerned, but a savepoint seems inconsistent + * with exec_simple_query's behavior of abandoning the whole query + * string upon error. Also, the point of an implicit transaction + * block (as opposed to a regular one) is to automatically close + * after an error, so it's hard to see how a savepoint would fit + * into that. + * + * The error messages for this are phrased as if there were no + * active transaction block at all, which is historical but + * perhaps could be improved. + */ + case TBLOCK_IMPLICIT_INPROGRESS: + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "SAVEPOINT"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "DefineSavepoint: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } +} + +/* + * ReleaseSavepoint + * This executes a RELEASE command. + * + * As above, we don't actually do anything here except change blockState. + */ +void +ReleaseSavepoint(const char *name) +{ + TransactionState s = CurrentTransactionState; + TransactionState target, + xact; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for transaction state change after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot release savepoints during a parallel operation"))); + + switch (s->blockState) + { + /* + * We can't release a savepoint if there is no savepoint defined. + */ + case TBLOCK_INPROGRESS: + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + break; + + case TBLOCK_IMPLICIT_INPROGRESS: + /* See comment about implicit transactions in DefineSavepoint */ + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "RELEASE SAVEPOINT"))); + break; + + /* + * We are in a non-aborted subtransaction. This is the only valid + * case. + */ + case TBLOCK_SUBINPROGRESS: + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "ReleaseSavepoint: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + for (target = s; PointerIsValid(target); target = target->parent) + { + if (PointerIsValid(target->name) && strcmp(target->name, name) == 0) + break; + } + + if (!PointerIsValid(target)) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + + /* disallow crossing savepoint level boundaries */ + if (target->savepointLevel != s->savepointLevel) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist within current savepoint level", name))); + + /* + * Mark "commit pending" all subtransactions up to the target + * subtransaction. The actual commits will happen when control gets to + * CommitTransactionCommand. + */ + xact = CurrentTransactionState; + for (;;) + { + Assert(xact->blockState == TBLOCK_SUBINPROGRESS); + xact->blockState = TBLOCK_SUBRELEASE; + if (xact == target) + break; + xact = xact->parent; + Assert(PointerIsValid(xact)); + } +} + +/* + * RollbackToSavepoint + * This executes a ROLLBACK TO command. + * + * As above, we don't actually do anything here except change blockState. + */ +void +RollbackToSavepoint(const char *name) +{ + TransactionState s = CurrentTransactionState; + TransactionState target, + xact; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for transaction state change after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot rollback to savepoints during a parallel operation"))); + + switch (s->blockState) + { + /* + * We can't rollback to a savepoint if there is no savepoint + * defined. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_ABORT: + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + break; + + case TBLOCK_IMPLICIT_INPROGRESS: + /* See comment about implicit transactions in DefineSavepoint */ + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "ROLLBACK TO SAVEPOINT"))); + break; + + /* + * There is at least one savepoint, so proceed. + */ + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBABORT: + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "RollbackToSavepoint: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + for (target = s; PointerIsValid(target); target = target->parent) + { + if (PointerIsValid(target->name) && strcmp(target->name, name) == 0) + break; + } + + if (!PointerIsValid(target)) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + + /* disallow crossing savepoint level boundaries */ + if (target->savepointLevel != s->savepointLevel) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist within current savepoint level", name))); + + /* + * Mark "abort pending" all subtransactions up to the target + * subtransaction. The actual aborts will happen when control gets to + * CommitTransactionCommand. + */ + xact = CurrentTransactionState; + for (;;) + { + if (xact == target) + break; + if (xact->blockState == TBLOCK_SUBINPROGRESS) + xact->blockState = TBLOCK_SUBABORT_PENDING; + else if (xact->blockState == TBLOCK_SUBABORT) + xact->blockState = TBLOCK_SUBABORT_END; + else + elog(FATAL, "RollbackToSavepoint: unexpected state %s", + BlockStateAsString(xact->blockState)); + xact = xact->parent; + Assert(PointerIsValid(xact)); + } + + /* And mark the target as "restart pending" */ + if (xact->blockState == TBLOCK_SUBINPROGRESS) + xact->blockState = TBLOCK_SUBRESTART; + else if (xact->blockState == TBLOCK_SUBABORT) + xact->blockState = TBLOCK_SUBABORT_RESTART; + else + elog(FATAL, "RollbackToSavepoint: unexpected state %s", + BlockStateAsString(xact->blockState)); +} + +/* + * BeginInternalSubTransaction + * This is the same as DefineSavepoint except it allows TBLOCK_STARTED, + * TBLOCK_IMPLICIT_INPROGRESS, TBLOCK_END, and TBLOCK_PREPARE states, + * and therefore it can safely be used in functions that might be called + * when not inside a BEGIN block or when running deferred triggers at + * COMMIT/PREPARE time. Also, it automatically does + * CommitTransactionCommand/StartTransactionCommand instead of expecting + * the caller to do it. + */ +void +BeginInternalSubTransaction(const char *name) +{ + TransactionState s = CurrentTransactionState; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new subtransactions after that + * point. We might be able to make an exception for the type of + * subtransaction established by this function, which is typically used in + * contexts where we're going to release or roll back the subtransaction + * before proceeding further, so that no enduring change to the + * transaction state occurs. For now, however, we prohibit this case along + * with all the others. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot start subtransactions during a parallel operation"))); + + switch (s->blockState) + { + case TBLOCK_STARTED: + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_END: + case TBLOCK_PREPARE: + case TBLOCK_SUBINPROGRESS: + /* Normal subtransaction start */ + PushTransaction(); + s = CurrentTransactionState; /* changed by push */ + + /* + * Savepoint names, like the TransactionState block itself, live + * in TopTransactionContext. + */ + if (name) + s->name = MemoryContextStrdup(TopTransactionContext, name); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + elog(FATAL, "BeginInternalSubTransaction: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + CommitTransactionCommand(); + StartTransactionCommand(); +} + +/* + * ReleaseCurrentSubTransaction + * + * RELEASE (ie, commit) the innermost subtransaction, regardless of its + * savepoint name (if any). + * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this. + */ +void +ReleaseCurrentSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for commit of subtransactions after that + * point. This should not happen anyway. Code calling this would + * typically have called BeginInternalSubTransaction() first, failing + * there. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot commit subtransactions during a parallel operation"))); + + if (s->blockState != TBLOCK_SUBINPROGRESS) + elog(ERROR, "ReleaseCurrentSubTransaction: unexpected state %s", + BlockStateAsString(s->blockState)); + Assert(s->state == TRANS_INPROGRESS); + MemoryContextSwitchTo(CurTransactionContext); + CommitSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + Assert(s->state == TRANS_INPROGRESS); +} + +/* + * RollbackAndReleaseCurrentSubTransaction + * + * ROLLBACK and RELEASE (ie, abort) the innermost subtransaction, regardless + * of its savepoint name (if any). + * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this. + */ +void +RollbackAndReleaseCurrentSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * Unlike ReleaseCurrentSubTransaction(), this is nominally permitted + * during parallel operations. That's because we may be in the leader, + * recovering from an error thrown while we were in parallel mode. We + * won't reach here in a worker, because BeginInternalSubTransaction() + * will have failed. + */ + + switch (s->blockState) + { + /* Must be in a subtransaction */ + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBABORT: + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_INPROGRESS: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "RollbackAndReleaseCurrentSubTransaction: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + /* + * Abort the current subtransaction, if needed. + */ + if (s->blockState == TBLOCK_SUBINPROGRESS) + AbortSubTransaction(); + + /* And clean it up, too */ + CleanupSubTransaction(); + + s = CurrentTransactionState; /* changed by pop */ + AssertState(s->blockState == TBLOCK_SUBINPROGRESS || + s->blockState == TBLOCK_INPROGRESS || + s->blockState == TBLOCK_IMPLICIT_INPROGRESS || + s->blockState == TBLOCK_STARTED); +} + +/* + * AbortOutOfAnyTransaction + * + * This routine is provided for error recovery purposes. It aborts any + * active transaction or transaction block, leaving the system in a known + * idle state. + */ +void +AbortOutOfAnyTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* Ensure we're not running in a doomed memory context */ + AtAbort_Memory(); + + /* + * Get out of any transaction or nested transaction + */ + do + { + switch (s->blockState) + { + case TBLOCK_DEFAULT: + if (s->state == TRANS_DEFAULT) + { + /* Not in a transaction, do nothing */ + } + else + { + /* + * We can get here after an error during transaction start + * (state will be TRANS_START). Need to clean up the + * incompletely started transaction. First, adjust the + * low-level state to suppress warning message from + * AbortTransaction. + */ + if (s->state == TRANS_START) + s->state = TRANS_INPROGRESS; + AbortTransaction(); + CleanupTransaction(); + } + break; + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_PREPARE: + /* In a transaction, so clean up */ + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + case TBLOCK_ABORT: + case TBLOCK_ABORT_END: + + /* + * AbortTransaction is already done, still need Cleanup. + * However, if we failed partway through running ROLLBACK, + * there will be an active portal running that command, which + * we need to shut down before doing CleanupTransaction. + */ + AtAbort_Portals(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * In a subtransaction, so clean it up and abort parent too + */ + case TBLOCK_SUBBEGIN: + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + AbortSubTransaction(); + CleanupSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + break; + + case TBLOCK_SUBABORT: + case TBLOCK_SUBABORT_END: + case TBLOCK_SUBABORT_RESTART: + /* As above, but AbortSubTransaction already done */ + if (s->curTransactionOwner) + { + /* As in TBLOCK_ABORT, might have a live portal to zap */ + AtSubAbort_Portals(s->subTransactionId, + s->parent->subTransactionId, + s->curTransactionOwner, + s->parent->curTransactionOwner); + } + CleanupSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + break; + } + } while (s->blockState != TBLOCK_DEFAULT); + + /* Should be out of all subxacts now */ + Assert(s->parent == NULL); + + /* If we didn't actually have anything to do, revert to TopMemoryContext */ + AtCleanup_Memory(); +} + +/* + * IsTransactionBlock --- are we within a transaction block? + */ +bool +IsTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_DEFAULT || s->blockState == TBLOCK_STARTED) + return false; + + return true; +} + +/* + * IsTransactionOrTransactionBlock --- are we within either a transaction + * or a transaction block? (The backend is only really "idle" when this + * returns false.) + * + * This should match up with IsTransactionBlock and IsTransactionState. + */ +bool +IsTransactionOrTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_DEFAULT) + return false; + + return true; +} + +/* + * TransactionBlockStatusCode - return status code to send in ReadyForQuery + */ +char +TransactionBlockStatusCode(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + return 'I'; /* idle --- not in transaction */ + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_PREPARE: + return 'T'; /* in transaction */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + return 'E'; /* in failed transaction */ + } + + /* should never get here */ + elog(FATAL, "invalid transaction block state: %s", + BlockStateAsString(s->blockState)); + return 0; /* keep compiler quiet */ +} + +/* + * IsSubTransaction + */ +bool +IsSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->nestingLevel >= 2) + return true; + + return false; +} + +/* + * StartSubTransaction + * + * If you're wondering why this is separate from PushTransaction: it's because + * we can't conveniently do this stuff right inside DefineSavepoint. The + * SAVEPOINT utility command will be executed inside a Portal, and if we + * muck with CurrentMemoryContext or CurrentResourceOwner then exit from + * the Portal will undo those settings. So we make DefineSavepoint just + * push a dummy transaction block, and when control returns to the main + * idle loop, CommitTransactionCommand will be called, and we'll come here + * to finish starting the subtransaction. + */ +static void +StartSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->state != TRANS_DEFAULT) + elog(WARNING, "StartSubTransaction while in %s state", + TransStateAsString(s->state)); + + s->state = TRANS_START; + + /* + * Initialize subsystems for new subtransaction + * + * must initialize resource-management stuff first + */ + AtSubStart_Memory(); + AtSubStart_ResourceOwner(); + AfterTriggerBeginSubXact(); + + s->state = TRANS_INPROGRESS; + + /* + * Call start-of-subxact callbacks + */ + CallSubXactCallbacks(SUBXACT_EVENT_START_SUB, s->subTransactionId, + s->parent->subTransactionId); + + ShowTransactionState("StartSubTransaction"); +} + +/* + * CommitSubTransaction + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +CommitSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + ShowTransactionState("CommitSubTransaction"); + + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "CommitSubTransaction while in %s state", + TransStateAsString(s->state)); + + /* Pre-commit processing goes here */ + + CallSubXactCallbacks(SUBXACT_EVENT_PRE_COMMIT_SUB, s->subTransactionId, + s->parent->subTransactionId); + + /* If in parallel mode, clean up workers and exit parallel mode. */ + if (IsInParallelMode()) + { + AtEOSubXact_Parallel(true, s->subTransactionId); + s->parallelModeLevel = 0; + } + + /* Do the actual "commit", such as it is */ + s->state = TRANS_COMMIT; + + /* Must CCI to ensure commands of subtransaction are seen as done */ + CommandCounterIncrement(); + + /* + * Prior to 8.4 we marked subcommit in clog at this point. We now only + * perform that step, if required, as part of the atomic update of the + * whole transaction tree at top level commit or abort. + */ + + /* Post-commit cleanup */ + if (FullTransactionIdIsValid(s->fullTransactionId)) + AtSubCommit_childXids(); + AfterTriggerEndSubXact(true); + AtSubCommit_Portals(s->subTransactionId, + s->parent->subTransactionId, + s->parent->nestingLevel, + s->parent->curTransactionOwner); + AtEOSubXact_LargeObject(true, s->subTransactionId, + s->parent->subTransactionId); + AtSubCommit_Notify(); + + CallSubXactCallbacks(SUBXACT_EVENT_COMMIT_SUB, s->subTransactionId, + s->parent->subTransactionId); + + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, false); + AtEOSubXact_RelationCache(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Inval(true); + AtSubCommit_smgr(); + + /* + * The only lock we actually release here is the subtransaction XID lock. + */ + CurrentResourceOwner = s->curTransactionOwner; + if (FullTransactionIdIsValid(s->fullTransactionId)) + XactLockTableDelete(XidFromFullTransactionId(s->fullTransactionId)); + + /* + * Other locks should get transferred to their parent resource owner. + */ + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_LOCKS, + true, false); + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, false); + + AtEOXact_GUC(true, s->gucNestLevel); + AtEOSubXact_SPI(true, s->subTransactionId); + AtEOSubXact_on_commit_actions(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Namespace(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Files(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_HashTables(true, s->nestingLevel); + AtEOSubXact_PgStat(true, s->nestingLevel); + AtSubCommit_Snapshot(s->nestingLevel); + + /* + * We need to restore the upper transaction's read-only state, in case the + * upper is read-write while the child is read-only; GUC will incorrectly + * think it should leave the child state in place. + */ + XactReadOnly = s->prevXactReadOnly; + + CurrentResourceOwner = s->parent->curTransactionOwner; + CurTransactionResourceOwner = s->parent->curTransactionOwner; + ResourceOwnerDelete(s->curTransactionOwner); + s->curTransactionOwner = NULL; + + AtSubCommit_Memory(); + + s->state = TRANS_DEFAULT; + + PopTransaction(); +} + +/* + * AbortSubTransaction + */ +static void +AbortSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* Make sure we have a valid memory context and resource owner */ + AtSubAbort_Memory(); + AtSubAbort_ResourceOwner(); + + /* + * Release any LW locks we might be holding as quickly as possible. + * (Regular locks, however, must be held till we finish aborting.) + * Releasing LW locks is critical since we might try to grab them again + * while cleaning up! + * + * FIXME This may be incorrect --- Are there some locks we should keep? + * Buffer locks, for example? I don't think so but I'm not sure. + */ + LWLockReleaseAll(); + + pgstat_report_wait_end(); + pgstat_progress_end_command(); + AbortBufferIO(); + UnlockBuffers(); + + /* Reset WAL record construction state */ + XLogResetInsertion(); + + /* Cancel condition variable sleep */ + ConditionVariableCancelSleep(); + + /* + * Also clean up any open wait for lock, since the lock manager will choke + * if we try to wait for another lock before doing this. + */ + LockErrorCleanup(); + + /* + * If any timeout events are still active, make sure the timeout interrupt + * is scheduled. This covers possible loss of a timeout interrupt due to + * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm). + * We delay this till after LockErrorCleanup so that we don't uselessly + * reschedule lock or deadlock check timeouts. + */ + reschedule_timeouts(); + + /* + * Re-enable signals, in case we got here by longjmp'ing out of a signal + * handler. We do this fairly early in the sequence so that the timeout + * infrastructure will be functional if needed while aborting. + */ + PG_SETMASK(&UnBlockSig); + + /* + * check the current transaction state + */ + ShowTransactionState("AbortSubTransaction"); + + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "AbortSubTransaction while in %s state", + TransStateAsString(s->state)); + + s->state = TRANS_ABORT; + + /* + * Reset user ID which might have been changed transiently. (See notes in + * AbortTransaction.) + */ + SetUserIdAndSecContext(s->prevUser, s->prevSecContext); + + /* Forget about any active REINDEX. */ + ResetReindexState(s->nestingLevel); + + /* Reset logical streaming state. */ + ResetLogicalStreamingState(); + + /* + * No need for SnapBuildResetExportedSnapshotState() here, snapshot + * exports are not supported in subtransactions. + */ + + /* Exit from parallel mode, if necessary. */ + if (IsInParallelMode()) + { + AtEOSubXact_Parallel(false, s->subTransactionId); + s->parallelModeLevel = 0; + } + + /* + * We can skip all this stuff if the subxact failed before creating a + * ResourceOwner... + */ + if (s->curTransactionOwner) + { + AfterTriggerEndSubXact(false); + AtSubAbort_Portals(s->subTransactionId, + s->parent->subTransactionId, + s->curTransactionOwner, + s->parent->curTransactionOwner); + AtEOSubXact_LargeObject(false, s->subTransactionId, + s->parent->subTransactionId); + AtSubAbort_Notify(); + + /* Advertise the fact that we aborted in pg_xact. */ + (void) RecordTransactionAbort(true); + + /* Post-abort cleanup */ + if (FullTransactionIdIsValid(s->fullTransactionId)) + AtSubAbort_childXids(); + + CallSubXactCallbacks(SUBXACT_EVENT_ABORT_SUB, s->subTransactionId, + s->parent->subTransactionId); + + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, false); + AtEOSubXact_RelationCache(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Inval(false); + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_LOCKS, + false, false); + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + false, false); + AtSubAbort_smgr(); + + AtEOXact_GUC(false, s->gucNestLevel); + AtEOSubXact_SPI(false, s->subTransactionId); + AtEOSubXact_on_commit_actions(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Namespace(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Files(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_HashTables(false, s->nestingLevel); + AtEOSubXact_PgStat(false, s->nestingLevel); + AtSubAbort_Snapshot(s->nestingLevel); + } + + /* + * Restore the upper transaction's read-only state, too. This should be + * redundant with GUC's cleanup but we may as well do it for consistency + * with the commit case. + */ + XactReadOnly = s->prevXactReadOnly; + + RESUME_INTERRUPTS(); +} + +/* + * CleanupSubTransaction + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +CleanupSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + ShowTransactionState("CleanupSubTransaction"); + + if (s->state != TRANS_ABORT) + elog(WARNING, "CleanupSubTransaction while in %s state", + TransStateAsString(s->state)); + + AtSubCleanup_Portals(s->subTransactionId); + + CurrentResourceOwner = s->parent->curTransactionOwner; + CurTransactionResourceOwner = s->parent->curTransactionOwner; + if (s->curTransactionOwner) + ResourceOwnerDelete(s->curTransactionOwner); + s->curTransactionOwner = NULL; + + AtSubCleanup_Memory(); + + s->state = TRANS_DEFAULT; + + PopTransaction(); +} + +/* + * PushTransaction + * Create transaction state stack entry for a subtransaction + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +PushTransaction(void) +{ + TransactionState p = CurrentTransactionState; + TransactionState s; + + /* + * We keep subtransaction state nodes in TopTransactionContext. + */ + s = (TransactionState) + MemoryContextAllocZero(TopTransactionContext, + sizeof(TransactionStateData)); + + /* + * Assign a subtransaction ID, watching out for counter wraparound. + */ + currentSubTransactionId += 1; + if (currentSubTransactionId == InvalidSubTransactionId) + { + currentSubTransactionId -= 1; + pfree(s); + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than 2^32-1 subtransactions in a transaction"))); + } + + /* + * We can now stack a minimally valid subtransaction without fear of + * failure. + */ + s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + s->subTransactionId = currentSubTransactionId; + s->parent = p; + s->nestingLevel = p->nestingLevel + 1; + s->gucNestLevel = NewGUCNestLevel(); + s->savepointLevel = p->savepointLevel; + s->state = TRANS_DEFAULT; + s->blockState = TBLOCK_SUBBEGIN; + GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext); + s->prevXactReadOnly = XactReadOnly; + s->parallelModeLevel = 0; + s->assigned = false; + + CurrentTransactionState = s; + + /* + * AbortSubTransaction and CleanupSubTransaction have to be able to cope + * with the subtransaction from here on out; in particular they should not + * assume that it necessarily has a transaction context, resource owner, + * or XID. + */ +} + +/* + * PopTransaction + * Pop back to parent transaction state + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +PopTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->state != TRANS_DEFAULT) + elog(WARNING, "PopTransaction while in %s state", + TransStateAsString(s->state)); + + if (s->parent == NULL) + elog(FATAL, "PopTransaction with no parent"); + + CurrentTransactionState = s->parent; + + /* Let's just make sure CurTransactionContext is good */ + CurTransactionContext = s->parent->curTransactionContext; + MemoryContextSwitchTo(CurTransactionContext); + + /* Ditto for ResourceOwner links */ + CurTransactionResourceOwner = s->parent->curTransactionOwner; + CurrentResourceOwner = s->parent->curTransactionOwner; + + /* Free the old child structure */ + if (s->name) + pfree(s->name); + pfree(s); +} + +/* + * EstimateTransactionStateSpace + * Estimate the amount of space that will be needed by + * SerializeTransactionState. It would be OK to overestimate slightly, + * but it's simple for us to work out the precise value, so we do. + */ +Size +EstimateTransactionStateSpace(void) +{ + TransactionState s; + Size nxids = 0; + Size size = SerializedTransactionStateHeaderSize; + + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (FullTransactionIdIsValid(s->fullTransactionId)) + nxids = add_size(nxids, 1); + nxids = add_size(nxids, s->nChildXids); + } + + return add_size(size, mul_size(sizeof(TransactionId), nxids)); +} + +/* + * SerializeTransactionState + * Write out relevant details of our transaction state that will be + * needed by a parallel worker. + * + * We need to save and restore XactDeferrable, XactIsoLevel, and the XIDs + * associated with this transaction. These are serialized into a + * caller-supplied buffer big enough to hold the number of bytes reported by + * EstimateTransactionStateSpace(). We emit the XIDs in sorted order for the + * convenience of the receiving process. + */ +void +SerializeTransactionState(Size maxsize, char *start_address) +{ + TransactionState s; + Size nxids = 0; + Size i = 0; + TransactionId *workspace; + SerializedTransactionState *result; + + result = (SerializedTransactionState *) start_address; + + result->xactIsoLevel = XactIsoLevel; + result->xactDeferrable = XactDeferrable; + result->topFullTransactionId = XactTopFullTransactionId; + result->currentFullTransactionId = + CurrentTransactionState->fullTransactionId; + result->currentCommandId = currentCommandId; + + /* + * If we're running in a parallel worker and launching a parallel worker + * of our own, we can just pass along the information that was passed to + * us. + */ + if (nParallelCurrentXids > 0) + { + result->nParallelCurrentXids = nParallelCurrentXids; + memcpy(&result->parallelCurrentXids[0], ParallelCurrentXids, + nParallelCurrentXids * sizeof(TransactionId)); + return; + } + + /* + * OK, we need to generate a sorted list of XIDs that our workers should + * view as current. First, figure out how many there are. + */ + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (FullTransactionIdIsValid(s->fullTransactionId)) + nxids = add_size(nxids, 1); + nxids = add_size(nxids, s->nChildXids); + } + Assert(SerializedTransactionStateHeaderSize + nxids * sizeof(TransactionId) + <= maxsize); + + /* Copy them to our scratch space. */ + workspace = palloc(nxids * sizeof(TransactionId)); + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (FullTransactionIdIsValid(s->fullTransactionId)) + workspace[i++] = XidFromFullTransactionId(s->fullTransactionId); + if (s->nChildXids > 0) + memcpy(&workspace[i], s->childXids, + s->nChildXids * sizeof(TransactionId)); + i += s->nChildXids; + } + Assert(i == nxids); + + /* Sort them. */ + qsort(workspace, nxids, sizeof(TransactionId), xidComparator); + + /* Copy data into output area. */ + result->nParallelCurrentXids = nxids; + memcpy(&result->parallelCurrentXids[0], workspace, + nxids * sizeof(TransactionId)); +} + +/* + * StartParallelWorkerTransaction + * Start a parallel worker transaction, restoring the relevant + * transaction state serialized by SerializeTransactionState. + */ +void +StartParallelWorkerTransaction(char *tstatespace) +{ + SerializedTransactionState *tstate; + + Assert(CurrentTransactionState->blockState == TBLOCK_DEFAULT); + StartTransaction(); + + tstate = (SerializedTransactionState *) tstatespace; + XactIsoLevel = tstate->xactIsoLevel; + XactDeferrable = tstate->xactDeferrable; + XactTopFullTransactionId = tstate->topFullTransactionId; + CurrentTransactionState->fullTransactionId = + tstate->currentFullTransactionId; + currentCommandId = tstate->currentCommandId; + nParallelCurrentXids = tstate->nParallelCurrentXids; + ParallelCurrentXids = &tstate->parallelCurrentXids[0]; + + CurrentTransactionState->blockState = TBLOCK_PARALLEL_INPROGRESS; +} + +/* + * EndParallelWorkerTransaction + * End a parallel worker transaction. + */ +void +EndParallelWorkerTransaction(void) +{ + Assert(CurrentTransactionState->blockState == TBLOCK_PARALLEL_INPROGRESS); + CommitTransaction(); + CurrentTransactionState->blockState = TBLOCK_DEFAULT; +} + +/* + * ShowTransactionState + * Debug support + */ +static void +ShowTransactionState(const char *str) +{ + /* skip work if message will definitely not be printed */ + if (message_level_is_interesting(DEBUG5)) + ShowTransactionStateRec(str, CurrentTransactionState); +} + +/* + * ShowTransactionStateRec + * Recursive subroutine for ShowTransactionState + */ +static void +ShowTransactionStateRec(const char *str, TransactionState s) +{ + StringInfoData buf; + + initStringInfo(&buf); + + if (s->nChildXids > 0) + { + int i; + + appendStringInfo(&buf, ", children: %u", s->childXids[0]); + for (i = 1; i < s->nChildXids; i++) + appendStringInfo(&buf, " %u", s->childXids[i]); + } + + if (s->parent) + ShowTransactionStateRec(str, s->parent); + + ereport(DEBUG5, + (errmsg_internal("%s(%d) name: %s; blockState: %s; state: %s, xid/subid/cid: %u/%u/%u%s%s", + str, s->nestingLevel, + PointerIsValid(s->name) ? s->name : "unnamed", + BlockStateAsString(s->blockState), + TransStateAsString(s->state), + (unsigned int) XidFromFullTransactionId(s->fullTransactionId), + (unsigned int) s->subTransactionId, + (unsigned int) currentCommandId, + currentCommandIdUsed ? " (used)" : "", + buf.data))); + + pfree(buf.data); +} + +/* + * BlockStateAsString + * Debug support + */ +static const char * +BlockStateAsString(TBlockState blockState) +{ + switch (blockState) + { + case TBLOCK_DEFAULT: + return "DEFAULT"; + case TBLOCK_STARTED: + return "STARTED"; + case TBLOCK_BEGIN: + return "BEGIN"; + case TBLOCK_INPROGRESS: + return "INPROGRESS"; + case TBLOCK_IMPLICIT_INPROGRESS: + return "IMPLICIT_INPROGRESS"; + case TBLOCK_PARALLEL_INPROGRESS: + return "PARALLEL_INPROGRESS"; + case TBLOCK_END: + return "END"; + case TBLOCK_ABORT: + return "ABORT"; + case TBLOCK_ABORT_END: + return "ABORT_END"; + case TBLOCK_ABORT_PENDING: + return "ABORT_PENDING"; + case TBLOCK_PREPARE: + return "PREPARE"; + case TBLOCK_SUBBEGIN: + return "SUBBEGIN"; + case TBLOCK_SUBINPROGRESS: + return "SUBINPROGRESS"; + case TBLOCK_SUBRELEASE: + return "SUBRELEASE"; + case TBLOCK_SUBCOMMIT: + return "SUBCOMMIT"; + case TBLOCK_SUBABORT: + return "SUBABORT"; + case TBLOCK_SUBABORT_END: + return "SUBABORT_END"; + case TBLOCK_SUBABORT_PENDING: + return "SUBABORT_PENDING"; + case TBLOCK_SUBRESTART: + return "SUBRESTART"; + case TBLOCK_SUBABORT_RESTART: + return "SUBABORT_RESTART"; + } + return "UNRECOGNIZED"; +} + +/* + * TransStateAsString + * Debug support + */ +static const char * +TransStateAsString(TransState state) +{ + switch (state) + { + case TRANS_DEFAULT: + return "DEFAULT"; + case TRANS_START: + return "START"; + case TRANS_INPROGRESS: + return "INPROGRESS"; + case TRANS_COMMIT: + return "COMMIT"; + case TRANS_ABORT: + return "ABORT"; + case TRANS_PREPARE: + return "PREPARE"; + } + return "UNRECOGNIZED"; +} + +/* + * xactGetCommittedChildren + * + * Gets the list of committed children of the current transaction. The return + * value is the number of child transactions. *ptr is set to point to an + * array of TransactionIds. The array is allocated in TopTransactionContext; + * the caller should *not* pfree() it (this is a change from pre-8.4 code!). + * If there are no subxacts, *ptr is set to NULL. + */ +int +xactGetCommittedChildren(TransactionId **ptr) +{ + TransactionState s = CurrentTransactionState; + + if (s->nChildXids == 0) + *ptr = NULL; + else + *ptr = s->childXids; + + return s->nChildXids; +} + +/* + * XLOG support routines + */ + + +/* + * Log the commit record for a plain or twophase transaction commit. + * + * A 2pc commit will be emitted when twophase_xid is valid, a plain one + * otherwise. + */ +XLogRecPtr +XactLogCommitRecord(TimestampTz commit_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileNode *rels, + int nmsgs, SharedInvalidationMessage *msgs, + bool relcacheInval, + int xactflags, TransactionId twophase_xid, + const char *twophase_gid) +{ + xl_xact_commit xlrec; + xl_xact_xinfo xl_xinfo; + xl_xact_dbinfo xl_dbinfo; + xl_xact_subxacts xl_subxacts; + xl_xact_relfilenodes xl_relfilenodes; + xl_xact_invals xl_invals; + xl_xact_twophase xl_twophase; + xl_xact_origin xl_origin; + uint8 info; + + Assert(CritSectionCount > 0); + + xl_xinfo.xinfo = 0; + + /* decide between a plain and 2pc commit */ + if (!TransactionIdIsValid(twophase_xid)) + info = XLOG_XACT_COMMIT; + else + info = XLOG_XACT_COMMIT_PREPARED; + + /* First figure out and collect all the information needed */ + + xlrec.xact_time = commit_time; + + if (relcacheInval) + xl_xinfo.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE; + if (forceSyncCommit) + xl_xinfo.xinfo |= XACT_COMPLETION_FORCE_SYNC_COMMIT; + if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK)) + xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS; + + /* + * Check if the caller would like to ask standbys for immediate feedback + * once this commit is applied. + */ + if (synchronous_commit >= SYNCHRONOUS_COMMIT_REMOTE_APPLY) + xl_xinfo.xinfo |= XACT_COMPLETION_APPLY_FEEDBACK; + + /* + * Relcache invalidations requires information about the current database + * and so does logical decoding. + */ + if (nmsgs > 0 || XLogLogicalInfoActive()) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO; + xl_dbinfo.dbId = MyDatabaseId; + xl_dbinfo.tsId = MyDatabaseTableSpace; + } + + if (nsubxacts > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS; + xl_subxacts.nsubxacts = nsubxacts; + } + + if (nrels > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES; + xl_relfilenodes.nrels = nrels; + info |= XLR_SPECIAL_REL_UPDATE; + } + + if (nmsgs > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_INVALS; + xl_invals.nmsgs = nmsgs; + } + + if (TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; + xl_twophase.xid = twophase_xid; + Assert(twophase_gid != NULL); + + if (XLogLogicalInfoActive()) + xl_xinfo.xinfo |= XACT_XINFO_HAS_GID; + } + + /* dump transaction origin information */ + if (replorigin_session_origin != InvalidRepOriginId) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN; + + xl_origin.origin_lsn = replorigin_session_origin_lsn; + xl_origin.origin_timestamp = replorigin_session_origin_timestamp; + } + + if (xl_xinfo.xinfo != 0) + info |= XLOG_XACT_HAS_INFO; + + /* Then include all the collected data into the commit record. */ + + XLogBeginInsert(); + + XLogRegisterData((char *) (&xlrec), sizeof(xl_xact_commit)); + + if (xl_xinfo.xinfo != 0) + XLogRegisterData((char *) (&xl_xinfo.xinfo), sizeof(xl_xinfo.xinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO) + XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS) + { + XLogRegisterData((char *) (&xl_subxacts), + MinSizeOfXactSubxacts); + XLogRegisterData((char *) subxacts, + nsubxacts * sizeof(TransactionId)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES) + { + XLogRegisterData((char *) (&xl_relfilenodes), + MinSizeOfXactRelfilenodes); + XLogRegisterData((char *) rels, + nrels * sizeof(RelFileNode)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_INVALS) + { + XLogRegisterData((char *) (&xl_invals), MinSizeOfXactInvals); + XLogRegisterData((char *) msgs, + nmsgs * sizeof(SharedInvalidationMessage)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE) + { + XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase)); + if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID) + XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN) + XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin)); + + /* we allow filtering by xacts */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + return XLogInsert(RM_XACT_ID, info); +} + +/* + * Log the commit record for a plain or twophase transaction abort. + * + * A 2pc abort will be emitted when twophase_xid is valid, a plain one + * otherwise. + */ +XLogRecPtr +XactLogAbortRecord(TimestampTz abort_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileNode *rels, + int xactflags, TransactionId twophase_xid, + const char *twophase_gid) +{ + xl_xact_abort xlrec; + xl_xact_xinfo xl_xinfo; + xl_xact_subxacts xl_subxacts; + xl_xact_relfilenodes xl_relfilenodes; + xl_xact_twophase xl_twophase; + xl_xact_dbinfo xl_dbinfo; + xl_xact_origin xl_origin; + + uint8 info; + + Assert(CritSectionCount > 0); + + xl_xinfo.xinfo = 0; + + /* decide between a plain and 2pc abort */ + if (!TransactionIdIsValid(twophase_xid)) + info = XLOG_XACT_ABORT; + else + info = XLOG_XACT_ABORT_PREPARED; + + + /* First figure out and collect all the information needed */ + + xlrec.xact_time = abort_time; + + if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK)) + xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS; + + if (nsubxacts > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS; + xl_subxacts.nsubxacts = nsubxacts; + } + + if (nrels > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES; + xl_relfilenodes.nrels = nrels; + info |= XLR_SPECIAL_REL_UPDATE; + } + + if (TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; + xl_twophase.xid = twophase_xid; + Assert(twophase_gid != NULL); + + if (XLogLogicalInfoActive()) + xl_xinfo.xinfo |= XACT_XINFO_HAS_GID; + } + + if (TransactionIdIsValid(twophase_xid) && XLogLogicalInfoActive()) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO; + xl_dbinfo.dbId = MyDatabaseId; + xl_dbinfo.tsId = MyDatabaseTableSpace; + } + + /* + * Dump transaction origin information only for abort prepared. We need + * this during recovery to update the replication origin progress. + */ + if ((replorigin_session_origin != InvalidRepOriginId) && + TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN; + + xl_origin.origin_lsn = replorigin_session_origin_lsn; + xl_origin.origin_timestamp = replorigin_session_origin_timestamp; + } + + if (xl_xinfo.xinfo != 0) + info |= XLOG_XACT_HAS_INFO; + + /* Then include all the collected data into the abort record. */ + + XLogBeginInsert(); + + XLogRegisterData((char *) (&xlrec), MinSizeOfXactAbort); + + if (xl_xinfo.xinfo != 0) + XLogRegisterData((char *) (&xl_xinfo), sizeof(xl_xinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO) + XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS) + { + XLogRegisterData((char *) (&xl_subxacts), + MinSizeOfXactSubxacts); + XLogRegisterData((char *) subxacts, + nsubxacts * sizeof(TransactionId)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES) + { + XLogRegisterData((char *) (&xl_relfilenodes), + MinSizeOfXactRelfilenodes); + XLogRegisterData((char *) rels, + nrels * sizeof(RelFileNode)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE) + { + XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase)); + if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID) + XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN) + XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin)); + + if (TransactionIdIsValid(twophase_xid)) + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + return XLogInsert(RM_XACT_ID, info); +} + +/* + * Before 9.0 this was a fairly short function, but now it performs many + * actions for which the order of execution is critical. + */ +static void +xact_redo_commit(xl_xact_parsed_commit *parsed, + TransactionId xid, + XLogRecPtr lsn, + RepOriginId origin_id) +{ + TransactionId max_xid; + TimestampTz commit_time; + + Assert(TransactionIdIsValid(xid)); + + max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); + + /* Make sure nextXid is beyond any XID mentioned in the record. */ + AdvanceNextFullTransactionIdPastXid(max_xid); + + Assert(((parsed->xinfo & XACT_XINFO_HAS_ORIGIN) == 0) == + (origin_id == InvalidRepOriginId)); + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + commit_time = parsed->origin_timestamp; + else + commit_time = parsed->xact_time; + + /* Set the transaction commit timestamp and metadata */ + TransactionTreeSetCommitTsData(xid, parsed->nsubxacts, parsed->subxacts, + commit_time, origin_id); + + if (standbyState == STANDBY_DISABLED) + { + /* + * Mark the transaction committed in pg_xact. + */ + TransactionIdCommitTree(xid, parsed->nsubxacts, parsed->subxacts); + } + else + { + /* + * If a transaction completion record arrives that has as-yet + * unobserved subtransactions then this will not have been fully + * handled by the call to RecordKnownAssignedTransactionIds() in the + * main recovery loop in xlog.c. So we need to do bookkeeping again to + * cover that case. This is confusing and it is easy to think this + * call is irrelevant, which has happened three times in development + * already. Leave it in. + */ + RecordKnownAssignedTransactionIds(max_xid); + + /* + * Mark the transaction committed in pg_xact. We use async commit + * protocol during recovery to provide information on database + * consistency for when users try to set hint bits. It is important + * that we do not set hint bits until the minRecoveryPoint is past + * this commit record. This ensures that if we crash we don't see hint + * bits set on changes made by transactions that haven't yet + * recovered. It's unlikely but it's good to be safe. + */ + TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn); + + /* + * We must mark clog before we update the ProcArray. + */ + ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid); + + /* + * Send any cache invalidations attached to the commit. We must + * maintain the same order of invalidation then release locks as + * occurs in CommitTransaction(). + */ + ProcessCommittedInvalidationMessages(parsed->msgs, parsed->nmsgs, + XactCompletionRelcacheInitFileInval(parsed->xinfo), + parsed->dbId, parsed->tsId); + + /* + * Release locks, if any. We do this for both two phase and normal one + * phase transactions. In effect we are ignoring the prepare phase and + * just going straight to lock release. + */ + if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS) + StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts); + } + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + /* recover apply progress */ + replorigin_advance(origin_id, parsed->origin_lsn, lsn, + false /* backward */ , false /* WAL */ ); + } + + /* Make sure files supposed to be dropped are dropped */ + if (parsed->nrels > 0) + { + /* + * First update minimum recovery point to cover this WAL record. Once + * a relation is deleted, there's no going back. The buffer manager + * enforces the WAL-first rule for normal updates to relation files, + * so that the minimum recovery point is always updated before the + * corresponding change in the data file is flushed to disk, but we + * have to do the same here since we're bypassing the buffer manager. + * + * Doing this before deleting the files means that if a deletion fails + * for some reason, you cannot start up the system even after restart, + * until you fix the underlying situation so that the deletion will + * succeed. Alternatively, we could update the minimum recovery point + * after deletion, but that would leave a small window where the + * WAL-first rule would be violated. + */ + XLogFlush(lsn); + + /* Make sure files supposed to be dropped are dropped */ + DropRelationFiles(parsed->xnodes, parsed->nrels, true); + } + + /* + * We issue an XLogFlush() for the same reason we emit ForceSyncCommit() + * in normal operation. For example, in CREATE DATABASE, we copy all files + * from the template database, and then commit the transaction. If we + * crash after all the files have been copied but before the commit, you + * have files in the data directory without an entry in pg_database. To + * minimize the window for that, we use ForceSyncCommit() to rush the + * commit record to disk as quick as possible. We have the same window + * during recovery, and forcing an XLogFlush() (which updates + * minRecoveryPoint during recovery) helps to reduce that problem window, + * for any user that requested ForceSyncCommit(). + */ + if (XactCompletionForceSyncCommit(parsed->xinfo)) + XLogFlush(lsn); + + /* + * If asked by the primary (because someone is waiting for a synchronous + * commit = remote_apply), we will need to ask walreceiver to send a reply + * immediately. + */ + if (XactCompletionApplyFeedback(parsed->xinfo)) + XLogRequestWalReceiverReply(); +} + +/* + * Be careful with the order of execution, as with xact_redo_commit(). + * The two functions are similar but differ in key places. + * + * Note also that an abort can be for a subtransaction and its children, + * not just for a top level abort. That means we have to consider + * topxid != xid, whereas in commit we would find topxid == xid always + * because subtransaction commit is never WAL logged. + */ +static void +xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, + XLogRecPtr lsn, RepOriginId origin_id) +{ + TransactionId max_xid; + + Assert(TransactionIdIsValid(xid)); + + /* Make sure nextXid is beyond any XID mentioned in the record. */ + max_xid = TransactionIdLatest(xid, + parsed->nsubxacts, + parsed->subxacts); + AdvanceNextFullTransactionIdPastXid(max_xid); + + if (standbyState == STANDBY_DISABLED) + { + /* Mark the transaction aborted in pg_xact, no need for async stuff */ + TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); + } + else + { + /* + * If a transaction completion record arrives that has as-yet + * unobserved subtransactions then this will not have been fully + * handled by the call to RecordKnownAssignedTransactionIds() in the + * main recovery loop in xlog.c. So we need to do bookkeeping again to + * cover that case. This is confusing and it is easy to think this + * call is irrelevant, which has happened three times in development + * already. Leave it in. + */ + RecordKnownAssignedTransactionIds(max_xid); + + /* Mark the transaction aborted in pg_xact, no need for async stuff */ + TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); + + /* + * We must update the ProcArray after we have marked clog. + */ + ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid); + + /* + * There are no invalidation messages to send or undo. + */ + + /* + * Release locks, if any. There are no invalidations to send. + */ + if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS) + StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts); + } + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + /* recover apply progress */ + replorigin_advance(origin_id, parsed->origin_lsn, lsn, + false /* backward */ , false /* WAL */ ); + } + + /* Make sure files supposed to be dropped are dropped */ + if (parsed->nrels > 0) + { + /* + * See comments about update of minimum recovery point on truncation, + * in xact_redo_commit(). + */ + XLogFlush(lsn); + + DropRelationFiles(parsed->xnodes, parsed->nrels, true); + } +} + +void +xact_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + /* Backup blocks are not used in xact records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == XLOG_XACT_COMMIT) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_commit(&parsed, XLogRecGetXid(record), + record->EndRecPtr, XLogRecGetOrigin(record)); + } + else if (info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_commit(&parsed, parsed.twophase_xid, + record->EndRecPtr, XLogRecGetOrigin(record)); + + /* Delete TwoPhaseState gxact entry and/or 2PC file. */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + PrepareRedoRemove(parsed.twophase_xid, false); + LWLockRelease(TwoPhaseStateLock); + } + else if (info == XLOG_XACT_ABORT) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_abort(&parsed, XLogRecGetXid(record), + record->EndRecPtr, XLogRecGetOrigin(record)); + } + else if (info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_abort(&parsed, parsed.twophase_xid, + record->EndRecPtr, XLogRecGetOrigin(record)); + + /* Delete TwoPhaseState gxact entry and/or 2PC file. */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + PrepareRedoRemove(parsed.twophase_xid, false); + LWLockRelease(TwoPhaseStateLock); + } + else if (info == XLOG_XACT_PREPARE) + { + /* + * Store xid and start/end pointers of the WAL record in TwoPhaseState + * gxact entry. + */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + PrepareRedoAdd(XLogRecGetData(record), + record->ReadRecPtr, + record->EndRecPtr, + XLogRecGetOrigin(record)); + LWLockRelease(TwoPhaseStateLock); + } + else if (info == XLOG_XACT_ASSIGNMENT) + { + xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record); + + if (standbyState >= STANDBY_INITIALIZED) + ProcArrayApplyXidAssignment(xlrec->xtop, + xlrec->nsubxacts, xlrec->xsub); + } + else if (info == XLOG_XACT_INVALIDATIONS) + { + /* + * XXX we do ignore this for now, what matters are invalidations + * written into the commit record. + */ + } + else + elog(PANIC, "xact_redo: unknown op code %u", info); +} + +/* + * IsSubTransactionAssignmentPending + * + * This is used to decide whether we need to WAL log the top-level XID for + * operation in a subtransaction. We require that for logical decoding, see + * LogicalDecodingProcessRecord. + * + * This returns true if wal_level >= logical and we are inside a valid + * subtransaction, for which the assignment was not yet written to any WAL + * record. + */ +bool +IsSubTransactionAssignmentPending(void) +{ + /* wal_level has to be logical */ + if (!XLogLogicalInfoActive()) + return false; + + /* we need to be in a transaction state */ + if (!IsTransactionState()) + return false; + + /* it has to be a subtransaction */ + if (!IsSubTransaction()) + return false; + + /* the subtransaction has to have a XID assigned */ + if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + return false; + + /* and it should not be already 'assigned' */ + return !CurrentTransactionState->assigned; +} + +/* + * MarkSubTransactionAssigned + * + * Mark the subtransaction assignment as completed. + */ +void +MarkSubTransactionAssigned(void) +{ + Assert(IsSubTransactionAssignmentPending()); + + CurrentTransactionState->assigned = true; +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c new file mode 100644 index 0000000..ef72bde --- /dev/null +++ b/src/backend/access/transam/xlog.c @@ -0,0 +1,13209 @@ +/*------------------------------------------------------------------------- + * + * xlog.c + * PostgreSQL write-ahead log manager + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlog.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/heaptoast.h" +#include "access/multixact.h" +#include "access/rewriteheap.h" +#include "access/subtrans.h" +#include "access/timeline.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "access/xloginsert.h" +#include "access/xlogreader.h" +#include "access/xlogutils.h" +#include "catalog/catversion.h" +#include "catalog/pg_control.h" +#include "catalog/pg_database.h" +#include "commands/progress.h" +#include "commands/tablespace.h" +#include "common/controldata_utils.h" +#include "common/file_utils.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "port/pg_iovec.h" +#include "postmaster/bgwriter.h" +#include "postmaster/startup.h" +#include "postmaster/walwriter.h" +#include "replication/basebackup.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "replication/slot.h" +#include "replication/snapbuild.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/large_object.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/reinit.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "storage/sync.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/relmapper.h" +#include "utils/pg_rusage.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" + +extern uint32 bootstrap_data_checksum_version; + +/* Unsupported old recovery command file names (relative to $PGDATA) */ +#define RECOVERY_COMMAND_FILE "recovery.conf" +#define RECOVERY_COMMAND_DONE "recovery.done" + +/* User-settable parameters */ +int max_wal_size_mb = 1024; /* 1 GB */ +int min_wal_size_mb = 80; /* 80 MB */ +int wal_keep_size_mb = 0; +int XLOGbuffers = -1; +int XLogArchiveTimeout = 0; +int XLogArchiveMode = ARCHIVE_MODE_OFF; +char *XLogArchiveCommand = NULL; +bool EnableHotStandby = false; +bool fullPageWrites = true; +bool wal_log_hints = false; +bool wal_compression = false; +char *wal_consistency_checking_string = NULL; +bool *wal_consistency_checking = NULL; +bool wal_init_zero = true; +bool wal_recycle = true; +bool log_checkpoints = false; +int sync_method = DEFAULT_SYNC_METHOD; +int wal_level = WAL_LEVEL_MINIMAL; +int CommitDelay = 0; /* precommit delay in microseconds */ +int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ +int wal_retrieve_retry_interval = 5000; +int max_slot_wal_keep_size_mb = -1; +bool track_wal_io_timing = false; + +#ifdef WAL_DEBUG +bool XLOG_DEBUG = false; +#endif + +int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; + +/* + * Number of WAL insertion locks to use. A higher value allows more insertions + * to happen concurrently, but adds some CPU overhead to flushing the WAL, + * which needs to iterate all the locks. + */ +#define NUM_XLOGINSERT_LOCKS 8 + +/* + * Max distance from last checkpoint, before triggering a new xlog-based + * checkpoint. + */ +int CheckPointSegments; + +/* Estimated distance between checkpoints, in bytes */ +static double CheckPointDistanceEstimate = 0; +static double PrevCheckPointDistance = 0; + +/* + * GUC support + */ +const struct config_enum_entry sync_method_options[] = { + {"fsync", SYNC_METHOD_FSYNC, false}, +#ifdef HAVE_FSYNC_WRITETHROUGH + {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false}, +#endif +#ifdef HAVE_FDATASYNC + {"fdatasync", SYNC_METHOD_FDATASYNC, false}, +#endif +#ifdef OPEN_SYNC_FLAG + {"open_sync", SYNC_METHOD_OPEN, false}, +#endif +#ifdef OPEN_DATASYNC_FLAG + {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false}, +#endif + {NULL, 0, false} +}; + + +/* + * Although only "on", "off", and "always" are documented, + * we accept all the likely variants of "on" and "off". + */ +const struct config_enum_entry archive_mode_options[] = { + {"always", ARCHIVE_MODE_ALWAYS, false}, + {"on", ARCHIVE_MODE_ON, false}, + {"off", ARCHIVE_MODE_OFF, false}, + {"true", ARCHIVE_MODE_ON, true}, + {"false", ARCHIVE_MODE_OFF, true}, + {"yes", ARCHIVE_MODE_ON, true}, + {"no", ARCHIVE_MODE_OFF, true}, + {"1", ARCHIVE_MODE_ON, true}, + {"0", ARCHIVE_MODE_OFF, true}, + {NULL, 0, false} +}; + +const struct config_enum_entry recovery_target_action_options[] = { + {"pause", RECOVERY_TARGET_ACTION_PAUSE, false}, + {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false}, + {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false}, + {NULL, 0, false} +}; + +/* + * Statistics for current checkpoint are collected in this global struct. + * Because only the checkpointer or a stand-alone backend can perform + * checkpoints, this will be unused in normal backends. + */ +CheckpointStatsData CheckpointStats; + +/* + * ThisTimeLineID will be same in all backends --- it identifies current + * WAL timeline for the database system. + */ +TimeLineID ThisTimeLineID = 0; + +/* + * Are we doing recovery from XLOG? + * + * This is only ever true in the startup process; it should be read as meaning + * "this process is replaying WAL records", rather than "the system is in + * recovery mode". It should be examined primarily by functions that need + * to act differently when called from a WAL redo function (e.g., to skip WAL + * logging). To check whether the system is in recovery regardless of which + * process you're running in, use RecoveryInProgress() but only after shared + * memory startup and lock initialization. + */ +bool InRecovery = false; + +/* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */ +HotStandbyState standbyState = STANDBY_DISABLED; + +static XLogRecPtr LastRec; + +/* Local copy of WalRcv->flushedUpto */ +static XLogRecPtr flushedUpto = 0; +static TimeLineID receiveTLI = 0; + +/* + * abortedRecPtr is the start pointer of a broken record at end of WAL when + * recovery completes; missingContrecPtr is the location of the first + * contrecord that went missing. See CreateOverwriteContrecordRecord for + * details. + */ +static XLogRecPtr abortedRecPtr; +static XLogRecPtr missingContrecPtr; + +/* + * During recovery, lastFullPageWrites keeps track of full_page_writes that + * the replayed WAL records indicate. It's initialized with full_page_writes + * that the recovery starting checkpoint record indicates, and then updated + * each time XLOG_FPW_CHANGE record is replayed. + */ +static bool lastFullPageWrites; + +/* + * Local copy of the state tracked by SharedRecoveryState in shared memory, + * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually + * means "not known, need to check the shared state". + */ +static bool LocalRecoveryInProgress = true; + +/* + * Local copy of SharedHotStandbyActive variable. False actually means "not + * known, need to check the shared state". + */ +static bool LocalHotStandbyActive = false; + +/* + * Local copy of SharedPromoteIsTriggered variable. False actually means "not + * known, need to check the shared state". + */ +static bool LocalPromoteIsTriggered = false; + +/* + * Local state for XLogInsertAllowed(): + * 1: unconditionally allowed to insert XLOG + * 0: unconditionally not allowed to insert XLOG + * -1: must check RecoveryInProgress(); disallow until it is false + * Most processes start with -1 and transition to 1 after seeing that recovery + * is not in progress. But we can also force the value for special cases. + * The coding in XLogInsertAllowed() depends on the first two of these states + * being numerically the same as bool true and false. + */ +static int LocalXLogInsertAllowed = -1; + +/* + * When ArchiveRecoveryRequested is set, archive recovery was requested, + * ie. signal files were present. When InArchiveRecovery is set, we are + * currently recovering using offline XLOG archives. These variables are only + * valid in the startup process. + * + * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're + * currently performing crash recovery using only XLOG files in pg_wal, but + * will switch to using offline XLOG archives as soon as we reach the end of + * WAL in pg_wal. +*/ +bool ArchiveRecoveryRequested = false; +bool InArchiveRecovery = false; + +static bool standby_signal_file_found = false; +static bool recovery_signal_file_found = false; + +/* Was the last xlog file restored from archive, or local? */ +static bool restoredFromArchive = false; + +/* Buffers dedicated to consistency checks of size BLCKSZ */ +static char *replay_image_masked = NULL; +static char *primary_image_masked = NULL; + +/* options formerly taken from recovery.conf for archive recovery */ +char *recoveryRestoreCommand = NULL; +char *recoveryEndCommand = NULL; +char *archiveCleanupCommand = NULL; +RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; +bool recoveryTargetInclusive = true; +int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE; +TransactionId recoveryTargetXid; +char *recovery_target_time_string; +static TimestampTz recoveryTargetTime; +const char *recoveryTargetName; +XLogRecPtr recoveryTargetLSN; +int recovery_min_apply_delay = 0; + +/* options formerly taken from recovery.conf for XLOG streaming */ +bool StandbyModeRequested = false; +char *PrimaryConnInfo = NULL; +char *PrimarySlotName = NULL; +char *PromoteTriggerFile = NULL; +bool wal_receiver_create_temp_slot = false; + +/* are we currently in standby mode? */ +bool StandbyMode = false; + +/* + * if recoveryStopsBefore/After returns true, it saves information of the stop + * point here + */ +static TransactionId recoveryStopXid; +static TimestampTz recoveryStopTime; +static XLogRecPtr recoveryStopLSN; +static char recoveryStopName[MAXFNAMELEN]; +static bool recoveryStopAfter; + +/* + * During normal operation, the only timeline we care about is ThisTimeLineID. + * During recovery, however, things are more complicated. To simplify life + * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we + * scan through the WAL history (that is, it is the line that was active when + * the currently-scanned WAL record was generated). We also need these + * timeline values: + * + * recoveryTargetTimeLineGoal: what the user requested, if any + * + * recoveryTargetTLIRequested: numeric value of requested timeline, if constant + * + * recoveryTargetTLI: the currently understood target timeline; changes + * + * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of + * its known parents, newest first (so recoveryTargetTLI is always the + * first list member). Only these TLIs are expected to be seen in the WAL + * segments we read, and indeed only these TLIs will be considered as + * candidate WAL files to open at all. + * + * curFileTLI: the TLI appearing in the name of the current input WAL file. + * (This is not necessarily the same as ThisTimeLineID, because we could + * be scanning data that was copied from an ancestor timeline when the current + * file was created.) During a sequential scan we do not allow this value + * to decrease. + */ +RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST; +TimeLineID recoveryTargetTLIRequested = 0; +TimeLineID recoveryTargetTLI = 0; +static List *expectedTLEs; +static TimeLineID curFileTLI; + +/* + * ProcLastRecPtr points to the start of the last XLOG record inserted by the + * current backend. It is updated for all inserts. XactLastRecEnd points to + * end+1 of the last record, and is reset when we end a top-level transaction, + * or start a new one; so it can be used to tell if the current transaction has + * created any XLOG records. + * + * While in parallel mode, this may not be fully up to date. When committing, + * a transaction can assume this covers all xlog records written either by the + * user backend or by any parallel worker which was present at any point during + * the transaction. But when aborting, or when still in parallel mode, other + * parallel backends may have written WAL records at later LSNs than the value + * stored here. The parallel leader advances its own copy, when necessary, + * in WaitForParallelWorkersToFinish. + */ +XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr; +XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr; +XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr; + +/* + * RedoRecPtr is this backend's local copy of the REDO record pointer + * (which is almost but not quite the same as a pointer to the most recent + * CHECKPOINT record). We update this from the shared-memory copy, + * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we + * hold an insertion lock). See XLogInsertRecord for details. We are also + * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck; + * see GetRedoRecPtr. A freshly spawned backend obtains the value during + * InitXLOGAccess. + */ +static XLogRecPtr RedoRecPtr; + +/* + * doPageWrites is this backend's local copy of (forcePageWrites || + * fullPageWrites). It is used together with RedoRecPtr to decide whether + * a full-page image of a page need to be taken. + */ +static bool doPageWrites; + +/* Has the recovery code requested a walreceiver wakeup? */ +static bool doRequestWalReceiverReply; + +/* + * RedoStartLSN points to the checkpoint's REDO location which is specified + * in a backup label file, backup history file or control file. In standby + * mode, XLOG streaming usually starts from the position where an invalid + * record was found. But if we fail to read even the initial checkpoint + * record, we use the REDO location instead of the checkpoint location as + * the start position of XLOG streaming. Otherwise we would have to jump + * backwards to the REDO location after reading the checkpoint record, + * because the REDO record can precede the checkpoint record. + */ +static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr; + +/*---------- + * Shared-memory data structures for XLOG control + * + * LogwrtRqst indicates a byte position that we need to write and/or fsync + * the log up to (all records before that point must be written or fsynced). + * LogwrtResult indicates the byte positions we have already written/fsynced. + * These structs are identical but are declared separately to indicate their + * slightly different functions. + * + * To read XLogCtl->LogwrtResult, you must hold either info_lck or + * WALWriteLock. To update it, you need to hold both locks. The point of + * this arrangement is that the value can be examined by code that already + * holds WALWriteLock without needing to grab info_lck as well. In addition + * to the shared variable, each backend has a private copy of LogwrtResult, + * which is updated when convenient. + * + * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst + * (protected by info_lck), but we don't need to cache any copies of it. + * + * info_lck is only held long enough to read/update the protected variables, + * so it's a plain spinlock. The other locks are held longer (potentially + * over I/O operations), so we use LWLocks for them. These locks are: + * + * WALBufMappingLock: must be held to replace a page in the WAL buffer cache. + * It is only held while initializing and changing the mapping. If the + * contents of the buffer being replaced haven't been written yet, the mapping + * lock is released while the write is done, and reacquired afterwards. + * + * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or + * XLogFlush). + * + * ControlFileLock: must be held to read/update control file or create + * new log file. + * + *---------- + */ + +typedef struct XLogwrtRqst +{ + XLogRecPtr Write; /* last byte + 1 to write out */ + XLogRecPtr Flush; /* last byte + 1 to flush */ +} XLogwrtRqst; + +typedef struct XLogwrtResult +{ + XLogRecPtr Write; /* last byte + 1 written out */ + XLogRecPtr Flush; /* last byte + 1 flushed */ +} XLogwrtResult; + +/* + * Inserting to WAL is protected by a small fixed number of WAL insertion + * locks. To insert to the WAL, you must hold one of the locks - it doesn't + * matter which one. To lock out other concurrent insertions, you must hold + * of them. Each WAL insertion lock consists of a lightweight lock, plus an + * indicator of how far the insertion has progressed (insertingAt). + * + * The insertingAt values are read when a process wants to flush WAL from + * the in-memory buffers to disk, to check that all the insertions to the + * region the process is about to write out have finished. You could simply + * wait for all currently in-progress insertions to finish, but the + * insertingAt indicator allows you to ignore insertions to later in the WAL, + * so that you only wait for the insertions that are modifying the buffers + * you're about to write out. + * + * This isn't just an optimization. If all the WAL buffers are dirty, an + * inserter that's holding a WAL insert lock might need to evict an old WAL + * buffer, which requires flushing the WAL. If it's possible for an inserter + * to block on another inserter unnecessarily, deadlock can arise when two + * inserters holding a WAL insert lock wait for each other to finish their + * insertion. + * + * Small WAL records that don't cross a page boundary never update the value, + * the WAL record is just copied to the page and the lock is released. But + * to avoid the deadlock-scenario explained above, the indicator is always + * updated before sleeping while holding an insertion lock. + * + * lastImportantAt contains the LSN of the last important WAL record inserted + * using a given lock. This value is used to detect if there has been + * important WAL activity since the last time some action, like a checkpoint, + * was performed - allowing to not repeat the action if not. The LSN is + * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was + * set. lastImportantAt is never cleared, only overwritten by the LSN of newer + * records. Tracking the WAL activity directly in WALInsertLock has the + * advantage of not needing any additional locks to update the value. + */ +typedef struct +{ + LWLock lock; + XLogRecPtr insertingAt; + XLogRecPtr lastImportantAt; +} WALInsertLock; + +/* + * All the WAL insertion locks are allocated as an array in shared memory. We + * force the array stride to be a power of 2, which saves a few cycles in + * indexing, but more importantly also ensures that individual slots don't + * cross cache line boundaries. (Of course, we have to also ensure that the + * array start address is suitably aligned.) + */ +typedef union WALInsertLockPadded +{ + WALInsertLock l; + char pad[PG_CACHE_LINE_SIZE]; +} WALInsertLockPadded; + +/* + * State of an exclusive backup, necessary to control concurrent activities + * across sessions when working on exclusive backups. + * + * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually + * running, to be more precise pg_start_backup() is not being executed for + * an exclusive backup and there is no exclusive backup in progress. + * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an + * exclusive backup. + * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished + * running and an exclusive backup is in progress. pg_stop_backup() is + * needed to finish it. + * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an + * exclusive backup. + */ +typedef enum ExclusiveBackupState +{ + EXCLUSIVE_BACKUP_NONE = 0, + EXCLUSIVE_BACKUP_STARTING, + EXCLUSIVE_BACKUP_IN_PROGRESS, + EXCLUSIVE_BACKUP_STOPPING +} ExclusiveBackupState; + +/* + * Session status of running backup, used for sanity checks in SQL-callable + * functions to start and stop backups. + */ +static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE; + +/* + * Shared state data for WAL insertion. + */ +typedef struct XLogCtlInsert +{ + slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */ + + /* + * CurrBytePos is the end of reserved WAL. The next record will be + * inserted at that position. PrevBytePos is the start position of the + * previously inserted (or rather, reserved) record - it is copied to the + * prev-link of the next record. These are stored as "usable byte + * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()). + */ + uint64 CurrBytePos; + uint64 PrevBytePos; + + /* + * Make sure the above heavily-contended spinlock and byte positions are + * on their own cache line. In particular, the RedoRecPtr and full page + * write variables below should be on a different cache line. They are + * read on every WAL insertion, but updated rarely, and we don't want + * those reads to steal the cache line containing Curr/PrevBytePos. + */ + char pad[PG_CACHE_LINE_SIZE]; + + /* + * fullPageWrites is the authoritative value used by all backends to + * determine whether to write full-page image to WAL. This shared value, + * instead of the process-local fullPageWrites, is required because, when + * full_page_writes is changed by SIGHUP, we must WAL-log it before it + * actually affects WAL-logging by backends. Checkpointer sets at startup + * or after SIGHUP. + * + * To read these fields, you must hold an insertion lock. To modify them, + * you must hold ALL the locks. + */ + XLogRecPtr RedoRecPtr; /* current redo point for insertions */ + bool forcePageWrites; /* forcing full-page writes for PITR? */ + bool fullPageWrites; + + /* + * exclusiveBackupState indicates the state of an exclusive backup (see + * comments of ExclusiveBackupState for more details). nonExclusiveBackups + * is a counter indicating the number of streaming base backups currently + * in progress. forcePageWrites is set to true when either of these is + * non-zero. lastBackupStart is the latest checkpoint redo location used + * as a starting point for an online backup. + */ + ExclusiveBackupState exclusiveBackupState; + int nonExclusiveBackups; + XLogRecPtr lastBackupStart; + + /* + * WAL insertion locks. + */ + WALInsertLockPadded *WALInsertLocks; +} XLogCtlInsert; + +/* + * Total shared-memory state for XLOG. + */ +typedef struct XLogCtlData +{ + XLogCtlInsert Insert; + + /* Protected by info_lck: */ + XLogwrtRqst LogwrtRqst; + XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ + FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ + XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ + XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ + + XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */ + + /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */ + XLogRecPtr unloggedLSN; + slock_t ulsn_lck; + + /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */ + pg_time_t lastSegSwitchTime; + XLogRecPtr lastSegSwitchLSN; + + /* + * Protected by info_lck and WALWriteLock (you must hold either lock to + * read it, but both to update) + */ + XLogwrtResult LogwrtResult; + + /* + * Latest initialized page in the cache (last byte position + 1). + * + * To change the identity of a buffer (and InitializedUpTo), you need to + * hold WALBufMappingLock. To change the identity of a buffer that's + * still dirty, the old page needs to be written out first, and for that + * you need WALWriteLock, and you need to ensure that there are no + * in-progress insertions to the page by calling + * WaitXLogInsertionsToFinish(). + */ + XLogRecPtr InitializedUpTo; + + /* + * These values do not change after startup, although the pointed-to pages + * and xlblocks values certainly do. xlblocks values are protected by + * WALBufMappingLock. + */ + char *pages; /* buffers for unwritten XLOG pages */ + XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */ + int XLogCacheBlck; /* highest allocated xlog buffer index */ + + /* + * Shared copy of ThisTimeLineID. Does not change after end-of-recovery. + * If we created a new timeline when the system was started up, + * PrevTimeLineID is the old timeline's ID that we forked off from. + * Otherwise it's equal to ThisTimeLineID. + */ + TimeLineID ThisTimeLineID; + TimeLineID PrevTimeLineID; + + /* + * SharedRecoveryState indicates if we're still in crash or archive + * recovery. Protected by info_lck. + */ + RecoveryState SharedRecoveryState; + + /* + * SharedHotStandbyActive indicates if we allow hot standby queries to be + * run. Protected by info_lck. + */ + bool SharedHotStandbyActive; + + /* + * SharedPromoteIsTriggered indicates if a standby promotion has been + * triggered. Protected by info_lck. + */ + bool SharedPromoteIsTriggered; + + /* + * WalWriterSleeping indicates whether the WAL writer is currently in + * low-power mode (and hence should be nudged if an async commit occurs). + * Protected by info_lck. + */ + bool WalWriterSleeping; + + /* + * recoveryWakeupLatch is used to wake up the startup process to continue + * WAL replay, if it is waiting for WAL to arrive or failover trigger file + * to appear. + * + * Note that the startup process also uses another latch, its procLatch, + * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for + * signaling the startup process in favor of using its procLatch, which + * comports better with possible generic signal handlers using that latch. + * But we should not do that because the startup process doesn't assume + * that it's waken up by walreceiver process or SIGHUP signal handler + * while it's waiting for recovery conflict. The separate latches, + * recoveryWakeupLatch and procLatch, should be used for inter-process + * communication for WAL replay and recovery conflict, respectively. + */ + Latch recoveryWakeupLatch; + + /* + * During recovery, we keep a copy of the latest checkpoint record here. + * lastCheckPointRecPtr points to start of checkpoint record and + * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the + * checkpointer when it wants to create a restartpoint. + * + * Protected by info_lck. + */ + XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; + CheckPoint lastCheckPoint; + + /* + * lastReplayedEndRecPtr points to end+1 of the last record successfully + * replayed. When we're currently replaying a record, ie. in a redo + * function, replayEndRecPtr points to the end+1 of the record being + * replayed, otherwise it's equal to lastReplayedEndRecPtr. + */ + XLogRecPtr lastReplayedEndRecPtr; + TimeLineID lastReplayedTLI; + XLogRecPtr replayEndRecPtr; + TimeLineID replayEndTLI; + /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ + TimestampTz recoveryLastXTime; + + /* + * timestamp of when we started replaying the current chunk of WAL data, + * only relevant for replication or archive recovery + */ + TimestampTz currentChunkStartTime; + /* Recovery pause state */ + RecoveryPauseState recoveryPauseState; + ConditionVariable recoveryNotPausedCV; + + /* + * lastFpwDisableRecPtr points to the start of the last replayed + * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled. + */ + XLogRecPtr lastFpwDisableRecPtr; + + slock_t info_lck; /* locks shared variables shown above */ +} XLogCtlData; + +static XLogCtlData *XLogCtl = NULL; + +/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */ +static WALInsertLockPadded *WALInsertLocks = NULL; + +/* + * We maintain an image of pg_control in shared memory. + */ +static ControlFileData *ControlFile = NULL; + +/* + * Calculate the amount of space left on the page after 'endptr'. Beware + * multiple evaluation! + */ +#define INSERT_FREESPACE(endptr) \ + (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ)) + +/* Macro to advance to next buffer index. */ +#define NextBufIdx(idx) \ + (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1)) + +/* + * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or + * would hold if it was in cache, the page containing 'recptr'. + */ +#define XLogRecPtrToBufIdx(recptr) \ + (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1)) + +/* + * These are the number of bytes in a WAL page usable for WAL data. + */ +#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD) + +/* + * Convert values of GUCs measured in megabytes to equiv. segment count. + * Rounds down. + */ +#define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize)) + +/* The number of bytes in a WAL segment usable for WAL data. */ +static int UsableBytesInSegment; + +/* + * Private, possibly out-of-date copy of shared LogwrtResult. + * See discussion above. + */ +static XLogwrtResult LogwrtResult = {0, 0}; + +/* + * Codes indicating where we got a WAL file from during recovery, or where + * to attempt to get one. + */ +typedef enum +{ + XLOG_FROM_ANY = 0, /* request to read WAL from any source */ + XLOG_FROM_ARCHIVE, /* restored using restore_command */ + XLOG_FROM_PG_WAL, /* existing file in pg_wal */ + XLOG_FROM_STREAM /* streamed from primary */ +} XLogSource; + +/* human-readable names for XLogSources, for debugging output */ +static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"}; + +/* + * openLogFile is -1 or a kernel FD for an open log file segment. + * openLogSegNo identifies the segment. These variables are only used to + * write the XLOG, and so will normally refer to the active segment. + * Note: call Reserve/ReleaseExternalFD to track consumption of this FD. + */ +static int openLogFile = -1; +static XLogSegNo openLogSegNo = 0; + +/* + * These variables are used similarly to the ones above, but for reading + * the XLOG. readOff is the offset of the page just read, readLen + * indicates how much of it has been read into readBuf, and readSource + * indicates where we got the currently open file from. + * Note: we could use Reserve/ReleaseExternalFD to track consumption of + * this FD too; but it doesn't currently seem worthwhile, since the XLOG is + * not read by general-purpose sessions. + */ +static int readFile = -1; +static XLogSegNo readSegNo = 0; +static uint32 readOff = 0; +static uint32 readLen = 0; +static XLogSource readSource = XLOG_FROM_ANY; + +/* + * Keeps track of which source we're currently reading from. This is + * different from readSource in that this is always set, even when we don't + * currently have a WAL file open. If lastSourceFailed is set, our last + * attempt to read from currentSource failed, and we should try another source + * next. + * + * pendingWalRcvRestart is set when a config change occurs that requires a + * walreceiver restart. This is only valid in XLOG_FROM_STREAM state. + */ +static XLogSource currentSource = XLOG_FROM_ANY; +static bool lastSourceFailed = false; +static bool pendingWalRcvRestart = false; + +typedef struct XLogPageReadPrivate +{ + int emode; + bool fetching_ckpt; /* are we fetching a checkpoint record? */ + bool randAccess; +} XLogPageReadPrivate; + +/* + * These variables track when we last obtained some WAL data to process, + * and where we got it from. (XLogReceiptSource is initially the same as + * readSource, but readSource gets reset to zero when we don't have data + * to process right now. It is also different from currentSource, which + * also changes when we try to read from a source and fail, while + * XLogReceiptSource tracks where we last successfully read some WAL.) + */ +static TimestampTz XLogReceiptTime = 0; +static XLogSource XLogReceiptSource = XLOG_FROM_ANY; + +/* State information for XLOG reading */ +static XLogRecPtr ReadRecPtr; /* start of last record read */ +static XLogRecPtr EndRecPtr; /* end+1 of last record read */ + +/* + * Local copies of equivalent fields in the control file. When running + * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we + * expect to replay all the WAL available, and updateMinRecoveryPoint is + * switched to false to prevent any updates while replaying records. + * Those values are kept consistent as long as crash recovery runs. + */ +static XLogRecPtr minRecoveryPoint; +static TimeLineID minRecoveryPointTLI; +static bool updateMinRecoveryPoint = true; + +/* + * Have we reached a consistent database state? In crash recovery, we have + * to replay all the WAL, so reachedConsistency is never set. During archive + * recovery, the database is consistent once minRecoveryPoint is reached. + */ +bool reachedConsistency = false; + +static bool InRedo = false; + +/* Have we launched bgwriter during recovery? */ +static bool bgwriterLaunched = false; + +/* For WALInsertLockAcquire/Release functions */ +static int MyLockNo = 0; +static bool holdingAllLocks = false; + +#ifdef WAL_DEBUG +static MemoryContext walDebugCxt = NULL; +#endif + +static void readRecoverySignalFile(void); +static void validateRecoveryParameters(void); +static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog); +static bool recoveryStopsBefore(XLogReaderState *record); +static bool recoveryStopsAfter(XLogReaderState *record); +static void ConfirmRecoveryPaused(void); +static void recoveryPausesHere(bool endOfRecovery); +static bool recoveryApplyDelay(XLogReaderState *record); +static void SetLatestXTime(TimestampTz xtime); +static void SetCurrentChunkStartTime(TimestampTz xtime); +static void CheckRequiredParameterValues(void); +static void XLogReportParameters(void); +static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, + TimeLineID prevTLI); +static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, + XLogReaderState *state); +static void LocalSetXLogInsertAllowed(void); +static void CreateEndOfRecoveryRecord(void); +static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn); +static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); +static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); +static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); + +static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic); +static bool XLogCheckpointNeeded(XLogSegNo new_segno); +static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible); +static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, + bool find_free, XLogSegNo max_segno, + bool use_lock); +static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, + XLogSource source, bool notfoundOk); +static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); +static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *readBuf); +static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, + bool fetching_ckpt, XLogRecPtr tliRecPtr); +static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); +static void XLogFileClose(void); +static void PreallocXlogFiles(XLogRecPtr endptr); +static void RemoveTempXlogFiles(void); +static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr); +static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, + XLogSegNo *endlogSegNo); +static void UpdateLastRemovedPtr(char *filename); +static void ValidateXLOGDirectoryStructure(void); +static void CleanupBackupHistory(void); +static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); +static XLogRecord *ReadRecord(XLogReaderState *xlogreader, + int emode, bool fetching_ckpt); +static void CheckRecoveryConsistency(void); +static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, + XLogRecPtr RecPtr, int whichChkpt, bool report); +static bool rescanLatestTimeLine(void); +static void InitControlFile(uint64 sysidentifier); +static void WriteControlFile(void); +static void ReadControlFile(void); +static char *str_time(pg_time_t tnow); +static void SetPromoteIsTriggered(void); +static bool CheckForStandbyTrigger(void); + +#ifdef WAL_DEBUG +static void xlog_outrec(StringInfo buf, XLogReaderState *record); +#endif +static void xlog_block_info(StringInfo buf, XLogReaderState *record); +static void xlog_outdesc(StringInfo buf, XLogReaderState *record); +static void pg_start_backup_callback(int code, Datum arg); +static void pg_stop_backup_callback(int code, Datum arg); +static bool read_backup_label(XLogRecPtr *checkPointLoc, + bool *backupEndRequired, bool *backupFromStandby); +static bool read_tablespace_map(List **tablespaces); + +static void rm_redo_error_callback(void *arg); +static int get_sync_bit(int method); + +static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, + XLogRecData *rdata, + XLogRecPtr StartPos, XLogRecPtr EndPos); +static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, + XLogRecPtr *EndPos, XLogRecPtr *PrevPtr); +static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, + XLogRecPtr *PrevPtr); +static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto); +static char *GetXLogBuffer(XLogRecPtr ptr); +static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); +static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); +static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); +static void checkXLogConsistency(XLogReaderState *record); + +static void WALInsertLockAcquire(void); +static void WALInsertLockAcquireExclusive(void); +static void WALInsertLockRelease(void); +static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); + +/* + * Insert an XLOG record represented by an already-constructed chain of data + * chunks. This is a low-level routine; to construct the WAL record header + * and data, use the higher-level routines in xloginsert.c. + * + * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this + * WAL record applies to, that were not included in the record as full page + * images. If fpw_lsn <= RedoRecPtr, the function does not perform the + * insertion and returns InvalidXLogRecPtr. The caller can then recalculate + * which pages need a full-page image, and retry. If fpw_lsn is invalid, the + * record is always inserted. + * + * 'flags' gives more in-depth control on the record being inserted. See + * XLogSetRecordFlags() for details. + * + * The first XLogRecData in the chain must be for the record header, and its + * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and + * xl_crc fields in the header, the rest of the header must already be filled + * by the caller. + * + * Returns XLOG pointer to end of record (beginning of next record). + * This can be used as LSN for data pages affected by the logged action. + * (LSN is the XLOG point up to which the XLOG must be flushed to disk + * before the data page can be written out. This implements the basic + * WAL rule "write the log before the data".) + */ +XLogRecPtr +XLogInsertRecord(XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags, + int num_fpi) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + pg_crc32c rdata_crc; + bool inserted; + XLogRecord *rechdr = (XLogRecord *) rdata->data; + uint8 info = rechdr->xl_info & ~XLR_INFO_MASK; + bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && + info == XLOG_SWITCH); + XLogRecPtr StartPos; + XLogRecPtr EndPos; + bool prevDoPageWrites = doPageWrites; + + /* we assume that all of the record header is in the first chunk */ + Assert(rdata->len >= SizeOfXLogRecord); + + /* cross-check on whether we should be here or not */ + if (!XLogInsertAllowed()) + elog(ERROR, "cannot make new WAL entries during recovery"); + + /*---------- + * + * We have now done all the preparatory work we can without holding a + * lock or modifying shared state. From here on, inserting the new WAL + * record to the shared WAL buffer cache is a two-step process: + * + * 1. Reserve the right amount of space from the WAL. The current head of + * reserved space is kept in Insert->CurrBytePos, and is protected by + * insertpos_lck. + * + * 2. Copy the record to the reserved WAL space. This involves finding the + * correct WAL buffer containing the reserved space, and copying the + * record in place. This can be done concurrently in multiple processes. + * + * To keep track of which insertions are still in-progress, each concurrent + * inserter acquires an insertion lock. In addition to just indicating that + * an insertion is in progress, the lock tells others how far the inserter + * has progressed. There is a small fixed number of insertion locks, + * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page + * boundary, it updates the value stored in the lock to the how far it has + * inserted, to allow the previous buffer to be flushed. + * + * Holding onto an insertion lock also protects RedoRecPtr and + * fullPageWrites from changing until the insertion is finished. + * + * Step 2 can usually be done completely in parallel. If the required WAL + * page is not initialized yet, you have to grab WALBufMappingLock to + * initialize it, but the WAL writer tries to do that ahead of insertions + * to avoid that from happening in the critical path. + * + *---------- + */ + START_CRIT_SECTION(); + if (isLogSwitch) + WALInsertLockAcquireExclusive(); + else + WALInsertLockAcquire(); + + /* + * Check to see if my copy of RedoRecPtr is out of date. If so, may have + * to go back and have the caller recompute everything. This can only + * happen just after a checkpoint, so it's better to be slow in this case + * and fast otherwise. + * + * Also check to see if fullPageWrites or forcePageWrites was just turned + * on; if we weren't already doing full-page writes then go back and + * recompute. + * + * If we aren't doing full-page writes then RedoRecPtr doesn't actually + * affect the contents of the XLOG record, so we'll update our local copy + * but not force a recomputation. (If doPageWrites was just turned off, + * we could recompute the record without full pages, but we choose not to + * bother.) + */ + if (RedoRecPtr != Insert->RedoRecPtr) + { + Assert(RedoRecPtr < Insert->RedoRecPtr); + RedoRecPtr = Insert->RedoRecPtr; + } + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + + if (doPageWrites && + (!prevDoPageWrites || + (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr))) + { + /* + * Oops, some buffer now needs to be backed up that the caller didn't + * back up. Start over. + */ + WALInsertLockRelease(); + END_CRIT_SECTION(); + return InvalidXLogRecPtr; + } + + /* + * Reserve space for the record in the WAL. This also sets the xl_prev + * pointer. + */ + if (isLogSwitch) + inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev); + else + { + ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos, + &rechdr->xl_prev); + inserted = true; + } + + if (inserted) + { + /* + * Now that xl_prev has been filled in, calculate CRC of the record + * header. + */ + rdata_crc = rechdr->xl_crc; + COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(rdata_crc); + rechdr->xl_crc = rdata_crc; + + /* + * All the record data, including the header, is now ready to be + * inserted. Copy the record in the space reserved. + */ + CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, + StartPos, EndPos); + + /* + * Unless record is flagged as not important, update LSN of last + * important record in the current slot. When holding all locks, just + * update the first one. + */ + if ((flags & XLOG_MARK_UNIMPORTANT) == 0) + { + int lockno = holdingAllLocks ? 0 : MyLockNo; + + WALInsertLocks[lockno].l.lastImportantAt = StartPos; + } + } + else + { + /* + * This was an xlog-switch record, but the current insert location was + * already exactly at the beginning of a segment, so there was no need + * to do anything. + */ + } + + /* + * Done! Let others know that we're finished. + */ + WALInsertLockRelease(); + + MarkCurrentTransactionIdLoggedIfAny(); + + END_CRIT_SECTION(); + + /* + * Update shared LogwrtRqst.Write, if we crossed page boundary. + */ + if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) + { + SpinLockAcquire(&XLogCtl->info_lck); + /* advance global request to include new block(s) */ + if (XLogCtl->LogwrtRqst.Write < EndPos) + XLogCtl->LogwrtRqst.Write = EndPos; + /* update local result copy while I have the chance */ + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + } + + /* + * If this was an XLOG_SWITCH record, flush the record and the empty + * padding space that fills the rest of the segment, and perform + * end-of-segment actions (eg, notifying archiver). + */ + if (isLogSwitch) + { + TRACE_POSTGRESQL_WAL_SWITCH(); + XLogFlush(EndPos); + + /* + * Even though we reserved the rest of the segment for us, which is + * reflected in EndPos, we return a pointer to just the end of the + * xlog-switch record. + */ + if (inserted) + { + EndPos = StartPos + SizeOfXLogRecord; + if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) + { + uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size); + + if (offset == EndPos % XLOG_BLCKSZ) + EndPos += SizeOfXLogLongPHD; + else + EndPos += SizeOfXLogShortPHD; + } + } + } + +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + { + static XLogReaderState *debug_reader = NULL; + StringInfoData buf; + StringInfoData recordBuf; + char *errormsg = NULL; + MemoryContext oldCxt; + + oldCxt = MemoryContextSwitchTo(walDebugCxt); + + initStringInfo(&buf); + appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos)); + + /* + * We have to piece together the WAL record data from the XLogRecData + * entries, so that we can pass it to the rm_desc function as one + * contiguous chunk. + */ + initStringInfo(&recordBuf); + for (; rdata != NULL; rdata = rdata->next) + appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len); + + if (!debug_reader) + debug_reader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(), NULL); + + if (!debug_reader) + { + appendStringInfoString(&buf, "error decoding record: out of memory"); + } + else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data, + &errormsg)) + { + appendStringInfo(&buf, "error decoding record: %s", + errormsg ? errormsg : "no error message"); + } + else + { + appendStringInfoString(&buf, " - "); + xlog_outdesc(&buf, debug_reader); + } + elog(LOG, "%s", buf.data); + + pfree(buf.data); + pfree(recordBuf.data); + MemoryContextSwitchTo(oldCxt); + } +#endif + + /* + * Update our global variables + */ + ProcLastRecPtr = StartPos; + XactLastRecEnd = EndPos; + + /* Report WAL traffic to the instrumentation. */ + if (inserted) + { + pgWalUsage.wal_bytes += rechdr->xl_tot_len; + pgWalUsage.wal_records++; + pgWalUsage.wal_fpi += num_fpi; + } + + return EndPos; +} + +/* + * Reserves the right amount of space for a record of given size from the WAL. + * *StartPos is set to the beginning of the reserved section, *EndPos to + * its end+1. *PrevPtr is set to the beginning of the previous record; it is + * used to set the xl_prev of this record. + * + * This is the performance critical part of XLogInsert that must be serialized + * across backends. The rest can happen mostly in parallel. Try to keep this + * section as short as possible, insertpos_lck can be heavily contended on a + * busy system. + * + * NB: The space calculation here must match the code in CopyXLogRecordToWAL, + * where we actually copy the record to the reserved space. + */ +static void +ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, + XLogRecPtr *PrevPtr) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint64 startbytepos; + uint64 endbytepos; + uint64 prevbytepos; + + size = MAXALIGN(size); + + /* All (non xlog-switch) records should contain data. */ + Assert(size > SizeOfXLogRecord); + + /* + * The duration the spinlock needs to be held is minimized by minimizing + * the calculations that have to be done while holding the lock. The + * current tip of reserved WAL is kept in CurrBytePos, as a byte position + * that only counts "usable" bytes in WAL, that is, it excludes all WAL + * page headers. The mapping between "usable" byte positions and physical + * positions (XLogRecPtrs) can be done outside the locked region, and + * because the usable byte position doesn't include any headers, reserving + * X bytes from WAL is almost as simple as "CurrBytePos += X". + */ + SpinLockAcquire(&Insert->insertpos_lck); + + startbytepos = Insert->CurrBytePos; + endbytepos = startbytepos + size; + prevbytepos = Insert->PrevBytePos; + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = startbytepos; + + SpinLockRelease(&Insert->insertpos_lck); + + *StartPos = XLogBytePosToRecPtr(startbytepos); + *EndPos = XLogBytePosToEndRecPtr(endbytepos); + *PrevPtr = XLogBytePosToRecPtr(prevbytepos); + + /* + * Check that the conversions between "usable byte positions" and + * XLogRecPtrs work consistently in both directions. + */ + Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos); + Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos); + Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos); +} + +/* + * Like ReserveXLogInsertLocation(), but for an xlog-switch record. + * + * A log-switch record is handled slightly differently. The rest of the + * segment will be reserved for this insertion, as indicated by the returned + * *EndPos value. However, if we are already at the beginning of the current + * segment, *StartPos and *EndPos are set to the current location without + * reserving any space, and the function returns false. +*/ +static bool +ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint64 startbytepos; + uint64 endbytepos; + uint64 prevbytepos; + uint32 size = MAXALIGN(SizeOfXLogRecord); + XLogRecPtr ptr; + uint32 segleft; + + /* + * These calculations are a bit heavy-weight to be done while holding a + * spinlock, but since we're holding all the WAL insertion locks, there + * are no other inserters competing for it. GetXLogInsertRecPtr() does + * compete for it, but that's not called very frequently. + */ + SpinLockAcquire(&Insert->insertpos_lck); + + startbytepos = Insert->CurrBytePos; + + ptr = XLogBytePosToEndRecPtr(startbytepos); + if (XLogSegmentOffset(ptr, wal_segment_size) == 0) + { + SpinLockRelease(&Insert->insertpos_lck); + *EndPos = *StartPos = ptr; + return false; + } + + endbytepos = startbytepos + size; + prevbytepos = Insert->PrevBytePos; + + *StartPos = XLogBytePosToRecPtr(startbytepos); + *EndPos = XLogBytePosToEndRecPtr(endbytepos); + + segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size); + if (segleft != wal_segment_size) + { + /* consume the rest of the segment */ + *EndPos += segleft; + endbytepos = XLogRecPtrToBytePos(*EndPos); + } + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = startbytepos; + + SpinLockRelease(&Insert->insertpos_lck); + + *PrevPtr = XLogBytePosToRecPtr(prevbytepos); + + Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0); + Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos); + Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos); + Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos); + + return true; +} + +/* + * Checks whether the current buffer page and backup page stored in the + * WAL record are consistent or not. Before comparing the two pages, a + * masking can be applied to the pages to ignore certain areas like hint bits, + * unused space between pd_lower and pd_upper among other things. This + * function should be called once WAL replay has been completed for a + * given record. + */ +static void +checkXLogConsistency(XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + int block_id; + + /* Records with no backup blocks have no need for consistency checks. */ + if (!XLogRecHasAnyBlockRefs(record)) + return; + + Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); + + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + Buffer buf; + Page page; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + { + /* + * WAL record doesn't contain a block reference with the given id. + * Do nothing. + */ + continue; + } + + Assert(XLogRecHasBlockImage(record, block_id)); + + if (XLogRecBlockImageApply(record, block_id)) + { + /* + * WAL record has already applied the page, so bypass the + * consistency check as that would result in comparing the full + * page stored in the record with itself. + */ + continue; + } + + /* + * Read the contents from the current buffer and store it in a + * temporary page. + */ + buf = XLogReadBufferExtended(rnode, forknum, blkno, + RBM_NORMAL_NO_LOG); + if (!BufferIsValid(buf)) + continue; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + /* + * Take a copy of the local page where WAL has been applied to have a + * comparison base before masking it... + */ + memcpy(replay_image_masked, page, BLCKSZ); + + /* No need for this page anymore now that a copy is in. */ + UnlockReleaseBuffer(buf); + + /* + * If the block LSN is already ahead of this WAL record, we can't + * expect contents to match. This can happen if recovery is + * restarted. + */ + if (PageGetLSN(replay_image_masked) > record->EndRecPtr) + continue; + + /* + * Read the contents from the backup copy, stored in WAL record and + * store it in a temporary page. There is no need to allocate a new + * page here, a local buffer is fine to hold its contents and a mask + * can be directly applied on it. + */ + if (!RestoreBlockImage(record, block_id, primary_image_masked)) + elog(ERROR, "failed to restore block image"); + + /* + * If masking function is defined, mask both the primary and replay + * images + */ + if (RmgrTable[rmid].rm_mask != NULL) + { + RmgrTable[rmid].rm_mask(replay_image_masked, blkno); + RmgrTable[rmid].rm_mask(primary_image_masked, blkno); + } + + /* Time to compare the primary and replay images. */ + if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) + { + elog(FATAL, + "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + } + } +} + +/* + * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved + * area in the WAL. + */ +static void +CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, + XLogRecPtr StartPos, XLogRecPtr EndPos) +{ + char *currpos; + int freespace; + int written; + XLogRecPtr CurrPos; + XLogPageHeader pagehdr; + + /* + * Get a pointer to the right place in the right WAL buffer to start + * inserting to. + */ + CurrPos = StartPos; + currpos = GetXLogBuffer(CurrPos); + freespace = INSERT_FREESPACE(CurrPos); + + /* + * there should be enough space for at least the first field (xl_tot_len) + * on this page. + */ + Assert(freespace >= sizeof(uint32)); + + /* Copy record data */ + written = 0; + while (rdata != NULL) + { + char *rdata_data = rdata->data; + int rdata_len = rdata->len; + + while (rdata_len > freespace) + { + /* + * Write what fits on this page, and continue on the next page. + */ + Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0); + memcpy(currpos, rdata_data, freespace); + rdata_data += freespace; + rdata_len -= freespace; + written += freespace; + CurrPos += freespace; + + /* + * Get pointer to beginning of next page, and set the xlp_rem_len + * in the page header. Set XLP_FIRST_IS_CONTRECORD. + * + * It's safe to set the contrecord flag and xlp_rem_len without a + * lock on the page. All the other flags were already set when the + * page was initialized, in AdvanceXLInsertBuffer, and we're the + * only backend that needs to set the contrecord flag. + */ + currpos = GetXLogBuffer(CurrPos); + pagehdr = (XLogPageHeader) currpos; + pagehdr->xlp_rem_len = write_len - written; + pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD; + + /* skip over the page header */ + if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0) + { + CurrPos += SizeOfXLogLongPHD; + currpos += SizeOfXLogLongPHD; + } + else + { + CurrPos += SizeOfXLogShortPHD; + currpos += SizeOfXLogShortPHD; + } + freespace = INSERT_FREESPACE(CurrPos); + } + + Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0); + memcpy(currpos, rdata_data, rdata_len); + currpos += rdata_len; + CurrPos += rdata_len; + freespace -= rdata_len; + written += rdata_len; + + rdata = rdata->next; + } + Assert(written == write_len); + + /* + * If this was an xlog-switch, it's not enough to write the switch record, + * we also have to consume all the remaining space in the WAL segment. We + * have already reserved that space, but we need to actually fill it. + */ + if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0) + { + /* An xlog-switch record doesn't contain any data besides the header */ + Assert(write_len == SizeOfXLogRecord); + + /* Assert that we did reserve the right amount of space */ + Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0); + + /* Use up all the remaining space on the current page */ + CurrPos += freespace; + + /* + * Cause all remaining pages in the segment to be flushed, leaving the + * XLog position where it should be, at the start of the next segment. + * We do this one page at a time, to make sure we don't deadlock + * against ourselves if wal_buffers < wal_segment_size. + */ + while (CurrPos < EndPos) + { + /* + * The minimal action to flush the page would be to call + * WALInsertLockUpdateInsertingAt(CurrPos) followed by + * AdvanceXLInsertBuffer(...). The page would be left initialized + * mostly to zeros, except for the page header (always the short + * variant, as this is never a segment's first page). + * + * The large vistas of zeros are good for compressibility, but the + * headers interrupting them every XLOG_BLCKSZ (with values that + * differ from page to page) are not. The effect varies with + * compression tool, but bzip2 for instance compresses about an + * order of magnitude worse if those headers are left in place. + * + * Rather than complicating AdvanceXLInsertBuffer itself (which is + * called in heavily-loaded circumstances as well as this lightly- + * loaded one) with variant behavior, we just use GetXLogBuffer + * (which itself calls the two methods we need) to get the pointer + * and zero most of the page. Then we just zero the page header. + */ + currpos = GetXLogBuffer(CurrPos); + MemSet(currpos, 0, SizeOfXLogShortPHD); + + CurrPos += XLOG_BLCKSZ; + } + } + else + { + /* Align the end position, so that the next record starts aligned */ + CurrPos = MAXALIGN64(CurrPos); + } + + if (CurrPos != EndPos) + elog(PANIC, "space reserved for WAL record does not match what was written"); +} + +/* + * Acquire a WAL insertion lock, for inserting to WAL. + */ +static void +WALInsertLockAcquire(void) +{ + bool immed; + + /* + * It doesn't matter which of the WAL insertion locks we acquire, so try + * the one we used last time. If the system isn't particularly busy, it's + * a good bet that it's still available, and it's good to have some + * affinity to a particular lock so that you don't unnecessarily bounce + * cache lines between processes when there's no contention. + * + * If this is the first time through in this backend, pick a lock + * (semi-)randomly. This allows the locks to be used evenly if you have a + * lot of very short connections. + */ + static int lockToTry = -1; + + if (lockToTry == -1) + lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS; + MyLockNo = lockToTry; + + /* + * The insertingAt value is initially set to 0, as we don't know our + * insert location yet. + */ + immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE); + if (!immed) + { + /* + * If we couldn't get the lock immediately, try another lock next + * time. On a system with more insertion locks than concurrent + * inserters, this causes all the inserters to eventually migrate to a + * lock that no-one else is using. On a system with more inserters + * than locks, it still helps to distribute the inserters evenly + * across the locks. + */ + lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS; + } +} + +/* + * Acquire all WAL insertion locks, to prevent other backends from inserting + * to WAL. + */ +static void +WALInsertLockAcquireExclusive(void) +{ + int i; + + /* + * When holding all the locks, all but the last lock's insertingAt + * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real + * XLogRecPtr value, to make sure that no-one blocks waiting on those. + */ + for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++) + { + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + LWLockUpdateVar(&WALInsertLocks[i].l.lock, + &WALInsertLocks[i].l.insertingAt, + PG_UINT64_MAX); + } + /* Variable value reset to 0 at release */ + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + + holdingAllLocks = true; +} + +/* + * Release our insertion lock (or locks, if we're holding them all). + * + * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the + * next time the lock is acquired. + */ +static void +WALInsertLockRelease(void) +{ + if (holdingAllLocks) + { + int i; + + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + LWLockReleaseClearVar(&WALInsertLocks[i].l.lock, + &WALInsertLocks[i].l.insertingAt, + 0); + + holdingAllLocks = false; + } + else + { + LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock, + &WALInsertLocks[MyLockNo].l.insertingAt, + 0); + } +} + +/* + * Update our insertingAt value, to let others know that we've finished + * inserting up to that point. + */ +static void +WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt) +{ + if (holdingAllLocks) + { + /* + * We use the last lock to mark our actual position, see comments in + * WALInsertLockAcquireExclusive. + */ + LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock, + &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt, + insertingAt); + } + else + LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock, + &WALInsertLocks[MyLockNo].l.insertingAt, + insertingAt); +} + +/* + * Wait for any WAL insertions < upto to finish. + * + * Returns the location of the oldest insertion that is still in-progress. + * Any WAL prior to that point has been fully copied into WAL buffers, and + * can be flushed out to disk. Because this waits for any insertions older + * than 'upto' to finish, the return value is always >= 'upto'. + * + * Note: When you are about to write out WAL, you must call this function + * *before* acquiring WALWriteLock, to avoid deadlocks. This function might + * need to wait for an insertion to finish (or at least advance to next + * uninitialized page), and the inserter might need to evict an old WAL buffer + * to make room for a new one, which in turn requires WALWriteLock. + */ +static XLogRecPtr +WaitXLogInsertionsToFinish(XLogRecPtr upto) +{ + uint64 bytepos; + XLogRecPtr reservedUpto; + XLogRecPtr finishedUpto; + XLogCtlInsert *Insert = &XLogCtl->Insert; + int i; + + if (MyProc == NULL) + elog(PANIC, "cannot wait without a PGPROC structure"); + + /* Read the current insert position */ + SpinLockAcquire(&Insert->insertpos_lck); + bytepos = Insert->CurrBytePos; + SpinLockRelease(&Insert->insertpos_lck); + reservedUpto = XLogBytePosToEndRecPtr(bytepos); + + /* + * No-one should request to flush a piece of WAL that hasn't even been + * reserved yet. However, it can happen if there is a block with a bogus + * LSN on disk, for example. XLogFlush checks for that situation and + * complains, but only after the flush. Here we just assume that to mean + * that all WAL that has been reserved needs to be finished. In this + * corner-case, the return value can be smaller than 'upto' argument. + */ + if (upto > reservedUpto) + { + ereport(LOG, + (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X", + LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)))); + upto = reservedUpto; + } + + /* + * Loop through all the locks, sleeping on any in-progress insert older + * than 'upto'. + * + * finishedUpto is our return value, indicating the point upto which all + * the WAL insertions have been finished. Initialize it to the head of + * reserved WAL, and as we iterate through the insertion locks, back it + * out for any insertion that's still in progress. + */ + finishedUpto = reservedUpto; + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + XLogRecPtr insertingat = InvalidXLogRecPtr; + + do + { + /* + * See if this insertion is in progress. LWLockWaitForVar will + * wait for the lock to be released, or for the 'value' to be set + * by a LWLockUpdateVar call. When a lock is initially acquired, + * its value is 0 (InvalidXLogRecPtr), which means that we don't + * know where it's inserting yet. We will have to wait for it. If + * it's a small insertion, the record will most likely fit on the + * same page and the inserter will release the lock without ever + * calling LWLockUpdateVar. But if it has to sleep, it will + * advertise the insertion point with LWLockUpdateVar before + * sleeping. + */ + if (LWLockWaitForVar(&WALInsertLocks[i].l.lock, + &WALInsertLocks[i].l.insertingAt, + insertingat, &insertingat)) + { + /* the lock was free, so no insertion in progress */ + insertingat = InvalidXLogRecPtr; + break; + } + + /* + * This insertion is still in progress. Have to wait, unless the + * inserter has proceeded past 'upto'. + */ + } while (insertingat < upto); + + if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto) + finishedUpto = insertingat; + } + return finishedUpto; +} + +/* + * Get a pointer to the right location in the WAL buffer containing the + * given XLogRecPtr. + * + * If the page is not initialized yet, it is initialized. That might require + * evicting an old dirty buffer from the buffer cache, which means I/O. + * + * The caller must ensure that the page containing the requested location + * isn't evicted yet, and won't be evicted. The way to ensure that is to + * hold onto a WAL insertion lock with the insertingAt position set to + * something <= ptr. GetXLogBuffer() will update insertingAt if it needs + * to evict an old page from the buffer. (This means that once you call + * GetXLogBuffer() with a given 'ptr', you must not access anything before + * that point anymore, and must not call GetXLogBuffer() with an older 'ptr' + * later, because older buffers might be recycled already) + */ +static char * +GetXLogBuffer(XLogRecPtr ptr) +{ + int idx; + XLogRecPtr endptr; + static uint64 cachedPage = 0; + static char *cachedPos = NULL; + XLogRecPtr expectedEndPtr; + + /* + * Fast path for the common case that we need to access again the same + * page as last time. + */ + if (ptr / XLOG_BLCKSZ == cachedPage) + { + Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); + Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); + return cachedPos + ptr % XLOG_BLCKSZ; + } + + /* + * The XLog buffer cache is organized so that a page is always loaded to a + * particular buffer. That way we can easily calculate the buffer a given + * page must be loaded into, from the XLogRecPtr alone. + */ + idx = XLogRecPtrToBufIdx(ptr); + + /* + * See what page is loaded in the buffer at the moment. It could be the + * page we're looking for, or something older. It can't be anything newer + * - that would imply the page we're looking for has already been written + * out to disk and evicted, and the caller is responsible for making sure + * that doesn't happen. + * + * However, we don't hold a lock while we read the value. If someone has + * just initialized the page, it's possible that we get a "torn read" of + * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In + * that case we will see a bogus value. That's ok, we'll grab the mapping + * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than + * the page we're looking for. But it means that when we do this unlocked + * read, we might see a value that appears to be ahead of the page we're + * looking for. Don't PANIC on that, until we've verified the value while + * holding the lock. + */ + expectedEndPtr = ptr; + expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ; + + endptr = XLogCtl->xlblocks[idx]; + if (expectedEndPtr != endptr) + { + XLogRecPtr initializedUpto; + + /* + * Before calling AdvanceXLInsertBuffer(), which can block, let others + * know how far we're finished with inserting the record. + * + * NB: If 'ptr' points to just after the page header, advertise a + * position at the beginning of the page rather than 'ptr' itself. If + * there are no other insertions running, someone might try to flush + * up to our advertised location. If we advertised a position after + * the page header, someone might try to flush the page header, even + * though page might actually not be initialized yet. As the first + * inserter on the page, we are effectively responsible for making + * sure that it's initialized, before we let insertingAt to move past + * the page header. + */ + if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD && + XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ) + initializedUpto = ptr - SizeOfXLogShortPHD; + else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD && + XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ) + initializedUpto = ptr - SizeOfXLogLongPHD; + else + initializedUpto = ptr; + + WALInsertLockUpdateInsertingAt(initializedUpto); + + AdvanceXLInsertBuffer(ptr, false); + endptr = XLogCtl->xlblocks[idx]; + + if (expectedEndPtr != endptr) + elog(PANIC, "could not find WAL buffer for %X/%X", + LSN_FORMAT_ARGS(ptr)); + } + else + { + /* + * Make sure the initialization of the page is visible to us, and + * won't arrive later to overwrite the WAL data we write on the page. + */ + pg_memory_barrier(); + } + + /* + * Found the buffer holding this page. Return a pointer to the right + * offset within the page. + */ + cachedPage = ptr / XLOG_BLCKSZ; + cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; + + Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); + Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); + + return cachedPos + ptr % XLOG_BLCKSZ; +} + +/* + * Converts a "usable byte position" to XLogRecPtr. A usable byte position + * is the position starting from the beginning of WAL, excluding all WAL + * page headers. + */ +static XLogRecPtr +XLogBytePosToRecPtr(uint64 bytepos) +{ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / UsableBytesInSegment; + bytesleft = bytepos % UsableBytesInSegment; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / UsableBytesInPage; + bytesleft = bytesleft % UsableBytesInPage; + + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + + return result; +} + +/* + * Like XLogBytePosToRecPtr, but if the position is at a page boundary, + * returns a pointer to the beginning of the page (ie. before page header), + * not to where the first xlog record on that page would go to. This is used + * when converting a pointer to the end of a record. + */ +static XLogRecPtr +XLogBytePosToEndRecPtr(uint64 bytepos) +{ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / UsableBytesInSegment; + bytesleft = bytepos % UsableBytesInSegment; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + if (bytesleft == 0) + seg_offset = 0; + else + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / UsableBytesInPage; + bytesleft = bytesleft % UsableBytesInPage; + + if (bytesleft == 0) + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft; + else + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + + return result; +} + +/* + * Convert an XLogRecPtr to a "usable byte position". + */ +static uint64 +XLogRecPtrToBytePos(XLogRecPtr ptr) +{ + uint64 fullsegs; + uint32 fullpages; + uint32 offset; + uint64 result; + + XLByteToSeg(ptr, fullsegs, wal_segment_size); + + fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ; + offset = ptr % XLOG_BLCKSZ; + + if (fullpages == 0) + { + result = fullsegs * UsableBytesInSegment; + if (offset > 0) + { + Assert(offset >= SizeOfXLogLongPHD); + result += offset - SizeOfXLogLongPHD; + } + } + else + { + result = fullsegs * UsableBytesInSegment + + (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */ + (fullpages - 1) * UsableBytesInPage; /* full pages */ + if (offset > 0) + { + Assert(offset >= SizeOfXLogShortPHD); + result += offset - SizeOfXLogShortPHD; + } + } + + return result; +} + +/* + * Initialize XLOG buffers, writing out old buffers if they still contain + * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is + * true, initialize as many pages as we can without having to write out + * unwritten data. Any new pages are initialized to zeros, with pages headers + * initialized properly. + */ +static void +AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + int nextidx; + XLogRecPtr OldPageRqstPtr; + XLogwrtRqst WriteRqst; + XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr; + XLogRecPtr NewPageBeginPtr; + XLogPageHeader NewPage; + int npages = 0; + + LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + + /* + * Now that we have the lock, check if someone initialized the page + * already. + */ + while (upto >= XLogCtl->InitializedUpTo || opportunistic) + { + nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo); + + /* + * Get ending-offset of the buffer page we need to replace (this may + * be zero if the buffer hasn't been used yet). Fall through if it's + * already written out. + */ + OldPageRqstPtr = XLogCtl->xlblocks[nextidx]; + if (LogwrtResult.Write < OldPageRqstPtr) + { + /* + * Nope, got work to do. If we just want to pre-initialize as much + * as we can without flushing, give up now. + */ + if (opportunistic) + break; + + /* Before waiting, get info_lck and update LogwrtResult */ + SpinLockAcquire(&XLogCtl->info_lck); + if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr) + XLogCtl->LogwrtRqst.Write = OldPageRqstPtr; + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Now that we have an up-to-date LogwrtResult value, see if we + * still need to write it or if someone else already did. + */ + if (LogwrtResult.Write < OldPageRqstPtr) + { + /* + * Must acquire write lock. Release WALBufMappingLock first, + * to make sure that all insertions that we need to wait for + * can finish (up to this same position). Otherwise we risk + * deadlock. + */ + LWLockRelease(WALBufMappingLock); + + WaitXLogInsertionsToFinish(OldPageRqstPtr); + + LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + + LogwrtResult = XLogCtl->LogwrtResult; + if (LogwrtResult.Write >= OldPageRqstPtr) + { + /* OK, someone wrote it already */ + LWLockRelease(WALWriteLock); + } + else + { + /* Have to write it ourselves */ + TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START(); + WriteRqst.Write = OldPageRqstPtr; + WriteRqst.Flush = 0; + XLogWrite(WriteRqst, false); + LWLockRelease(WALWriteLock); + WalStats.m_wal_buffers_full++; + TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + } + /* Re-acquire WALBufMappingLock and retry */ + LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + continue; + } + } + + /* + * Now the next buffer slot is free and we can set it up to be the + * next output page. + */ + NewPageBeginPtr = XLogCtl->InitializedUpTo; + NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ; + + Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx); + + NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ); + + /* + * Be sure to re-zero the buffer so that bytes beyond what we've + * written will look like zeroes and not valid XLOG records... + */ + MemSet((char *) NewPage, 0, XLOG_BLCKSZ); + + /* + * Fill the new page's header + */ + NewPage->xlp_magic = XLOG_PAGE_MAGIC; + + /* NewPage->xlp_info = 0; */ /* done by memset */ + NewPage->xlp_tli = ThisTimeLineID; + NewPage->xlp_pageaddr = NewPageBeginPtr; + + /* NewPage->xlp_rem_len = 0; */ /* done by memset */ + + /* + * If online backup is not in progress, mark the header to indicate + * that WAL records beginning in this page have removable backup + * blocks. This allows the WAL archiver to know whether it is safe to + * compress archived WAL data by transforming full-block records into + * the non-full-block format. It is sufficient to record this at the + * page level because we force a page switch (in fact a segment + * switch) when starting a backup, so the flag will be off before any + * records can be written during the backup. At the end of a backup, + * the last page will be marked as all unsafe when perhaps only part + * is unsafe, but at worst the archiver would miss the opportunity to + * compress a few records. + */ + if (!Insert->forcePageWrites) + NewPage->xlp_info |= XLP_BKP_REMOVABLE; + + /* + * If a record was found to be broken at the end of recovery, and + * we're going to write on the page where its first contrecord was + * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page + * header. See CreateOverwriteContrecordRecord(). + */ + if (missingContrecPtr == NewPageBeginPtr) + { + NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD; + missingContrecPtr = InvalidXLogRecPtr; + } + + /* + * If first page of an XLOG segment file, make it a long header. + */ + if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0) + { + XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage; + + NewLongPage->xlp_sysid = ControlFile->system_identifier; + NewLongPage->xlp_seg_size = wal_segment_size; + NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ; + NewPage->xlp_info |= XLP_LONG_HEADER; + } + + /* + * Make sure the initialization of the page becomes visible to others + * before the xlblocks update. GetXLogBuffer() reads xlblocks without + * holding a lock. + */ + pg_write_barrier(); + + *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr; + + XLogCtl->InitializedUpTo = NewPageEndPtr; + + npages++; + } + LWLockRelease(WALBufMappingLock); + +#ifdef WAL_DEBUG + if (XLOG_DEBUG && npages > 0) + { + elog(DEBUG1, "initialized %d pages, up to %X/%X", + npages, LSN_FORMAT_ARGS(NewPageEndPtr)); + } +#endif +} + +/* + * Calculate CheckPointSegments based on max_wal_size_mb and + * checkpoint_completion_target. + */ +static void +CalculateCheckpointSegments(void) +{ + double target; + + /*------- + * Calculate the distance at which to trigger a checkpoint, to avoid + * exceeding max_wal_size_mb. This is based on two assumptions: + * + * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept + * WAL for two checkpoint cycles to allow us to recover from the + * secondary checkpoint if the first checkpoint failed, though we + * only did this on the primary anyway, not on standby. Keeping just + * one checkpoint simplifies processing and reduces disk space in + * many smaller databases.) + * b) during checkpoint, we consume checkpoint_completion_target * + * number of segments consumed between checkpoints. + *------- + */ + target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) / + (1.0 + CheckPointCompletionTarget); + + /* round down */ + CheckPointSegments = (int) target; + + if (CheckPointSegments < 1) + CheckPointSegments = 1; +} + +void +assign_max_wal_size(int newval, void *extra) +{ + max_wal_size_mb = newval; + CalculateCheckpointSegments(); +} + +void +assign_checkpoint_completion_target(double newval, void *extra) +{ + CheckPointCompletionTarget = newval; + CalculateCheckpointSegments(); +} + +/* + * At a checkpoint, how many WAL segments to recycle as preallocated future + * XLOG segments? Returns the highest segment that should be preallocated. + */ +static XLogSegNo +XLOGfileslop(XLogRecPtr lastredoptr) +{ + XLogSegNo minSegNo; + XLogSegNo maxSegNo; + double distance; + XLogSegNo recycleSegNo; + + /* + * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb + * correspond to. Always recycle enough segments to meet the minimum, and + * remove enough segments to stay below the maximum. + */ + minSegNo = lastredoptr / wal_segment_size + + ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1; + maxSegNo = lastredoptr / wal_segment_size + + ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1; + + /* + * Between those limits, recycle enough segments to get us through to the + * estimated end of next checkpoint. + * + * To estimate where the next checkpoint will finish, assume that the + * system runs steadily consuming CheckPointDistanceEstimate bytes between + * every checkpoint. + */ + distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate; + /* add 10% for good measure. */ + distance *= 1.10; + + recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) / + wal_segment_size); + + if (recycleSegNo < minSegNo) + recycleSegNo = minSegNo; + if (recycleSegNo > maxSegNo) + recycleSegNo = maxSegNo; + + return recycleSegNo; +} + +/* + * Check whether we've consumed enough xlog space that a checkpoint is needed. + * + * new_segno indicates a log file that has just been filled up (or read + * during recovery). We measure the distance from RedoRecPtr to new_segno + * and see if that exceeds CheckPointSegments. + * + * Note: it is caller's responsibility that RedoRecPtr is up-to-date. + */ +static bool +XLogCheckpointNeeded(XLogSegNo new_segno) +{ + XLogSegNo old_segno; + + XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size); + + if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1)) + return true; + return false; +} + +/* + * Write and/or fsync the log at least as far as WriteRqst indicates. + * + * If flexible == true, we don't have to write as far as WriteRqst, but + * may stop at any convenient boundary (such as a cache or logfile boundary). + * This option allows us to avoid uselessly issuing multiple writes when a + * single one would do. + * + * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst) + * must be called before grabbing the lock, to make sure the data is ready to + * write. + */ +static void +XLogWrite(XLogwrtRqst WriteRqst, bool flexible) +{ + bool ispartialpage; + bool last_iteration; + bool finishing_seg; + bool use_existent; + int curridx; + int npages; + int startidx; + uint32 startoffset; + + /* We should always be inside a critical section here */ + Assert(CritSectionCount > 0); + + /* + * Update local LogwrtResult (caller probably did this already, but...) + */ + LogwrtResult = XLogCtl->LogwrtResult; + + /* + * Since successive pages in the xlog cache are consecutively allocated, + * we can usually gather multiple pages together and issue just one + * write() call. npages is the number of pages we have determined can be + * written together; startidx is the cache block index of the first one, + * and startoffset is the file offset at which it should go. The latter + * two variables are only valid when npages > 0, but we must initialize + * all of them to keep the compiler quiet. + */ + npages = 0; + startidx = 0; + startoffset = 0; + + /* + * Within the loop, curridx is the cache block index of the page to + * consider writing. Begin at the buffer containing the next unwritten + * page, or last partially written page. + */ + curridx = XLogRecPtrToBufIdx(LogwrtResult.Write); + + while (LogwrtResult.Write < WriteRqst.Write) + { + /* + * Make sure we're not ahead of the insert process. This could happen + * if we're passed a bogus WriteRqst.Write that is past the end of the + * last page that's been initialized by AdvanceXLInsertBuffer. + */ + XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx]; + + if (LogwrtResult.Write >= EndPtr) + elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", + LSN_FORMAT_ARGS(LogwrtResult.Write), + LSN_FORMAT_ARGS(EndPtr)); + + /* Advance LogwrtResult.Write to end of current buffer page */ + LogwrtResult.Write = EndPtr; + ispartialpage = WriteRqst.Write < LogwrtResult.Write; + + if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size)) + { + /* + * Switch to new logfile segment. We cannot have any pending + * pages here (since we dump what we have at segment end). + */ + Assert(npages == 0); + if (openLogFile >= 0) + XLogFileClose(); + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + + /* create/use new log file */ + use_existent = true; + openLogFile = XLogFileInit(openLogSegNo, &use_existent, true); + ReserveExternalFD(); + } + + /* Make sure we have the current logfile open */ + if (openLogFile < 0) + { + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + openLogFile = XLogFileOpen(openLogSegNo); + ReserveExternalFD(); + } + + /* Add current page to the set of pending pages-to-dump */ + if (npages == 0) + { + /* first of group */ + startidx = curridx; + startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ, + wal_segment_size); + } + npages++; + + /* + * Dump the set if this will be the last loop iteration, or if we are + * at the last page of the cache area (since the next page won't be + * contiguous in memory), or if we are at the end of the logfile + * segment. + */ + last_iteration = WriteRqst.Write <= LogwrtResult.Write; + + finishing_seg = !ispartialpage && + (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size; + + if (last_iteration || + curridx == XLogCtl->XLogCacheBlck || + finishing_seg) + { + char *from; + Size nbytes; + Size nleft; + int written; + instr_time start; + + /* OK to write the page(s) */ + from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; + nbytes = npages * (Size) XLOG_BLCKSZ; + nleft = nbytes; + do + { + errno = 0; + + /* Measure I/O timing to write WAL data */ + if (track_wal_io_timing) + INSTR_TIME_SET_CURRENT(start); + + pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); + written = pg_pwrite(openLogFile, from, nleft, startoffset); + pgstat_report_wait_end(); + + /* + * Increment the I/O timing and the number of times WAL data + * were written out to disk. + */ + if (track_wal_io_timing) + { + instr_time duration; + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start); + WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration); + } + + WalStats.m_wal_write++; + + if (written <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + if (errno == EINTR) + continue; + + save_errno = errno; + XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, + wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log file %s " + "at offset %u, length %zu: %m", + xlogfname, startoffset, nleft))); + } + nleft -= written; + from += written; + startoffset += written; + } while (nleft > 0); + + npages = 0; + + /* + * If we just wrote the whole last page of a logfile segment, + * fsync the segment immediately. This avoids having to go back + * and re-open prior segments when an fsync request comes along + * later. Doing it here ensures that one and only one backend will + * perform this fsync. + * + * This is also the right place to notify the Archiver that the + * segment is ready to copy to archival storage, and to update the + * timer for archive_timeout, and to signal for a checkpoint if + * too many logfile segments have been used since the last + * checkpoint. + */ + if (finishing_seg) + { + issue_xlog_fsync(openLogFile, openLogSegNo); + + /* signal that we need to wakeup walsenders later */ + WalSndWakeupRequest(); + + LogwrtResult.Flush = LogwrtResult.Write; /* end of page */ + + if (XLogArchivingActive()) + XLogArchiveNotifySeg(openLogSegNo); + + XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush; + + /* + * Request a checkpoint if we've consumed too much xlog since + * the last one. For speed, we first check using the local + * copy of RedoRecPtr, which might be out of date; if it looks + * like a checkpoint is needed, forcibly update RedoRecPtr and + * recheck. + */ + if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo)) + { + (void) GetRedoRecPtr(); + if (XLogCheckpointNeeded(openLogSegNo)) + RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); + } + } + } + + if (ispartialpage) + { + /* Only asked to write a partial page */ + LogwrtResult.Write = WriteRqst.Write; + break; + } + curridx = NextBufIdx(curridx); + + /* If flexible, break out of loop as soon as we wrote something */ + if (flexible && npages == 0) + break; + } + + Assert(npages == 0); + + /* + * If asked to flush, do so + */ + if (LogwrtResult.Flush < WriteRqst.Flush && + LogwrtResult.Flush < LogwrtResult.Write) + + { + /* + * Could get here without iterating above loop, in which case we might + * have no open file or the wrong one. However, we do not need to + * fsync more than one file. + */ + if (sync_method != SYNC_METHOD_OPEN && + sync_method != SYNC_METHOD_OPEN_DSYNC) + { + if (openLogFile >= 0 && + !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size)) + XLogFileClose(); + if (openLogFile < 0) + { + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + openLogFile = XLogFileOpen(openLogSegNo); + ReserveExternalFD(); + } + + issue_xlog_fsync(openLogFile, openLogSegNo); + } + + /* signal that we need to wakeup walsenders later */ + WalSndWakeupRequest(); + + LogwrtResult.Flush = LogwrtResult.Write; + } + + /* + * Update shared-memory status + * + * We make sure that the shared 'request' values do not fall behind the + * 'result' values. This is not absolutely essential, but it saves some + * code in a couple of places. + */ + { + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->LogwrtResult = LogwrtResult; + if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write) + XLogCtl->LogwrtRqst.Write = LogwrtResult.Write; + if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush) + XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush; + SpinLockRelease(&XLogCtl->info_lck); + } +} + +/* + * Record the LSN for an asynchronous transaction commit/abort + * and nudge the WALWriter if there is work for it to do. + * (This should not be called for synchronous commits.) + */ +void +XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) +{ + XLogRecPtr WriteRqstPtr = asyncXactLSN; + bool sleeping; + + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + sleeping = XLogCtl->WalWriterSleeping; + if (XLogCtl->asyncXactLSN < asyncXactLSN) + XLogCtl->asyncXactLSN = asyncXactLSN; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * If the WALWriter is sleeping, we should kick it to make it come out of + * low-power mode. Otherwise, determine whether there's a full page of + * WAL available to write. + */ + if (!sleeping) + { + /* back off to last completed page boundary */ + WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ; + + /* if we have already flushed that far, we're done */ + if (WriteRqstPtr <= LogwrtResult.Flush) + return; + } + + /* + * Nudge the WALWriter: it has a full page of WAL to write, or we want it + * to come out of low-power mode so that this async commit will reach disk + * within the expected amount of time. + */ + if (ProcGlobal->walwriterLatch) + SetLatch(ProcGlobal->walwriterLatch); +} + +/* + * Record the LSN up to which we can remove WAL because it's not required by + * any replication slot. + */ +void +XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->replicationSlotMinLSN = lsn; + SpinLockRelease(&XLogCtl->info_lck); +} + + +/* + * Return the oldest LSN we must retain to satisfy the needs of some + * replication slot. + */ +static XLogRecPtr +XLogGetReplicationSlotMinimumLSN(void) +{ + XLogRecPtr retval; + + SpinLockAcquire(&XLogCtl->info_lck); + retval = XLogCtl->replicationSlotMinLSN; + SpinLockRelease(&XLogCtl->info_lck); + + return retval; +} + +/* + * Advance minRecoveryPoint in control file. + * + * If we crash during recovery, we must reach this point again before the + * database is consistent. + * + * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint + * is only updated if it's not already greater than or equal to 'lsn'. + */ +static void +UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) +{ + /* Quick check using our local copy of the variable */ + if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint)) + return; + + /* + * An invalid minRecoveryPoint means that we need to recover all the WAL, + * i.e., we're doing crash recovery. We never modify the control file's + * value in that case, so we can short-circuit future checks here too. The + * local values of minRecoveryPoint and minRecoveryPointTLI should not be + * updated until crash recovery finishes. We only do this for the startup + * process as it should not update its own reference of minRecoveryPoint + * until it has finished crash recovery to make sure that all WAL + * available is replayed in this case. This also saves from extra locks + * taken on the control file from the startup process. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery) + { + updateMinRecoveryPoint = false; + return; + } + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* update local copy */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + + if (XLogRecPtrIsInvalid(minRecoveryPoint)) + updateMinRecoveryPoint = false; + else if (force || minRecoveryPoint < lsn) + { + XLogRecPtr newMinRecoveryPoint; + TimeLineID newMinRecoveryPointTLI; + + /* + * To avoid having to update the control file too often, we update it + * all the way to the last record being replayed, even though 'lsn' + * would suffice for correctness. This also allows the 'force' case + * to not need a valid 'lsn' value. + * + * Another important reason for doing it this way is that the passed + * 'lsn' value could be bogus, i.e., past the end of available WAL, if + * the caller got it from a corrupted heap page. Accepting such a + * value as the min recovery point would prevent us from coming up at + * all. Instead, we just log a warning and continue with recovery. + * (See also the comments about corrupt LSNs in XLogFlush.) + */ + SpinLockAcquire(&XLogCtl->info_lck); + newMinRecoveryPoint = XLogCtl->replayEndRecPtr; + newMinRecoveryPointTLI = XLogCtl->replayEndTLI; + SpinLockRelease(&XLogCtl->info_lck); + + if (!force && newMinRecoveryPoint < lsn) + elog(WARNING, + "xlog min recovery request %X/%X is past current point %X/%X", + LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint)); + + /* update control file */ + if (ControlFile->minRecoveryPoint < newMinRecoveryPoint) + { + ControlFile->minRecoveryPoint = newMinRecoveryPoint; + ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI; + UpdateControlFile(); + minRecoveryPoint = newMinRecoveryPoint; + minRecoveryPointTLI = newMinRecoveryPointTLI; + + ereport(DEBUG2, + (errmsg_internal("updated min recovery point to %X/%X on timeline %u", + LSN_FORMAT_ARGS(minRecoveryPoint), + newMinRecoveryPointTLI))); + } + } + LWLockRelease(ControlFileLock); +} + +/* + * Ensure that all XLOG data through the given position is flushed to disk. + * + * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not + * already held, and we try to avoid acquiring it if possible. + */ +void +XLogFlush(XLogRecPtr record) +{ + XLogRecPtr WriteRqstPtr; + XLogwrtRqst WriteRqst; + + /* + * During REDO, we are reading not writing WAL. Therefore, instead of + * trying to flush the WAL, we should update minRecoveryPoint instead. We + * test XLogInsertAllowed(), not InRecovery, because we need checkpointer + * to act this way too, and because when it tries to write the + * end-of-recovery checkpoint, it should indeed flush. + */ + if (!XLogInsertAllowed()) + { + UpdateMinRecoveryPoint(record, false); + return; + } + + /* Quick exit if already known flushed */ + if (record <= LogwrtResult.Flush) + return; + +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", + LSN_FORMAT_ARGS(record), + LSN_FORMAT_ARGS(LogwrtResult.Write), + LSN_FORMAT_ARGS(LogwrtResult.Flush)); +#endif + + START_CRIT_SECTION(); + + /* + * Since fsync is usually a horribly expensive operation, we try to + * piggyback as much data as we can on each fsync: if we see any more data + * entered into the xlog buffer, we'll write and fsync that too, so that + * the final value of LogwrtResult.Flush is as large as possible. This + * gives us some chance of avoiding another fsync immediately after. + */ + + /* initialize to given target; may increase below */ + WriteRqstPtr = record; + + /* + * Now wait until we get the write lock, or someone else does the flush + * for us. + */ + for (;;) + { + XLogRecPtr insertpos; + + /* read LogwrtResult and update local state */ + SpinLockAcquire(&XLogCtl->info_lck); + if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write) + WriteRqstPtr = XLogCtl->LogwrtRqst.Write; + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* done already? */ + if (record <= LogwrtResult.Flush) + break; + + /* + * Before actually performing the write, wait for all in-flight + * insertions to the pages we're about to write to finish. + */ + insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr); + + /* + * Try to get the write lock. If we can't get it immediately, wait + * until it's released, and recheck if we still need to do the flush + * or if the backend that held the lock did it for us already. This + * helps to maintain a good rate of group committing when the system + * is bottlenecked by the speed of fsyncing. + */ + if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) + { + /* + * The lock is now free, but we didn't acquire it yet. Before we + * do, loop back to check if someone else flushed the record for + * us already. + */ + continue; + } + + /* Got the lock; recheck whether request is satisfied */ + LogwrtResult = XLogCtl->LogwrtResult; + if (record <= LogwrtResult.Flush) + { + LWLockRelease(WALWriteLock); + break; + } + + /* + * Sleep before flush! By adding a delay here, we may give further + * backends the opportunity to join the backlog of group commit + * followers; this can significantly improve transaction throughput, + * at the risk of increasing transaction latency. + * + * We do not sleep if enableFsync is not turned on, nor if there are + * fewer than CommitSiblings other backends with active transactions. + */ + if (CommitDelay > 0 && enableFsync && + MinimumActiveBackends(CommitSiblings)) + { + pg_usleep(CommitDelay); + + /* + * Re-check how far we can now flush the WAL. It's generally not + * safe to call WaitXLogInsertionsToFinish while holding + * WALWriteLock, because an in-progress insertion might need to + * also grab WALWriteLock to make progress. But we know that all + * the insertions up to insertpos have already finished, because + * that's what the earlier WaitXLogInsertionsToFinish() returned. + * We're only calling it again to allow insertpos to be moved + * further forward, not to actually wait for anyone. + */ + insertpos = WaitXLogInsertionsToFinish(insertpos); + } + + /* try to write/flush later additions to XLOG as well */ + WriteRqst.Write = insertpos; + WriteRqst.Flush = insertpos; + + XLogWrite(WriteRqst, false); + + LWLockRelease(WALWriteLock); + /* done */ + break; + } + + END_CRIT_SECTION(); + + /* wake up walsenders now that we've released heavily contended locks */ + WalSndWakeupProcessRequests(); + + /* + * If we still haven't flushed to the request point then we have a + * problem; most likely, the requested flush point is past end of XLOG. + * This has been seen to occur when a disk page has a corrupted LSN. + * + * Formerly we treated this as a PANIC condition, but that hurts the + * system's robustness rather than helping it: we do not want to take down + * the whole system due to corruption on one data page. In particular, if + * the bad page is encountered again during recovery then we would be + * unable to restart the database at all! (This scenario actually + * happened in the field several times with 7.1 releases.) As of 8.4, bad + * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem; + * the only time we can reach here during recovery is while flushing the + * end-of-recovery checkpoint record, and we don't expect that to have a + * bad LSN. + * + * Note that for calls from xact.c, the ERROR will be promoted to PANIC + * since xact.c calls this routine inside a critical section. However, + * calls from bufmgr.c are not within critical sections and so we will not + * force a restart for a bad LSN on a data page. + */ + if (LogwrtResult.Flush < record) + elog(ERROR, + "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", + LSN_FORMAT_ARGS(record), + LSN_FORMAT_ARGS(LogwrtResult.Flush)); +} + +/* + * Write & flush xlog, but without specifying exactly where to. + * + * We normally write only completed blocks; but if there is nothing to do on + * that basis, we check for unwritten async commits in the current incomplete + * block, and write through the latest one of those. Thus, if async commits + * are not being used, we will write complete blocks only. + * + * If, based on the above, there's anything to write we do so immediately. But + * to avoid calling fsync, fdatasync et. al. at a rate that'd impact + * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's + * more than wal_writer_flush_after unflushed blocks. + * + * We can guarantee that async commits reach disk after at most three + * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite + * to write "flexibly", meaning it can stop at the end of the buffer ring; + * this makes a difference only with very high load or long wal_writer_delay, + * but imposes one extra cycle for the worst case for async commits.) + * + * This routine is invoked periodically by the background walwriter process. + * + * Returns true if there was any work to do, even if we skipped flushing due + * to wal_writer_delay/wal_writer_flush_after. + */ +bool +XLogBackgroundFlush(void) +{ + XLogwrtRqst WriteRqst; + bool flexible = true; + static TimestampTz lastflush; + TimestampTz now; + int flushbytes; + + /* XLOG doesn't need flushing during recovery */ + if (RecoveryInProgress()) + return false; + + /* read LogwrtResult and update local state */ + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + WriteRqst = XLogCtl->LogwrtRqst; + SpinLockRelease(&XLogCtl->info_lck); + + /* back off to last completed page boundary */ + WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ; + + /* if we have already flushed that far, consider async commit records */ + if (WriteRqst.Write <= LogwrtResult.Flush) + { + SpinLockAcquire(&XLogCtl->info_lck); + WriteRqst.Write = XLogCtl->asyncXactLSN; + SpinLockRelease(&XLogCtl->info_lck); + flexible = false; /* ensure it all gets written */ + } + + /* + * If already known flushed, we're done. Just need to check if we are + * holding an open file handle to a logfile that's no longer in use, + * preventing the file from being deleted. + */ + if (WriteRqst.Write <= LogwrtResult.Flush) + { + if (openLogFile >= 0) + { + if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size)) + { + XLogFileClose(); + } + } + return false; + } + + /* + * Determine how far to flush WAL, based on the wal_writer_delay and + * wal_writer_flush_after GUCs. + */ + now = GetCurrentTimestamp(); + flushbytes = + WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ; + + if (WalWriterFlushAfter == 0 || lastflush == 0) + { + /* first call, or block based limits disabled */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay)) + { + /* + * Flush the writes at least every WalWriterDelay ms. This is + * important to bound the amount of time it takes for an asynchronous + * commit to hit disk. + */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (flushbytes >= WalWriterFlushAfter) + { + /* exceeded wal_writer_flush_after blocks, flush */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else + { + /* no flushing, this time round */ + WriteRqst.Flush = 0; + } + +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", + LSN_FORMAT_ARGS(WriteRqst.Write), + LSN_FORMAT_ARGS(WriteRqst.Flush), + LSN_FORMAT_ARGS(LogwrtResult.Write), + LSN_FORMAT_ARGS(LogwrtResult.Flush)); +#endif + + START_CRIT_SECTION(); + + /* now wait for any in-progress insertions to finish and get write lock */ + WaitXLogInsertionsToFinish(WriteRqst.Write); + LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + LogwrtResult = XLogCtl->LogwrtResult; + if (WriteRqst.Write > LogwrtResult.Write || + WriteRqst.Flush > LogwrtResult.Flush) + { + XLogWrite(WriteRqst, flexible); + } + LWLockRelease(WALWriteLock); + + END_CRIT_SECTION(); + + /* wake up walsenders now that we've released heavily contended locks */ + WalSndWakeupProcessRequests(); + + /* + * Great, done. To take some work off the critical path, try to initialize + * as many of the no-longer-needed WAL buffers for future use as we can. + */ + AdvanceXLInsertBuffer(InvalidXLogRecPtr, true); + + /* + * If we determined that we need to write data, but somebody else + * wrote/flushed already, it should be considered as being active, to + * avoid hibernating too early. + */ + return true; +} + +/* + * Test whether XLOG data has been flushed up to (at least) the given position. + * + * Returns true if a flush is still needed. (It may be that someone else + * is already in process of flushing that far, however.) + */ +bool +XLogNeedsFlush(XLogRecPtr record) +{ + /* + * During recovery, we don't flush WAL but update minRecoveryPoint + * instead. So "needs flush" is taken to mean whether minRecoveryPoint + * would need to be updated. + */ + if (RecoveryInProgress()) + { + /* + * An invalid minRecoveryPoint means that we need to recover all the + * WAL, i.e., we're doing crash recovery. We never modify the control + * file's value in that case, so we can short-circuit future checks + * here too. This triggers a quick exit path for the startup process, + * which cannot update its local copy of minRecoveryPoint as long as + * it has not replayed all WAL available when doing crash recovery. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery) + updateMinRecoveryPoint = false; + + /* Quick exit if already known to be updated or cannot be updated */ + if (record <= minRecoveryPoint || !updateMinRecoveryPoint) + return false; + + /* + * Update local copy of minRecoveryPoint. But if the lock is busy, + * just return a conservative guess. + */ + if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED)) + return true; + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + LWLockRelease(ControlFileLock); + + /* + * Check minRecoveryPoint for any other process than the startup + * process doing crash recovery, which should not update the control + * file value if crash recovery is still running. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint)) + updateMinRecoveryPoint = false; + + /* check again */ + if (record <= minRecoveryPoint || !updateMinRecoveryPoint) + return false; + else + return true; + } + + /* Quick exit if already known flushed */ + if (record <= LogwrtResult.Flush) + return false; + + /* read LogwrtResult and update local state */ + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* check again */ + if (record <= LogwrtResult.Flush) + return false; + + return true; +} + +/* + * Create a new XLOG file segment, or open a pre-existing one. + * + * logsegno: identify segment to be created/opened. + * + * *use_existent: if true, OK to use a pre-existing file (else, any + * pre-existing file will be deleted). On return, true if a pre-existing + * file was used. + * + * use_lock: if true, acquire ControlFileLock while moving file into + * place. This should be true except during bootstrap log creation. The + * caller must *not* hold the lock at call. + * + * Returns FD of opened file. + * + * Note: errors here are ERROR not PANIC because we might or might not be + * inside a critical section (eg, during checkpoint there is no reason to + * take down the system on failure). They will promote to PANIC if we are + * in a critical section. + */ +int +XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + PGAlignedXLogBlock zbuffer; + XLogSegNo installed_segno; + XLogSegNo max_segno; + int fd; + int save_errno; + + XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size); + + /* + * Try to use existent file (checkpoint maker may have created it already) + */ + if (*use_existent) + { + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + if (fd < 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + else + return fd; + } + + /* + * Initialize an empty (all zeroes) segment. NOTE: it is possible that + * another process is doing the same thing. If so, we will end up + * pre-creating an extra log segment. That seems OK, and better than + * holding the lock throughout this lengthy process. + */ + elog(DEBUG2, "creating and filling new WAL file"); + + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + memset(zbuffer.data, 0, XLOG_BLCKSZ); + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); + save_errno = 0; + if (wal_init_zero) + { + struct iovec iov[PG_IOV_MAX]; + int blocks; + + /* + * Zero-fill the file. With this setting, we do this the hard way to + * ensure that all the file space has really been allocated. On + * platforms that allow "holes" in files, just seeking to the end + * doesn't allocate intermediate space. This way, we know that we + * have all the space and (after the fsync below) that all the + * indirect blocks are down on disk. Therefore, fdatasync(2) or + * O_DSYNC will be sufficient to sync future writes to the log file. + */ + + /* Prepare to write out a lot of copies of our zero buffer at once. */ + for (int i = 0; i < lengthof(iov); ++i) + { + iov[i].iov_base = zbuffer.data; + iov[i].iov_len = XLOG_BLCKSZ; + } + + /* Loop, writing as many blocks as we can for each system call. */ + blocks = wal_segment_size / XLOG_BLCKSZ; + for (int i = 0; i < blocks;) + { + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * XLOG_BLCKSZ; + + if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) + { + save_errno = errno; + break; + } + + i += iovcnt; + } + } + else + { + /* + * Otherwise, seeking to the end and writing a solitary byte is + * enough. + */ + errno = 0; + if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1) + { + /* if write didn't set errno, assume no disk space */ + save_errno = errno ? errno : ENOSPC; + } + } + pgstat_report_wait_end(); + + if (save_errno) + { + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + + close(fd); + + errno = save_errno; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + close(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + if (close(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + /* + * Now move the segment into place with its final name. + * + * If caller didn't want to use a pre-existing file, get rid of any + * pre-existing file. Otherwise, cope with possibility that someone else + * has created the file while we were filling ours: if so, use ours to + * pre-create a future log segment. + */ + installed_segno = logsegno; + + /* + * XXX: What should we use as max_segno? We used to use XLOGfileslop when + * that was a constant, but that was always a bit dubious: normally, at a + * checkpoint, XLOGfileslop was the offset from the checkpoint record, but + * here, it was the offset from the insert location. We can't do the + * normal XLOGfileslop calculation here because we don't have access to + * the prior checkpoint's redo location. So somewhat arbitrarily, just use + * CheckPointSegments. + */ + max_segno = logsegno + CheckPointSegments; + if (!InstallXLogFileSegment(&installed_segno, tmppath, + *use_existent, max_segno, + use_lock)) + { + /* + * No need for any more future segments, or InstallXLogFileSegment() + * failed to rename the file into place. If the rename failed, opening + * the file below will fail. + */ + unlink(tmppath); + } + + /* Set flag to tell caller there was no existent file */ + *use_existent = false; + + /* Now open original target segment (might not be file I just made) */ + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path), + (AmCheckpointerProcess() ? + errhint("This is known to fail occasionally during archive recovery, where it is harmless.") : + 0))); + + elog(DEBUG2, "done creating and filling new WAL file"); + + return fd; +} + +/* + * Create a new XLOG file segment by copying a pre-existing one. + * + * destsegno: identify segment to be created. + * + * srcTLI, srcsegno: identify segment to be copied (could be from + * a different timeline) + * + * upto: how much of the source file to copy (the rest is filled with + * zeros) + * + * Currently this is only used during recovery, and so there are no locking + * considerations. But we should be just as tense as XLogFileInit to avoid + * emplacing a bogus file. + */ +static void +XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, + int upto) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + PGAlignedXLogBlock buffer; + int srcfd; + int fd; + int nbytes; + + /* + * Open the source file + */ + XLogFilePath(path, srcTLI, srcsegno, wal_segment_size); + srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (srcfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* + * Copy into a temp file name. + */ + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + /* + * Do the data copying. + */ + for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer)) + { + int nread; + + nread = upto - nbytes; + + /* + * The part that is not read from the source file is filled with + * zeros. + */ + if (nread < sizeof(buffer)) + memset(buffer.data, 0, sizeof(buffer)); + + if (nread > 0) + { + int r; + + if (nread > sizeof(buffer)) + nread = sizeof(buffer); + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ); + r = read(srcfd, buffer.data, nread); + if (r != nread) + { + if (r < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, r, (Size) nread))); + } + pgstat_report_wait_end(); + } + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE); + if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer)) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + } + + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + if (CloseTransientFile(srcfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + /* + * Now move the segment into place with its final name. + */ + if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false)) + elog(ERROR, "InstallXLogFileSegment should not have failed"); +} + +/* + * Install a new XLOG segment file as a current or future log segment. + * + * This is used both to install a newly-created segment (which has a temp + * filename while it's being created) and to recycle an old segment. + * + * *segno: identify segment to install as (or first possible target). + * When find_free is true, this is modified on return to indicate the + * actual installation location or last segment searched. + * + * tmppath: initial name of file to install. It will be renamed into place. + * + * find_free: if true, install the new segment at the first empty segno + * number at or after the passed numbers. If false, install the new segment + * exactly where specified, deleting any existing segment file there. + * + * max_segno: maximum segment number to install the new file as. Fail if no + * free slot is found between *segno and max_segno. (Ignored when find_free + * is false.) + * + * use_lock: if true, acquire ControlFileLock while moving file into + * place. This should be true except during bootstrap log creation. The + * caller must *not* hold the lock at call. + * + * Returns true if the file was installed successfully. false indicates that + * max_segno limit was exceeded, or an error occurred while renaming the + * file into place. + */ +static bool +InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, + bool find_free, XLogSegNo max_segno, + bool use_lock) +{ + char path[MAXPGPATH]; + struct stat stat_buf; + + XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size); + + /* + * We want to be sure that only one process does this at a time. + */ + if (use_lock) + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (!find_free) + { + /* Force installation: get rid of any pre-existing segment file */ + durable_unlink(path, DEBUG1); + } + else + { + /* Find a free slot to put it in */ + while (stat(path, &stat_buf) == 0) + { + if ((*segno) >= max_segno) + { + /* Failed to find a free slot within specified range */ + if (use_lock) + LWLockRelease(ControlFileLock); + return false; + } + (*segno)++; + XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size); + } + } + + /* + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). + */ + if (durable_rename_excl(tmppath, path, LOG) != 0) + { + if (use_lock) + LWLockRelease(ControlFileLock); + /* durable_rename_excl already emitted log message */ + return false; + } + + if (use_lock) + LWLockRelease(ControlFileLock); + + return true; +} + +/* + * Open a pre-existing logfile segment for writing. + */ +int +XLogFileOpen(XLogSegNo segno) +{ + char path[MAXPGPATH]; + int fd; + + XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size); + + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + return fd; +} + +/* + * Open a logfile segment for reading (during recovery). + * + * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. + * Otherwise, it's assumed to be already available in pg_wal. + */ +static int +XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, + XLogSource source, bool notfoundOk) +{ + char xlogfname[MAXFNAMELEN]; + char activitymsg[MAXFNAMELEN + 16]; + char path[MAXPGPATH]; + int fd; + + XLogFileName(xlogfname, tli, segno, wal_segment_size); + + switch (source) + { + case XLOG_FROM_ARCHIVE: + /* Report recovery progress in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", + xlogfname); + set_ps_display(activitymsg); + + restoredFromArchive = RestoreArchivedFile(path, xlogfname, + "RECOVERYXLOG", + wal_segment_size, + InRedo); + if (!restoredFromArchive) + return -1; + break; + + case XLOG_FROM_PG_WAL: + case XLOG_FROM_STREAM: + XLogFilePath(path, tli, segno, wal_segment_size); + restoredFromArchive = false; + break; + + default: + elog(ERROR, "invalid XLogFileRead source %d", source); + } + + /* + * If the segment was fetched from archival storage, replace the existing + * xlog segment (if any) with the archival version. + */ + if (source == XLOG_FROM_ARCHIVE) + { + KeepFileRestoredFromArchive(path, xlogfname); + + /* + * Set path to point at the new file in pg_wal. + */ + snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); + } + + fd = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (fd >= 0) + { + /* Success! */ + curFileTLI = tli; + + /* Report recovery progress in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "recovering %s", + xlogfname); + set_ps_display(activitymsg); + + /* Track source of data in assorted state variables */ + readSource = source; + XLogReceiptSource = source; + /* In FROM_STREAM case, caller tracks receipt time, not me */ + if (source != XLOG_FROM_STREAM) + XLogReceiptTime = GetCurrentTimestamp(); + + return fd; + } + if (errno != ENOENT || !notfoundOk) /* unexpected failure? */ + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return -1; +} + +/* + * Open a logfile segment for reading (during recovery). + * + * This version searches for the segment with any TLI listed in expectedTLEs. + */ +static int +XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) +{ + char path[MAXPGPATH]; + ListCell *cell; + int fd; + List *tles; + + /* + * Loop looking for a suitable timeline ID: we might need to read any of + * the timelines listed in expectedTLEs. + * + * We expect curFileTLI on entry to be the TLI of the preceding file in + * sequence, or 0 if there was no predecessor. We do not allow curFileTLI + * to go backwards; this prevents us from picking up the wrong file when a + * parent timeline extends to higher segment numbers than the child we + * want to read. + * + * If we haven't read the timeline history file yet, read it now, so that + * we know which TLIs to scan. We don't save the list in expectedTLEs, + * however, unless we actually find a valid segment. That way if there is + * neither a timeline history file nor a WAL segment in the archive, and + * streaming replication is set up, we'll read the timeline history file + * streamed from the primary when we start streaming, instead of + * recovering with a dummy history generated here. + */ + if (expectedTLEs) + tles = expectedTLEs; + else + tles = readTimeLineHistory(recoveryTargetTLI); + + foreach(cell, tles) + { + TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell); + TimeLineID tli = hent->tli; + + if (tli < curFileTLI) + break; /* don't bother looking at too-old TLIs */ + + /* + * Skip scanning the timeline ID that the logfile segment to read + * doesn't belong to + */ + if (hent->begin != InvalidXLogRecPtr) + { + XLogSegNo beginseg = 0; + + XLByteToSeg(hent->begin, beginseg, wal_segment_size); + + /* + * The logfile segment that doesn't belong to the timeline is + * older or newer than the segment that the timeline started or + * ended at, respectively. It's sufficient to check only the + * starting segment of the timeline here. Since the timelines are + * scanned in descending order in this loop, any segments newer + * than the ending segment should belong to newer timeline and + * have already been read before. So it's not necessary to check + * the ending segment of the timeline here. + */ + if (segno < beginseg) + continue; + } + + if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE) + { + fd = XLogFileRead(segno, emode, tli, + XLOG_FROM_ARCHIVE, true); + if (fd != -1) + { + elog(DEBUG1, "got WAL segment from archive"); + if (!expectedTLEs) + expectedTLEs = tles; + return fd; + } + } + + if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL) + { + fd = XLogFileRead(segno, emode, tli, + XLOG_FROM_PG_WAL, true); + if (fd != -1) + { + if (!expectedTLEs) + expectedTLEs = tles; + return fd; + } + } + } + + /* Couldn't find it. For simplicity, complain about front timeline */ + XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size); + errno = ENOENT; + ereport(emode, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return -1; +} + +/* + * Close the current logfile segment for writing. + */ +static void +XLogFileClose(void) +{ + Assert(openLogFile >= 0); + + /* + * WAL segment files will not be re-read in normal operation, so we advise + * the OS to release any cached pages. But do not do so if WAL archiving + * or streaming is active, because archiver and walsender process could + * use the cache to read the WAL segment. + */ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + if (!XLogIsNeeded()) + (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); +#endif + + if (close(openLogFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", xlogfname))); + } + + openLogFile = -1; + ReleaseExternalFD(); +} + +/* + * Preallocate log files beyond the specified log endpoint. + * + * XXX this is currently extremely conservative, since it forces only one + * future log segment to exist, and even that only if we are 75% done with + * the current one. This is only appropriate for very low-WAL-volume systems. + * High-volume systems will be OK once they've built up a sufficient set of + * recycled log segments, but the startup transient is likely to include + * a lot of segment creations by foreground processes, which is not so good. + */ +static void +PreallocXlogFiles(XLogRecPtr endptr) +{ + XLogSegNo _logSegNo; + int lf; + bool use_existent; + uint64 offset; + + XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size); + offset = XLogSegmentOffset(endptr - 1, wal_segment_size); + if (offset >= (uint32) (0.75 * wal_segment_size)) + { + _logSegNo++; + use_existent = true; + lf = XLogFileInit(_logSegNo, &use_existent, true); + close(lf); + if (!use_existent) + CheckpointStats.ckpt_segs_added++; + } +} + +/* + * Throws an error if the given log segment has already been removed or + * recycled. The caller should only pass a segment that it knows to have + * existed while the server has been running, as this function always + * succeeds if no WAL segments have been removed since startup. + * 'tli' is only used in the error message. + * + * Note: this function guarantees to keep errno unchanged on return. + * This supports callers that use this to possibly deliver a better + * error message about a missing file, while still being able to throw + * a normal file-access error afterwards, if this does return. + */ +void +CheckXLogRemoved(XLogSegNo segno, TimeLineID tli) +{ + int save_errno = errno; + XLogSegNo lastRemovedSegNo; + + SpinLockAcquire(&XLogCtl->info_lck); + lastRemovedSegNo = XLogCtl->lastRemovedSegNo; + SpinLockRelease(&XLogCtl->info_lck); + + if (segno <= lastRemovedSegNo) + { + char filename[MAXFNAMELEN]; + + XLogFileName(filename, tli, segno, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + filename))); + } + errno = save_errno; +} + +/* + * Return the last WAL segment removed, or 0 if no segment has been removed + * since startup. + * + * NB: the result can be out of date arbitrarily fast, the caller has to deal + * with that. + */ +XLogSegNo +XLogGetLastRemovedSegno(void) +{ + XLogSegNo lastRemovedSegNo; + + SpinLockAcquire(&XLogCtl->info_lck); + lastRemovedSegNo = XLogCtl->lastRemovedSegNo; + SpinLockRelease(&XLogCtl->info_lck); + + return lastRemovedSegNo; +} + + +/* + * Update the last removed segno pointer in shared memory, to reflect that the + * given XLOG file has been removed. + */ +static void +UpdateLastRemovedPtr(char *filename) +{ + uint32 tli; + XLogSegNo segno; + + XLogFromFileName(filename, &tli, &segno, wal_segment_size); + + SpinLockAcquire(&XLogCtl->info_lck); + if (segno > XLogCtl->lastRemovedSegNo) + XLogCtl->lastRemovedSegNo = segno; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Remove all temporary log files in pg_wal + * + * This is called at the beginning of recovery after a previous crash, + * at a point where no other processes write fresh WAL data. + */ +static void +RemoveTempXlogFiles(void) +{ + DIR *xldir; + struct dirent *xlde; + + elog(DEBUG2, "removing all temporary WAL segments"); + + xldir = AllocateDir(XLOGDIR); + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + char path[MAXPGPATH]; + + if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0) + continue; + + snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name); + unlink(path); + elog(DEBUG2, "removed temporary WAL segment \"%s\"", path); + } + FreeDir(xldir); +} + +/* + * Recycle or remove all log files older or equal to passed segno. + * + * endptr is current (or recent) end of xlog, and lastredoptr is the + * redo pointer of the last checkpoint. These are used to determine + * whether we want to recycle rather than delete no-longer-wanted log files. + */ +static void +RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr) +{ + DIR *xldir; + struct dirent *xlde; + char lastoff[MAXFNAMELEN]; + XLogSegNo endlogSegNo; + XLogSegNo recycleSegNo; + + /* Initialize info about where to try to recycle to */ + XLByteToSeg(endptr, endlogSegNo, wal_segment_size); + recycleSegNo = XLOGfileslop(lastredoptr); + + /* + * Construct a filename of the last segment to be kept. The timeline ID + * doesn't matter, we ignore that in the comparison. (During recovery, + * ThisTimeLineID isn't set, so we can't use that.) + */ + XLogFileName(lastoff, 0, segno, wal_segment_size); + + elog(DEBUG2, "attempting to remove WAL segments older than log file %s", + lastoff); + + xldir = AllocateDir(XLOGDIR); + + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + /* Ignore files that are not XLOG segments */ + if (!IsXLogFileName(xlde->d_name) && + !IsPartialXLogFileName(xlde->d_name)) + continue; + + /* + * We ignore the timeline part of the XLOG segment identifiers in + * deciding whether a segment is still needed. This ensures that we + * won't prematurely remove a segment from a parent timeline. We could + * probably be a little more proactive about removing segments of + * non-parent timelines, but that would be a whole lot more + * complicated. + * + * We use the alphanumeric sorting property of the filenames to decide + * which ones are earlier than the lastoff segment. + */ + if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0) + { + if (XLogArchiveCheckDone(xlde->d_name)) + { + /* Update the last removed location in shared memory first */ + UpdateLastRemovedPtr(xlde->d_name); + + RemoveXlogFile(xlde->d_name, recycleSegNo, &endlogSegNo); + } + } + } + + FreeDir(xldir); +} + +/* + * Remove WAL files that are not part of the given timeline's history. + * + * This is called during recovery, whenever we switch to follow a new + * timeline, and at the end of recovery when we create a new timeline. We + * wouldn't otherwise care about extra WAL files lying in pg_wal, but they + * might be leftover pre-allocated or recycled WAL segments on the old timeline + * that we haven't used yet, and contain garbage. If we just leave them in + * pg_wal, they will eventually be archived, and we can't let that happen. + * Files that belong to our timeline history are valid, because we have + * successfully replayed them, but from others we can't be sure. + * + * 'switchpoint' is the current point in WAL where we switch to new timeline, + * and 'newTLI' is the new timeline we switch to. + */ +static void +RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI) +{ + DIR *xldir; + struct dirent *xlde; + char switchseg[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + XLogSegNo switchLogSegNo; + XLogSegNo recycleSegNo; + + /* + * Initialize info about where to begin the work. This will recycle, + * somewhat arbitrarily, 10 future segments. + */ + XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size); + XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size); + recycleSegNo = endLogSegNo + 10; + + /* + * Construct a filename of the last segment to be kept. + */ + XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size); + + elog(DEBUG2, "attempting to remove WAL segments newer than log file %s", + switchseg); + + xldir = AllocateDir(XLOGDIR); + + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + /* Ignore files that are not XLOG segments */ + if (!IsXLogFileName(xlde->d_name)) + continue; + + /* + * Remove files that are on a timeline older than the new one we're + * switching to, but with a segment number >= the first segment on the + * new timeline. + */ + if (strncmp(xlde->d_name, switchseg, 8) < 0 && + strcmp(xlde->d_name + 8, switchseg + 8) > 0) + { + /* + * If the file has already been marked as .ready, however, don't + * remove it yet. It should be OK to remove it - files that are + * not part of our timeline history are not required for recovery + * - but seems safer to let them be archived and removed later. + */ + if (!XLogArchiveIsReady(xlde->d_name)) + RemoveXlogFile(xlde->d_name, recycleSegNo, &endLogSegNo); + } + } + + FreeDir(xldir); +} + +/* + * Recycle or remove a log file that's no longer needed. + * + * segname is the name of the segment to recycle or remove. recycleSegNo + * is the segment number to recycle up to. endlogSegNo is the segment + * number of the current (or recent) end of WAL. + * + * endlogSegNo gets incremented if the segment is recycled so as it is not + * checked again with future callers of this function. + */ +static void +RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, + XLogSegNo *endlogSegNo) +{ + char path[MAXPGPATH]; +#ifdef WIN32 + char newpath[MAXPGPATH]; +#endif + struct stat statbuf; + + snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname); + + /* + * Before deleting the file, see if it can be recycled as a future log + * segment. Only recycle normal files, because we don't want to recycle + * symbolic links pointing to a separate archive directory. + */ + if (wal_recycle && + *endlogSegNo <= recycleSegNo && + lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && + InstallXLogFileSegment(endlogSegNo, path, + true, recycleSegNo, true)) + { + ereport(DEBUG2, + (errmsg_internal("recycled write-ahead log file \"%s\"", + segname))); + CheckpointStats.ckpt_segs_recycled++; + /* Needn't recheck that slot on future iterations */ + (*endlogSegNo)++; + } + else + { + /* No need for any more future segments... */ + int rc; + + ereport(DEBUG2, + (errmsg_internal("removing write-ahead log file \"%s\"", + segname))); + +#ifdef WIN32 + + /* + * On Windows, if another process (e.g another backend) holds the file + * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file + * will still show up in directory listing until the last handle is + * closed. To avoid confusing the lingering deleted file for a live + * WAL file that needs to be archived, rename it before deleting it. + * + * If another process holds the file open without FILE_SHARE_DELETE + * flag, rename will fail. We'll try again at the next checkpoint. + */ + snprintf(newpath, MAXPGPATH, "%s.deleted", path); + if (rename(path, newpath) != 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\": %m", + path))); + return; + } + rc = durable_unlink(newpath, LOG); +#else + rc = durable_unlink(path, LOG); +#endif + if (rc != 0) + { + /* Message already logged by durable_unlink() */ + return; + } + CheckpointStats.ckpt_segs_removed++; + } + + XLogArchiveCleanup(segname); +} + +/* + * Verify whether pg_wal and pg_wal/archive_status exist. + * If the latter does not exist, recreate it. + * + * It is not the goal of this function to verify the contents of these + * directories, but to help in cases where someone has performed a cluster + * copy for PITR purposes but omitted pg_wal from the copy. + * + * We could also recreate pg_wal if it doesn't exist, but a deliberate + * policy decision was made not to. It is fairly common for pg_wal to be + * a symlink, and if that was the DBA's intent then automatically making a + * plain directory would result in degraded performance with no notice. + */ +static void +ValidateXLOGDirectoryStructure(void) +{ + char path[MAXPGPATH]; + struct stat stat_buf; + + /* Check for pg_wal; if it doesn't exist, error out */ + if (stat(XLOGDIR, &stat_buf) != 0 || + !S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (errmsg("required WAL directory \"%s\" does not exist", + XLOGDIR))); + + /* Check for archive_status */ + snprintf(path, MAXPGPATH, XLOGDIR "/archive_status"); + if (stat(path, &stat_buf) == 0) + { + /* Check for weird cases where it exists but isn't a directory */ + if (!S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (errmsg("required WAL directory \"%s\" does not exist", + path))); + } + else + { + ereport(LOG, + (errmsg("creating missing WAL directory \"%s\"", path))); + if (MakePGDirectory(path) < 0) + ereport(FATAL, + (errmsg("could not create missing directory \"%s\": %m", + path))); + } +} + +/* + * Remove previous backup history files. This also retries creation of + * .ready files for any backup history files for which XLogArchiveNotify + * failed earlier. + */ +static void +CleanupBackupHistory(void) +{ + DIR *xldir; + struct dirent *xlde; + char path[MAXPGPATH + sizeof(XLOGDIR)]; + + xldir = AllocateDir(XLOGDIR); + + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + if (IsBackupHistoryFileName(xlde->d_name)) + { + if (XLogArchiveCheckDone(xlde->d_name)) + { + elog(DEBUG2, "removing WAL backup history file \"%s\"", + xlde->d_name); + snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name); + unlink(path); + XLogArchiveCleanup(xlde->d_name); + } + } + } + + FreeDir(xldir); +} + +/* + * Attempt to read the next XLOG record. + * + * Before first call, the reader needs to be positioned to the first record + * by calling XLogBeginRead(). + * + * If no valid record is available, returns NULL, or fails if emode is PANIC. + * (emode must be either PANIC, LOG). In standby mode, retries until a valid + * record is available. + */ +static XLogRecord * +ReadRecord(XLogReaderState *xlogreader, int emode, + bool fetching_ckpt) +{ + XLogRecord *record; + XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; + + /* Pass through parameters to XLogPageRead */ + private->fetching_ckpt = fetching_ckpt; + private->emode = emode; + private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); + + /* This is the first attempt to read this page. */ + lastSourceFailed = false; + + for (;;) + { + char *errormsg; + + record = XLogReadRecord(xlogreader, &errormsg); + ReadRecPtr = xlogreader->ReadRecPtr; + EndRecPtr = xlogreader->EndRecPtr; + if (record == NULL) + { + /* + * When not in standby mode we find that WAL ends in an incomplete + * record, keep track of that record. After recovery is done, + * we'll write a record to indicate downstream WAL readers that + * that portion is to be ignored. + */ + if (!StandbyMode && + !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr)) + { + abortedRecPtr = xlogreader->abortedRecPtr; + missingContrecPtr = xlogreader->missingContrecPtr; + } + + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + + /* + * We only end up here without a message when XLogPageRead() + * failed - in that case we already logged something. In + * StandbyMode that only happens if we have been triggered, so we + * shouldn't loop anymore in that case. + */ + if (errormsg) + ereport(emode_for_corrupt_record(emode, EndRecPtr), + (errmsg_internal("%s", errormsg) /* already translated */ )); + } + + /* + * Check page TLI is one of the expected values. + */ + else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs)) + { + char fname[MAXFNAMELEN]; + XLogSegNo segno; + int32 offset; + + XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size); + offset = XLogSegmentOffset(xlogreader->latestPagePtr, + wal_segment_size); + XLogFileName(fname, xlogreader->seg.ws_tli, segno, + wal_segment_size); + ereport(emode_for_corrupt_record(emode, EndRecPtr), + (errmsg("unexpected timeline ID %u in log segment %s, offset %u", + xlogreader->latestPageTLI, + fname, + offset))); + record = NULL; + } + + if (record) + { + /* Great, got a record */ + return record; + } + else + { + /* No valid record available from this source */ + lastSourceFailed = true; + + /* + * If archive recovery was requested, but we were still doing + * crash recovery, switch to archive recovery and retry using the + * offline archive. We have now replayed all the valid WAL in + * pg_wal, so we are presumably now consistent. + * + * We require that there's at least some valid WAL present in + * pg_wal, however (!fetching_ckpt). We could recover using the + * WAL from the archive, even if pg_wal is completely empty, but + * we'd have no idea how far we'd have to replay to reach + * consistency. So err on the safe side and give up. + */ + if (!InArchiveRecovery && ArchiveRecoveryRequested && + !fetching_ckpt) + { + ereport(DEBUG1, + (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); + InArchiveRecovery = true; + if (StandbyModeRequested) + StandbyMode = true; + + /* initialize minRecoveryPoint to this record */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_ARCHIVE_RECOVERY; + if (ControlFile->minRecoveryPoint < EndRecPtr) + { + ControlFile->minRecoveryPoint = EndRecPtr; + ControlFile->minRecoveryPointTLI = ThisTimeLineID; + } + /* update local copy */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + + /* + * The startup process can update its local copy of + * minRecoveryPoint from this point. + */ + updateMinRecoveryPoint = true; + + UpdateControlFile(); + + /* + * We update SharedRecoveryState while holding the lock on + * ControlFileLock so both states are consistent in shared + * memory. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE; + SpinLockRelease(&XLogCtl->info_lck); + + LWLockRelease(ControlFileLock); + + CheckRecoveryConsistency(); + + /* + * Before we retry, reset lastSourceFailed and currentSource + * so that we will check the archive next. + */ + lastSourceFailed = false; + currentSource = XLOG_FROM_ANY; + + continue; + } + + /* In standby mode, loop back to retry. Otherwise, give up. */ + if (StandbyMode && !CheckForStandbyTrigger()) + continue; + else + return NULL; + } + } +} + +/* + * Scan for new timelines that might have appeared in the archive since we + * started recovery. + * + * If there are any, the function changes recovery target TLI to the latest + * one and returns 'true'. + */ +static bool +rescanLatestTimeLine(void) +{ + List *newExpectedTLEs; + bool found; + ListCell *cell; + TimeLineID newtarget; + TimeLineID oldtarget = recoveryTargetTLI; + TimeLineHistoryEntry *currentTle = NULL; + + newtarget = findNewestTimeLine(recoveryTargetTLI); + if (newtarget == recoveryTargetTLI) + { + /* No new timelines found */ + return false; + } + + /* + * Determine the list of expected TLIs for the new TLI + */ + + newExpectedTLEs = readTimeLineHistory(newtarget); + + /* + * If the current timeline is not part of the history of the new timeline, + * we cannot proceed to it. + */ + found = false; + foreach(cell, newExpectedTLEs) + { + currentTle = (TimeLineHistoryEntry *) lfirst(cell); + + if (currentTle->tli == recoveryTargetTLI) + { + found = true; + break; + } + } + if (!found) + { + ereport(LOG, + (errmsg("new timeline %u is not a child of database system timeline %u", + newtarget, + ThisTimeLineID))); + return false; + } + + /* + * The current timeline was found in the history file, but check that the + * next timeline was forked off from it *after* the current recovery + * location. + */ + if (currentTle->end < EndRecPtr) + { + ereport(LOG, + (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", + newtarget, + ThisTimeLineID, + LSN_FORMAT_ARGS(EndRecPtr)))); + return false; + } + + /* The new timeline history seems valid. Switch target */ + recoveryTargetTLI = newtarget; + list_free_deep(expectedTLEs); + expectedTLEs = newExpectedTLEs; + + /* + * As in StartupXLOG(), try to ensure we have all the history files + * between the old target and new target in pg_wal. + */ + restoreTimeLineHistoryFiles(oldtarget + 1, newtarget); + + ereport(LOG, + (errmsg("new target timeline is %u", + recoveryTargetTLI))); + + return true; +} + +/* + * I/O routines for pg_control + * + * *ControlFile is a buffer in shared memory that holds an image of the + * contents of pg_control. WriteControlFile() initializes pg_control + * given a preloaded buffer, ReadControlFile() loads the buffer from + * the pg_control file (during postmaster or standalone-backend startup), + * and UpdateControlFile() rewrites pg_control after we modify xlog state. + * InitControlFile() fills the buffer with initial values. + * + * For simplicity, WriteControlFile() initializes the fields of pg_control + * that are related to checking backend/database compatibility, and + * ReadControlFile() verifies they are correct. We could split out the + * I/O and compatibility-check functions, but there seems no need currently. + */ + +static void +InitControlFile(uint64 sysidentifier) +{ + char mock_auth_nonce[MOCK_AUTH_NONCE_LEN]; + + /* + * Generate a random nonce. This is used for authentication requests that + * will fail because the user does not exist. The nonce is used to create + * a genuine-looking password challenge for the non-existent user, in lieu + * of an actual stored password. + */ + if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN)) + ereport(PANIC, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate secret authorization token"))); + + memset(ControlFile, 0, sizeof(ControlFileData)); + /* Initialize pg_control status fields */ + ControlFile->system_identifier = sysidentifier; + memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN); + ControlFile->state = DB_SHUTDOWNED; + ControlFile->unloggedLSN = FirstNormalUnloggedLSN; + + /* Set important parameter values for use when replaying WAL */ + ControlFile->MaxConnections = MaxConnections; + ControlFile->max_worker_processes = max_worker_processes; + ControlFile->max_wal_senders = max_wal_senders; + ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_locks_per_xact = max_locks_per_xact; + ControlFile->wal_level = wal_level; + ControlFile->wal_log_hints = wal_log_hints; + ControlFile->track_commit_timestamp = track_commit_timestamp; + ControlFile->data_checksum_version = bootstrap_data_checksum_version; +} + +static void +WriteControlFile(void) +{ + int fd; + char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */ + + /* + * Ensure that the size of the pg_control data structure is sane. See the + * comments for these symbols in pg_control.h. + */ + StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE, + "pg_control is too large for atomic disk writes"); + StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE, + "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE"); + + /* + * Initialize version and compatibility-check fields + */ + ControlFile->pg_control_version = PG_CONTROL_VERSION; + ControlFile->catalog_version_no = CATALOG_VERSION_NO; + + ControlFile->maxAlign = MAXIMUM_ALIGNOF; + ControlFile->floatFormat = FLOATFORMAT_VALUE; + + ControlFile->blcksz = BLCKSZ; + ControlFile->relseg_size = RELSEG_SIZE; + ControlFile->xlog_blcksz = XLOG_BLCKSZ; + ControlFile->xlog_seg_size = wal_segment_size; + + ControlFile->nameDataLen = NAMEDATALEN; + ControlFile->indexMaxKeys = INDEX_MAX_KEYS; + + ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE; + ControlFile->loblksize = LOBLKSIZE; + + ControlFile->float8ByVal = FLOAT8PASSBYVAL; + + /* Contents are protected with a CRC */ + INIT_CRC32C(ControlFile->crc); + COMP_CRC32C(ControlFile->crc, + (char *) ControlFile, + offsetof(ControlFileData, crc)); + FIN_CRC32C(ControlFile->crc); + + /* + * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding + * the excess over sizeof(ControlFileData). This reduces the odds of + * premature-EOF errors when reading pg_control. We'll still fail when we + * check the contents of the file, but hopefully with a more specific + * error than "couldn't read pg_control". + */ + memset(buffer, 0, PG_CONTROL_FILE_SIZE); + memcpy(buffer, ControlFile, sizeof(ControlFileData)); + + fd = BasicOpenFile(XLOG_CONTROL_FILE, + O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + XLOG_CONTROL_FILE))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE); + if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + XLOG_CONTROL_FILE))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + XLOG_CONTROL_FILE))); + pgstat_report_wait_end(); + + if (close(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + XLOG_CONTROL_FILE))); +} + +static void +ReadControlFile(void) +{ + pg_crc32c crc; + int fd; + static char wal_segsz_str[20]; + int r; + + /* + * Read data... + */ + fd = BasicOpenFile(XLOG_CONTROL_FILE, + O_RDWR | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + XLOG_CONTROL_FILE))); + + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ); + r = read(fd, ControlFile, sizeof(ControlFileData)); + if (r != sizeof(ControlFileData)) + { + if (r < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + XLOG_CONTROL_FILE))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + XLOG_CONTROL_FILE, r, sizeof(ControlFileData)))); + } + pgstat_report_wait_end(); + + close(fd); + + /* + * Check for expected pg_control format version. If this is wrong, the + * CRC check will likely fail because we'll be checking the wrong number + * of bytes. Complaining about wrong version will probably be more + * enlightening than complaining about wrong CRC. + */ + + if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x)," + " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).", + ControlFile->pg_control_version, ControlFile->pg_control_version, + PG_CONTROL_VERSION, PG_CONTROL_VERSION), + errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb."))); + + if (ControlFile->pg_control_version != PG_CONTROL_VERSION) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d," + " but the server was compiled with PG_CONTROL_VERSION %d.", + ControlFile->pg_control_version, PG_CONTROL_VERSION), + errhint("It looks like you need to initdb."))); + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, + (char *) ControlFile, + offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, ControlFile->crc)) + ereport(FATAL, + (errmsg("incorrect checksum in control file"))); + + /* + * Do compatibility checking immediately. If the database isn't + * compatible with the backend executable, we want to abort before we can + * possibly do any damage. + */ + if (ControlFile->catalog_version_no != CATALOG_VERSION_NO) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d," + " but the server was compiled with CATALOG_VERSION_NO %d.", + ControlFile->catalog_version_no, CATALOG_VERSION_NO), + errhint("It looks like you need to initdb."))); + if (ControlFile->maxAlign != MAXIMUM_ALIGNOF) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with MAXALIGN %d," + " but the server was compiled with MAXALIGN %d.", + ControlFile->maxAlign, MAXIMUM_ALIGNOF), + errhint("It looks like you need to initdb."))); + if (ControlFile->floatFormat != FLOATFORMAT_VALUE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster appears to use a different floating-point number format than the server executable."), + errhint("It looks like you need to initdb."))); + if (ControlFile->blcksz != BLCKSZ) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with BLCKSZ %d," + " but the server was compiled with BLCKSZ %d.", + ControlFile->blcksz, BLCKSZ), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->relseg_size != RELSEG_SIZE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with RELSEG_SIZE %d," + " but the server was compiled with RELSEG_SIZE %d.", + ControlFile->relseg_size, RELSEG_SIZE), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->xlog_blcksz != XLOG_BLCKSZ) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with XLOG_BLCKSZ %d," + " but the server was compiled with XLOG_BLCKSZ %d.", + ControlFile->xlog_blcksz, XLOG_BLCKSZ), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->nameDataLen != NAMEDATALEN) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with NAMEDATALEN %d," + " but the server was compiled with NAMEDATALEN %d.", + ControlFile->nameDataLen, NAMEDATALEN), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d," + " but the server was compiled with INDEX_MAX_KEYS %d.", + ControlFile->indexMaxKeys, INDEX_MAX_KEYS), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d," + " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.", + ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->loblksize != LOBLKSIZE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with LOBLKSIZE %d," + " but the server was compiled with LOBLKSIZE %d.", + ControlFile->loblksize, (int) LOBLKSIZE), + errhint("It looks like you need to recompile or initdb."))); + +#ifdef USE_FLOAT8_BYVAL + if (ControlFile->float8ByVal != true) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL" + " but the server was compiled with USE_FLOAT8_BYVAL."), + errhint("It looks like you need to recompile or initdb."))); +#else + if (ControlFile->float8ByVal != false) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL" + " but the server was compiled without USE_FLOAT8_BYVAL."), + errhint("It looks like you need to recompile or initdb."))); +#endif + + wal_segment_size = ControlFile->xlog_seg_size; + + if (!IsValidWalSegSize(wal_segment_size)) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte", + "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes", + wal_segment_size, + wal_segment_size))); + + snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size); + SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL, + PGC_S_OVERRIDE); + + /* check and update variables dependent on wal_segment_size */ + if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\""))); + + if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\""))); + + UsableBytesInSegment = + (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) - + (SizeOfXLogLongPHD - SizeOfXLogShortPHD); + + CalculateCheckpointSegments(); + + /* Make the initdb settings visible as GUC variables, too */ + SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", + PGC_INTERNAL, PGC_S_OVERRIDE); +} + +/* + * Utility wrapper to update the control file. Note that the control + * file gets flushed. + */ +void +UpdateControlFile(void) +{ + update_controlfile(DataDir, ControlFile, true); +} + +/* + * Returns the unique system identifier from control file. + */ +uint64 +GetSystemIdentifier(void) +{ + Assert(ControlFile != NULL); + return ControlFile->system_identifier; +} + +/* + * Returns the random nonce from control file. + */ +char * +GetMockAuthenticationNonce(void) +{ + Assert(ControlFile != NULL); + return ControlFile->mock_authentication_nonce; +} + +/* + * Are checksums enabled for data pages? + */ +bool +DataChecksumsEnabled(void) +{ + Assert(ControlFile != NULL); + return (ControlFile->data_checksum_version > 0); +} + +/* + * Returns a fake LSN for unlogged relations. + * + * Each call generates an LSN that is greater than any previous value + * returned. The current counter value is saved and restored across clean + * shutdowns, but like unlogged relations, does not survive a crash. This can + * be used in lieu of real LSN values returned by XLogInsert, if you need an + * LSN-like increasing sequence of numbers without writing any WAL. + */ +XLogRecPtr +GetFakeLSNForUnloggedRel(void) +{ + XLogRecPtr nextUnloggedLSN; + + /* increment the unloggedLSN counter, need SpinLock */ + SpinLockAcquire(&XLogCtl->ulsn_lck); + nextUnloggedLSN = XLogCtl->unloggedLSN++; + SpinLockRelease(&XLogCtl->ulsn_lck); + + return nextUnloggedLSN; +} + +/* + * Auto-tune the number of XLOG buffers. + * + * The preferred setting for wal_buffers is about 3% of shared_buffers, with + * a maximum of one XLOG segment (there is little reason to think that more + * is helpful, at least so long as we force an fsync when switching log files) + * and a minimum of 8 blocks (which was the default value prior to PostgreSQL + * 9.1, when auto-tuning was added). + * + * This should not be called until NBuffers has received its final value. + */ +static int +XLOGChooseNumBuffers(void) +{ + int xbuffers; + + xbuffers = NBuffers / 32; + if (xbuffers > (wal_segment_size / XLOG_BLCKSZ)) + xbuffers = (wal_segment_size / XLOG_BLCKSZ); + if (xbuffers < 8) + xbuffers = 8; + return xbuffers; +} + +/* + * GUC check_hook for wal_buffers + */ +bool +check_wal_buffers(int *newval, void **extra, GucSource source) +{ + /* + * -1 indicates a request for auto-tune. + */ + if (*newval == -1) + { + /* + * If we haven't yet changed the boot_val default of -1, just let it + * be. We'll fix it when XLOGShmemSize is called. + */ + if (XLOGbuffers == -1) + return true; + + /* Otherwise, substitute the auto-tune value */ + *newval = XLOGChooseNumBuffers(); + } + + /* + * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL + * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer + * the case, we just silently treat such values as a request for the + * minimum. (We could throw an error instead, but that doesn't seem very + * helpful.) + */ + if (*newval < 4) + *newval = 4; + + return true; +} + +/* + * Read the control file, set respective GUCs. + * + * This is to be called during startup, including a crash recovery cycle, + * unless in bootstrap mode, where no control file yet exists. As there's no + * usable shared memory yet (its sizing can depend on the contents of the + * control file!), first store the contents in local memory. XLOGShmemInit() + * will then copy it to shared memory later. + * + * reset just controls whether previous contents are to be expected (in the + * reset case, there's a dangling pointer into old shared memory), or not. + */ +void +LocalProcessControlFile(bool reset) +{ + Assert(reset || ControlFile == NULL); + ControlFile = palloc(sizeof(ControlFileData)); + ReadControlFile(); +} + +/* + * Initialization of shared memory for XLOG + */ +Size +XLOGShmemSize(void) +{ + Size size; + + /* + * If the value of wal_buffers is -1, use the preferred auto-tune value. + * This isn't an amazingly clean place to do this, but we must wait till + * NBuffers has received its final value, and must do it before using the + * value of XLOGbuffers to do anything important. + */ + if (XLOGbuffers == -1) + { + char buf[32]; + + snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers()); + SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE); + } + Assert(XLOGbuffers > 0); + + /* XLogCtl */ + size = sizeof(XLogCtlData); + + /* WAL insertion locks, plus alignment */ + size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1)); + /* xlblocks array */ + size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers)); + /* extra alignment padding for XLOG I/O buffers */ + size = add_size(size, XLOG_BLCKSZ); + /* and the buffers themselves */ + size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); + + /* + * Note: we don't count ControlFileData, it comes out of the "slop factor" + * added by CreateSharedMemoryAndSemaphores. This lets us use this + * routine again below to compute the actual allocation size. + */ + + return size; +} + +void +XLOGShmemInit(void) +{ + bool foundCFile, + foundXLog; + char *allocptr; + int i; + ControlFileData *localControlFile; + +#ifdef WAL_DEBUG + + /* + * Create a memory context for WAL debugging that's exempt from the normal + * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if + * an allocation fails, but wal_debug is not for production use anyway. + */ + if (walDebugCxt == NULL) + { + walDebugCxt = AllocSetContextCreate(TopMemoryContext, + "WAL Debug", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(walDebugCxt, true); + } +#endif + + + XLogCtl = (XLogCtlData *) + ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); + + localControlFile = ControlFile; + ControlFile = (ControlFileData *) + ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile); + + if (foundCFile || foundXLog) + { + /* both should be present or neither */ + Assert(foundCFile && foundXLog); + + /* Initialize local copy of WALInsertLocks */ + WALInsertLocks = XLogCtl->Insert.WALInsertLocks; + + if (localControlFile) + pfree(localControlFile); + return; + } + memset(XLogCtl, 0, sizeof(XLogCtlData)); + + /* + * Already have read control file locally, unless in bootstrap mode. Move + * contents into shared memory. + */ + if (localControlFile) + { + memcpy(ControlFile, localControlFile, sizeof(ControlFileData)); + pfree(localControlFile); + } + + /* + * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a + * multiple of the alignment for same, so no extra alignment padding is + * needed here. + */ + allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData); + XLogCtl->xlblocks = (XLogRecPtr *) allocptr; + memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers); + allocptr += sizeof(XLogRecPtr) * XLOGbuffers; + + + /* WAL insertion locks. Ensure they're aligned to the full padded size */ + allocptr += sizeof(WALInsertLockPadded) - + ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded); + WALInsertLocks = XLogCtl->Insert.WALInsertLocks = + (WALInsertLockPadded *) allocptr; + allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS; + + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT); + WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr; + WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr; + } + + /* + * Align the start of the page buffers to a full xlog block size boundary. + * This simplifies some calculations in XLOG insertion. It is also + * required for O_DIRECT. + */ + allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); + XLogCtl->pages = allocptr; + memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers); + + /* + * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill + * in additional info.) + */ + XLogCtl->XLogCacheBlck = XLOGbuffers - 1; + XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; + XLogCtl->SharedHotStandbyActive = false; + XLogCtl->SharedPromoteIsTriggered = false; + XLogCtl->WalWriterSleeping = false; + + SpinLockInit(&XLogCtl->Insert.insertpos_lck); + SpinLockInit(&XLogCtl->info_lck); + SpinLockInit(&XLogCtl->ulsn_lck); + InitSharedLatch(&XLogCtl->recoveryWakeupLatch); + ConditionVariableInit(&XLogCtl->recoveryNotPausedCV); +} + +/* + * This func must be called ONCE on system install. It creates pg_control + * and the initial XLOG segment. + */ +void +BootStrapXLOG(void) +{ + CheckPoint checkPoint; + char *buffer; + XLogPageHeader page; + XLogLongPageHeader longpage; + XLogRecord *record; + char *recptr; + bool use_existent; + uint64 sysidentifier; + struct timeval tv; + pg_crc32c crc; + + /* + * Select a hopefully-unique system identifier code for this installation. + * We use the result of gettimeofday(), including the fractional seconds + * field, as being about as unique as we can easily get. (Think not to + * use random(), since it hasn't been seeded and there's no portable way + * to seed it other than the system clock value...) The upper half of the + * uint64 value is just the tv_sec part, while the lower half contains the + * tv_usec part (which must fit in 20 bits), plus 12 bits from our current + * PID for a little extra uniqueness. A person knowing this encoding can + * determine the initialization time of the installation, which could + * perhaps be useful sometimes. + */ + gettimeofday(&tv, NULL); + sysidentifier = ((uint64) tv.tv_sec) << 32; + sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier |= getpid() & 0xFFF; + + /* First timeline ID is always 1 */ + ThisTimeLineID = 1; + + /* page buffer must be aligned suitably for O_DIRECT */ + buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); + page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer); + memset(page, 0, XLOG_BLCKSZ); + + /* + * Set up information for the initial checkpoint record + * + * The initial checkpoint record is written to the beginning of the WAL + * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not + * used, so that we can use 0/0 to mean "before any valid WAL segment". + */ + checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD; + checkPoint.ThisTimeLineID = ThisTimeLineID; + checkPoint.PrevTimeLineID = ThisTimeLineID; + checkPoint.fullPageWrites = fullPageWrites; + checkPoint.nextXid = + FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); + checkPoint.nextOid = FirstBootstrapObjectId; + checkPoint.nextMulti = FirstMultiXactId; + checkPoint.nextMultiOffset = 0; + checkPoint.oldestXid = FirstNormalTransactionId; + checkPoint.oldestXidDB = TemplateDbOid; + checkPoint.oldestMulti = FirstMultiXactId; + checkPoint.oldestMultiDB = TemplateDbOid; + checkPoint.oldestCommitTsXid = InvalidTransactionId; + checkPoint.newestCommitTsXid = InvalidTransactionId; + checkPoint.time = (pg_time_t) time(NULL); + checkPoint.oldestActiveXid = InvalidTransactionId; + + ShmemVariableCache->nextXid = checkPoint.nextXid; + ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->oidCount = 0; + MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); + SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); + + /* Set up the XLOG page header */ + page->xlp_magic = XLOG_PAGE_MAGIC; + page->xlp_info = XLP_LONG_HEADER; + page->xlp_tli = ThisTimeLineID; + page->xlp_pageaddr = wal_segment_size; + longpage = (XLogLongPageHeader) page; + longpage->xlp_sysid = sysidentifier; + longpage->xlp_seg_size = wal_segment_size; + longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; + + /* Insert the initial checkpoint record */ + recptr = ((char *) page + SizeOfXLogLongPHD); + record = (XLogRecord *) recptr; + record->xl_prev = 0; + record->xl_xid = InvalidTransactionId; + record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint); + record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; + record->xl_rmid = RM_XLOG_ID; + recptr += SizeOfXLogRecord; + /* fill the XLogRecordDataHeaderShort struct */ + *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT; + *(recptr++) = sizeof(checkPoint); + memcpy(recptr, &checkPoint, sizeof(checkPoint)); + recptr += sizeof(checkPoint); + Assert(recptr - (char *) record == record->xl_tot_len); + + INIT_CRC32C(crc); + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(crc); + record->xl_crc = crc; + + /* Create first XLOG segment file */ + use_existent = false; + openLogFile = XLogFileInit(1, &use_existent, false); + + /* + * We needn't bother with Reserve/ReleaseExternalFD here, since we'll + * close the file again in a moment. + */ + + /* Write the first page with the initial record */ + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE); + if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write bootstrap write-ahead log file: %m"))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC); + if (pg_fsync(openLogFile) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync bootstrap write-ahead log file: %m"))); + pgstat_report_wait_end(); + + if (close(openLogFile) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close bootstrap write-ahead log file: %m"))); + + openLogFile = -1; + + /* Now create pg_control */ + InitControlFile(sysidentifier); + ControlFile->time = checkPoint.time; + ControlFile->checkPoint = checkPoint.redo; + ControlFile->checkPointCopy = checkPoint; + + /* some additional ControlFile fields are set in WriteControlFile() */ + WriteControlFile(); + + /* Bootstrap the commit log, too */ + BootStrapCLOG(); + BootStrapCommitTs(); + BootStrapSUBTRANS(); + BootStrapMultiXact(); + + pfree(buffer); + + /* + * Force control file to be read - in contrast to normal processing we'd + * otherwise never run the checks and GUC related initializations therein. + */ + ReadControlFile(); +} + +static char * +str_time(pg_time_t tnow) +{ + static char buf[128]; + + pg_strftime(buf, sizeof(buf), + "%Y-%m-%d %H:%M:%S %Z", + pg_localtime(&tnow, log_timezone)); + + return buf; +} + +/* + * See if there are any recovery signal files and if so, set state for + * recovery. + * + * See if there is a recovery command file (recovery.conf), and if so + * throw an ERROR since as of PG12 we no longer recognize that. + */ +static void +readRecoverySignalFile(void) +{ + struct stat stat_buf; + + if (IsBootstrapProcessingMode()) + return; + + /* + * Check for old recovery API file: recovery.conf + */ + if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("using recovery command file \"%s\" is not supported", + RECOVERY_COMMAND_FILE))); + + /* + * Remove unused .done file, if present. Ignore if absent. + */ + unlink(RECOVERY_COMMAND_DONE); + + /* + * Check for recovery signal files and if found, fsync them since they + * represent server state information. We don't sweat too much about the + * possibility of fsync failure, however. + * + * If present, standby signal file takes precedence. If neither is present + * then we won't enter archive recovery. + */ + if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) + { + int fd; + + fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method), + S_IRUSR | S_IWUSR); + if (fd >= 0) + { + (void) pg_fsync(fd); + close(fd); + } + standby_signal_file_found = true; + } + else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) + { + int fd; + + fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method), + S_IRUSR | S_IWUSR); + if (fd >= 0) + { + (void) pg_fsync(fd); + close(fd); + } + recovery_signal_file_found = true; + } + + StandbyModeRequested = false; + ArchiveRecoveryRequested = false; + if (standby_signal_file_found) + { + StandbyModeRequested = true; + ArchiveRecoveryRequested = true; + } + else if (recovery_signal_file_found) + { + StandbyModeRequested = false; + ArchiveRecoveryRequested = true; + } + else + return; + + /* + * We don't support standby mode in standalone backends; that requires + * other processes such as the WAL receiver to be alive. + */ + if (StandbyModeRequested && !IsUnderPostmaster) + ereport(FATAL, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("standby mode is not supported by single-user servers"))); +} + +static void +validateRecoveryParameters(void) +{ + if (!ArchiveRecoveryRequested) + return; + + /* + * Check for compulsory parameters + */ + if (StandbyModeRequested) + { + if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) && + (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)) + ereport(WARNING, + (errmsg("specified neither primary_conninfo nor restore_command"), + errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there."))); + } + else + { + if (recoveryRestoreCommand == NULL || + strcmp(recoveryRestoreCommand, "") == 0) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("must specify restore_command when standby mode is not enabled"))); + } + + /* + * Override any inconsistent requests. Note that this is a change of + * behaviour in 9.5; prior to this we simply ignored a request to pause if + * hot_standby = off, which was surprising behaviour. + */ + if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE && + !EnableHotStandby) + recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN; + + /* + * Final parsing of recovery_target_time string; see also + * check_recovery_target_time(). + */ + if (recoveryTarget == RECOVERY_TARGET_TIME) + { + recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, + CStringGetDatum(recovery_target_time_string), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1))); + } + + /* + * If user specified recovery_target_timeline, validate it or compute the + * "latest" value. We can't do this until after we've gotten the restore + * command and set InArchiveRecovery, because we need to fetch timeline + * history files from the archive. + */ + if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC) + { + TimeLineID rtli = recoveryTargetTLIRequested; + + /* Timeline 1 does not have a history file, all else should */ + if (rtli != 1 && !existsTimeLineHistory(rtli)) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery target timeline %u does not exist", + rtli))); + recoveryTargetTLI = rtli; + } + else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) + { + /* We start the "latest" search from pg_control's timeline */ + recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI); + } + else + { + /* + * else we just use the recoveryTargetTLI as already read from + * ControlFile + */ + Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE); + } +} + +/* + * Exit archive-recovery state + */ +static void +exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) +{ + char xlogfname[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + XLogSegNo startLogSegNo; + + /* we always switch to a new timeline after archive recovery */ + Assert(endTLI != ThisTimeLineID); + + /* + * We are no longer in archive recovery state. + */ + InArchiveRecovery = false; + + /* + * Update min recovery point one last time. + */ + UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); + + /* + * If the ending log segment is still open, close it (to avoid problems on + * Windows with trying to rename or delete an open file). + */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + + /* + * Calculate the last segment on the old timeline, and the first segment + * on the new timeline. If the switch happens in the middle of a segment, + * they are the same, but if the switch happens exactly at a segment + * boundary, startLogSegNo will be endLogSegNo + 1. + */ + XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size); + XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size); + + /* + * Initialize the starting WAL segment for the new timeline. If the switch + * happens in the middle of a segment, copy data from the last WAL segment + * of the old timeline up to the switch point, to the starting WAL segment + * on the new timeline. + */ + if (endLogSegNo == startLogSegNo) + { + /* + * Make a copy of the file on the new timeline. + * + * Writing WAL isn't allowed yet, so there are no locking + * considerations. But we should be just as tense as XLogFileInit to + * avoid emplacing a bogus file. + */ + XLogFileCopy(endLogSegNo, endTLI, endLogSegNo, + XLogSegmentOffset(endOfLog, wal_segment_size)); + } + else + { + /* + * The switch happened at a segment boundary, so just create the next + * segment on the new timeline. + */ + bool use_existent = true; + int fd; + + fd = XLogFileInit(startLogSegNo, &use_existent, true); + + if (close(fd) != 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, + wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", xlogfname))); + } + } + + /* + * Let's just make real sure there are not .ready or .done flags posted + * for the new segment. + */ + XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size); + XLogArchiveCleanup(xlogfname); + + /* + * Remove the signal files out of the way, so that we don't accidentally + * re-enter archive recovery mode in a subsequent crash. + */ + if (standby_signal_file_found) + durable_unlink(STANDBY_SIGNAL_FILE, FATAL); + + if (recovery_signal_file_found) + durable_unlink(RECOVERY_SIGNAL_FILE, FATAL); + + ereport(LOG, + (errmsg("archive recovery complete"))); +} + +/* + * Extract timestamp from WAL record. + * + * If the record contains a timestamp, returns true, and saves the timestamp + * in *recordXtime. If the record type has no timestamp, returns false. + * Currently, only transaction commit/abort records and restore points contain + * timestamps. + */ +static bool +getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 xact_info = info & XLOG_XACT_OPMASK; + uint8 rmid = XLogRecGetRmid(record); + + if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) + { + *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time; + return true; + } + if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED)) + { + *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time; + return true; + } + if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED)) + { + *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time; + return true; + } + return false; +} + +/* + * For point-in-time recovery, this function decides whether we want to + * stop applying the XLOG before the current record. + * + * Returns true if we are stopping, false otherwise. If stopping, some + * information is saved in recoveryStopXid et al for use in annotating the + * new timeline's history file. + */ +static bool +recoveryStopsBefore(XLogReaderState *record) +{ + bool stopsHere = false; + uint8 xact_info; + bool isCommit; + TimestampTz recordXtime = 0; + TransactionId recordXid; + + /* + * Ignore recovery target settings when not in archive recovery (meaning + * we are in crash recovery). + */ + if (!ArchiveRecoveryRequested) + return false; + + /* Check if we should stop as soon as reaching consistency */ + if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) + { + ereport(LOG, + (errmsg("recovery stopping after reaching consistency"))); + + recoveryStopAfter = false; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + return true; + } + + /* Check if target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + !recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = false; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryStopLSN)))); + return true; + } + + /* Otherwise we only consider stopping before COMMIT or ABORT records. */ + if (XLogRecGetRmid(record) != RM_XACT_ID) + return false; + + xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT) + { + isCommit = true; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + isCommit = true; + ParseCommitRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else if (xact_info == XLOG_XACT_ABORT) + { + isCommit = false; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + isCommit = false; + ParseAbortRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else + return false; + + if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive) + { + /* + * There can be only one transaction end record with this exact + * transactionid + * + * when testing for an xid, we MUST test for equality only, since + * transactions are numbered in the order they start, not the order + * they complete. A higher numbered xid will complete before you about + * 50% of the time... + */ + stopsHere = (recordXid == recoveryTargetXid); + } + + if (recoveryTarget == RECOVERY_TARGET_TIME && + getRecordTimestamp(record, &recordXtime)) + { + /* + * There can be many transactions that share the same commit time, so + * we stop after the last one, if we are inclusive, or stop at the + * first one if we are exclusive + */ + if (recoveryTargetInclusive) + stopsHere = (recordXtime > recoveryTargetTime); + else + stopsHere = (recordXtime >= recoveryTargetTime); + } + + if (stopsHere) + { + recoveryStopAfter = false; + recoveryStopXid = recordXid; + recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + + if (isCommit) + { + ereport(LOG, + (errmsg("recovery stopping before commit of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + else + { + ereport(LOG, + (errmsg("recovery stopping before abort of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + } + + return stopsHere; +} + +/* + * Same as recoveryStopsBefore, but called after applying the record. + * + * We also track the timestamp of the latest applied COMMIT/ABORT + * record in XLogCtl->recoveryLastXTime. + */ +static bool +recoveryStopsAfter(XLogReaderState *record) +{ + uint8 info; + uint8 xact_info; + uint8 rmid; + TimestampTz recordXtime; + + /* + * Ignore recovery target settings when not in archive recovery (meaning + * we are in crash recovery). + */ + if (!ArchiveRecoveryRequested) + return false; + + info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + rmid = XLogRecGetRmid(record); + + /* + * There can be many restore points that share the same name; we stop at + * the first one. + */ + if (recoveryTarget == RECOVERY_TARGET_NAME && + rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) + { + xl_restore_point *recordRestorePointData; + + recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); + + if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) + { + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; + (void) getRecordTimestamp(record, &recoveryStopTime); + strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN); + + ereport(LOG, + (errmsg("recovery stopping at restore point \"%s\", time %s", + recoveryStopName, + timestamptz_to_str(recoveryStopTime)))); + return true; + } + } + + /* Check if the target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryStopLSN)))); + return true; + } + + if (rmid != RM_XACT_ID) + return false; + + xact_info = info & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED || + xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED) + { + TransactionId recordXid; + + /* Update the last applied transaction timestamp */ + if (getRecordTimestamp(record, &recordXtime)) + SetLatestXTime(recordXtime); + + /* Extract the XID of the committed/aborted transaction */ + if (xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + ParseCommitRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else if (xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + ParseAbortRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else + recordXid = XLogRecGetXid(record); + + /* + * There can be only one transaction end record with this exact + * transactionid + * + * when testing for an xid, we MUST test for equality only, since + * transactions are numbered in the order they start, not the order + * they complete. A higher numbered xid will complete before you about + * 50% of the time... + */ + if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive && + recordXid == recoveryTargetXid) + { + recoveryStopAfter = true; + recoveryStopXid = recordXid; + recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + + if (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED) + { + ereport(LOG, + (errmsg("recovery stopping after commit of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + else if (xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED) + { + ereport(LOG, + (errmsg("recovery stopping after abort of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + return true; + } + } + + /* Check if we should stop as soon as reaching consistency */ + if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) + { + ereport(LOG, + (errmsg("recovery stopping after reaching consistency"))); + + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopTime = 0; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + return true; + } + + return false; +} + +/* + * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED. + * + * endOfRecovery is true if the recovery target is reached and + * the paused state starts at the end of recovery because of + * recovery_target_action=pause, and false otherwise. + */ +static void +recoveryPausesHere(bool endOfRecovery) +{ + /* Don't pause unless users can connect! */ + if (!LocalHotStandbyActive) + return; + + /* Don't pause after standby promotion has been triggered */ + if (LocalPromoteIsTriggered) + return; + + if (endOfRecovery) + ereport(LOG, + (errmsg("pausing at the end of recovery"), + errhint("Execute pg_wal_replay_resume() to promote."))); + else + ereport(LOG, + (errmsg("recovery has paused"), + errhint("Execute pg_wal_replay_resume() to continue."))); + + /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */ + while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) + { + HandleStartupProcInterrupts(); + if (CheckForStandbyTrigger()) + return; + + /* + * If recovery pause is requested then set it paused. While we are in + * the loop, user might resume and pause again so set this every time. + */ + ConfirmRecoveryPaused(); + + /* + * We wait on a condition variable that will wake us as soon as the + * pause ends, but we use a timeout so we can check the above exit + * condition periodically too. + */ + ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000, + WAIT_EVENT_RECOVERY_PAUSE); + } + ConditionVariableCancelSleep(); +} + +/* + * Get the current state of the recovery pause request. + */ +RecoveryPauseState +GetRecoveryPauseState(void) +{ + RecoveryPauseState state; + + SpinLockAcquire(&XLogCtl->info_lck); + state = XLogCtl->recoveryPauseState; + SpinLockRelease(&XLogCtl->info_lck); + + return state; +} + +/* + * Set the recovery pause state. + * + * If recovery pause is requested then sets the recovery pause state to + * 'pause requested' if it is not already 'paused'. Otherwise, sets it + * to 'not paused' to resume the recovery. The recovery pause will be + * confirmed by the ConfirmRecoveryPaused. + */ +void +SetRecoveryPause(bool recoveryPause) +{ + SpinLockAcquire(&XLogCtl->info_lck); + + if (!recoveryPause) + XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; + else if (XLogCtl->recoveryPauseState == RECOVERY_NOT_PAUSED) + XLogCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED; + + SpinLockRelease(&XLogCtl->info_lck); + + if (!recoveryPause) + ConditionVariableBroadcast(&XLogCtl->recoveryNotPausedCV); +} + +/* + * Confirm the recovery pause by setting the recovery pause state to + * RECOVERY_PAUSED. + */ +static void +ConfirmRecoveryPaused(void) +{ + /* If recovery pause is requested then set it paused */ + SpinLockAcquire(&XLogCtl->info_lck); + if (XLogCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED) + XLogCtl->recoveryPauseState = RECOVERY_PAUSED; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * When recovery_min_apply_delay is set, we wait long enough to make sure + * certain record types are applied at least that interval behind the primary. + * + * Returns true if we waited. + * + * Note that the delay is calculated between the WAL record log time and + * the current time on standby. We would prefer to keep track of when this + * standby received each WAL record, which would allow a more consistent + * approach and one not affected by time synchronisation issues, but that + * is significantly more effort and complexity for little actual gain in + * usability. + */ +static bool +recoveryApplyDelay(XLogReaderState *record) +{ + uint8 xact_info; + TimestampTz xtime; + TimestampTz delayUntil; + long msecs; + + /* nothing to do if no delay configured */ + if (recovery_min_apply_delay <= 0) + return false; + + /* no delay is applied on a database not yet consistent */ + if (!reachedConsistency) + return false; + + /* nothing to do if crash recovery is requested */ + if (!ArchiveRecoveryRequested) + return false; + + /* + * Is it a COMMIT record? + * + * We deliberately choose not to delay aborts since they have no effect on + * MVCC. We already allow replay of records that don't have a timestamp, + * so there is already opportunity for issues caused by early conflicts on + * standbys. + */ + if (XLogRecGetRmid(record) != RM_XACT_ID) + return false; + + xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (xact_info != XLOG_XACT_COMMIT && + xact_info != XLOG_XACT_COMMIT_PREPARED) + return false; + + if (!getRecordTimestamp(record, &xtime)) + return false; + + delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); + + /* + * Exit without arming the latch if it's already past time to apply this + * record + */ + msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil); + if (msecs <= 0) + return false; + + while (true) + { + ResetLatch(&XLogCtl->recoveryWakeupLatch); + + /* + * This might change recovery_min_apply_delay or the trigger file's + * location. + */ + HandleStartupProcInterrupts(); + + if (CheckForStandbyTrigger()) + break; + + /* + * Recalculate delayUntil as recovery_min_apply_delay could have + * changed while waiting in this loop. + */ + delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); + + /* + * Wait for difference between GetCurrentTimestamp() and delayUntil. + */ + msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), + delayUntil); + + if (msecs <= 0) + break; + + elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs); + + (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + msecs, + WAIT_EVENT_RECOVERY_APPLY_DELAY); + } + return true; +} + +/* + * Save timestamp of latest processed commit/abort record. + * + * We keep this in XLogCtl, not a simple static variable, so that it can be + * seen by processes other than the startup process. Note in particular + * that CreateRestartPoint is executed in the checkpointer. + */ +static void +SetLatestXTime(TimestampTz xtime) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->recoveryLastXTime = xtime; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Fetch timestamp of latest processed commit/abort record. + */ +TimestampTz +GetLatestXTime(void) +{ + TimestampTz xtime; + + SpinLockAcquire(&XLogCtl->info_lck); + xtime = XLogCtl->recoveryLastXTime; + SpinLockRelease(&XLogCtl->info_lck); + + return xtime; +} + +/* + * Save timestamp of the next chunk of WAL records to apply. + * + * We keep this in XLogCtl, not a simple static variable, so that it can be + * seen by all backends. + */ +static void +SetCurrentChunkStartTime(TimestampTz xtime) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->currentChunkStartTime = xtime; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Fetch timestamp of latest processed commit/abort record. + * Startup process maintains an accurate local copy in XLogReceiptTime + */ +TimestampTz +GetCurrentChunkReplayStartTime(void) +{ + TimestampTz xtime; + + SpinLockAcquire(&XLogCtl->info_lck); + xtime = XLogCtl->currentChunkStartTime; + SpinLockRelease(&XLogCtl->info_lck); + + return xtime; +} + +/* + * Returns time of receipt of current chunk of XLOG data, as well as + * whether it was received from streaming replication or from archives. + */ +void +GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) +{ + /* + * This must be executed in the startup process, since we don't export the + * relevant state to shared memory. + */ + Assert(InRecovery); + + *rtime = XLogReceiptTime; + *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM); +} + +/* + * Note that text field supplied is a parameter name and does not require + * translation + */ +static void +RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue) +{ + if (currValue < minValue) + { + if (LocalHotStandbyActive) + { + bool warned_for_promote = false; + + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("hot standby is not possible because of insufficient parameter settings"), + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue))); + + SetRecoveryPause(true); + + ereport(LOG, + (errmsg("recovery has paused"), + errdetail("If recovery is unpaused, the server will shut down."), + errhint("You can then restart the server after making the necessary configuration changes."))); + + while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) + { + HandleStartupProcInterrupts(); + + if (CheckForStandbyTrigger()) + { + if (!warned_for_promote) + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("promotion is not possible because of insufficient parameter settings"), + + /* + * Repeat the detail from above so it's easy to find + * in the log. + */ + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue), + errhint("Restart the server after making the necessary configuration changes."))); + warned_for_promote = true; + } + + /* + * If recovery pause is requested then set it paused. While + * we are in the loop, user might resume and pause again so + * set this every time. + */ + ConfirmRecoveryPaused(); + + /* + * We wait on a condition variable that will wake us as soon + * as the pause ends, but we use a timeout so we can check the + * above conditions periodically too. + */ + ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000, + WAIT_EVENT_RECOVERY_PAUSE); + } + ConditionVariableCancelSleep(); + } + + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery aborted because of insufficient parameter settings"), + /* Repeat the detail from above so it's easy to find in the log. */ + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue), + errhint("You can restart the server after making the necessary configuration changes."))); + } +} + +/* + * Check to see if required parameters are set high enough on this server + * for various aspects of recovery operation. + * + * Note that all the parameters which this function tests need to be + * listed in Administrator's Overview section in high-availability.sgml. + * If you change them, don't forget to update the list. + */ +static void +CheckRequiredParameterValues(void) +{ + /* + * For archive recovery, the WAL must be generated with at least 'replica' + * wal_level. + */ + if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL) + { + ereport(FATAL, + (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"), + errdetail("This happens if you temporarily set wal_level=minimal on the server."), + errhint("Use a backup taken after setting wal_level to higher than minimal."))); + } + + /* + * For Hot Standby, the WAL must be generated with 'replica' mode, and we + * must have at least as many backend slots as the primary. + */ + if (ArchiveRecoveryRequested && EnableHotStandby) + { + /* We ignore autovacuum_max_workers when we make this test. */ + RecoveryRequiresIntParameter("max_connections", + MaxConnections, + ControlFile->MaxConnections); + RecoveryRequiresIntParameter("max_worker_processes", + max_worker_processes, + ControlFile->max_worker_processes); + RecoveryRequiresIntParameter("max_wal_senders", + max_wal_senders, + ControlFile->max_wal_senders); + RecoveryRequiresIntParameter("max_prepared_transactions", + max_prepared_xacts, + ControlFile->max_prepared_xacts); + RecoveryRequiresIntParameter("max_locks_per_transaction", + max_locks_per_xact, + ControlFile->max_locks_per_xact); + } +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup + */ +void +StartupXLOG(void) +{ + XLogCtlInsert *Insert; + CheckPoint checkPoint; + bool wasShutdown; + bool reachedRecoveryTarget = false; + bool haveBackupLabel = false; + bool haveTblspcMap = false; + XLogRecPtr RecPtr, + checkPointLoc, + EndOfLog; + TimeLineID EndOfLogTLI; + TimeLineID PrevTimeLineID; + XLogRecord *record; + TransactionId oldestActiveXID; + bool backupEndRequired = false; + bool backupFromStandby = false; + DBState dbstate_at_startup; + XLogReaderState *xlogreader; + XLogPageReadPrivate private; + bool promoted = false; + struct stat st; + + /* + * We should have an aux process resource owner to use, and we should not + * be in a transaction that's installed some other resowner. + */ + Assert(AuxProcessResourceOwner != NULL); + Assert(CurrentResourceOwner == NULL || + CurrentResourceOwner == AuxProcessResourceOwner); + CurrentResourceOwner = AuxProcessResourceOwner; + + /* + * Check that contents look valid. + */ + if (!XRecOffIsValid(ControlFile->checkPoint)) + ereport(FATAL, + (errmsg("control file contains invalid checkpoint location"))); + + switch (ControlFile->state) + { + case DB_SHUTDOWNED: + + /* + * This is the expected case, so don't be chatty in standalone + * mode + */ + ereport(IsPostmasterEnvironment ? LOG : NOTICE, + (errmsg("database system was shut down at %s", + str_time(ControlFile->time)))); + break; + + case DB_SHUTDOWNED_IN_RECOVERY: + ereport(LOG, + (errmsg("database system was shut down in recovery at %s", + str_time(ControlFile->time)))); + break; + + case DB_SHUTDOWNING: + ereport(LOG, + (errmsg("database system shutdown was interrupted; last known up at %s", + str_time(ControlFile->time)))); + break; + + case DB_IN_CRASH_RECOVERY: + ereport(LOG, + (errmsg("database system was interrupted while in recovery at %s", + str_time(ControlFile->time)), + errhint("This probably means that some data is corrupted and" + " you will have to use the last backup for recovery."))); + break; + + case DB_IN_ARCHIVE_RECOVERY: + ereport(LOG, + (errmsg("database system was interrupted while in recovery at log time %s", + str_time(ControlFile->checkPointCopy.time)), + errhint("If this has occurred more than once some data might be corrupted" + " and you might need to choose an earlier recovery target."))); + break; + + case DB_IN_PRODUCTION: + ereport(LOG, + (errmsg("database system was interrupted; last known up at %s", + str_time(ControlFile->time)))); + break; + + default: + ereport(FATAL, + (errmsg("control file contains invalid database cluster state"))); + } + + /* This is just to allow attaching to startup process with a debugger */ +#ifdef XLOG_REPLAY_DELAY + if (ControlFile->state != DB_SHUTDOWNED) + pg_usleep(60000000L); +#endif + + /* + * Verify that pg_wal and pg_wal/archive_status exist. In cases where + * someone has performed a copy for PITR, these directories may have been + * excluded and need to be re-created. + */ + ValidateXLOGDirectoryStructure(); + + /*---------- + * If we previously crashed, perform a couple of actions: + * + * - The pg_wal directory may still include some temporary WAL segments + * used when creating a new segment, so perform some clean up to not + * bloat this path. This is done first as there is no point to sync + * this temporary data. + * + * - There might be data which we had written, intending to fsync it, but + * which we had not actually fsync'd yet. Therefore, a power failure in + * the near future might cause earlier unflushed writes to be lost, even + * though more recent data written to disk from here on would be + * persisted. To avoid that, fsync the entire data directory. + */ + if (ControlFile->state != DB_SHUTDOWNED && + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + { + RemoveTempXlogFiles(); + SyncDataDirectory(); + } + + /* + * Initialize on the assumption we want to recover to the latest timeline + * that's active according to pg_control. + */ + if (ControlFile->minRecoveryPointTLI > + ControlFile->checkPointCopy.ThisTimeLineID) + recoveryTargetTLI = ControlFile->minRecoveryPointTLI; + else + recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID; + + /* + * Check for signal files, and if so set up state for offline recovery + */ + readRecoverySignalFile(); + validateRecoveryParameters(); + + if (ArchiveRecoveryRequested) + { + if (StandbyModeRequested) + ereport(LOG, + (errmsg("entering standby mode"))); + else if (recoveryTarget == RECOVERY_TARGET_XID) + ereport(LOG, + (errmsg("starting point-in-time recovery to XID %u", + recoveryTargetXid))); + else if (recoveryTarget == RECOVERY_TARGET_TIME) + ereport(LOG, + (errmsg("starting point-in-time recovery to %s", + timestamptz_to_str(recoveryTargetTime)))); + else if (recoveryTarget == RECOVERY_TARGET_NAME) + ereport(LOG, + (errmsg("starting point-in-time recovery to \"%s\"", + recoveryTargetName))); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + ereport(LOG, + (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryTargetLSN)))); + else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) + ereport(LOG, + (errmsg("starting point-in-time recovery to earliest consistent point"))); + else + ereport(LOG, + (errmsg("starting archive recovery"))); + } + + /* + * Take ownership of the wakeup latch if we're going to sleep during + * recovery. + */ + if (ArchiveRecoveryRequested) + OwnLatch(&XLogCtl->recoveryWakeupLatch); + + /* Set up XLOG reader facility */ + MemSet(&private, 0, sizeof(XLogPageReadPrivate)); + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &XLogPageRead, + .segment_open = NULL, + .segment_close = wal_segment_close), + &private); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + xlogreader->system_identifier = ControlFile->system_identifier; + + /* + * Allocate two page buffers dedicated to WAL consistency checks. We do + * it this way, rather than just making static arrays, for two reasons: + * (1) no need to waste the storage in most instantiations of the backend; + * (2) a static char array isn't guaranteed to have any particular + * alignment, whereas palloc() will provide MAXALIGN'd storage. + */ + replay_image_masked = (char *) palloc(BLCKSZ); + primary_image_masked = (char *) palloc(BLCKSZ); + + if (read_backup_label(&checkPointLoc, &backupEndRequired, + &backupFromStandby)) + { + List *tablespaces = NIL; + + /* + * Archive recovery was requested, and thanks to the backup label + * file, we know how far we need to replay to reach consistency. Enter + * archive recovery directly. + */ + InArchiveRecovery = true; + if (StandbyModeRequested) + StandbyMode = true; + + /* + * When a backup_label file is present, we want to roll forward from + * the checkpoint it identifies, rather than using pg_control. + */ + record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); + if (record != NULL) + { + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + ereport(DEBUG1, + (errmsg_internal("checkpoint record is at %X/%X", + LSN_FORMAT_ARGS(checkPointLoc)))); + InRecovery = true; /* force recovery even if SHUTDOWNED */ + + /* + * Make sure that REDO location exists. This may not be the case + * if there was a crash during an online backup, which left a + * backup_label around that references a WAL segment that's + * already been archived. + */ + if (checkPoint.redo < checkPointLoc) + { + XLogBeginRead(xlogreader, checkPoint.redo); + if (!ReadRecord(xlogreader, LOG, false)) + ereport(FATAL, + (errmsg("could not find redo location referenced by checkpoint record"), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); + } + } + else + { + ereport(FATAL, + (errmsg("could not locate required checkpoint record"), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); + wasShutdown = false; /* keep compiler quiet */ + } + + /* read the tablespace_map file if present and create symlinks. */ + if (read_tablespace_map(&tablespaces)) + { + ListCell *lc; + + foreach(lc, tablespaces) + { + tablespaceinfo *ti = lfirst(lc); + char *linkloc; + + linkloc = psprintf("pg_tblspc/%s", ti->oid); + + /* + * Remove the existing symlink if any and Create the symlink + * under PGDATA. + */ + remove_tablespace_symlink(linkloc); + + if (symlink(ti->path, linkloc) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create symbolic link \"%s\": %m", + linkloc))); + + pfree(ti->oid); + pfree(ti->path); + pfree(ti); + } + + /* set flag to delete it later */ + haveTblspcMap = true; + } + + /* set flag to delete it later */ + haveBackupLabel = true; + } + else + { + /* + * If tablespace_map file is present without backup_label file, there + * is no use of such file. There is no harm in retaining it, but it + * is better to get rid of the map file so that we don't have any + * redundant file in data directory and it will avoid any sort of + * confusion. It seems prudent though to just rename the file out of + * the way rather than delete it completely, also we ignore any error + * that occurs in rename operation as even if map file is present + * without backup_label file, it is harmless. + */ + if (stat(TABLESPACE_MAP, &st) == 0) + { + unlink(TABLESPACE_MAP_OLD); + if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) + ereport(LOG, + (errmsg("ignoring file \"%s\" because no file \"%s\" exists", + TABLESPACE_MAP, BACKUP_LABEL_FILE), + errdetail("File \"%s\" was renamed to \"%s\".", + TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + else + ereport(LOG, + (errmsg("ignoring file \"%s\" because no file \"%s\" exists", + TABLESPACE_MAP, BACKUP_LABEL_FILE), + errdetail("Could not rename file \"%s\" to \"%s\": %m.", + TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + } + + /* + * It's possible that archive recovery was requested, but we don't + * know how far we need to replay the WAL before we reach consistency. + * This can happen for example if a base backup is taken from a + * running server using an atomic filesystem snapshot, without calling + * pg_start/stop_backup. Or if you just kill a running primary server + * and put it into archive recovery by creating a recovery signal + * file. + * + * Our strategy in that case is to perform crash recovery first, + * replaying all the WAL present in pg_wal, and only enter archive + * recovery after that. + * + * But usually we already know how far we need to replay the WAL (up + * to minRecoveryPoint, up to backupEndPoint, or until we see an + * end-of-backup record), and we can enter archive recovery directly. + */ + if (ArchiveRecoveryRequested && + (ControlFile->minRecoveryPoint != InvalidXLogRecPtr || + ControlFile->backupEndRequired || + ControlFile->backupEndPoint != InvalidXLogRecPtr || + ControlFile->state == DB_SHUTDOWNED)) + { + InArchiveRecovery = true; + if (StandbyModeRequested) + StandbyMode = true; + } + + /* Get the last valid checkpoint record. */ + checkPointLoc = ControlFile->checkPoint; + RedoStartLSN = ControlFile->checkPointCopy.redo; + record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true); + if (record != NULL) + { + ereport(DEBUG1, + (errmsg_internal("checkpoint record is at %X/%X", + LSN_FORMAT_ARGS(checkPointLoc)))); + } + else + { + /* + * We used to attempt to go back to a secondary checkpoint record + * here, but only when not in standby mode. We now just fail if we + * can't read the last checkpoint because this allows us to + * simplify processing around checkpoints. + */ + ereport(PANIC, + (errmsg("could not locate a valid checkpoint record"))); + } + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + } + + /* + * Clear out any old relcache cache files. This is *necessary* if we do + * any WAL replay, since that would probably result in the cache files + * being out of sync with database reality. In theory we could leave them + * in place if the database had been cleanly shut down, but it seems + * safest to just remove them always and let them be rebuilt during the + * first backend startup. These files needs to be removed from all + * directories including pg_tblspc, however the symlinks are created only + * after reading tablespace_map file in case of archive recovery from + * backup, so needs to clear old relcache files here after creating + * symlinks. + */ + RelationCacheInitFileRemove(); + + /* + * If the location of the checkpoint record is not on the expected + * timeline in the history of the requested timeline, we cannot proceed: + * the backup is not part of the history of the requested timeline. + */ + Assert(expectedTLEs); /* was initialized by reading checkpoint + * record */ + if (tliOfPointInHistory(checkPointLoc, expectedTLEs) != + checkPoint.ThisTimeLineID) + { + XLogRecPtr switchpoint; + + /* + * tliSwitchPoint will throw an error if the checkpoint's timeline is + * not in expectedTLEs at all. + */ + switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL); + ereport(FATAL, + (errmsg("requested timeline %u is not a child of this server's history", + recoveryTargetTLI), + errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", + LSN_FORMAT_ARGS(ControlFile->checkPoint), + ControlFile->checkPointCopy.ThisTimeLineID, + LSN_FORMAT_ARGS(switchpoint)))); + } + + /* + * The min recovery point should be part of the requested timeline's + * history, too. + */ + if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && + tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != + ControlFile->minRecoveryPointTLI) + ereport(FATAL, + (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", + recoveryTargetTLI, + LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), + ControlFile->minRecoveryPointTLI))); + + LastRec = RecPtr = checkPointLoc; + + ereport(DEBUG1, + (errmsg_internal("redo record is at %X/%X; shutdown %s", + LSN_FORMAT_ARGS(checkPoint.redo), + wasShutdown ? "true" : "false"))); + ereport(DEBUG1, + (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", + U64FromFullTransactionId(checkPoint.nextXid), + checkPoint.nextOid))); + ereport(DEBUG1, + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", + checkPoint.nextMulti, checkPoint.nextMultiOffset))); + ereport(DEBUG1, + (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", + checkPoint.oldestXid, checkPoint.oldestXidDB))); + ereport(DEBUG1, + (errmsg_internal("oldest MultiXactId: %u, in database %u", + checkPoint.oldestMulti, checkPoint.oldestMultiDB))); + ereport(DEBUG1, + (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u", + checkPoint.oldestCommitTsXid, + checkPoint.newestCommitTsXid))); + if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid))) + ereport(PANIC, + (errmsg("invalid next transaction ID"))); + + /* initialize shared memory variables from the checkpoint record */ + ShmemVariableCache->nextXid = checkPoint.nextXid; + ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->oidCount = 0; + MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); + SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetCommitTsLimit(checkPoint.oldestCommitTsXid, + checkPoint.newestCommitTsXid); + XLogCtl->ckptFullXid = checkPoint.nextXid; + + /* + * Initialize replication slots, before there's a chance to remove + * required resources. + */ + StartupReplicationSlots(); + + /* + * Startup logical state, needs to be setup now so we have proper data + * during crash recovery. + */ + StartupReorderBuffer(); + + /* + * Startup CLOG. This must be done after ShmemVariableCache->nextXid has + * been initialized and before we accept connections or begin WAL replay. + */ + StartupCLOG(); + + /* + * Startup MultiXact. We need to do this early to be able to replay + * truncations. + */ + StartupMultiXact(); + + /* + * Ditto for commit timestamps. Activate the facility if the setting is + * enabled in the control file, as there should be no tracking of commit + * timestamps done when the setting was disabled. This facility can be + * started or stopped when replaying a XLOG_PARAMETER_CHANGE record. + */ + if (ControlFile->track_commit_timestamp) + StartupCommitTs(); + + /* + * Recover knowledge about replay progress of known replication partners. + */ + StartupReplicationOrigin(); + + /* + * Initialize unlogged LSN. On a clean shutdown, it's restored from the + * control file. On recovery, all unlogged relations are blown away, so + * the unlogged LSN counter can be reset too. + */ + if (ControlFile->state == DB_SHUTDOWNED) + XLogCtl->unloggedLSN = ControlFile->unloggedLSN; + else + XLogCtl->unloggedLSN = FirstNormalUnloggedLSN; + + /* + * We must replay WAL entries using the same TimeLineID they were created + * under, so temporarily adopt the TLI indicated by the checkpoint (see + * also xlog_redo()). + */ + ThisTimeLineID = checkPoint.ThisTimeLineID; + + /* + * Copy any missing timeline history files between 'now' and the recovery + * target timeline from archive to pg_wal. While we don't need those files + * ourselves - the history file of the recovery target timeline covers all + * the previous timelines in the history too - a cascading standby server + * might be interested in them. Or, if you archive the WAL from this + * server to a different archive than the primary, it'd be good for all + * the history files to get archived there after failover, so that you can + * use one of the old timelines as a PITR target. Timeline history files + * are small, so it's better to copy them unnecessarily than not copy them + * and regret later. + */ + restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI); + + /* + * Before running in recovery, scan pg_twophase and fill in its status to + * be able to work on entries generated by redo. Doing a scan before + * taking any recovery action has the merit to discard any 2PC files that + * are newer than the first record to replay, saving from any conflicts at + * replay. This avoids as well any subsequent scans when doing recovery + * of the on-disk two-phase data. + */ + restoreTwoPhaseData(); + + lastFullPageWrites = checkPoint.fullPageWrites; + + RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; + doPageWrites = lastFullPageWrites; + + if (RecPtr < checkPoint.redo) + ereport(PANIC, + (errmsg("invalid redo in checkpoint record"))); + + /* + * Check whether we need to force recovery from WAL. If it appears to + * have been a clean shutdown and we did not have a recovery signal file, + * then assume no recovery needed. + */ + if (checkPoint.redo < RecPtr) + { + if (wasShutdown) + ereport(PANIC, + (errmsg("invalid redo record in shutdown checkpoint"))); + InRecovery = true; + } + else if (ControlFile->state != DB_SHUTDOWNED) + InRecovery = true; + else if (ArchiveRecoveryRequested) + { + /* force recovery due to presence of recovery signal file */ + InRecovery = true; + } + + /* + * Start recovery assuming that the final record isn't lost. + */ + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + + /* REDO */ + if (InRecovery) + { + int rmid; + + /* + * Update pg_control to show that we are recovering and to show the + * selected checkpoint as the place we are starting from. We also mark + * pg_control with any minimum recovery stop point obtained from a + * backup history file. + */ + dbstate_at_startup = ControlFile->state; + if (InArchiveRecovery) + { + ControlFile->state = DB_IN_ARCHIVE_RECOVERY; + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE; + SpinLockRelease(&XLogCtl->info_lck); + } + else + { + ereport(LOG, + (errmsg("database system was not properly shut down; " + "automatic recovery in progress"))); + if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID) + ereport(LOG, + (errmsg("crash recovery starts in timeline %u " + "and has target timeline %u", + ControlFile->checkPointCopy.ThisTimeLineID, + recoveryTargetTLI))); + ControlFile->state = DB_IN_CRASH_RECOVERY; + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; + SpinLockRelease(&XLogCtl->info_lck); + } + ControlFile->checkPoint = checkPointLoc; + ControlFile->checkPointCopy = checkPoint; + if (InArchiveRecovery) + { + /* initialize minRecoveryPoint if not set yet */ + if (ControlFile->minRecoveryPoint < checkPoint.redo) + { + ControlFile->minRecoveryPoint = checkPoint.redo; + ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID; + } + } + + /* + * Set backupStartPoint if we're starting recovery from a base backup. + * + * Also set backupEndPoint and use minRecoveryPoint as the backup end + * location if we're starting recovery from a base backup which was + * taken from a standby. In this case, the database system status in + * pg_control must indicate that the database was already in recovery. + * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be + * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted + * before reaching this point; e.g. because restore_command or + * primary_conninfo were faulty. + * + * Any other state indicates that the backup somehow became corrupted + * and we can't sensibly continue with recovery. + */ + if (haveBackupLabel) + { + ControlFile->backupStartPoint = checkPoint.redo; + ControlFile->backupEndRequired = backupEndRequired; + + if (backupFromStandby) + { + if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && + dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) + ereport(FATAL, + (errmsg("backup_label contains data inconsistent with control file"), + errhint("This means that the backup is corrupted and you will " + "have to use another backup for recovery."))); + ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; + } + } + ControlFile->time = (pg_time_t) time(NULL); + /* No need to hold ControlFileLock yet, we aren't up far enough */ + UpdateControlFile(); + + /* + * Initialize our local copy of minRecoveryPoint. When doing crash + * recovery we want to replay up to the end of WAL. Particularly, in + * the case of a promoted standby minRecoveryPoint value in the + * control file is only updated after the first checkpoint. However, + * if the instance crashes before the first post-recovery checkpoint + * is completed then recovery will use a stale location causing the + * startup process to think that there are still invalid page + * references when checking for data consistency. + */ + if (InArchiveRecovery) + { + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + else + { + minRecoveryPoint = InvalidXLogRecPtr; + minRecoveryPointTLI = 0; + } + + /* + * Reset pgstat data, because it may be invalid after recovery. + */ + pgstat_reset_all(); + + /* + * If there was a backup label file, it's done its job and the info + * has now been propagated into pg_control. We must get rid of the + * label file so that if we crash during recovery, we'll pick up at + * the latest recovery restartpoint instead of going all the way back + * to the backup start point. It seems prudent though to just rename + * the file out of the way rather than delete it completely. + */ + if (haveBackupLabel) + { + unlink(BACKUP_LABEL_OLD); + durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL); + } + + /* + * If there was a tablespace_map file, it's done its job and the + * symlinks have been created. We must get rid of the map file so + * that if we crash during recovery, we don't create symlinks again. + * It seems prudent though to just rename the file out of the way + * rather than delete it completely. + */ + if (haveTblspcMap) + { + unlink(TABLESPACE_MAP_OLD); + durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL); + } + + /* Check that the GUCs used to generate the WAL allow recovery */ + CheckRequiredParameterValues(); + + /* + * We're in recovery, so unlogged relations may be trashed and must be + * reset. This should be done BEFORE allowing Hot Standby + * connections, so that read-only backends don't try to read whatever + * garbage is left over from before. + */ + ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP); + + /* + * Likewise, delete any saved transaction snapshot files that got left + * behind by crashed backends. + */ + DeleteAllExportedSnapshotFiles(); + + /* + * Initialize for Hot Standby, if enabled. We won't let backends in + * yet, not until we've reached the min recovery point specified in + * control file and we've established a recovery snapshot from a + * running-xacts WAL record. + */ + if (ArchiveRecoveryRequested && EnableHotStandby) + { + TransactionId *xids; + int nxids; + + ereport(DEBUG1, + (errmsg_internal("initializing for hot standby"))); + + InitRecoveryTransactionEnvironment(); + + if (wasShutdown) + oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + else + oldestActiveXID = checkPoint.oldestActiveXid; + Assert(TransactionIdIsValid(oldestActiveXID)); + + /* Tell procarray about the range of xids it has to deal with */ + ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid)); + + /* + * Startup subtrans only. CLOG, MultiXact and commit timestamp + * have already been started up and other SLRUs are not maintained + * during recovery and need not be started yet. + */ + StartupSUBTRANS(oldestActiveXID); + + /* + * If we're beginning at a shutdown checkpoint, we know that + * nothing was running on the primary at this point. So fake-up an + * empty running-xacts record and use that here and now. Recover + * additional standby state for prepared transactions. + */ + if (wasShutdown) + { + RunningTransactionsData running; + TransactionId latestCompletedXid; + + /* + * Construct a RunningTransactions snapshot representing a + * shut down server, with only prepared transactions still + * alive. We're never overflowed at this point because all + * subxids are listed with their parent prepared transactions. + */ + running.xcnt = nxids; + running.subxcnt = 0; + running.subxid_overflow = false; + running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); + running.oldestRunningXid = oldestActiveXID; + latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); + TransactionIdRetreat(latestCompletedXid); + Assert(TransactionIdIsNormal(latestCompletedXid)); + running.latestCompletedXid = latestCompletedXid; + running.xids = xids; + + ProcArrayApplyRecoveryInfo(&running); + + StandbyRecoverPreparedTransactions(); + } + } + + /* Initialize resource managers */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_startup != NULL) + RmgrTable[rmid].rm_startup(); + } + + /* + * Initialize shared variables for tracking progress of WAL replay, as + * if we had just replayed the record before the REDO location (or the + * checkpoint record itself, if it's a shutdown checkpoint). + */ + SpinLockAcquire(&XLogCtl->info_lck); + if (checkPoint.redo < RecPtr) + XLogCtl->replayEndRecPtr = checkPoint.redo; + else + XLogCtl->replayEndRecPtr = EndRecPtr; + XLogCtl->replayEndTLI = ThisTimeLineID; + XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr; + XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI; + XLogCtl->recoveryLastXTime = 0; + XLogCtl->currentChunkStartTime = 0; + XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; + SpinLockRelease(&XLogCtl->info_lck); + + /* Also ensure XLogReceiptTime has a sane value */ + XLogReceiptTime = GetCurrentTimestamp(); + + /* + * Let postmaster know we've started redo now, so that it can launch + * checkpointer to perform restartpoints. We don't bother during + * crash recovery as restartpoints can only be performed during + * archive recovery. And we'd like to keep crash recovery simple, to + * avoid introducing bugs that could affect you when recovering after + * crash. + * + * After this point, we can no longer assume that we're the only + * process in addition to postmaster! Also, fsync requests are + * subsequently to be handled by the checkpointer, not locally. + */ + if (ArchiveRecoveryRequested && IsUnderPostmaster) + { + PublishStartupProcessInformation(); + EnableSyncRequestForwarding(); + SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); + bgwriterLaunched = true; + } + + /* + * Allow read-only connections immediately if we're consistent + * already. + */ + CheckRecoveryConsistency(); + + /* + * Find the first record that logically follows the checkpoint --- it + * might physically precede it, though. + */ + if (checkPoint.redo < RecPtr) + { + /* back up to find the record */ + XLogBeginRead(xlogreader, checkPoint.redo); + record = ReadRecord(xlogreader, PANIC, false); + } + else + { + /* just have to read next record after CheckPoint */ + record = ReadRecord(xlogreader, LOG, false); + } + + if (record != NULL) + { + ErrorContextCallback errcallback; + TimestampTz xtime; + PGRUsage ru0; + + pg_rusage_init(&ru0); + + InRedo = true; + + ereport(LOG, + (errmsg("redo starts at %X/%X", + LSN_FORMAT_ARGS(ReadRecPtr)))); + + /* + * main redo apply loop + */ + do + { + bool switchedTLI = false; + +#ifdef WAL_DEBUG + if (XLOG_DEBUG || + (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) || + (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3)) + { + StringInfoData buf; + + initStringInfo(&buf); + appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", + LSN_FORMAT_ARGS(ReadRecPtr), + LSN_FORMAT_ARGS(EndRecPtr)); + xlog_outrec(&buf, xlogreader); + appendStringInfoString(&buf, " - "); + xlog_outdesc(&buf, xlogreader); + elog(LOG, "%s", buf.data); + pfree(buf.data); + } +#endif + + /* Handle interrupt signals of startup process */ + HandleStartupProcInterrupts(); + + /* + * Pause WAL replay, if requested by a hot-standby session via + * SetRecoveryPause(). + * + * Note that we intentionally don't take the info_lck spinlock + * here. We might therefore read a slightly stale value of + * the recoveryPause flag, but it can't be very stale (no + * worse than the last spinlock we did acquire). Since a + * pause request is a pretty asynchronous thing anyway, + * possibly responding to it one WAL record later than we + * otherwise would is a minor issue, so it doesn't seem worth + * adding another spinlock cycle to prevent that. + */ + if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + + /* + * Have we reached our recovery target? + */ + if (recoveryStopsBefore(xlogreader)) + { + reachedRecoveryTarget = true; + break; + } + + /* + * If we've been asked to lag the primary, wait on latch until + * enough time has passed. + */ + if (recoveryApplyDelay(xlogreader)) + { + /* + * We test for paused recovery again here. If user sets + * delayed apply, it may be because they expect to pause + * recovery in case of problems, so we must test again + * here otherwise pausing during the delay-wait wouldn't + * work. + */ + if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + } + + /* Setup error traceback support for ereport() */ + errcallback.callback = rm_redo_error_callback; + errcallback.arg = (void *) xlogreader; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* + * ShmemVariableCache->nextXid must be beyond record's xid. + */ + AdvanceNextFullTransactionIdPastXid(record->xl_xid); + + /* + * Before replaying this record, check if this record causes + * the current timeline to change. The record is already + * considered to be part of the new timeline, so we update + * ThisTimeLineID before replaying it. That's important so + * that replayEndTLI, which is recorded as the minimum + * recovery point's TLI if recovery stops after this record, + * is set correctly. + */ + if (record->xl_rmid == RM_XLOG_ID) + { + TimeLineID newTLI = ThisTimeLineID; + TimeLineID prevTLI = ThisTimeLineID; + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_CHECKPOINT_SHUTDOWN) + { + CheckPoint checkPoint; + + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + newTLI = checkPoint.ThisTimeLineID; + prevTLI = checkPoint.PrevTimeLineID; + } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery)); + newTLI = xlrec.ThisTimeLineID; + prevTLI = xlrec.PrevTimeLineID; + } + + if (newTLI != ThisTimeLineID) + { + /* Check that it's OK to switch to this TLI */ + checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI); + + /* Following WAL records should be run with new TLI */ + ThisTimeLineID = newTLI; + switchedTLI = true; + } + } + + /* + * Update shared replayEndRecPtr before replaying this record, + * so that XLogFlush will update minRecoveryPoint correctly. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->replayEndRecPtr = EndRecPtr; + XLogCtl->replayEndTLI = ThisTimeLineID; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * If we are attempting to enter Hot Standby mode, process + * XIDs we see + */ + if (standbyState >= STANDBY_INITIALIZED && + TransactionIdIsValid(record->xl_xid)) + RecordKnownAssignedTransactionIds(record->xl_xid); + + /* Now apply the WAL record itself */ + RmgrTable[record->xl_rmid].rm_redo(xlogreader); + + /* + * After redo, check whether the backup pages associated with + * the WAL record are consistent with the existing pages. This + * check is done only if consistency check is enabled for this + * record. + */ + if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) + checkXLogConsistency(xlogreader); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + /* + * Update lastReplayedEndRecPtr after this record has been + * successfully replayed. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->lastReplayedEndRecPtr = EndRecPtr; + XLogCtl->lastReplayedTLI = ThisTimeLineID; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * If rm_redo called XLogRequestWalReceiverReply, then we wake + * up the receiver so that it notices the updated + * lastReplayedEndRecPtr and sends a reply to the primary. + */ + if (doRequestWalReceiverReply) + { + doRequestWalReceiverReply = false; + WalRcvForceReply(); + } + + /* Remember this record as the last-applied one */ + LastRec = ReadRecPtr; + + /* Allow read-only connections if we're consistent now */ + CheckRecoveryConsistency(); + + /* Is this a timeline switch? */ + if (switchedTLI) + { + /* + * Before we continue on the new timeline, clean up any + * (possibly bogus) future WAL segments on the old + * timeline. + */ + RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID); + + /* + * Wake up any walsenders to notice that we are on a new + * timeline. + */ + if (AllowCascadeReplication()) + WalSndWakeup(); + } + + /* Exit loop if we reached inclusive recovery target */ + if (recoveryStopsAfter(xlogreader)) + { + reachedRecoveryTarget = true; + break; + } + + /* Else, try to fetch the next WAL record */ + record = ReadRecord(xlogreader, LOG, false); + } while (record != NULL); + + /* + * end of main redo apply loop + */ + + if (reachedRecoveryTarget) + { + if (!reachedConsistency) + ereport(FATAL, + (errmsg("requested recovery stop point is before consistent recovery point"))); + + /* + * This is the last point where we can restart recovery with a + * new recovery target, if we shutdown and begin again. After + * this, Resource Managers may choose to do permanent + * corrective actions at end of recovery. + */ + switch (recoveryTargetAction) + { + case RECOVERY_TARGET_ACTION_SHUTDOWN: + + /* + * exit with special return code to request shutdown + * of postmaster. Log messages issued from + * postmaster. + */ + proc_exit(3); + + case RECOVERY_TARGET_ACTION_PAUSE: + SetRecoveryPause(true); + recoveryPausesHere(true); + + /* drop into promote */ + + case RECOVERY_TARGET_ACTION_PROMOTE: + break; + } + } + + /* Allow resource managers to do any required cleanup. */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_cleanup != NULL) + RmgrTable[rmid].rm_cleanup(); + } + + ereport(LOG, + (errmsg("redo done at %X/%X system usage: %s", + LSN_FORMAT_ARGS(ReadRecPtr), + pg_rusage_show(&ru0)))); + xtime = GetLatestXTime(); + if (xtime) + ereport(LOG, + (errmsg("last completed transaction was at log time %s", + timestamptz_to_str(xtime)))); + + InRedo = false; + } + else + { + /* there are no WAL records following the checkpoint */ + ereport(LOG, + (errmsg("redo is not required"))); + + } + + /* + * This check is intentionally after the above log messages that + * indicate how far recovery went. + */ + if (ArchiveRecoveryRequested && + recoveryTarget != RECOVERY_TARGET_UNSET && + !reachedRecoveryTarget) + ereport(FATAL, + (errmsg("recovery ended before configured recovery target was reached"))); + } + + /* + * Kill WAL receiver, if it's still running, before we continue to write + * the startup checkpoint and aborted-contrecord records. It will trump + * over these records and subsequent ones if it's still alive when we + * start writing WAL. + */ + ShutdownWalRcv(); + + /* + * Reset unlogged relations to the contents of their INIT fork. This is + * done AFTER recovery is complete so as to include any unlogged relations + * created during recovery, but BEFORE recovery is marked as having + * completed successfully. Otherwise we'd not retry if any of the post + * end-of-recovery steps fail. + */ + if (InRecovery) + ResetUnloggedRelations(UNLOGGED_RELATION_INIT); + + /* + * We don't need the latch anymore. It's not strictly necessary to disown + * it, but let's do it for the sake of tidiness. + */ + if (ArchiveRecoveryRequested) + DisownLatch(&XLogCtl->recoveryWakeupLatch); + + /* + * We are now done reading the xlog from stream. Turn off streaming + * recovery to force fetching the files (which would be required at end of + * recovery, e.g., timeline history file) from archive or pg_wal. + * + * Note that standby mode must be turned off after killing WAL receiver, + * i.e., calling ShutdownWalRcv(). + */ + Assert(!WalRcvStreaming()); + StandbyMode = false; + + /* + * Determine where to start writing WAL next. + * + * When recovery ended in an incomplete record, write a WAL record about + * that and continue after it. In all other cases, re-fetch the last + * valid or last applied record, so we can identify the exact endpoint of + * what we consider the valid portion of WAL. + */ + XLogBeginRead(xlogreader, LastRec); + record = ReadRecord(xlogreader, PANIC, false); + EndOfLog = EndRecPtr; + + /* + * EndOfLogTLI is the TLI in the filename of the XLOG segment containing + * the end-of-log. It could be different from the timeline that EndOfLog + * nominally belongs to, if there was a timeline switch in that segment, + * and we were reading the old WAL from a segment belonging to a higher + * timeline. + */ + EndOfLogTLI = xlogreader->seg.ws_tli; + + /* + * Complain if we did not roll forward far enough to render the backup + * dump consistent. Note: it is indeed okay to look at the local variable + * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might + * be further ahead --- ControlFile->minRecoveryPoint cannot have been + * advanced beyond the WAL we processed. + */ + if (InRecovery && + (EndOfLog < minRecoveryPoint || + !XLogRecPtrIsInvalid(ControlFile->backupStartPoint))) + { + /* + * Ran off end of WAL before reaching end-of-backup WAL record, or + * minRecoveryPoint. That's usually a bad sign, indicating that you + * tried to recover from an online backup but never called + * pg_stop_backup(), or you didn't archive all the WAL up to that + * point. However, this also happens in crash recovery, if the system + * crashes while an online backup is in progress. We must not treat + * that as an error, or the database will refuse to start up. + */ + if (ArchiveRecoveryRequested || ControlFile->backupEndRequired) + { + if (ControlFile->backupEndRequired) + ereport(FATAL, + (errmsg("WAL ends before end of online backup"), + errhint("All WAL generated while online backup was taken must be available at recovery."))); + else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) + ereport(FATAL, + (errmsg("WAL ends before end of online backup"), + errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery."))); + else + ereport(FATAL, + (errmsg("WAL ends before consistent recovery point"))); + } + } + + /* + * Pre-scan prepared transactions to find out the range of XIDs present. + * This information is not quite needed yet, but it is positioned here so + * as potential problems are detected before any on-disk change is done. + */ + oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); + + /* + * Consider whether we need to assign a new timeline ID. + * + * If we are doing an archive recovery, we always assign a new ID. This + * handles a couple of issues. If we stopped short of the end of WAL + * during recovery, then we are clearly generating a new timeline and must + * assign it a unique new ID. Even if we ran to the end, modifying the + * current last segment is problematic because it may result in trying to + * overwrite an already-archived copy of that segment, and we encourage + * DBAs to make their archive_commands reject that. We can dodge the + * problem by making the new active segment have a new timeline ID. + * + * In a normal crash recovery, we can just extend the timeline we were in. + */ + PrevTimeLineID = ThisTimeLineID; + if (ArchiveRecoveryRequested) + { + char reason[200]; + char recoveryPath[MAXPGPATH]; + + Assert(InArchiveRecovery); + + ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1; + ereport(LOG, + (errmsg("selected new timeline ID: %u", ThisTimeLineID))); + + /* + * Create a comment for the history file to explain why and where + * timeline changed. + */ + if (recoveryTarget == RECOVERY_TARGET_XID) + snprintf(reason, sizeof(reason), + "%s transaction %u", + recoveryStopAfter ? "after" : "before", + recoveryStopXid); + else if (recoveryTarget == RECOVERY_TARGET_TIME) + snprintf(reason, sizeof(reason), + "%s %s\n", + recoveryStopAfter ? "after" : "before", + timestamptz_to_str(recoveryStopTime)); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + snprintf(reason, sizeof(reason), + "%s LSN %X/%X\n", + recoveryStopAfter ? "after" : "before", + LSN_FORMAT_ARGS(recoveryStopLSN)); + else if (recoveryTarget == RECOVERY_TARGET_NAME) + snprintf(reason, sizeof(reason), + "at restore point \"%s\"", + recoveryStopName); + else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) + snprintf(reason, sizeof(reason), "reached consistency"); + else + snprintf(reason, sizeof(reason), "no recovery target specified"); + + /* + * We are now done reading the old WAL. Turn off archive fetching if + * it was active, and make a writable copy of the last WAL segment. + * (Note that we also have a copy of the last block of the old WAL in + * readBuf; we will use that below.) + */ + exitArchiveRecovery(EndOfLogTLI, EndOfLog); + + /* + * Write the timeline history file, and have it archived. After this + * point (or rather, as soon as the file is archived), the timeline + * will appear as "taken" in the WAL archive and to any standby + * servers. If we crash before actually switching to the new + * timeline, standby servers will nevertheless think that we switched + * to the new timeline, and will try to connect to the new timeline. + * To minimize the window for that, try to do as little as possible + * between here and writing the end-of-recovery record. + */ + writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI, + EndRecPtr, reason); + + /* + * Since there might be a partial WAL segment named RECOVERYXLOG, get + * rid of it. + */ + snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG"); + unlink(recoveryPath); /* ignore any error */ + + /* Get rid of any remaining recovered timeline-history file, too */ + snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY"); + unlink(recoveryPath); /* ignore any error */ + } + + /* Save the selected TimeLineID in shared memory, too */ + XLogCtl->ThisTimeLineID = ThisTimeLineID; + XLogCtl->PrevTimeLineID = PrevTimeLineID; + + /* + * Actually, if WAL ended in an incomplete record, skip the parts that + * made it through and start writing after the portion that persisted. + * (It's critical to first write an OVERWRITE_CONTRECORD message, which + * we'll do as soon as we're open for writing new WAL.) + */ + if (!XLogRecPtrIsInvalid(missingContrecPtr)) + { + Assert(!XLogRecPtrIsInvalid(abortedRecPtr)); + EndOfLog = missingContrecPtr; + } + + /* + * Prepare to write WAL starting at EndOfLog location, and init xlog + * buffer cache using the block containing the last record from the + * previous incarnation. + */ + Insert = &XLogCtl->Insert; + Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec); + Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog); + + /* + * Tricky point here: readBuf contains the *last* block that the LastRec + * record spans, not the one it starts in. The last block is indeed the + * one we want to use. + */ + if (EndOfLog % XLOG_BLCKSZ != 0) + { + char *page; + int len; + int firstIdx; + XLogRecPtr pageBeginPtr; + + pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ); + Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); + + firstIdx = XLogRecPtrToBufIdx(EndOfLog); + + /* Copy the valid part of the last block, and zero the rest */ + page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; + len = EndOfLog % XLOG_BLCKSZ; + memcpy(page, xlogreader->readBuf, len); + memset(page + len, 0, XLOG_BLCKSZ - len); + + XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ; + XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ; + } + else + { + /* + * There is no partial block to copy. Just set InitializedUpTo, and + * let the first attempt to insert a log record to initialize the next + * buffer. + */ + XLogCtl->InitializedUpTo = EndOfLog; + } + + LogwrtResult.Write = LogwrtResult.Flush = EndOfLog; + + XLogCtl->LogwrtResult = LogwrtResult; + + XLogCtl->LogwrtRqst.Write = EndOfLog; + XLogCtl->LogwrtRqst.Flush = EndOfLog; + + LocalSetXLogInsertAllowed(); + + /* If necessary, write overwrite-contrecord before doing anything else */ + if (!XLogRecPtrIsInvalid(abortedRecPtr)) + { + Assert(!XLogRecPtrIsInvalid(missingContrecPtr)); + CreateOverwriteContrecordRecord(abortedRecPtr); + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + } + + /* + * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE + * record before resource manager writes cleanup WAL records or checkpoint + * record is written. + */ + Insert->fullPageWrites = lastFullPageWrites; + UpdateFullPageWrites(); + LocalXLogInsertAllowed = -1; + + if (InRecovery) + { + /* + * Perform a checkpoint to update all our recovery activity to disk. + * + * Note that we write a shutdown checkpoint rather than an on-line + * one. This is not particularly critical, but since we may be + * assigning a new TLI, using a shutdown checkpoint allows us to have + * the rule that TLI only changes in shutdown checkpoints, which + * allows some extra error checking in xlog_redo. + * + * In promotion, only create a lightweight end-of-recovery record + * instead of a full checkpoint. A checkpoint is requested later, + * after we're fully out of recovery mode and already accepting + * queries. + */ + if (bgwriterLaunched) + { + if (LocalPromoteIsTriggered) + { + checkPointLoc = ControlFile->checkPoint; + + /* + * Confirm the last checkpoint is available for us to recover + * from if we fail. + */ + record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false); + if (record != NULL) + { + promoted = true; + + /* + * Insert a special WAL record to mark the end of + * recovery, since we aren't doing a checkpoint. That + * means that the checkpointer process may likely be in + * the middle of a time-smoothed restartpoint and could + * continue to be for minutes after this. That sounds + * strange, but the effect is roughly the same and it + * would be stranger to try to come out of the + * restartpoint and then checkpoint. We request a + * checkpoint later anyway, just for safety. + */ + CreateEndOfRecoveryRecord(); + } + } + + if (!promoted) + RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | + CHECKPOINT_IMMEDIATE | + CHECKPOINT_WAIT); + } + else + CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE); + } + + if (ArchiveRecoveryRequested) + { + /* + * And finally, execute the recovery_end_command, if any. + */ + if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0) + ExecuteRecoveryCommand(recoveryEndCommand, + "recovery_end_command", + true); + + /* + * We switched to a new timeline. Clean up segments on the old + * timeline. + * + * If there are any higher-numbered segments on the old timeline, + * remove them. They might contain valid WAL, but they might also be + * pre-allocated files containing garbage. In any case, they are not + * part of the new timeline's history so we don't need them. + */ + RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID); + + /* + * If the switch happened in the middle of a segment, what to do with + * the last, partial segment on the old timeline? If we don't archive + * it, and the server that created the WAL never archives it either + * (e.g. because it was hit by a meteor), it will never make it to the + * archive. That's OK from our point of view, because the new segment + * that we created with the new TLI contains all the WAL from the old + * timeline up to the switch point. But if you later try to do PITR to + * the "missing" WAL on the old timeline, recovery won't find it in + * the archive. It's physically present in the new file with new TLI, + * but recovery won't look there when it's recovering to the older + * timeline. On the other hand, if we archive the partial segment, and + * the original server on that timeline is still running and archives + * the completed version of the same segment later, it will fail. (We + * used to do that in 9.4 and below, and it caused such problems). + * + * As a compromise, we rename the last segment with the .partial + * suffix, and archive it. Archive recovery will never try to read + * .partial segments, so they will normally go unused. But in the odd + * PITR case, the administrator can copy them manually to the pg_wal + * directory (removing the suffix). They can be useful in debugging, + * too. + * + * If a .done or .ready file already exists for the old timeline, + * however, we had already determined that the segment is complete, so + * we can let it be archived normally. (In particular, if it was + * restored from the archive to begin with, it's expected to have a + * .done file). + */ + if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 && + XLogArchivingActive()) + { + char origfname[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + + XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size); + XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size); + + if (!XLogArchiveIsReadyOrDone(origfname)) + { + char origpath[MAXPGPATH]; + char partialfname[MAXFNAMELEN]; + char partialpath[MAXPGPATH]; + + XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size); + snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname); + snprintf(partialpath, MAXPGPATH, "%s.partial", origpath); + + /* + * Make sure there's no .done or .ready file for the .partial + * file. + */ + XLogArchiveCleanup(partialfname); + + durable_rename(origpath, partialpath, ERROR); + XLogArchiveNotify(partialfname); + } + } + } + + /* + * Preallocate additional log files, if wanted. + */ + PreallocXlogFiles(EndOfLog); + + /* + * Okay, we're officially UP. + */ + InRecovery = false; + + /* start the archive_timeout timer and LSN running */ + XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = EndOfLog; + + /* also initialize latestCompletedXid, to nextXid - 1 */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; + FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid); + LWLockRelease(ProcArrayLock); + + /* + * Start up subtrans, if not already done for hot standby. (commit + * timestamps are started below, if necessary.) + */ + if (standbyState == STANDBY_DISABLED) + StartupSUBTRANS(oldestActiveXID); + + /* + * Perform end of recovery actions for any SLRUs that need it. + */ + TrimCLOG(); + TrimMultiXact(); + + /* Reload shared-memory state for prepared transactions */ + RecoverPreparedTransactions(); + + /* Shut down xlogreader */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + XLogReaderFree(xlogreader); + + /* + * If any of the critical GUCs have changed, log them before we allow + * backends to write WAL. + */ + LocalSetXLogInsertAllowed(); + XLogReportParameters(); + + /* + * Local WAL inserts enabled, so it's time to finish initialization of + * commit timestamp. + */ + CompleteCommitTsInitialization(); + + /* + * All done with end-of-recovery actions. + * + * Now allow backends to write WAL and update the control file status in + * consequence. SharedRecoveryState, that controls if backends can write + * WAL, is updated while holding ControlFileLock to prevent other backends + * to look at an inconsistent state of the control file in shared memory. + * There is still a small window during which backends can write WAL and + * the control file is still referring to a system not in DB_IN_PRODUCTION + * state while looking at the on-disk control file. + * + * Also, we use info_lck to update SharedRecoveryState to ensure that + * there are no race conditions concerning visibility of other recent + * updates to shared memory. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_PRODUCTION; + ControlFile->time = (pg_time_t) time(NULL); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE; + SpinLockRelease(&XLogCtl->info_lck); + + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* + * Shutdown the recovery environment. This must occur after + * RecoverPreparedTransactions() (see notes in lock_twophase_recover()) + * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as + * any session building a snapshot will not rely on KnownAssignedXids as + * RecoveryInProgress() would return false at this stage. This is + * particularly critical for prepared 2PC transactions, that would still + * need to be included in snapshots once recovery has ended. + */ + if (standbyState != STANDBY_DISABLED) + ShutdownRecoveryTransactionEnvironment(); + + /* + * If there were cascading standby servers connected to us, nudge any wal + * sender processes to notice that we've been promoted. + */ + WalSndWakeup(); + + /* + * If this was a promotion, request an (online) checkpoint now. This isn't + * required for consistency, but the last restartpoint might be far back, + * and in case of a crash, recovering from it might take a longer than is + * appropriate now that we're not in standby mode anymore. + */ + if (promoted) + RequestCheckpoint(CHECKPOINT_FORCE); +} + +/* + * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real + * directories. + * + * Replay of database creation XLOG records for databases that were later + * dropped can create fake directories in pg_tblspc. By the time consistency + * is reached these directories should have been removed; here we verify + * that this did indeed happen. This is to be called at the point where + * consistent state is reached. + * + * allow_in_place_tablespaces turns the PANIC into a WARNING, which is + * useful for testing purposes, and also allows for an escape hatch in case + * things go south. + */ +static void +CheckTablespaceDirectory(void) +{ + DIR *dir; + struct dirent *de; + + dir = AllocateDir("pg_tblspc"); + while ((de = ReadDir(dir, "pg_tblspc")) != NULL) + { + char path[MAXPGPATH + 10]; + + /* Skip entries of non-oid names */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name); + + if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK) + ereport(allow_in_place_tablespaces ? WARNING : PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("unexpected directory entry \"%s\" found in %s", + de->d_name, "pg_tblspc/"), + errdetail("All directory entries in pg_tblspc/ should be symbolic links."), + errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete."))); + } +} + +/* + * Checks if recovery has reached a consistent state. When consistency is + * reached and we have a valid starting standby snapshot, tell postmaster + * that it can start accepting read-only connections. + */ +static void +CheckRecoveryConsistency(void) +{ + XLogRecPtr lastReplayedEndRecPtr; + + /* + * During crash recovery, we don't reach a consistent state until we've + * replayed all the WAL. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint)) + return; + + Assert(InArchiveRecovery); + + /* + * assume that we are called in the startup process, and hence don't need + * a lock to read lastReplayedEndRecPtr + */ + lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr; + + /* + * Have we reached the point where our base backup was completed? + */ + if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) && + ControlFile->backupEndPoint <= lastReplayedEndRecPtr) + { + /* + * We have reached the end of base backup, as indicated by pg_control. + * The data on disk is now consistent. Reset backupStartPoint and + * backupEndPoint, and update minRecoveryPoint to make sure we don't + * allow starting up at an earlier point even if recovery is stopped + * and restarted soon after this. + */ + elog(DEBUG1, "end of backup reached"); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr) + ControlFile->minRecoveryPoint = lastReplayedEndRecPtr; + + ControlFile->backupStartPoint = InvalidXLogRecPtr; + ControlFile->backupEndPoint = InvalidXLogRecPtr; + ControlFile->backupEndRequired = false; + UpdateControlFile(); + + LWLockRelease(ControlFileLock); + } + + /* + * Have we passed our safe starting point? Note that minRecoveryPoint is + * known to be incorrectly set if ControlFile->backupEndRequired, until + * the XLOG_BACKUP_END arrives to advise us of the correct + * minRecoveryPoint. All we know prior to that is that we're not + * consistent yet. + */ + if (!reachedConsistency && !ControlFile->backupEndRequired && + minRecoveryPoint <= lastReplayedEndRecPtr && + XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) + { + /* + * Check to see if the XLOG sequence contained any unresolved + * references to uninitialized pages. + */ + XLogCheckInvalidPages(); + + /* + * Check that pg_tblspc doesn't contain any real directories. Replay + * of Database/CREATE_* records may have created ficticious tablespace + * directories that should have been removed by the time consistency + * was reached. + */ + CheckTablespaceDirectory(); + + reachedConsistency = true; + ereport(LOG, + (errmsg("consistent recovery state reached at %X/%X", + LSN_FORMAT_ARGS(lastReplayedEndRecPtr)))); + } + + /* + * Have we got a valid starting snapshot that will allow queries to be + * run? If so, we can tell postmaster that the database is consistent now, + * enabling connections. + */ + if (standbyState == STANDBY_SNAPSHOT_READY && + !LocalHotStandbyActive && + reachedConsistency && + IsUnderPostmaster) + { + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedHotStandbyActive = true; + SpinLockRelease(&XLogCtl->info_lck); + + LocalHotStandbyActive = true; + + SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY); + } +} + +/* + * Is the system still in recovery? + * + * Unlike testing InRecovery, this works in any process that's connected to + * shared memory. + * + * As a side-effect, we initialize the local TimeLineID and RedoRecPtr + * variables the first time we see that recovery is finished. + */ +bool +RecoveryInProgress(void) +{ + /* + * We check shared state each time only until we leave recovery mode. We + * can't re-enter recovery, so there's no need to keep checking after the + * shared variable has once been seen false. + */ + if (!LocalRecoveryInProgress) + return false; + else + { + /* + * use volatile pointer to make sure we make a fresh read of the + * shared variable. + */ + volatile XLogCtlData *xlogctl = XLogCtl; + + LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE); + + /* + * Initialize TimeLineID and RedoRecPtr when we discover that recovery + * is finished. InitPostgres() relies upon this behaviour to ensure + * that InitXLOGAccess() is called at backend startup. (If you change + * this, see also LocalSetXLogInsertAllowed.) + */ + if (!LocalRecoveryInProgress) + { + /* + * If we just exited recovery, make sure we read TimeLineID and + * RedoRecPtr after SharedRecoveryState (for machines with weak + * memory ordering). + */ + pg_memory_barrier(); + InitXLOGAccess(); + } + + /* + * Note: We don't need a memory barrier when we're still in recovery. + * We might exit recovery immediately after return, so the caller + * can't rely on 'true' meaning that we're still in recovery anyway. + */ + + return LocalRecoveryInProgress; + } +} + +/* + * Returns current recovery state from shared memory. + * + * This returned state is kept consistent with the contents of the control + * file. See details about the possible values of RecoveryState in xlog.h. + */ +RecoveryState +GetRecoveryState(void) +{ + RecoveryState retval; + + SpinLockAcquire(&XLogCtl->info_lck); + retval = XLogCtl->SharedRecoveryState; + SpinLockRelease(&XLogCtl->info_lck); + + return retval; +} + +/* + * Is HotStandby active yet? This is only important in special backends + * since normal backends won't ever be able to connect until this returns + * true. Postmaster knows this by way of signal, not via shared memory. + * + * Unlike testing standbyState, this works in any process that's connected to + * shared memory. (And note that standbyState alone doesn't tell the truth + * anyway.) + */ +bool +HotStandbyActive(void) +{ + /* + * We check shared state each time only until Hot Standby is active. We + * can't de-activate Hot Standby, so there's no need to keep checking + * after the shared variable has once been seen true. + */ + if (LocalHotStandbyActive) + return true; + else + { + /* spinlock is essential on machines with weak memory ordering! */ + SpinLockAcquire(&XLogCtl->info_lck); + LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive; + SpinLockRelease(&XLogCtl->info_lck); + + return LocalHotStandbyActive; + } +} + +/* + * Like HotStandbyActive(), but to be used only in WAL replay code, + * where we don't need to ask any other process what the state is. + */ +bool +HotStandbyActiveInReplay(void) +{ + Assert(AmStartupProcess() || !IsPostmasterEnvironment); + return LocalHotStandbyActive; +} + +/* + * Is this process allowed to insert new WAL records? + * + * Ordinarily this is essentially equivalent to !RecoveryInProgress(). + * But we also have provisions for forcing the result "true" or "false" + * within specific processes regardless of the global state. + */ +bool +XLogInsertAllowed(void) +{ + /* + * If value is "unconditionally true" or "unconditionally false", just + * return it. This provides the normal fast path once recovery is known + * done. + */ + if (LocalXLogInsertAllowed >= 0) + return (bool) LocalXLogInsertAllowed; + + /* + * Else, must check to see if we're still in recovery. + */ + if (RecoveryInProgress()) + return false; + + /* + * On exit from recovery, reset to "unconditionally true", since there is + * no need to keep checking. + */ + LocalXLogInsertAllowed = 1; + return true; +} + +/* + * Make XLogInsertAllowed() return true in the current process only. + * + * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later, + * and even call LocalSetXLogInsertAllowed() again after that. + */ +static void +LocalSetXLogInsertAllowed(void) +{ + Assert(LocalXLogInsertAllowed == -1); + LocalXLogInsertAllowed = 1; + + /* Initialize as RecoveryInProgress() would do when switching state */ + InitXLOGAccess(); +} + +/* + * Subroutine to try to fetch and validate a prior checkpoint record. + * + * whichChkpt identifies the checkpoint (merely for reporting purposes). + * 1 for "primary", 0 for "other" (backup_label) + */ +static XLogRecord * +ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, + int whichChkpt, bool report) +{ + XLogRecord *record; + uint8 info; + + if (!XRecOffIsValid(RecPtr)) + { + if (!report) + return NULL; + + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint link in control file"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint link in backup_label file"))); + break; + } + return NULL; + } + + XLogBeginRead(xlogreader, RecPtr); + record = ReadRecord(xlogreader, LOG, true); + + if (record == NULL) + { + if (!report) + return NULL; + + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint record"))); + break; + } + return NULL; + } + if (record->xl_rmid != RM_XLOG_ID) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid resource manager ID in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid resource manager ID in checkpoint record"))); + break; + } + return NULL; + } + info = record->xl_info & ~XLR_INFO_MASK; + if (info != XLOG_CHECKPOINT_SHUTDOWN && + info != XLOG_CHECKPOINT_ONLINE) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid xl_info in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid xl_info in checkpoint record"))); + break; + } + return NULL; + } + if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid length of primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid length of checkpoint record"))); + break; + } + return NULL; + } + return record; +} + +/* + * This must be called in a backend process before creating WAL records + * (except in a standalone backend, which does StartupXLOG instead). We need + * to initialize the local copies of ThisTimeLineID and RedoRecPtr. + * + * Note: before Postgres 8.0, we went to some effort to keep the postmaster + * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was + * unnecessary however, since the postmaster itself never touches XLOG anyway. + */ +void +InitXLOGAccess(void) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + + /* ThisTimeLineID doesn't change so we need no lock to copy it */ + ThisTimeLineID = XLogCtl->ThisTimeLineID; + Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode()); + + /* set wal_segment_size */ + wal_segment_size = ControlFile->xlog_seg_size; + + /* Use GetRedoRecPtr to copy the RedoRecPtr safely */ + (void) GetRedoRecPtr(); + /* Also update our copy of doPageWrites. */ + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + + /* Also initialize the working areas for constructing WAL records */ + InitXLogInsert(); +} + +/* + * Return the current Redo pointer from shared memory. + * + * As a side-effect, the local RedoRecPtr copy is updated. + */ +XLogRecPtr +GetRedoRecPtr(void) +{ + XLogRecPtr ptr; + + /* + * The possibly not up-to-date copy in XlogCtl is enough. Even if we + * grabbed a WAL insertion lock to read the authoritative value in + * Insert->RedoRecPtr, someone might update it just after we've released + * the lock. + */ + SpinLockAcquire(&XLogCtl->info_lck); + ptr = XLogCtl->RedoRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + + if (RedoRecPtr < ptr) + RedoRecPtr = ptr; + + return RedoRecPtr; +} + +/* + * Return information needed to decide whether a modified block needs a + * full-page image to be included in the WAL record. + * + * The returned values are cached copies from backend-private memory, and + * possibly out-of-date. XLogInsertRecord will re-check them against + * up-to-date values, while holding the WAL insert lock. + */ +void +GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) +{ + *RedoRecPtr_p = RedoRecPtr; + *doPageWrites_p = doPageWrites; +} + +/* + * GetInsertRecPtr -- Returns the current insert position. + * + * NOTE: The value *actually* returned is the position of the last full + * xlog page. It lags behind the real insert position by at most 1 page. + * For that, we don't need to scan through WAL insertion locks, and an + * approximation is enough for the current usage of this function. + */ +XLogRecPtr +GetInsertRecPtr(void) +{ + XLogRecPtr recptr; + + SpinLockAcquire(&XLogCtl->info_lck); + recptr = XLogCtl->LogwrtRqst.Write; + SpinLockRelease(&XLogCtl->info_lck); + + return recptr; +} + +/* + * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL + * position known to be fsync'd to disk. + */ +XLogRecPtr +GetFlushRecPtr(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + return LogwrtResult.Flush; +} + +/* + * GetLastImportantRecPtr -- Returns the LSN of the last important record + * inserted. All records not explicitly marked as unimportant are considered + * important. + * + * The LSN is determined by computing the maximum of + * WALInsertLocks[i].lastImportantAt. + */ +XLogRecPtr +GetLastImportantRecPtr(void) +{ + XLogRecPtr res = InvalidXLogRecPtr; + int i; + + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + XLogRecPtr last_important; + + /* + * Need to take a lock to prevent torn reads of the LSN, which are + * possible on some of the supported platforms. WAL insert locks only + * support exclusive mode, so we have to use that. + */ + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + last_important = WALInsertLocks[i].l.lastImportantAt; + LWLockRelease(&WALInsertLocks[i].l.lock); + + if (res < last_important) + res = last_important; + } + + return res; +} + +/* + * Get the time and LSN of the last xlog segment switch + */ +pg_time_t +GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN) +{ + pg_time_t result; + + /* Need WALWriteLock, but shared lock is sufficient */ + LWLockAcquire(WALWriteLock, LW_SHARED); + result = XLogCtl->lastSegSwitchTime; + *lastSwitchLSN = XLogCtl->lastSegSwitchLSN; + LWLockRelease(WALWriteLock); + + return result; +} + +/* + * This must be called ONCE during postmaster or standalone-backend shutdown + */ +void +ShutdownXLOG(int code, Datum arg) +{ + /* + * We should have an aux process resource owner to use, and we should not + * be in a transaction that's installed some other resowner. + */ + Assert(AuxProcessResourceOwner != NULL); + Assert(CurrentResourceOwner == NULL || + CurrentResourceOwner == AuxProcessResourceOwner); + CurrentResourceOwner = AuxProcessResourceOwner; + + /* Don't be chatty in standalone mode */ + ereport(IsPostmasterEnvironment ? LOG : NOTICE, + (errmsg("shutting down"))); + + /* + * Signal walsenders to move to stopping state. + */ + WalSndInitStopping(); + + /* + * Wait for WAL senders to be in stopping state. This prevents commands + * from writing new WAL. + */ + WalSndWaitStopping(); + + if (RecoveryInProgress()) + CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + else + { + /* + * If archiving is enabled, rotate the last XLOG file so that all the + * remaining records are archived (postmaster wakes up the archiver + * process one more time at the end of shutdown). The checkpoint + * record will go to the next XLOG file and won't be archived (yet). + */ + if (XLogArchivingActive() && XLogArchiveCommandSet()) + RequestXLogSwitch(false); + + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + } +} + +/* + * Log start of a checkpoint. + */ +static void +LogCheckpointStart(int flags, bool restartpoint) +{ + if (restartpoint) + ereport(LOG, + /* translator: the placeholders show checkpoint options */ + (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s", + (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", + (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", + (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FORCE) ? " force" : "", + (flags & CHECKPOINT_WAIT) ? " wait" : "", + (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", + (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", + (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + else + ereport(LOG, + /* translator: the placeholders show checkpoint options */ + (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s", + (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", + (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", + (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FORCE) ? " force" : "", + (flags & CHECKPOINT_WAIT) ? " wait" : "", + (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", + (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", + (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); +} + +/* + * Log end of a checkpoint. + */ +static void +LogCheckpointEnd(bool restartpoint) +{ + long write_msecs, + sync_msecs, + total_msecs, + longest_msecs, + average_msecs; + uint64 average_sync_time; + + CheckpointStats.ckpt_end_t = GetCurrentTimestamp(); + + write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t, + CheckpointStats.ckpt_sync_t); + + sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t, + CheckpointStats.ckpt_sync_end_t); + + /* Accumulate checkpoint timing summary data, in milliseconds. */ + BgWriterStats.m_checkpoint_write_time += write_msecs; + BgWriterStats.m_checkpoint_sync_time += sync_msecs; + + /* + * All of the published timing statistics are accounted for. Only + * continue if a log message is to be written. + */ + if (!log_checkpoints) + return; + + total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t, + CheckpointStats.ckpt_end_t); + + /* + * Timing values returned from CheckpointStats are in microseconds. + * Convert to milliseconds for consistent printing. + */ + longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000); + + average_sync_time = 0; + if (CheckpointStats.ckpt_sync_rels > 0) + average_sync_time = CheckpointStats.ckpt_agg_sync_time / + CheckpointStats.ckpt_sync_rels; + average_msecs = (long) ((average_sync_time + 999) / 1000); + + if (restartpoint) + ereport(LOG, + (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); " + "%d WAL file(s) added, %d removed, %d recycled; " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " + "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " + "distance=%d kB, estimate=%d kB", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled, + write_msecs / 1000, (int) (write_msecs % 1000), + sync_msecs / 1000, (int) (sync_msecs % 1000), + total_msecs / 1000, (int) (total_msecs % 1000), + CheckpointStats.ckpt_sync_rels, + longest_msecs / 1000, (int) (longest_msecs % 1000), + average_msecs / 1000, (int) (average_msecs % 1000), + (int) (PrevCheckPointDistance / 1024.0), + (int) (CheckPointDistanceEstimate / 1024.0)))); + else + ereport(LOG, + (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); " + "%d WAL file(s) added, %d removed, %d recycled; " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " + "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " + "distance=%d kB, estimate=%d kB", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled, + write_msecs / 1000, (int) (write_msecs % 1000), + sync_msecs / 1000, (int) (sync_msecs % 1000), + total_msecs / 1000, (int) (total_msecs % 1000), + CheckpointStats.ckpt_sync_rels, + longest_msecs / 1000, (int) (longest_msecs % 1000), + average_msecs / 1000, (int) (average_msecs % 1000), + (int) (PrevCheckPointDistance / 1024.0), + (int) (CheckPointDistanceEstimate / 1024.0)))); +} + +/* + * Update the estimate of distance between checkpoints. + * + * The estimate is used to calculate the number of WAL segments to keep + * preallocated, see XLOGfileslop(). + */ +static void +UpdateCheckPointDistanceEstimate(uint64 nbytes) +{ + /* + * To estimate the number of segments consumed between checkpoints, keep a + * moving average of the amount of WAL generated in previous checkpoint + * cycles. However, if the load is bursty, with quiet periods and busy + * periods, we want to cater for the peak load. So instead of a plain + * moving average, let the average decline slowly if the previous cycle + * used less WAL than estimated, but bump it up immediately if it used + * more. + * + * When checkpoints are triggered by max_wal_size, this should converge to + * CheckpointSegments * wal_segment_size, + * + * Note: This doesn't pay any attention to what caused the checkpoint. + * Checkpoints triggered manually with CHECKPOINT command, or by e.g. + * starting a base backup, are counted the same as those created + * automatically. The slow-decline will largely mask them out, if they are + * not frequent. If they are frequent, it seems reasonable to count them + * in as any others; if you issue a manual checkpoint every 5 minutes and + * never let a timed checkpoint happen, it makes sense to base the + * preallocation on that 5 minute interval rather than whatever + * checkpoint_timeout is set to. + */ + PrevCheckPointDistance = nbytes; + if (CheckPointDistanceEstimate < nbytes) + CheckPointDistanceEstimate = nbytes; + else + CheckPointDistanceEstimate = + (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes); +} + +/* + * Update the ps display for a process running a checkpoint. Note that + * this routine should not do any allocations so as it can be called + * from a critical section. + */ +static void +update_checkpoint_display(int flags, bool restartpoint, bool reset) +{ + /* + * The status is reported only for end-of-recovery and shutdown + * checkpoints or shutdown restartpoints. Updating the ps display is + * useful in those situations as it may not be possible to rely on + * pg_stat_activity to see the status of the checkpointer or the startup + * process. + */ + if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0) + return; + + if (reset) + set_ps_display(""); + else + { + char activitymsg[128]; + + snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s", + (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "", + (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "", + restartpoint ? "restartpoint" : "checkpoint"); + set_ps_display(activitymsg); + } +} + + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + * + * flags is a bitwise OR of the following: + * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. + * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. + * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, + * ignoring checkpoint_completion_target parameter. + * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred + * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or + * CHECKPOINT_END_OF_RECOVERY). + * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables. + * + * Note: flags contains other bits, of interest here only for logging purposes. + * In particular note that this routine is synchronous and does not pay + * attention to CHECKPOINT_WAIT. + * + * If !shutdown then we are writing an online checkpoint. This is a very special + * kind of operation and WAL record because the checkpoint action occurs over + * a period of time yet logically occurs at just a single LSN. The logical + * position of the WAL record (redo ptr) is the same or earlier than the + * physical position. When we replay WAL we locate the checkpoint via its + * physical position then read the redo ptr and actually start replay at the + * earlier logical position. Note that we don't write *anything* to WAL at + * the logical position, so that location could be any other kind of WAL record. + * All of this mechanism allows us to continue working while we checkpoint. + * As a result, timing of actions is critical here and be careful to note that + * this function will likely take minutes to execute on a busy system. + */ +void +CreateCheckPoint(int flags) +{ + bool shutdown; + CheckPoint checkPoint; + XLogRecPtr recptr; + XLogSegNo _logSegNo; + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint32 freespace; + XLogRecPtr PriorRedoPtr; + XLogRecPtr curInsert; + XLogRecPtr last_important_lsn; + VirtualTransactionId *vxids; + int nvxids; + + /* + * An end-of-recovery checkpoint is really a shutdown checkpoint, just + * issued at a different time. + */ + if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY)) + shutdown = true; + else + shutdown = false; + + /* sanity check */ + if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0) + elog(ERROR, "can't create a checkpoint during recovery"); + + /* + * Initialize InitXLogInsert working areas before entering the critical + * section. Normally, this is done by the first call to + * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating + * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is + * done below in a critical section, and InitXLogInsert cannot be called + * in a critical section. + */ + InitXLogInsert(); + + /* + * Prepare to accumulate statistics. + * + * Note: because it is possible for log_checkpoints to change while a + * checkpoint proceeds, we always accumulate stats, even if + * log_checkpoints is currently off. + */ + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + /* + * Let smgr prepare for checkpoint; this has to happen outside the + * critical section and before we determine the REDO pointer. Note that + * smgr must not do anything that'd have to be undone if we decide no + * checkpoint is needed. + */ + SyncPreCheckpoint(); + + /* + * Use a critical section to force system panic if we have trouble. + */ + START_CRIT_SECTION(); + + if (shutdown) + { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_SHUTDOWNING; + ControlFile->time = (pg_time_t) time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } + + /* Begin filling in the checkpoint WAL record */ + MemSet(&checkPoint, 0, sizeof(checkPoint)); + checkPoint.time = (pg_time_t) time(NULL); + + /* + * For Hot Standby, derive the oldestActiveXid before we fix the redo + * pointer. This allows us to begin accumulating changes to assemble our + * starting snapshot of locks and transactions. + */ + if (!shutdown && XLogStandbyInfoActive()) + checkPoint.oldestActiveXid = GetOldestActiveTransactionId(); + else + checkPoint.oldestActiveXid = InvalidTransactionId; + + /* + * Get location of last important record before acquiring insert locks (as + * GetLastImportantRecPtr() also locks WAL locks). + */ + last_important_lsn = GetLastImportantRecPtr(); + + /* + * We must block concurrent insertions while examining insert state to + * determine the checkpoint REDO pointer. + */ + WALInsertLockAcquireExclusive(); + curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); + + /* + * If this isn't a shutdown or forced checkpoint, and if there has been no + * WAL activity requiring a checkpoint, skip it. The idea here is to + * avoid inserting duplicate checkpoints when the system is idle. + */ + if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | + CHECKPOINT_FORCE)) == 0) + { + if (last_important_lsn == ControlFile->checkPoint) + { + WALInsertLockRelease(); + END_CRIT_SECTION(); + ereport(DEBUG1, + (errmsg_internal("checkpoint skipped because system is idle"))); + return; + } + } + + /* + * An end-of-recovery checkpoint is created before anyone is allowed to + * write WAL. To allow us to write the checkpoint record, temporarily + * enable XLogInsertAllowed. (This also ensures ThisTimeLineID is + * initialized, which we need here and in AdvanceXLInsertBuffer.) + */ + if (flags & CHECKPOINT_END_OF_RECOVERY) + LocalSetXLogInsertAllowed(); + + checkPoint.ThisTimeLineID = ThisTimeLineID; + if (flags & CHECKPOINT_END_OF_RECOVERY) + checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID; + else + checkPoint.PrevTimeLineID = ThisTimeLineID; + + checkPoint.fullPageWrites = Insert->fullPageWrites; + + /* + * Compute new REDO record ptr = location of next XLOG record. + * + * NB: this is NOT necessarily where the checkpoint record itself will be, + * since other backends may insert more XLOG records while we're off doing + * the buffer flush work. Those XLOG records are logically after the + * checkpoint, even though physically before it. Got that? + */ + freespace = INSERT_FREESPACE(curInsert); + if (freespace == 0) + { + if (XLogSegmentOffset(curInsert, wal_segment_size) == 0) + curInsert += SizeOfXLogLongPHD; + else + curInsert += SizeOfXLogShortPHD; + } + checkPoint.redo = curInsert; + + /* + * Here we update the shared RedoRecPtr for future XLogInsert calls; this + * must be done while holding all the insertion locks. + * + * Note: if we fail to complete the checkpoint, RedoRecPtr will be left + * pointing past where it really needs to point. This is okay; the only + * consequence is that XLogInsert might back up whole buffers that it + * didn't really need to. We can't postpone advancing RedoRecPtr because + * XLogInserts that happen while we are dumping buffers must assume that + * their buffer changes are not included in the checkpoint. + */ + RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; + + /* + * Now we can release the WAL insertion locks, allowing other xacts to + * proceed while we are flushing disk buffers. + */ + WALInsertLockRelease(); + + /* Update the info_lck-protected copy of RedoRecPtr as well */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->RedoRecPtr = checkPoint.redo; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * If enabled, log checkpoint start. We postpone this until now so as not + * to log anything if we decided to skip the checkpoint. + */ + if (log_checkpoints) + LogCheckpointStart(flags, false); + + /* Update the process title */ + update_checkpoint_display(flags, false, false); + + TRACE_POSTGRESQL_CHECKPOINT_START(flags); + + /* + * Get the other info we need for the checkpoint record. + * + * We don't need to save oldestClogXid in the checkpoint, it only matters + * for the short period in which clog is being truncated, and if we crash + * during that we'll redo the clog truncation and fix up oldestClogXid + * there. + */ + LWLockAcquire(XidGenLock, LW_SHARED); + checkPoint.nextXid = ShmemVariableCache->nextXid; + checkPoint.oldestXid = ShmemVariableCache->oldestXid; + checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB; + LWLockRelease(XidGenLock); + + LWLockAcquire(CommitTsLock, LW_SHARED); + checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid; + checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid; + LWLockRelease(CommitTsLock); + + LWLockAcquire(OidGenLock, LW_SHARED); + checkPoint.nextOid = ShmemVariableCache->nextOid; + if (!shutdown) + checkPoint.nextOid += ShmemVariableCache->oidCount; + LWLockRelease(OidGenLock); + + MultiXactGetCheckptMulti(shutdown, + &checkPoint.nextMulti, + &checkPoint.nextMultiOffset, + &checkPoint.oldestMulti, + &checkPoint.oldestMultiDB); + + /* + * Having constructed the checkpoint record, ensure all shmem disk buffers + * and commit-log buffers are flushed to disk. + * + * This I/O could fail for various reasons. If so, we will fail to + * complete the checkpoint, but there is no reason to force a system + * panic. Accordingly, exit critical section while doing it. + */ + END_CRIT_SECTION(); + + /* + * In some cases there are groups of actions that must all occur on one + * side or the other of a checkpoint record. Before flushing the + * checkpoint record we must explicitly wait for any backend currently + * performing those groups of actions. + * + * One example is end of transaction, so we must wait for any transactions + * that are currently in commit critical sections. If an xact inserted + * its commit record into XLOG just before the REDO point, then a crash + * restart from the REDO point would not replay that record, which means + * that our flushing had better include the xact's update of pg_xact. So + * we wait till he's out of his commit critical section before proceeding. + * See notes in RecordTransactionCommit(). + * + * Because we've already released the insertion locks, this test is a bit + * fuzzy: it is possible that we will wait for xacts we didn't really need + * to wait for. But the delay should be short and it seems better to make + * checkpoint take a bit longer than to hold off insertions longer than + * necessary. (In fact, the whole reason we have this issue is that xact.c + * does commit record XLOG insertion and clog update as two separate steps + * protected by different locks, but again that seems best on grounds of + * minimizing lock contention.) + * + * A transaction that has not yet set delayChkpt when we look cannot be at + * risk, since he's not inserted his commit record yet; and one that's + * already cleared it is not at risk either, since he's done fixing clog + * and we will correctly flush the update below. So we cannot miss any + * xacts we need to wait for. + */ + vxids = GetVirtualXIDsDelayingChkpt(&nvxids); + if (nvxids > 0) + { + do + { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids)); + } + pfree(vxids); + + CheckPointGuts(checkPoint.redo, flags); + + vxids = GetVirtualXIDsDelayingChkptEnd(&nvxids); + if (nvxids > 0) + { + do + { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveVirtualXIDsDelayingChkptEnd(vxids, nvxids)); + } + pfree(vxids); + + /* + * Take a snapshot of running transactions and write this to WAL. This + * allows us to reconstruct the state of running transactions during + * archive recovery, if required. Skip, if this info disabled. + * + * If we are shutting down, or Startup process is completing crash + * recovery we don't need to write running xact data. + */ + if (!shutdown && XLogStandbyInfoActive()) + LogStandbySnapshot(); + + START_CRIT_SECTION(); + + /* + * Now insert the checkpoint record into XLOG. + */ + XLogBeginInsert(); + XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint)); + recptr = XLogInsert(RM_XLOG_ID, + shutdown ? XLOG_CHECKPOINT_SHUTDOWN : + XLOG_CHECKPOINT_ONLINE); + + XLogFlush(recptr); + + /* + * We mustn't write any new WAL after a shutdown checkpoint, or it will be + * overwritten at next startup. No-one should even try, this just allows + * sanity-checking. In the case of an end-of-recovery checkpoint, we want + * to just temporarily disable writing until the system has exited + * recovery. + */ + if (shutdown) + { + if (flags & CHECKPOINT_END_OF_RECOVERY) + LocalXLogInsertAllowed = -1; /* return to "check" state */ + else + LocalXLogInsertAllowed = 0; /* never again write WAL */ + } + + /* + * We now have ProcLastRecPtr = start of actual checkpoint record, recptr + * = end of actual checkpoint record. + */ + if (shutdown && checkPoint.redo != ProcLastRecPtr) + ereport(PANIC, + (errmsg("concurrent write-ahead log activity while database system is shutting down"))); + + /* + * Remember the prior checkpoint's redo ptr for + * UpdateCheckPointDistanceEstimate() + */ + PriorRedoPtr = ControlFile->checkPointCopy.redo; + + /* + * Update the control file. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (shutdown) + ControlFile->state = DB_SHUTDOWNED; + ControlFile->checkPoint = ProcLastRecPtr; + ControlFile->checkPointCopy = checkPoint; + ControlFile->time = (pg_time_t) time(NULL); + /* crash recovery should always recover to the end of WAL */ + ControlFile->minRecoveryPoint = InvalidXLogRecPtr; + ControlFile->minRecoveryPointTLI = 0; + + /* + * Persist unloggedLSN value. It's reset on crash recovery, so this goes + * unused on non-shutdown checkpoints, but seems useful to store it always + * for debugging purposes. + */ + SpinLockAcquire(&XLogCtl->ulsn_lck); + ControlFile->unloggedLSN = XLogCtl->unloggedLSN; + SpinLockRelease(&XLogCtl->ulsn_lck); + + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* Update shared-memory copy of checkpoint XID/epoch */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->ckptFullXid = checkPoint.nextXid; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * We are now done with critical updates; no need for system panic if we + * have trouble while fooling with old log segments. + */ + END_CRIT_SECTION(); + + /* + * Let smgr do post-checkpoint cleanup (eg, deleting old files). + */ + SyncPostCheckpoint(); + + /* + * Update the average distance between checkpoints if the prior checkpoint + * exists. + */ + if (PriorRedoPtr != InvalidXLogRecPtr) + UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); + + /* + * Delete old log files, those no longer needed for last checkpoint to + * prevent the disk holding the xlog from growing full. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + KeepLogSeg(recptr, &_logSegNo); + if (InvalidateObsoleteReplicationSlots(_logSegNo)) + { + /* + * Some slots have been invalidated; recalculate the old-segment + * horizon, starting again from RedoRecPtr. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + KeepLogSeg(recptr, &_logSegNo); + } + _logSegNo--; + RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr); + + /* + * Make more log segments if needed. (Do this after recycling old log + * segments, since that may supply some of the needed files.) + */ + if (!shutdown) + PreallocXlogFiles(recptr); + + /* + * Truncate pg_subtrans if possible. We can throw away all data before + * the oldest XMIN of any running transaction. No future transaction will + * attempt to reference any pg_subtrans entry older than that (see Asserts + * in subtrans.c). During recovery, though, we mustn't do this because + * StartupSUBTRANS hasn't been called yet. + */ + if (!RecoveryInProgress()) + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + + /* Real work is done; log and update stats. */ + LogCheckpointEnd(false); + + /* Reset the process title */ + update_checkpoint_display(flags, false, true); + + TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, + NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled); +} + +/* + * Mark the end of recovery in WAL though without running a full checkpoint. + * We can expect that a restartpoint is likely to be in progress as we + * do this, though we are unwilling to wait for it to complete. + * + * CreateRestartPoint() allows for the case where recovery may end before + * the restartpoint completes so there is no concern of concurrent behaviour. + */ +static void +CreateEndOfRecoveryRecord(void) +{ + xl_end_of_recovery xlrec; + XLogRecPtr recptr; + + /* sanity check */ + if (!RecoveryInProgress()) + elog(ERROR, "can only be used to end recovery"); + + xlrec.end_time = GetCurrentTimestamp(); + + WALInsertLockAcquireExclusive(); + xlrec.ThisTimeLineID = ThisTimeLineID; + xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID; + WALInsertLockRelease(); + + LocalSetXLogInsertAllowed(); + + START_CRIT_SECTION(); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY); + + XLogFlush(recptr); + + /* + * Update the control file so that crash recovery can follow the timeline + * changes to this point. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->time = (pg_time_t) time(NULL); + ControlFile->minRecoveryPoint = recptr; + ControlFile->minRecoveryPointTLI = ThisTimeLineID; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + END_CRIT_SECTION(); + + LocalXLogInsertAllowed = -1; /* return to "check" state */ +} + +/* + * Write an OVERWRITE_CONTRECORD message. + * + * When on WAL replay we expect a continuation record at the start of a page + * that is not there, recovery ends and WAL writing resumes at that point. + * But it's wrong to resume writing new WAL back at the start of the record + * that was broken, because downstream consumers of that WAL (physical + * replicas) are not prepared to "rewind". So the first action after + * finishing replay of all valid WAL must be to write a record of this type + * at the point where the contrecord was missing; to support xlogreader + * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added + * to the page header where the record occurs. xlogreader has an ad-hoc + * mechanism to report metadata about the broken record, which is what we + * use here. + * + * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to + * skip the record it was reading, and pass back the LSN of the skipped + * record, so that its caller can verify (on "replay" of that record) that the + * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten. + */ +static XLogRecPtr +CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn) +{ + xl_overwrite_contrecord xlrec; + XLogRecPtr recptr; + + /* sanity check */ + if (!RecoveryInProgress()) + elog(ERROR, "can only be used at end of recovery"); + + xlrec.overwritten_lsn = aborted_lsn; + xlrec.overwrite_time = GetCurrentTimestamp(); + + START_CRIT_SECTION(); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD); + + XLogFlush(recptr); + + END_CRIT_SECTION(); + + return recptr; +} + +/* + * Flush all data in shared memory to disk, and fsync + * + * This is the common code shared between regular checkpoints and + * recovery restartpoints. + */ +static void +CheckPointGuts(XLogRecPtr checkPointRedo, int flags) +{ + CheckPointRelationMap(); + CheckPointReplicationSlots(); + CheckPointSnapBuild(); + CheckPointLogicalRewriteHeap(); + CheckPointReplicationOrigin(); + + /* Write out all dirty data in SLRUs and the main buffer pool */ + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); + CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); + CheckPointCLOG(); + CheckPointCommitTs(); + CheckPointSUBTRANS(); + CheckPointMultiXact(); + CheckPointPredicate(); + CheckPointBuffers(flags); + + /* Perform all queued up fsyncs */ + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); + CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); + ProcessSyncRequests(); + CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp(); + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE(); + + /* We deliberately delay 2PC checkpointing as long as possible */ + CheckPointTwoPhase(checkPointRedo); +} + +/* + * Save a checkpoint for recovery restart if appropriate + * + * This function is called each time a checkpoint record is read from XLOG. + * It must determine whether the checkpoint represents a safe restartpoint or + * not. If so, the checkpoint record is stashed in shared memory so that + * CreateRestartPoint can consult it. (Note that the latter function is + * executed by the checkpointer, while this one will be executed by the + * startup process.) + */ +static void +RecoveryRestartPoint(const CheckPoint *checkPoint) +{ + /* + * Also refrain from creating a restartpoint if we have seen any + * references to non-existent pages. Restarting recovery from the + * restartpoint would not see the references, so we would lose the + * cross-check that the pages belonged to a relation that was dropped + * later. + */ + if (XLogHaveInvalidPages()) + { + elog(trace_recovery(DEBUG2), + "could not record restart point at %X/%X because there " + "are unresolved references to invalid pages", + LSN_FORMAT_ARGS(checkPoint->redo)); + return; + } + + /* + * Copy the checkpoint record to shared memory, so that checkpointer can + * work out the next time it wants to perform a restartpoint. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->lastCheckPointRecPtr = ReadRecPtr; + XLogCtl->lastCheckPointEndPtr = EndRecPtr; + XLogCtl->lastCheckPoint = *checkPoint; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Establish a restartpoint if possible. + * + * This is similar to CreateCheckPoint, but is used during WAL recovery + * to establish a point from which recovery can roll forward without + * replaying the entire recovery log. + * + * Returns true if a new restartpoint was established. We can only establish + * a restartpoint if we have replayed a safe checkpoint record since last + * restartpoint. + */ +bool +CreateRestartPoint(int flags) +{ + XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; + CheckPoint lastCheckPoint; + XLogRecPtr PriorRedoPtr; + XLogRecPtr receivePtr; + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr endptr; + XLogSegNo _logSegNo; + TimestampTz xtime; + + /* Get a local copy of the last safe checkpoint record. */ + SpinLockAcquire(&XLogCtl->info_lck); + lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr; + lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr; + lastCheckPoint = XLogCtl->lastCheckPoint; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Check that we're still in recovery mode. It's ok if we exit recovery + * mode after this check, the restart point is valid anyway. + */ + if (!RecoveryInProgress()) + { + ereport(DEBUG2, + (errmsg_internal("skipping restartpoint, recovery has already ended"))); + return false; + } + + /* + * If the last checkpoint record we've replayed is already our last + * restartpoint, we can't perform a new restart point. We still update + * minRecoveryPoint in that case, so that if this is a shutdown restart + * point, we won't start up earlier than before. That's not strictly + * necessary, but when hot standby is enabled, it would be rather weird if + * the database opened up for read-only connections at a point-in-time + * before the last shutdown. Such time travel is still possible in case of + * immediate shutdown, though. + * + * We don't explicitly advance minRecoveryPoint when we do create a + * restartpoint. It's assumed that flushing the buffers will do that as a + * side-effect. + */ + if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) || + lastCheckPoint.redo <= ControlFile->checkPointCopy.redo) + { + ereport(DEBUG2, + (errmsg_internal("skipping restartpoint, already performed at %X/%X", + LSN_FORMAT_ARGS(lastCheckPoint.redo)))); + + UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); + if (flags & CHECKPOINT_IS_SHUTDOWN) + { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; + ControlFile->time = (pg_time_t) time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } + return false; + } + + /* + * Update the shared RedoRecPtr so that the startup process can calculate + * the number of segments replayed since last restartpoint, and request a + * restartpoint if it exceeds CheckPointSegments. + * + * Like in CreateCheckPoint(), hold off insertions to update it, although + * during recovery this is just pro forma, because no WAL insertions are + * happening. + */ + WALInsertLockAcquireExclusive(); + RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo; + WALInsertLockRelease(); + + /* Also update the info_lck-protected copy */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->RedoRecPtr = lastCheckPoint.redo; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Prepare to accumulate statistics. + * + * Note: because it is possible for log_checkpoints to change while a + * checkpoint proceeds, we always accumulate stats, even if + * log_checkpoints is currently off. + */ + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + if (log_checkpoints) + LogCheckpointStart(flags, true); + + /* Update the process title */ + update_checkpoint_display(flags, true, false); + + CheckPointGuts(lastCheckPoint.redo, flags); + + /* + * Remember the prior checkpoint's redo ptr for + * UpdateCheckPointDistanceEstimate() + */ + PriorRedoPtr = ControlFile->checkPointCopy.redo; + + /* + * Update pg_control, using current time. Check that it still shows an + * older checkpoint, else do nothing; this is a quick hack to make sure + * nothing really bad happens if somehow we get here after the + * end-of-recovery checkpoint. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo) + { + /* + * Update the checkpoint information. We do this even if the cluster + * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL + * segments recycled below. + */ + ControlFile->checkPoint = lastCheckPointRecPtr; + ControlFile->checkPointCopy = lastCheckPoint; + ControlFile->time = (pg_time_t) time(NULL); + + /* + * Ensure minRecoveryPoint is past the checkpoint record and update it + * if the control file still shows DB_IN_ARCHIVE_RECOVERY. Normally, + * this will have happened already while writing out dirty buffers, + * but not necessarily - e.g. because no buffers were dirtied. We do + * this because a non-exclusive base backup uses minRecoveryPoint to + * determine which WAL files must be included in the backup, and the + * file (or files) containing the checkpoint record must be included, + * at a minimum. Note that for an ordinary restart of recovery there's + * no value in having the minimum recovery point any earlier than this + * anyway, because redo will begin just after the checkpoint record. + * this because a non-exclusive base backup uses minRecoveryPoint to + * determine which WAL files must be included in the backup, and the + * file (or files) containing the checkpoint record must be included, + * at a minimum. Note that for an ordinary restart of recovery there's + * no value in having the minimum recovery point any earlier than this + * anyway, because redo will begin just after the checkpoint record. + */ + if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY) + { + if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr) + { + ControlFile->minRecoveryPoint = lastCheckPointEndPtr; + ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID; + + /* update local copy */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + if (flags & CHECKPOINT_IS_SHUTDOWN) + ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; + } + UpdateControlFile(); + } + LWLockRelease(ControlFileLock); + + /* + * Update the average distance between checkpoints/restartpoints if the + * prior checkpoint exists. + */ + if (PriorRedoPtr != InvalidXLogRecPtr) + UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); + + /* + * Delete old log files, those no longer needed for last restartpoint to + * prevent the disk holding the xlog from growing full. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + + /* + * Retreat _logSegNo using the current end of xlog replayed or received, + * whichever is later. + */ + receivePtr = GetWalRcvFlushRecPtr(NULL, NULL); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr; + KeepLogSeg(endptr, &_logSegNo); + if (InvalidateObsoleteReplicationSlots(_logSegNo)) + { + /* + * Some slots have been invalidated; recalculate the old-segment + * horizon, starting again from RedoRecPtr. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + KeepLogSeg(endptr, &_logSegNo); + } + _logSegNo--; + + /* + * Try to recycle segments on a useful timeline. If we've been promoted + * since the beginning of this restartpoint, use the new timeline chosen + * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that + * case). If we're still in recovery, use the timeline we're currently + * replaying. + * + * There is no guarantee that the WAL segments will be useful on the + * current timeline; if recovery proceeds to a new timeline right after + * this, the pre-allocated WAL segments on this timeline will not be used, + * and will go wasted until recycled on the next restartpoint. We'll live + * with that. + */ + if (RecoveryInProgress()) + ThisTimeLineID = replayTLI; + + RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr); + + /* + * Make more log segments if needed. (Do this after recycling old log + * segments, since that may supply some of the needed files.) + */ + PreallocXlogFiles(endptr); + + /* + * ThisTimeLineID is normally not set when we're still in recovery. + * However, recycling/preallocating segments above needed ThisTimeLineID + * to determine which timeline to install the segments on. Reset it now, + * to restore the normal state of affairs for debugging purposes. + */ + if (RecoveryInProgress()) + ThisTimeLineID = 0; + + /* + * Truncate pg_subtrans if possible. We can throw away all data before + * the oldest XMIN of any running transaction. No future transaction will + * attempt to reference any pg_subtrans entry older than that (see Asserts + * in subtrans.c). When hot standby is disabled, though, we mustn't do + * this because StartupSUBTRANS hasn't been called yet. + */ + if (EnableHotStandby) + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + + /* Real work is done; log and update stats. */ + LogCheckpointEnd(true); + + /* Reset the process title */ + update_checkpoint_display(flags, true, true); + + xtime = GetLatestXTime(); + ereport((log_checkpoints ? LOG : DEBUG2), + (errmsg("recovery restart point at %X/%X", + LSN_FORMAT_ARGS(lastCheckPoint.redo)), + xtime ? errdetail("Last completed transaction was at log time %s.", + timestamptz_to_str(xtime)) : 0)); + + /* + * Finally, execute archive_cleanup_command, if any. + */ + if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0) + ExecuteRecoveryCommand(archiveCleanupCommand, + "archive_cleanup_command", + false); + + return true; +} + +/* + * Report availability of WAL for the given target LSN + * (typically a slot's restart_lsn) + * + * Returns one of the following enum values: + * + * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of + * max_wal_size. + * + * * WALAVAIL_EXTENDED means it is still available by preserving extra + * segments beyond max_wal_size. If max_slot_wal_keep_size is smaller + * than max_wal_size, this state is not returned. + * + * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will + * remove reserved segments. The walsender using this slot may return to the + * above. + * + * * WALAVAIL_REMOVED means it has been removed. A replication stream on + * a slot with this LSN cannot continue after a restart. + * + * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL. + */ +WALAvailability +GetWALAvailability(XLogRecPtr targetLSN) +{ + XLogRecPtr currpos; /* current write LSN */ + XLogSegNo currSeg; /* segid of currpos */ + XLogSegNo targetSeg; /* segid of targetLSN */ + XLogSegNo oldestSeg; /* actual oldest segid */ + XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */ + XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */ + uint64 keepSegs; + + /* + * slot does not reserve WAL. Either deactivated, or has never been active + */ + if (XLogRecPtrIsInvalid(targetLSN)) + return WALAVAIL_INVALID_LSN; + + /* + * Calculate the oldest segment currently reserved by all slots, + * considering wal_keep_size and max_slot_wal_keep_size. Initialize + * oldestSlotSeg to the current segment. + */ + currpos = GetXLogWriteRecPtr(); + XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size); + KeepLogSeg(currpos, &oldestSlotSeg); + + /* + * Find the oldest extant segment file. We get 1 until checkpoint removes + * the first WAL segment file since startup, which causes the status being + * wrong under certain abnormal conditions but that doesn't actually harm. + */ + oldestSeg = XLogGetLastRemovedSegno() + 1; + + /* calculate oldest segment by max_wal_size */ + XLByteToSeg(currpos, currSeg, wal_segment_size); + keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1; + + if (currSeg > keepSegs) + oldestSegMaxWalSize = currSeg - keepSegs; + else + oldestSegMaxWalSize = 1; + + /* the segment we care about */ + XLByteToSeg(targetLSN, targetSeg, wal_segment_size); + + /* + * No point in returning reserved or extended status values if the + * targetSeg is known to be lost. + */ + if (targetSeg >= oldestSlotSeg) + { + /* show "reserved" when targetSeg is within max_wal_size */ + if (targetSeg >= oldestSegMaxWalSize) + return WALAVAIL_RESERVED; + + /* being retained by slots exceeding max_wal_size */ + return WALAVAIL_EXTENDED; + } + + /* WAL segments are no longer retained but haven't been removed yet */ + if (targetSeg >= oldestSeg) + return WALAVAIL_UNRESERVED; + + /* Definitely lost */ + return WALAVAIL_REMOVED; +} + + +/* + * Retreat *logSegNo to the last segment that we need to retain because of + * either wal_keep_size or replication slots. + * + * This is calculated by subtracting wal_keep_size from the given xlog + * location, recptr and by making sure that that result is below the + * requirement of replication slots. For the latter criterion we do consider + * the effects of max_slot_wal_keep_size: reserve at most that much space back + * from recptr. + * + * Note about replication slots: if this function calculates a value + * that's further ahead than what slots need reserved, then affected + * slots need to be invalidated and this function invoked again. + * XXX it might be a good idea to rewrite this function so that + * invalidation is optionally done here, instead. + */ +static void +KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) +{ + XLogSegNo currSegNo; + XLogSegNo segno; + XLogRecPtr keep; + + XLByteToSeg(recptr, currSegNo, wal_segment_size); + segno = currSegNo; + + /* + * Calculate how many segments are kept by slots first, adjusting for + * max_slot_wal_keep_size. + */ + keep = XLogGetReplicationSlotMinimumLSN(); + if (keep != InvalidXLogRecPtr) + { + XLByteToSeg(keep, segno, wal_segment_size); + + /* Cap by max_slot_wal_keep_size ... */ + if (max_slot_wal_keep_size_mb >= 0) + { + uint64 slot_keep_segs; + + slot_keep_segs = + ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size); + + if (currSegNo - segno > slot_keep_segs) + segno = currSegNo - slot_keep_segs; + } + } + + /* but, keep at least wal_keep_size if that's set */ + if (wal_keep_size_mb > 0) + { + uint64 keep_segs; + + keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size); + if (currSegNo - segno < keep_segs) + { + /* avoid underflow, don't go below 1 */ + if (currSegNo <= keep_segs) + segno = 1; + else + segno = currSegNo - keep_segs; + } + } + + /* don't delete WAL segments newer than the calculated segment */ + if (segno < *logSegNo) + *logSegNo = segno; +} + +/* + * Write a NEXTOID log record + */ +void +XLogPutNextOid(Oid nextOid) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&nextOid), sizeof(Oid)); + (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID); + + /* + * We need not flush the NEXTOID record immediately, because any of the + * just-allocated OIDs could only reach disk as part of a tuple insert or + * update that would have its own XLOG record that must follow the NEXTOID + * record. Therefore, the standard buffer LSN interlock applied to those + * records will ensure no such OID reaches disk before the NEXTOID record + * does. + * + * Note, however, that the above statement only covers state "within" the + * database. When we use a generated OID as a file or directory name, we + * are in a sense violating the basic WAL rule, because that filesystem + * change may reach disk before the NEXTOID WAL record does. The impact + * of this is that if a database crash occurs immediately afterward, we + * might after restart re-generate the same OID and find that it conflicts + * with the leftover file or directory. But since for safety's sake we + * always loop until finding a nonconflicting filename, this poses no real + * problem in practice. See pgsql-hackers discussion 27-Sep-2006. + */ +} + +/* + * Write an XLOG SWITCH record. + * + * Here we just blindly issue an XLogInsert request for the record. + * All the magic happens inside XLogInsert. + * + * The return value is either the end+1 address of the switch record, + * or the end+1 address of the prior segment if we did not need to + * write a switch record because we are already at segment start. + */ +XLogRecPtr +RequestXLogSwitch(bool mark_unimportant) +{ + XLogRecPtr RecPtr; + + /* XLOG SWITCH has no data */ + XLogBeginInsert(); + + if (mark_unimportant) + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH); + + return RecPtr; +} + +/* + * Write a RESTORE POINT record + */ +XLogRecPtr +XLogRestorePoint(const char *rpName) +{ + XLogRecPtr RecPtr; + xl_restore_point xlrec; + + xlrec.rp_time = GetCurrentTimestamp(); + strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point)); + + RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT); + + ereport(LOG, + (errmsg("restore point \"%s\" created at %X/%X", + rpName, LSN_FORMAT_ARGS(RecPtr)))); + + return RecPtr; +} + +/* + * Check if any of the GUC parameters that are critical for hot standby + * have changed, and update the value in pg_control file if necessary. + */ +static void +XLogReportParameters(void) +{ + if (wal_level != ControlFile->wal_level || + wal_log_hints != ControlFile->wal_log_hints || + MaxConnections != ControlFile->MaxConnections || + max_worker_processes != ControlFile->max_worker_processes || + max_wal_senders != ControlFile->max_wal_senders || + max_prepared_xacts != ControlFile->max_prepared_xacts || + max_locks_per_xact != ControlFile->max_locks_per_xact || + track_commit_timestamp != ControlFile->track_commit_timestamp) + { + /* + * The change in number of backend slots doesn't need to be WAL-logged + * if archiving is not enabled, as you can't start archive recovery + * with wal_level=minimal anyway. We don't really care about the + * values in pg_control either if wal_level=minimal, but seems better + * to keep them up-to-date to avoid confusion. + */ + if (wal_level != ControlFile->wal_level || XLogIsNeeded()) + { + xl_parameter_change xlrec; + XLogRecPtr recptr; + + xlrec.MaxConnections = MaxConnections; + xlrec.max_worker_processes = max_worker_processes; + xlrec.max_wal_senders = max_wal_senders; + xlrec.max_prepared_xacts = max_prepared_xacts; + xlrec.max_locks_per_xact = max_locks_per_xact; + xlrec.wal_level = wal_level; + xlrec.wal_log_hints = wal_log_hints; + xlrec.track_commit_timestamp = track_commit_timestamp; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE); + XLogFlush(recptr); + } + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + ControlFile->MaxConnections = MaxConnections; + ControlFile->max_worker_processes = max_worker_processes; + ControlFile->max_wal_senders = max_wal_senders; + ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_locks_per_xact = max_locks_per_xact; + ControlFile->wal_level = wal_level; + ControlFile->wal_log_hints = wal_log_hints; + ControlFile->track_commit_timestamp = track_commit_timestamp; + UpdateControlFile(); + + LWLockRelease(ControlFileLock); + } +} + +/* + * Update full_page_writes in shared memory, and write an + * XLOG_FPW_CHANGE record if necessary. + * + * Note: this function assumes there is no other process running + * concurrently that could update it. + */ +void +UpdateFullPageWrites(void) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + bool recoveryInProgress; + + /* + * Do nothing if full_page_writes has not been changed. + * + * It's safe to check the shared full_page_writes without the lock, + * because we assume that there is no concurrently running process which + * can update it. + */ + if (fullPageWrites == Insert->fullPageWrites) + return; + + /* + * Perform this outside critical section so that the WAL insert + * initialization done by RecoveryInProgress() doesn't trigger an + * assertion failure. + */ + recoveryInProgress = RecoveryInProgress(); + + START_CRIT_SECTION(); + + /* + * It's always safe to take full page images, even when not strictly + * required, but not the other round. So if we're setting full_page_writes + * to true, first set it true and then write the WAL record. If we're + * setting it to false, first write the WAL record and then set the global + * flag. + */ + if (fullPageWrites) + { + WALInsertLockAcquireExclusive(); + Insert->fullPageWrites = true; + WALInsertLockRelease(); + } + + /* + * Write an XLOG_FPW_CHANGE record. This allows us to keep track of + * full_page_writes during archive recovery, if required. + */ + if (XLogStandbyInfoActive() && !recoveryInProgress) + { + XLogBeginInsert(); + XLogRegisterData((char *) (&fullPageWrites), sizeof(bool)); + + XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE); + } + + if (!fullPageWrites) + { + WALInsertLockAcquireExclusive(); + Insert->fullPageWrites = false; + WALInsertLockRelease(); + } + END_CRIT_SECTION(); +} + +/* + * Check that it's OK to switch to new timeline during recovery. + * + * 'lsn' is the address of the shutdown checkpoint record we're about to + * replay. (Currently, timeline can only change at a shutdown checkpoint). + */ +static void +checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI) +{ + /* Check that the record agrees on what the current (old) timeline is */ + if (prevTLI != ThisTimeLineID) + ereport(PANIC, + (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record", + prevTLI, ThisTimeLineID))); + + /* + * The new timeline better be in the list of timelines we expect to see, + * according to the timeline history. It should also not decrease. + */ + if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs)) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", + newTLI, ThisTimeLineID))); + + /* + * If we have not yet reached min recovery point, and we're about to + * switch to a timeline greater than the timeline of the min recovery + * point: trouble. After switching to the new timeline, we could not + * possibly visit the min recovery point on the correct timeline anymore. + * This can happen if there is a newer timeline in the archive that + * branched before the timeline the min recovery point is on, and you + * attempt to do PITR to the new timeline. + */ + if (!XLogRecPtrIsInvalid(minRecoveryPoint) && + lsn < minRecoveryPoint && + newTLI > minRecoveryPointTLI) + ereport(PANIC, + (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", + newTLI, + LSN_FORMAT_ARGS(minRecoveryPoint), + minRecoveryPointTLI))); + + /* Looks good */ +} + +/* + * XLOG resource manager's routines + * + * Definitions of info values are in include/catalog/pg_control.h, though + * not all record types are related to control file updates. + */ +void +xlog_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; + + /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */ + Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || + !XLogRecHasAnyBlockRefs(record)); + + if (info == XLOG_NEXTOID) + { + Oid nextOid; + + /* + * We used to try to take the maximum of ShmemVariableCache->nextOid + * and the recorded nextOid, but that fails if the OID counter wraps + * around. Since no OID allocation should be happening during replay + * anyway, better to just believe the record exactly. We still take + * OidGenLock while setting the variable, just in case. + */ + memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid)); + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextOid = nextOid; + ShmemVariableCache->oidCount = 0; + LWLockRelease(OidGenLock); + } + else if (info == XLOG_CHECKPOINT_SHUTDOWN) + { + CheckPoint checkPoint; + + memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + /* In a SHUTDOWN checkpoint, believe the counters exactly */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextXid = checkPoint.nextXid; + LWLockRelease(XidGenLock); + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->oidCount = 0; + LWLockRelease(OidGenLock); + MultiXactSetNextMXact(checkPoint.nextMulti, + checkPoint.nextMultiOffset); + + MultiXactAdvanceOldest(checkPoint.oldestMulti, + checkPoint.oldestMultiDB); + + /* + * No need to set oldestClogXid here as well; it'll be set when we + * redo an xl_clog_truncate if it changed since initialization. + */ + SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + + /* + * If we see a shutdown checkpoint while waiting for an end-of-backup + * record, the backup was canceled and the end-of-backup record will + * never arrive. + */ + if (ArchiveRecoveryRequested && + !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) && + XLogRecPtrIsInvalid(ControlFile->backupEndPoint)) + ereport(PANIC, + (errmsg("online backup was canceled, recovery cannot continue"))); + + /* + * If we see a shutdown checkpoint, we know that nothing was running + * on the primary at this point. So fake-up an empty running-xacts + * record and use that here and now. Recover additional standby state + * for prepared transactions. + */ + if (standbyState >= STANDBY_INITIALIZED) + { + TransactionId *xids; + int nxids; + TransactionId oldestActiveXID; + TransactionId latestCompletedXid; + RunningTransactionsData running; + + oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + + /* + * Construct a RunningTransactions snapshot representing a shut + * down server, with only prepared transactions still alive. We're + * never overflowed at this point because all subxids are listed + * with their parent prepared transactions. + */ + running.xcnt = nxids; + running.subxcnt = 0; + running.subxid_overflow = false; + running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); + running.oldestRunningXid = oldestActiveXID; + latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); + TransactionIdRetreat(latestCompletedXid); + Assert(TransactionIdIsNormal(latestCompletedXid)); + running.latestCompletedXid = latestCompletedXid; + running.xids = xids; + + ProcArrayApplyRecoveryInfo(&running); + + StandbyRecoverPreparedTransactions(); + } + + /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + LWLockRelease(ControlFileLock); + + /* Update shared-memory copy of checkpoint XID/epoch */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->ckptFullXid = checkPoint.nextXid; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * We should've already switched to the new TLI before replaying this + * record. + */ + if (checkPoint.ThisTimeLineID != ThisTimeLineID) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", + checkPoint.ThisTimeLineID, ThisTimeLineID))); + + RecoveryRestartPoint(&checkPoint); + } + else if (info == XLOG_CHECKPOINT_ONLINE) + { + CheckPoint checkPoint; + + memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + /* In an ONLINE checkpoint, treat the XID counter as a minimum */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid, + checkPoint.nextXid)) + ShmemVariableCache->nextXid = checkPoint.nextXid; + LWLockRelease(XidGenLock); + + /* + * We ignore the nextOid counter in an ONLINE checkpoint, preferring + * to track OID assignment through XLOG_NEXTOID records. The nextOid + * counter is from the start of the checkpoint and might well be stale + * compared to later XLOG_NEXTOID records. We could try to take the + * maximum of the nextOid counter and our latest value, but since + * there's no particular guarantee about the speed with which the OID + * counter wraps around, that's a risky thing to do. In any case, + * users of the nextOid counter are required to avoid assignment of + * duplicates, so that a somewhat out-of-date value should be safe. + */ + + /* Handle multixact */ + MultiXactAdvanceNextMXact(checkPoint.nextMulti, + checkPoint.nextMultiOffset); + + /* + * NB: This may perform multixact truncation when replaying WAL + * generated by an older primary. + */ + MultiXactAdvanceOldest(checkPoint.oldestMulti, + checkPoint.oldestMultiDB); + if (TransactionIdPrecedes(ShmemVariableCache->oldestXid, + checkPoint.oldestXid)) + SetTransactionIdLimit(checkPoint.oldestXid, + checkPoint.oldestXidDB); + /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + LWLockRelease(ControlFileLock); + + /* Update shared-memory copy of checkpoint XID/epoch */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->ckptFullXid = checkPoint.nextXid; + SpinLockRelease(&XLogCtl->info_lck); + + /* TLI should not change in an on-line checkpoint */ + if (checkPoint.ThisTimeLineID != ThisTimeLineID) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", + checkPoint.ThisTimeLineID, ThisTimeLineID))); + + RecoveryRestartPoint(&checkPoint); + } + else if (info == XLOG_OVERWRITE_CONTRECORD) + { + xl_overwrite_contrecord xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); + VerifyOverwriteContrecord(&xlrec, record); + } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery)); + + /* + * For Hot Standby, we could treat this like a Shutdown Checkpoint, + * but this case is rarer and harder to test, so the benefit doesn't + * outweigh the potential extra cost of maintenance. + */ + + /* + * We should've already switched to the new TLI before replaying this + * record. + */ + if (xlrec.ThisTimeLineID != ThisTimeLineID) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", + xlrec.ThisTimeLineID, ThisTimeLineID))); + } + else if (info == XLOG_NOOP) + { + /* nothing to do here */ + } + else if (info == XLOG_SWITCH) + { + /* nothing to do here */ + } + else if (info == XLOG_RESTORE_POINT) + { + /* nothing to do here */ + } + else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + { + /* + * Full-page image (FPI) records contain nothing else but a backup + * block (or multiple backup blocks). Every block reference must + * include a full-page image - otherwise there would be no point in + * this record. + * + * No recovery conflicts are generated by these generic records - if a + * resource manager needs to generate conflicts, it has to define a + * separate WAL record type and redo routine. + * + * XLOG_FPI_FOR_HINT records are generated when a page needs to be + * WAL- logged because of a hint bit update. They are only generated + * when checksums are enabled. There is no difference in handling + * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info + * code just to distinguish them for statistics purposes. + */ + for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++) + { + Buffer buffer; + + if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) + elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); + UnlockReleaseBuffer(buffer); + } + } + else if (info == XLOG_BACKUP_END) + { + XLogRecPtr startpoint; + + memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint)); + + if (ControlFile->backupStartPoint == startpoint) + { + /* + * We have reached the end of base backup, the point where + * pg_stop_backup() was done. The data on disk is now consistent. + * Reset backupStartPoint, and update minRecoveryPoint to make + * sure we don't allow starting up at an earlier point even if + * recovery is stopped and restarted soon after this. + */ + elog(DEBUG1, "end of backup reached"); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->minRecoveryPoint < lsn) + { + ControlFile->minRecoveryPoint = lsn; + ControlFile->minRecoveryPointTLI = ThisTimeLineID; + } + ControlFile->backupStartPoint = InvalidXLogRecPtr; + ControlFile->backupEndRequired = false; + UpdateControlFile(); + + LWLockRelease(ControlFileLock); + } + } + else if (info == XLOG_PARAMETER_CHANGE) + { + xl_parameter_change xlrec; + + /* Update our copy of the parameters in pg_control */ + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change)); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->MaxConnections = xlrec.MaxConnections; + ControlFile->max_worker_processes = xlrec.max_worker_processes; + ControlFile->max_wal_senders = xlrec.max_wal_senders; + ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts; + ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact; + ControlFile->wal_level = xlrec.wal_level; + ControlFile->wal_log_hints = xlrec.wal_log_hints; + + /* + * Update minRecoveryPoint to ensure that if recovery is aborted, we + * recover back up to this point before allowing hot standby again. + * This is important if the max_* settings are decreased, to ensure + * you don't run queries against the WAL preceding the change. The + * local copies cannot be updated as long as crash recovery is + * happening and we expect all the WAL to be replayed. + */ + if (InArchiveRecovery) + { + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn) + { + ControlFile->minRecoveryPoint = lsn; + ControlFile->minRecoveryPointTLI = ThisTimeLineID; + } + + CommitTsParameterChange(xlrec.track_commit_timestamp, + ControlFile->track_commit_timestamp); + ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp; + + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* Check to see if any parameter change gives a problem on recovery */ + CheckRequiredParameterValues(); + } + else if (info == XLOG_FPW_CHANGE) + { + bool fpw; + + memcpy(&fpw, XLogRecGetData(record), sizeof(bool)); + + /* + * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that + * do_pg_start_backup() and do_pg_stop_backup() can check whether + * full_page_writes has been disabled during online backup. + */ + if (!fpw) + { + SpinLockAcquire(&XLogCtl->info_lck); + if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr) + XLogCtl->lastFpwDisableRecPtr = ReadRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + } + + /* Keep track of full_page_writes */ + lastFullPageWrites = fpw; + } +} + +/* + * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. + */ +static void +VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state) +{ + if (xlrec->overwritten_lsn != state->overwrittenRecPtr) + elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", + LSN_FORMAT_ARGS(xlrec->overwritten_lsn), + LSN_FORMAT_ARGS(state->overwrittenRecPtr)); + + /* We have safely skipped the aborted record */ + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + + ereport(LOG, + (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s", + LSN_FORMAT_ARGS(xlrec->overwritten_lsn), + timestamptz_to_str(xlrec->overwrite_time)))); + + /* Verifying the record should only happen once */ + state->overwrittenRecPtr = InvalidXLogRecPtr; +} + +#ifdef WAL_DEBUG + +static void +xlog_outrec(StringInfo buf, XLogReaderState *record) +{ + appendStringInfo(buf, "prev %X/%X; xid %u", + LSN_FORMAT_ARGS(XLogRecGetPrev(record)), + XLogRecGetXid(record)); + + appendStringInfo(buf, "; len %u", + XLogRecGetDataLen(record)); + + xlog_block_info(buf, record); +} +#endif /* WAL_DEBUG */ + +/* + * Returns a string giving information about all the blocks in an + * XLogRecord. + */ +static void +xlog_block_info(StringInfo buf, XLogReaderState *record) +{ + int block_id; + + /* decode block references */ + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blk; + + if (!XLogRecHasBlockRef(record, block_id)) + continue; + + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); + if (forknum != MAIN_FORKNUM) + appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, + blk); + else + appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + blk); + if (XLogRecHasBlockImage(record, block_id)) + appendStringInfoString(buf, " FPW"); + } +} + +/* + * Returns a string describing an XLogRecord, consisting of its identity + * optionally followed by a colon, a space, and a further description. + */ +static void +xlog_outdesc(StringInfo buf, XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + uint8 info = XLogRecGetInfo(record); + const char *id; + + appendStringInfoString(buf, RmgrTable[rmid].rm_name); + appendStringInfoChar(buf, '/'); + + id = RmgrTable[rmid].rm_identify(info); + if (id == NULL) + appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK); + else + appendStringInfo(buf, "%s: ", id); + + RmgrTable[rmid].rm_desc(buf, record); +} + + +/* + * Return the (possible) sync flag used for opening a file, depending on the + * value of the GUC wal_sync_method. + */ +static int +get_sync_bit(int method) +{ + int o_direct_flag = 0; + + /* If fsync is disabled, never open in sync mode */ + if (!enableFsync) + return 0; + + /* + * Optimize writes by bypassing kernel cache with O_DIRECT when using + * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are + * disabled, otherwise the archive command or walsender process will read + * the WAL soon after writing it, which is guaranteed to cause a physical + * read if we bypassed the kernel cache. We also skip the + * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same + * reason. + * + * Never use O_DIRECT in walreceiver process for similar reasons; the WAL + * written by walreceiver is normally read by the startup process soon + * after it's written. Also, walreceiver performs unaligned writes, which + * don't work with O_DIRECT, so it is required for correctness too. + */ + if (!XLogIsNeeded() && !AmWalReceiverProcess()) + o_direct_flag = PG_O_DIRECT; + + switch (method) + { + /* + * enum values for all sync options are defined even if they are + * not supported on the current platform. But if not, they are + * not included in the enum option array, and therefore will never + * be seen here. + */ + case SYNC_METHOD_FSYNC: + case SYNC_METHOD_FSYNC_WRITETHROUGH: + case SYNC_METHOD_FDATASYNC: + return 0; +#ifdef OPEN_SYNC_FLAG + case SYNC_METHOD_OPEN: + return OPEN_SYNC_FLAG | o_direct_flag; +#endif +#ifdef OPEN_DATASYNC_FLAG + case SYNC_METHOD_OPEN_DSYNC: + return OPEN_DATASYNC_FLAG | o_direct_flag; +#endif + default: + /* can't happen (unless we are out of sync with option array) */ + elog(ERROR, "unrecognized wal_sync_method: %d", method); + return 0; /* silence warning */ + } +} + +/* + * GUC support + */ +void +assign_xlog_sync_method(int new_sync_method, void *extra) +{ + if (sync_method != new_sync_method) + { + /* + * To ensure that no blocks escape unsynced, force an fsync on the + * currently open log segment (if any). Also, if the open flag is + * changing, close the log file so it will be reopened (with new flag + * bit) at next use. + */ + if (openLogFile >= 0) + { + pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN); + if (pg_fsync(openLogFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + save_errno = errno; + XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, + wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", xlogfname))); + } + + pgstat_report_wait_end(); + if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method)) + XLogFileClose(); + } + } +} + + +/* + * Issue appropriate kind of fsync (if any) for an XLOG output file. + * + * 'fd' is a file descriptor for the XLOG file to be fsync'd. + * 'segno' is for error reporting purposes. + */ +void +issue_xlog_fsync(int fd, XLogSegNo segno) +{ + char *msg = NULL; + instr_time start; + + /* + * Quick exit if fsync is disabled or write() has already synced the WAL + * file. + */ + if (!enableFsync || + sync_method == SYNC_METHOD_OPEN || + sync_method == SYNC_METHOD_OPEN_DSYNC) + return; + + /* Measure I/O timing to sync the WAL file */ + if (track_wal_io_timing) + INSTR_TIME_SET_CURRENT(start); + + pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); + switch (sync_method) + { + case SYNC_METHOD_FSYNC: + if (pg_fsync_no_writethrough(fd) != 0) + msg = _("could not fsync file \"%s\": %m"); + break; +#ifdef HAVE_FSYNC_WRITETHROUGH + case SYNC_METHOD_FSYNC_WRITETHROUGH: + if (pg_fsync_writethrough(fd) != 0) + msg = _("could not fsync write-through file \"%s\": %m"); + break; +#endif +#ifdef HAVE_FDATASYNC + case SYNC_METHOD_FDATASYNC: + if (pg_fdatasync(fd) != 0) + msg = _("could not fdatasync file \"%s\": %m"); + break; +#endif + case SYNC_METHOD_OPEN: + case SYNC_METHOD_OPEN_DSYNC: + /* not reachable */ + Assert(false); + break; + default: + elog(PANIC, "unrecognized wal_sync_method: %d", sync_method); + break; + } + + /* PANIC if failed to fsync */ + if (msg) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, ThisTimeLineID, segno, + wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg(msg, xlogfname))); + } + + pgstat_report_wait_end(); + + /* + * Increment the I/O timing and the number of times WAL files were synced. + */ + if (track_wal_io_timing) + { + instr_time duration; + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start); + WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration); + } + + WalStats.m_wal_sync++; +} + +/* + * do_pg_start_backup + * + * Utility function called at the start of an online backup. It creates the + * necessary starting checkpoint and constructs the backup label file. + * + * There are two kind of backups: exclusive and non-exclusive. An exclusive + * backup is started with pg_start_backup(), and there can be only one active + * at a time. The backup and tablespace map files of an exclusive backup are + * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are + * removed by pg_stop_backup(). + * + * A non-exclusive backup is used for the streaming base backups (see + * src/backend/replication/basebackup.c). The difference to exclusive backups + * is that the backup label and tablespace map files are not written to disk. + * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile, + * and the caller is responsible for including them in the backup archive as + * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups + * active at the same time, and they don't conflict with an exclusive backup + * either. + * + * labelfile and tblspcmapfile must be passed as NULL when starting an + * exclusive backup, and as initially-empty StringInfos for a non-exclusive + * backup. + * + * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs + * describing the cluster's tablespaces. + * + * tblspcmapfile is required mainly for tar format in windows as native windows + * utilities are not able to create symlinks while extracting files from tar. + * However for consistency, the same is used for all platforms. + * + * Returns the minimum WAL location that must be present to restore from this + * backup, and the corresponding timeline ID in *starttli_p. + * + * Every successfully started non-exclusive backup must be stopped by calling + * do_pg_stop_backup() or do_pg_abort_backup(). + * + * It is the responsibility of the caller of this function to verify the + * permissions of the calling user! + */ +XLogRecPtr +do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, + StringInfo labelfile, List **tablespaces, + StringInfo tblspcmapfile) +{ + bool exclusive = (labelfile == NULL); + bool backup_started_in_recovery = false; + XLogRecPtr checkpointloc; + XLogRecPtr startpoint; + TimeLineID starttli; + pg_time_t stamp_time; + char strfbuf[128]; + char xlogfilename[MAXFNAMELEN]; + XLogSegNo _logSegNo; + struct stat stat_buf; + FILE *fp; + + backup_started_in_recovery = RecoveryInProgress(); + + /* + * Currently only non-exclusive backup can be taken during recovery. + */ + if (backup_started_in_recovery && exclusive) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + /* + * During recovery, we don't need to check WAL level. Because, if WAL + * level is not sufficient, it's impossible to get here during recovery. + */ + if (!backup_started_in_recovery && !XLogIsNeeded()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for making an online backup"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + + if (strlen(backupidstr) > MAXPGPATH) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("backup label too long (max %d bytes)", + MAXPGPATH))); + + /* + * Mark backup active in shared memory. We must do full-page WAL writes + * during an on-line backup even if not doing so at other times, because + * it's quite possible for the backup dump to obtain a "torn" (partially + * written) copy of a database page if it reads the page concurrently with + * our write to the same page. This can be fixed as long as the first + * write to the page in the WAL sequence is a full-page write. Hence, we + * turn on forcePageWrites and then force a CHECKPOINT, to ensure there + * are no dirty pages in shared memory that might get dumped while the + * backup is in progress without having a corresponding WAL record. (Once + * the backup is complete, we need not force full-page writes anymore, + * since we expect that any pages not modified during the backup interval + * must have been correctly captured by the backup.) + * + * Note that forcePageWrites has no effect during an online backup from + * the standby. + * + * We must hold all the insertion locks to change the value of + * forcePageWrites, to ensure adequate interlocking against + * XLogInsertRecord(). + */ + WALInsertLockAcquireExclusive(); + if (exclusive) + { + /* + * At first, mark that we're now starting an exclusive backup, to + * ensure that there are no other sessions currently running + * pg_start_backup() or pg_stop_backup(). + */ + if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE) + { + WALInsertLockRelease(); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress"), + errhint("Run pg_stop_backup() and try again."))); + } + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING; + } + else + XLogCtl->Insert.nonExclusiveBackups++; + XLogCtl->Insert.forcePageWrites = true; + WALInsertLockRelease(); + + /* Ensure we release forcePageWrites if fail below */ + PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive)); + { + bool gotUniqueStartpoint = false; + DIR *tblspcdir; + struct dirent *de; + tablespaceinfo *ti; + int datadirpathlen; + + /* + * Force an XLOG file switch before the checkpoint, to ensure that the + * WAL segment the checkpoint is written to doesn't contain pages with + * old timeline IDs. That would otherwise happen if you called + * pg_start_backup() right after restoring from a PITR archive: the + * first WAL segment containing the startup checkpoint has pages in + * the beginning with the old timeline ID. That can cause trouble at + * recovery: we won't have a history file covering the old timeline if + * pg_wal directory was not included in the base backup and the WAL + * archive was cleared too before starting the backup. + * + * This also ensures that we have emitted a WAL page header that has + * XLP_BKP_REMOVABLE off before we emit the checkpoint record. + * Therefore, if a WAL archiver (such as pglesslog) is trying to + * compress out removable backup blocks, it won't remove any that + * occur after this point. + * + * During recovery, we skip forcing XLOG file switch, which means that + * the backup taken during recovery is not available for the special + * recovery case described above. + */ + if (!backup_started_in_recovery) + RequestXLogSwitch(false); + + do + { + bool checkpointfpw; + + /* + * Force a CHECKPOINT. Aside from being necessary to prevent torn + * page problems, this guarantees that two successive backup runs + * will have different checkpoint positions and hence different + * history file names, even if nothing happened in between. + * + * During recovery, establish a restartpoint if possible. We use + * the last restartpoint as the backup starting checkpoint. This + * means that two successive backup runs can have same checkpoint + * positions. + * + * Since the fact that we are executing do_pg_start_backup() + * during recovery means that checkpointer is running, we can use + * RequestCheckpoint() to establish a restartpoint. + * + * We use CHECKPOINT_IMMEDIATE only if requested by user (via + * passing fast = true). Otherwise this can take awhile. + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | + (fast ? CHECKPOINT_IMMEDIATE : 0)); + + /* + * Now we need to fetch the checkpoint record location, and also + * its REDO pointer. The oldest point in WAL that would be needed + * to restore starting from the checkpoint is precisely the REDO + * pointer. + */ + LWLockAcquire(ControlFileLock, LW_SHARED); + checkpointloc = ControlFile->checkPoint; + startpoint = ControlFile->checkPointCopy.redo; + starttli = ControlFile->checkPointCopy.ThisTimeLineID; + checkpointfpw = ControlFile->checkPointCopy.fullPageWrites; + LWLockRelease(ControlFileLock); + + if (backup_started_in_recovery) + { + XLogRecPtr recptr; + + /* + * Check to see if all WAL replayed during online backup + * (i.e., since last restartpoint used as backup starting + * checkpoint) contain full-page writes. + */ + SpinLockAcquire(&XLogCtl->info_lck); + recptr = XLogCtl->lastFpwDisableRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + + if (!checkpointfpw || startpoint <= recptr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL generated with full_page_writes=off was replayed " + "since last restartpoint"), + errhint("This means that the backup being taken on the standby " + "is corrupt and should not be used. " + "Enable full_page_writes and run CHECKPOINT on the primary, " + "and then try an online backup again."))); + + /* + * During recovery, since we don't use the end-of-backup WAL + * record and don't write the backup history file, the + * starting WAL location doesn't need to be unique. This means + * that two base backups started at the same time might use + * the same checkpoint as starting locations. + */ + gotUniqueStartpoint = true; + } + + /* + * If two base backups are started at the same time (in WAL sender + * processes), we need to make sure that they use different + * checkpoints as starting locations, because we use the starting + * WAL location as a unique identifier for the base backup in the + * end-of-backup WAL record and when we write the backup history + * file. Perhaps it would be better generate a separate unique ID + * for each backup instead of forcing another checkpoint, but + * taking a checkpoint right after another is not that expensive + * either because only few buffers have been dirtied yet. + */ + WALInsertLockAcquireExclusive(); + if (XLogCtl->Insert.lastBackupStart < startpoint) + { + XLogCtl->Insert.lastBackupStart = startpoint; + gotUniqueStartpoint = true; + } + WALInsertLockRelease(); + } while (!gotUniqueStartpoint); + + XLByteToSeg(startpoint, _logSegNo, wal_segment_size); + XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size); + + /* + * Construct tablespace_map file. If caller isn't interested in this, + * we make a local StringInfo. + */ + if (tblspcmapfile == NULL) + tblspcmapfile = makeStringInfo(); + + datadirpathlen = strlen(DataDir); + + /* Collect information about all tablespaces */ + tblspcdir = AllocateDir("pg_tblspc"); + while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) + { + char fullpath[MAXPGPATH + 10]; + char linkpath[MAXPGPATH]; + char *relpath = NULL; + int rllen; + StringInfoData escapedpath; + char *s; + + /* Skip anything that doesn't look like a tablespace */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name); + + /* + * Skip anything that isn't a symlink/junction. For testing only, + * we sometimes use allow_in_place_tablespaces to create + * directories directly under pg_tblspc, which would fail below. + */ + if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK) + continue; + +#if defined(HAVE_READLINK) || defined(WIN32) + rllen = readlink(fullpath, linkpath, sizeof(linkpath)); + if (rllen < 0) + { + ereport(WARNING, + (errmsg("could not read symbolic link \"%s\": %m", + fullpath))); + continue; + } + else if (rllen >= sizeof(linkpath)) + { + ereport(WARNING, + (errmsg("symbolic link \"%s\" target is too long", + fullpath))); + continue; + } + linkpath[rllen] = '\0'; + + /* + * Build a backslash-escaped version of the link path to include + * in the tablespace map file. + */ + initStringInfo(&escapedpath); + for (s = linkpath; *s; s++) + { + if (*s == '\n' || *s == '\r' || *s == '\\') + appendStringInfoChar(&escapedpath, '\\'); + appendStringInfoChar(&escapedpath, *s); + } + + /* + * Relpath holds the relative path of the tablespace directory + * when it's located within PGDATA, or NULL if it's located + * elsewhere. + */ + if (rllen > datadirpathlen && + strncmp(linkpath, DataDir, datadirpathlen) == 0 && + IS_DIR_SEP(linkpath[datadirpathlen])) + relpath = linkpath + datadirpathlen + 1; + + ti = palloc(sizeof(tablespaceinfo)); + ti->oid = pstrdup(de->d_name); + ti->path = pstrdup(linkpath); + ti->rpath = relpath ? pstrdup(relpath) : NULL; + ti->size = -1; + + if (tablespaces) + *tablespaces = lappend(*tablespaces, ti); + + appendStringInfo(tblspcmapfile, "%s %s\n", + ti->oid, escapedpath.data); + + pfree(escapedpath.data); +#else + + /* + * If the platform does not have symbolic links, it should not be + * possible to have tablespaces - clearly somebody else created + * them. Warn about it and ignore. + */ + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tablespaces are not supported on this platform"))); +#endif + } + FreeDir(tblspcdir); + + /* + * Construct backup label file. If caller isn't interested in this, + * we make a local StringInfo. + */ + if (labelfile == NULL) + labelfile = makeStringInfo(); + + /* Use the log timezone here, not the session timezone */ + stamp_time = (pg_time_t) time(NULL); + pg_strftime(strfbuf, sizeof(strfbuf), + "%Y-%m-%d %H:%M:%S %Z", + pg_localtime(&stamp_time, log_timezone)); + appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n", + LSN_FORMAT_ARGS(startpoint), xlogfilename); + appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n", + LSN_FORMAT_ARGS(checkpointloc)); + appendStringInfo(labelfile, "BACKUP METHOD: %s\n", + exclusive ? "pg_start_backup" : "streamed"); + appendStringInfo(labelfile, "BACKUP FROM: %s\n", + backup_started_in_recovery ? "standby" : "primary"); + appendStringInfo(labelfile, "START TIME: %s\n", strfbuf); + appendStringInfo(labelfile, "LABEL: %s\n", backupidstr); + appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli); + + /* + * Okay, write the file, or return its contents to caller. + */ + if (exclusive) + { + /* + * Check for existing backup label --- implies a backup is already + * running. (XXX given that we checked exclusiveBackupState + * above, maybe it would be OK to just unlink any such label + * file?) + */ + if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + BACKUP_LABEL_FILE))); + } + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress"), + errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.", + BACKUP_LABEL_FILE))); + + fp = AllocateFile(BACKUP_LABEL_FILE, "w"); + + if (!fp) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + BACKUP_LABEL_FILE))); + if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 || + fflush(fp) != 0 || + pg_fsync(fileno(fp)) != 0 || + ferror(fp) || + FreeFile(fp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + BACKUP_LABEL_FILE))); + /* Allocated locally for exclusive backups, so free separately */ + pfree(labelfile->data); + pfree(labelfile); + + /* Write backup tablespace_map file. */ + if (tblspcmapfile->len > 0) + { + if (stat(TABLESPACE_MAP, &stat_buf) != 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + TABLESPACE_MAP))); + } + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress"), + errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.", + TABLESPACE_MAP))); + + fp = AllocateFile(TABLESPACE_MAP, "w"); + + if (!fp) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + TABLESPACE_MAP))); + if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 || + fflush(fp) != 0 || + pg_fsync(fileno(fp)) != 0 || + ferror(fp) || + FreeFile(fp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + TABLESPACE_MAP))); + } + + /* Allocated locally for exclusive backups, so free separately */ + pfree(tblspcmapfile->data); + pfree(tblspcmapfile); + } + } + PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive)); + + /* + * Mark that start phase has correctly finished for an exclusive backup. + * Session-level locks are updated as well to reflect that state. + * + * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup + * counters and session-level lock. Otherwise they can be updated + * inconsistently, and which might cause do_pg_abort_backup() to fail. + */ + if (exclusive) + { + WALInsertLockAcquireExclusive(); + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS; + + /* Set session-level lock */ + sessionBackupState = SESSION_BACKUP_EXCLUSIVE; + WALInsertLockRelease(); + } + else + sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE; + + /* + * We're done. As a convenience, return the starting WAL location. + */ + if (starttli_p) + *starttli_p = starttli; + return startpoint; +} + +/* Error cleanup callback for pg_start_backup */ +static void +pg_start_backup_callback(int code, Datum arg) +{ + bool exclusive = DatumGetBool(arg); + + /* Update backup counters and forcePageWrites on failure */ + WALInsertLockAcquireExclusive(); + if (exclusive) + { + Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING); + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE; + } + else + { + Assert(XLogCtl->Insert.nonExclusiveBackups > 0); + XLogCtl->Insert.nonExclusiveBackups--; + } + + if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE && + XLogCtl->Insert.nonExclusiveBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } + WALInsertLockRelease(); +} + +/* + * Error cleanup callback for pg_stop_backup + */ +static void +pg_stop_backup_callback(int code, Datum arg) +{ + bool exclusive = DatumGetBool(arg); + + /* Update backup status on failure */ + WALInsertLockAcquireExclusive(); + if (exclusive) + { + Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING); + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS; + } + WALInsertLockRelease(); +} + +/* + * Utility routine to fetch the session-level status of a backup running. + */ +SessionBackupState +get_backup_status(void) +{ + return sessionBackupState; +} + +/* + * do_pg_stop_backup + * + * Utility function called at the end of an online backup. It cleans up the + * backup state and can optionally wait for WAL segments to be archived. + * + * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops + * the non-exclusive backup specified by 'labelfile'. + * + * Returns the last WAL location that must be present to restore from this + * backup, and the corresponding timeline ID in *stoptli_p. + * + * It is the responsibility of the caller of this function to verify the + * permissions of the calling user! + */ +XLogRecPtr +do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) +{ + bool exclusive = (labelfile == NULL); + bool backup_started_in_recovery = false; + XLogRecPtr startpoint; + XLogRecPtr stoppoint; + TimeLineID stoptli; + pg_time_t stamp_time; + char strfbuf[128]; + char histfilepath[MAXPGPATH]; + char startxlogfilename[MAXFNAMELEN]; + char stopxlogfilename[MAXFNAMELEN]; + char lastxlogfilename[MAXFNAMELEN]; + char histfilename[MAXFNAMELEN]; + char backupfrom[20]; + XLogSegNo _logSegNo; + FILE *lfp; + FILE *fp; + char ch; + int seconds_before_warning; + int waits = 0; + bool reported_waiting = false; + char *remaining; + char *ptr; + uint32 hi, + lo; + + backup_started_in_recovery = RecoveryInProgress(); + + /* + * Currently only non-exclusive backup can be taken during recovery. + */ + if (backup_started_in_recovery && exclusive) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + /* + * During recovery, we don't need to check WAL level. Because, if WAL + * level is not sufficient, it's impossible to get here during recovery. + */ + if (!backup_started_in_recovery && !XLogIsNeeded()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for making an online backup"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + + if (exclusive) + { + /* + * At first, mark that we're now stopping an exclusive backup, to + * ensure that there are no other sessions currently running + * pg_start_backup() or pg_stop_backup(). + */ + WALInsertLockAcquireExclusive(); + if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS) + { + WALInsertLockRelease(); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("exclusive backup not in progress"))); + } + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING; + WALInsertLockRelease(); + + /* + * Remove backup_label. In case of failure, the state for an exclusive + * backup is switched back to in-progress. + */ + PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive)); + { + /* + * Read the existing label file into memory. + */ + struct stat statbuf; + int r; + + if (stat(BACKUP_LABEL_FILE, &statbuf)) + { + /* should not happen per the upper checks */ + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + BACKUP_LABEL_FILE))); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is not in progress"))); + } + + lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); + if (!lfp) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + } + labelfile = palloc(statbuf.st_size + 1); + r = fread(labelfile, statbuf.st_size, 1, lfp); + labelfile[statbuf.st_size] = '\0'; + + /* + * Close and remove the backup label file + */ + if (r != 1 || ferror(lfp) || FreeFile(lfp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + durable_unlink(BACKUP_LABEL_FILE, ERROR); + + /* + * Remove tablespace_map file if present, it is created only if + * there are tablespaces. + */ + durable_unlink(TABLESPACE_MAP, DEBUG1); + } + PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive)); + } + + /* + * OK to update backup counters, forcePageWrites and session-level lock. + * + * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them. + * Otherwise they can be updated inconsistently, and which might cause + * do_pg_abort_backup() to fail. + */ + WALInsertLockAcquireExclusive(); + if (exclusive) + { + XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE; + } + else + { + /* + * The user-visible pg_start/stop_backup() functions that operate on + * exclusive backups can be called at any time, but for non-exclusive + * backups, it is expected that each do_pg_start_backup() call is + * matched by exactly one do_pg_stop_backup() call. + */ + Assert(XLogCtl->Insert.nonExclusiveBackups > 0); + XLogCtl->Insert.nonExclusiveBackups--; + } + + if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE && + XLogCtl->Insert.nonExclusiveBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } + + /* + * Clean up session-level lock. + * + * You might think that WALInsertLockRelease() can be called before + * cleaning up session-level lock because session-level lock doesn't need + * to be protected with WAL insertion lock. But since + * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be + * cleaned up before it. + */ + sessionBackupState = SESSION_BACKUP_NONE; + + WALInsertLockRelease(); + + /* + * Read and parse the START WAL LOCATION line (this code is pretty crude, + * but we are not expecting any variability in the file format). + */ + if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", + &hi, &lo, startxlogfilename, + &ch) != 4 || ch != '\n') + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + startpoint = ((uint64) hi) << 32 | lo; + remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */ + + /* + * Parse the BACKUP FROM line. If we are taking an online backup from the + * standby, we confirm that the standby has not been promoted during the + * backup. + */ + ptr = strstr(remaining, "BACKUP FROM:"); + if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the standby was promoted during online backup"), + errhint("This means that the backup being taken is corrupt " + "and should not be used. " + "Try taking another online backup."))); + + /* + * During recovery, we don't write an end-of-backup record. We assume that + * pg_control was backed up last and its minimum recovery point can be + * available as the backup end location. Since we don't have an + * end-of-backup record, we use the pg_control value to check whether + * we've reached the end of backup when starting recovery from this + * backup. We have no way of checking if pg_control wasn't backed up last + * however. + * + * We don't force a switch to new WAL file but it is still possible to + * wait for all the required files to be archived if waitforarchive is + * true. This is okay if we use the backup to start a standby and fetch + * the missing WAL using streaming replication. But in the case of an + * archive recovery, a user should set waitforarchive to true and wait for + * them to be archived to ensure that all the required files are + * available. + * + * We return the current minimum recovery point as the backup end + * location. Note that it can be greater than the exact backup end + * location if the minimum recovery point is updated after the backup of + * pg_control. This is harmless for current uses. + * + * XXX currently a backup history file is for informational and debug + * purposes only. It's not essential for an online backup. Furthermore, + * even if it's created, it will not be archived during recovery because + * an archiver is not invoked. So it doesn't seem worthwhile to write a + * backup history file during recovery. + */ + if (backup_started_in_recovery) + { + XLogRecPtr recptr; + + /* + * Check to see if all WAL replayed during online backup contain + * full-page writes. + */ + SpinLockAcquire(&XLogCtl->info_lck); + recptr = XLogCtl->lastFpwDisableRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + + if (startpoint <= recptr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL generated with full_page_writes=off was replayed " + "during online backup"), + errhint("This means that the backup being taken on the standby " + "is corrupt and should not be used. " + "Enable full_page_writes and run CHECKPOINT on the primary, " + "and then try an online backup again."))); + + + LWLockAcquire(ControlFileLock, LW_SHARED); + stoppoint = ControlFile->minRecoveryPoint; + stoptli = ControlFile->minRecoveryPointTLI; + LWLockRelease(ControlFileLock); + } + else + { + /* + * Write the backup-end xlog record + */ + XLogBeginInsert(); + XLogRegisterData((char *) (&startpoint), sizeof(startpoint)); + stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END); + stoptli = ThisTimeLineID; + + /* + * Force a switch to a new xlog segment file, so that the backup is + * valid as soon as archiver moves out the current segment file. + */ + RequestXLogSwitch(false); + + XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size); + XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size); + + /* Use the log timezone here, not the session timezone */ + stamp_time = (pg_time_t) time(NULL); + pg_strftime(strfbuf, sizeof(strfbuf), + "%Y-%m-%d %H:%M:%S %Z", + pg_localtime(&stamp_time, log_timezone)); + + /* + * Write the backup history file + */ + XLByteToSeg(startpoint, _logSegNo, wal_segment_size); + BackupHistoryFilePath(histfilepath, stoptli, _logSegNo, + startpoint, wal_segment_size); + fp = AllocateFile(histfilepath, "w"); + if (!fp) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + histfilepath))); + fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n", + LSN_FORMAT_ARGS(startpoint), startxlogfilename); + fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n", + LSN_FORMAT_ARGS(stoppoint), stopxlogfilename); + + /* + * Transfer remaining lines including label and start timeline to + * history file. + */ + fprintf(fp, "%s", remaining); + fprintf(fp, "STOP TIME: %s\n", strfbuf); + fprintf(fp, "STOP TIMELINE: %u\n", stoptli); + if (fflush(fp) || ferror(fp) || FreeFile(fp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + histfilepath))); + + /* + * Clean out any no-longer-needed history files. As a side effect, + * this will post a .ready file for the newly created history file, + * notifying the archiver that history file may be archived + * immediately. + */ + CleanupBackupHistory(); + } + + /* + * If archiving is enabled, wait for all the required WAL files to be + * archived before returning. If archiving isn't enabled, the required WAL + * needs to be transported via streaming replication (hopefully with + * wal_keep_size set high enough), or some more exotic mechanism like + * polling and copying files from pg_wal with script. We have no knowledge + * of those mechanisms, so it's up to the user to ensure that he gets all + * the required WAL. + * + * We wait until both the last WAL file filled during backup and the + * history file have been archived, and assume that the alphabetic sorting + * property of the WAL files ensures any earlier WAL files are safely + * archived as well. + * + * We wait forever, since archive_command is supposed to work and we + * assume the admin wanted his backup to work completely. If you don't + * wish to wait, then either waitforarchive should be passed in as false, + * or you can set statement_timeout. Also, some notices are issued to + * clue in anyone who might be doing this interactively. + */ + + if (waitforarchive && + ((!backup_started_in_recovery && XLogArchivingActive()) || + (backup_started_in_recovery && XLogArchivingAlways()))) + { + XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size); + XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size); + + XLByteToSeg(startpoint, _logSegNo, wal_segment_size); + BackupHistoryFileName(histfilename, stoptli, _logSegNo, + startpoint, wal_segment_size); + + seconds_before_warning = 60; + waits = 0; + + while (XLogArchiveIsBusy(lastxlogfilename) || + XLogArchiveIsBusy(histfilename)) + { + CHECK_FOR_INTERRUPTS(); + + if (!reported_waiting && waits > 5) + { + ereport(NOTICE, + (errmsg("base backup done, waiting for required WAL segments to be archived"))); + reported_waiting = true; + } + + pgstat_report_wait_start(WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE); + pg_usleep(1000000L); + pgstat_report_wait_end(); + + if (++waits >= seconds_before_warning) + { + seconds_before_warning *= 2; /* This wraps in >10 years... */ + ereport(WARNING, + (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)", + waits), + errhint("Check that your archive_command is executing properly. " + "You can safely cancel this backup, " + "but the database backup will not be usable without all the WAL segments."))); + } + } + + ereport(NOTICE, + (errmsg("all required WAL segments have been archived"))); + } + else if (waitforarchive) + ereport(NOTICE, + (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup"))); + + /* + * We're done. As a convenience, return the ending WAL location. + */ + if (stoptli_p) + *stoptli_p = stoptli; + return stoppoint; +} + + +/* + * do_pg_abort_backup: abort a running backup + * + * This does just the most basic steps of do_pg_stop_backup(), by taking the + * system out of backup mode, thus making it a lot more safe to call from + * an error handler. + * + * The caller can pass 'arg' as 'true' or 'false' to control whether a warning + * is emitted. + * + * NB: This is only for aborting a non-exclusive backup that doesn't write + * backup_label. A backup started with pg_start_backup() needs to be finished + * with pg_stop_backup(). + * + * NB: This gets used as a before_shmem_exit handler, hence the odd-looking + * signature. + */ +void +do_pg_abort_backup(int code, Datum arg) +{ + bool emit_warning = DatumGetBool(arg); + + /* + * Quick exit if session is not keeping around a non-exclusive backup + * already started. + */ + if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE) + return; + + WALInsertLockAcquireExclusive(); + Assert(XLogCtl->Insert.nonExclusiveBackups > 0); + XLogCtl->Insert.nonExclusiveBackups--; + + if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE && + XLogCtl->Insert.nonExclusiveBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } + + sessionBackupState = SESSION_BACKUP_NONE; + WALInsertLockRelease(); + + if (emit_warning) + ereport(WARNING, + (errmsg("aborting backup due to backend exiting before pg_stop_backup was called"))); +} + +/* + * Register a handler that will warn about unterminated backups at end of + * session, unless this has already been done. + */ +void +register_persistent_abort_backup_handler(void) +{ + static bool already_done = false; + + if (already_done) + return; + before_shmem_exit(do_pg_abort_backup, DatumGetBool(true)); + already_done = true; +} + +/* + * Get latest redo apply position. + * + * Exported to allow WALReceiver to read the pointer directly. + */ +XLogRecPtr +GetXLogReplayRecPtr(TimeLineID *replayTLI) +{ + XLogRecPtr recptr; + TimeLineID tli; + + SpinLockAcquire(&XLogCtl->info_lck); + recptr = XLogCtl->lastReplayedEndRecPtr; + tli = XLogCtl->lastReplayedTLI; + SpinLockRelease(&XLogCtl->info_lck); + + if (replayTLI) + *replayTLI = tli; + return recptr; +} + +/* + * Get latest WAL insert pointer + */ +XLogRecPtr +GetXLogInsertRecPtr(void) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint64 current_bytepos; + + SpinLockAcquire(&Insert->insertpos_lck); + current_bytepos = Insert->CurrBytePos; + SpinLockRelease(&Insert->insertpos_lck); + + return XLogBytePosToRecPtr(current_bytepos); +} + +/* + * Get latest WAL write pointer + */ +XLogRecPtr +GetXLogWriteRecPtr(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + return LogwrtResult.Write; +} + +/* + * Returns the redo pointer of the last checkpoint or restartpoint. This is + * the oldest point in WAL that we still need, if we have to restart recovery. + */ +void +GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) +{ + LWLockAcquire(ControlFileLock, LW_SHARED); + *oldrecptr = ControlFile->checkPointCopy.redo; + *oldtli = ControlFile->checkPointCopy.ThisTimeLineID; + LWLockRelease(ControlFileLock); +} + +/* + * read_backup_label: check to see if a backup_label file is present + * + * If we see a backup_label during recovery, we assume that we are recovering + * from a backup dump file, and we therefore roll forward from the checkpoint + * identified by the label file, NOT what pg_control says. This avoids the + * problem that pg_control might have been archived one or more checkpoints + * later than the start of the dump, and so if we rely on it as the start + * point, we will fail to restore a consistent database state. + * + * Returns true if a backup_label was found (and fills the checkpoint + * location and its REDO location into *checkPointLoc and RedoStartLSN, + * respectively); returns false if not. If this backup_label came from a + * streamed backup, *backupEndRequired is set to true. If this backup_label + * was created during recovery, *backupFromStandby is set to true. + */ +static bool +read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, + bool *backupFromStandby) +{ + char startxlogfilename[MAXFNAMELEN]; + TimeLineID tli_from_walseg, + tli_from_file; + FILE *lfp; + char ch; + char backuptype[20]; + char backupfrom[20]; + char backuplabel[MAXPGPATH]; + char backuptime[128]; + uint32 hi, + lo; + + *backupEndRequired = false; + *backupFromStandby = false; + + /* + * See if label file is present + */ + lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); + if (!lfp) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + return false; /* it's not there, all is fine */ + } + + /* + * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code + * is pretty crude, but we are not expecting any variability in the file + * format). + */ + if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", + &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + RedoStartLSN = ((uint64) hi) << 32 | lo; + if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", + &hi, &lo, &ch) != 3 || ch != '\n') + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + *checkPointLoc = ((uint64) hi) << 32 | lo; + + /* + * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore + * from an older backup anyway, but since the information on it is not + * strictly required, don't error out if it's missing for some reason. + */ + if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1) + { + if (strcmp(backuptype, "streamed") == 0) + *backupEndRequired = true; + } + + if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1) + { + if (strcmp(backupfrom, "standby") == 0) + *backupFromStandby = true; + } + + /* + * Parse START TIME and LABEL. Those are not mandatory fields for recovery + * but checking for their presence is useful for debugging and the next + * sanity checks. Cope also with the fact that the result buffers have a + * pre-allocated size, hence if the backup_label file has been generated + * with strings longer than the maximum assumed here an incorrect parsing + * happens. That's fine as only minor consistency checks are done + * afterwards. + */ + if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1) + ereport(DEBUG1, + (errmsg_internal("backup time %s in file \"%s\"", + backuptime, BACKUP_LABEL_FILE))); + + if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1) + ereport(DEBUG1, + (errmsg_internal("backup label %s in file \"%s\"", + backuplabel, BACKUP_LABEL_FILE))); + + /* + * START TIMELINE is new as of 11. Its parsing is not mandatory, still use + * it as a sanity check if present. + */ + if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1) + { + if (tli_from_walseg != tli_from_file) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE), + errdetail("Timeline ID parsed is %u, but expected %u.", + tli_from_file, tli_from_walseg))); + + ereport(DEBUG1, + (errmsg_internal("backup timeline %u in file \"%s\"", + tli_from_file, BACKUP_LABEL_FILE))); + } + + if (ferror(lfp) || FreeFile(lfp)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + + return true; +} + +/* + * read_tablespace_map: check to see if a tablespace_map file is present + * + * If we see a tablespace_map file during recovery, we assume that we are + * recovering from a backup dump file, and we therefore need to create symlinks + * as per the information present in tablespace_map file. + * + * Returns true if a tablespace_map file was found (and fills *tablespaces + * with a tablespaceinfo struct for each tablespace listed in the file); + * returns false if not. + */ +static bool +read_tablespace_map(List **tablespaces) +{ + tablespaceinfo *ti; + FILE *lfp; + char str[MAXPGPATH]; + int ch, + i, + n; + bool was_backslash; + + /* + * See if tablespace_map file is present + */ + lfp = AllocateFile(TABLESPACE_MAP, "r"); + if (!lfp) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + TABLESPACE_MAP))); + return false; /* it's not there, all is fine */ + } + + /* + * Read and parse the link name and path lines from tablespace_map file + * (this code is pretty crude, but we are not expecting any variability in + * the file format). De-escape any backslashes that were inserted. + */ + i = 0; + was_backslash = false; + while ((ch = fgetc(lfp)) != EOF) + { + if (!was_backslash && (ch == '\n' || ch == '\r')) + { + if (i == 0) + continue; /* \r immediately followed by \n */ + + /* + * The de-escaped line should contain an OID followed by exactly + * one space followed by a path. The path might start with + * spaces, so don't be too liberal about parsing. + */ + str[i] = '\0'; + n = 0; + while (str[n] && str[n] != ' ') + n++; + if (n < 1 || n >= i - 1) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); + str[n++] = '\0'; + + ti = palloc0(sizeof(tablespaceinfo)); + ti->oid = pstrdup(str); + ti->path = pstrdup(str + n); + *tablespaces = lappend(*tablespaces, ti); + + i = 0; + continue; + } + else if (!was_backslash && ch == '\\') + was_backslash = true; + else + { + if (i < sizeof(str) - 1) + str[i++] = ch; + was_backslash = false; + } + } + + if (i != 0 || was_backslash) /* last line not terminated? */ + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); + + if (ferror(lfp) || FreeFile(lfp)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + TABLESPACE_MAP))); + + return true; +} + +/* + * Error context callback for errors occurring during rm_redo(). + */ +static void +rm_redo_error_callback(void *arg) +{ + XLogReaderState *record = (XLogReaderState *) arg; + StringInfoData buf; + + initStringInfo(&buf); + xlog_outdesc(&buf, record); + xlog_block_info(&buf, record); + + /* translator: %s is a WAL record description */ + errcontext("WAL redo at %X/%X for %s", + LSN_FORMAT_ARGS(record->ReadRecPtr), + buf.data); + + pfree(buf.data); +} + +/* + * BackupInProgress: check if online backup mode is active + * + * This is done by checking for existence of the "backup_label" file. + */ +bool +BackupInProgress(void) +{ + struct stat stat_buf; + + return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0); +} + +/* + * CancelBackup: rename the "backup_label" and "tablespace_map" + * files to cancel backup mode + * + * If the "backup_label" file exists, it will be renamed to "backup_label.old". + * Similarly, if the "tablespace_map" file exists, it will be renamed to + * "tablespace_map.old". + * + * Note that this will render an online backup in progress + * useless. To correctly finish an online backup, pg_stop_backup must be + * called. + */ +void +CancelBackup(void) +{ + struct stat stat_buf; + + /* if the backup_label file is not there, return */ + if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0) + return; + + /* remove leftover file from previously canceled backup if it exists */ + unlink(BACKUP_LABEL_OLD); + + if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("online backup mode was not canceled"), + errdetail("File \"%s\" could not be renamed to \"%s\": %m.", + BACKUP_LABEL_FILE, BACKUP_LABEL_OLD))); + return; + } + + /* if the tablespace_map file is not there, return */ + if (stat(TABLESPACE_MAP, &stat_buf) < 0) + { + ereport(LOG, + (errmsg("online backup mode canceled"), + errdetail("File \"%s\" was renamed to \"%s\".", + BACKUP_LABEL_FILE, BACKUP_LABEL_OLD))); + return; + } + + /* remove leftover file from previously canceled backup if it exists */ + unlink(TABLESPACE_MAP_OLD); + + if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) + { + ereport(LOG, + (errmsg("online backup mode canceled"), + errdetail("Files \"%s\" and \"%s\" were renamed to " + "\"%s\" and \"%s\", respectively.", + BACKUP_LABEL_FILE, TABLESPACE_MAP, + BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD))); + } + else + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("online backup mode canceled"), + errdetail("File \"%s\" was renamed to \"%s\", but " + "file \"%s\" could not be renamed to \"%s\": %m.", + BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, + TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + } +} + +/* + * Read the XLOG page containing RecPtr into readBuf (if not read already). + * Returns number of bytes read, if the page is read successfully, or -1 + * in case of errors. When errors occur, they are ereport'ed, but only + * if they have not been previously reported. + * + * This is responsible for restoring files from archive as needed, as well + * as for waiting for the requested WAL record to arrive in standby mode. + * + * 'emode' specifies the log level used for reporting "file not found" or + * "end of WAL" situations in archive recovery, or in standby mode when a + * trigger file is found. If set to WARNING or below, XLogPageRead() returns + * false in those situations, on higher log levels the ereport() won't + * return. + * + * In standby mode, if after a successful return of XLogPageRead() the + * caller finds the record it's interested in to be broken, it should + * ereport the error with the level determined by + * emode_for_corrupt_record(), and then set lastSourceFailed + * and call XLogPageRead() again with the same arguments. This lets + * XLogPageRead() to try fetching the record from another source, or to + * sleep and retry. + */ +static int +XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *readBuf) +{ + XLogPageReadPrivate *private = + (XLogPageReadPrivate *) xlogreader->private_data; + int emode = private->emode; + uint32 targetPageOff; + XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; + int r; + + XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); + targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); + + /* + * See if we need to switch to a new segment because the requested record + * is not in the currently open one. + */ + if (readFile >= 0 && + !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) + { + /* + * Request a restartpoint if we've replayed too much xlog since the + * last one. + */ + if (bgwriterLaunched) + { + if (XLogCheckpointNeeded(readSegNo)) + { + (void) GetRedoRecPtr(); + if (XLogCheckpointNeeded(readSegNo)) + RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); + } + } + + close(readFile); + readFile = -1; + readSource = XLOG_FROM_ANY; + } + + XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); + +retry: + /* See if we need to retrieve more data */ + if (readFile < 0 || + (readSource == XLOG_FROM_STREAM && + flushedUpto < targetPagePtr + reqLen)) + { + if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen, + private->randAccess, + private->fetching_ckpt, + targetRecPtr)) + { + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + + return -1; + } + } + + /* + * At this point, we have the right segment open and if we're streaming we + * know the requested record is in it. + */ + Assert(readFile != -1); + + /* + * If the current segment is being streamed from the primary, calculate + * how much of the current page we have received already. We know the + * requested record has been received, but this is for the benefit of + * future calls, to allow quick exit at the top of this function. + */ + if (readSource == XLOG_FROM_STREAM) + { + if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ)) + readLen = XLOG_BLCKSZ; + else + readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) - + targetPageOff; + } + else + readLen = XLOG_BLCKSZ; + + /* Read the requested page */ + readOff = targetPageOff; + + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); + r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); + if (r != XLOG_BLCKSZ) + { + char fname[MAXFNAMELEN]; + int save_errno = errno; + + pgstat_report_wait_end(); + XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); + if (r < 0) + { + errno = save_errno; + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode_for_file_access(), + errmsg("could not read from log segment %s, offset %u: %m", + fname, readOff))); + } + else + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read from log segment %s, offset %u: read %d of %zu", + fname, readOff, r, (Size) XLOG_BLCKSZ))); + goto next_record_is_invalid; + } + pgstat_report_wait_end(); + + Assert(targetSegNo == readSegNo); + Assert(targetPageOff == readOff); + Assert(reqLen <= readLen); + + xlogreader->seg.ws_tli = curFileTLI; + + /* + * Check the page header immediately, so that we can retry immediately if + * it's not valid. This may seem unnecessary, because XLogReadRecord() + * validates the page header anyway, and would propagate the failure up to + * ReadRecord(), which would retry. However, there's a corner case with + * continuation records, if a record is split across two pages such that + * we would need to read the two pages from different sources. For + * example, imagine a scenario where a streaming replica is started up, + * and replay reaches a record that's split across two WAL segments. The + * first page is only available locally, in pg_wal, because it's already + * been recycled on the primary. The second page, however, is not present + * in pg_wal, and we should stream it from the primary. There is a + * recycled WAL segment present in pg_wal, with garbage contents, however. + * We would read the first page from the local WAL segment, but when + * reading the second page, we would read the bogus, recycled, WAL + * segment. If we didn't catch that case here, we would never recover, + * because ReadRecord() would retry reading the whole record from the + * beginning. + * + * Of course, this only catches errors in the page header, which is what + * happens in the case of a recycled WAL segment. Other kinds of errors or + * corruption still has the same problem. But this at least fixes the + * common case, which can happen as part of normal operation. + * + * Validating the page header is cheap enough that doing it twice + * shouldn't be a big deal from a performance point of view. + */ + if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) + { + /* reset any error XLogReaderValidatePageHeader() might have set */ + xlogreader->errormsg_buf[0] = '\0'; + goto next_record_is_invalid; + } + + return readLen; + +next_record_is_invalid: + lastSourceFailed = true; + + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + + /* In standby-mode, keep trying */ + if (StandbyMode) + goto retry; + else + return -1; +} + +/* + * Open the WAL segment containing WAL location 'RecPtr'. + * + * The segment can be fetched via restore_command, or via walreceiver having + * streamed the record, or it can already be present in pg_wal. Checking + * pg_wal is mainly for crash recovery, but it will be polled in standby mode + * too, in case someone copies a new segment directly to pg_wal. That is not + * documented or recommended, though. + * + * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should + * prepare to read WAL starting from RedoStartLSN after this. + * + * 'RecPtr' might not point to the beginning of the record we're interested + * in, it might also point to the page or segment header. In that case, + * 'tliRecPtr' is the position of the WAL record we're interested in. It is + * used to decide which timeline to stream the requested WAL from. + * + * If the record is not immediately available, the function returns false + * if we're not in standby mode. In standby mode, waits for it to become + * available. + * + * When the requested record becomes available, the function opens the file + * containing it (if not open already), and returns true. When end of standby + * mode is triggered by the user, and there is no more WAL available, returns + * false. + */ +static bool +WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, + bool fetching_ckpt, XLogRecPtr tliRecPtr) +{ + static TimestampTz last_fail_time = 0; + TimestampTz now; + bool streaming_reply_sent = false; + + /*------- + * Standby mode is implemented by a state machine: + * + * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just + * pg_wal (XLOG_FROM_PG_WAL) + * 2. Check trigger file + * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) + * 4. Rescan timelines + * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. + * + * Failure to read from the current source advances the state machine to + * the next state. + * + * 'currentSource' indicates the current state. There are no currentSource + * values for "check trigger", "rescan timelines", and "sleep" states, + * those actions are taken when reading from the previous source fails, as + * part of advancing to the next state. + * + * If standby mode is turned off while reading WAL from stream, we move + * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching + * the files (which would be required at end of recovery, e.g., timeline + * history file) from archive or pg_wal. We don't need to kill WAL receiver + * here because it's already stopped when standby mode is turned off at + * the end of recovery. + *------- + */ + if (!InArchiveRecovery) + currentSource = XLOG_FROM_PG_WAL; + else if (currentSource == XLOG_FROM_ANY || + (!StandbyMode && currentSource == XLOG_FROM_STREAM)) + { + lastSourceFailed = false; + currentSource = XLOG_FROM_ARCHIVE; + } + + for (;;) + { + XLogSource oldSource = currentSource; + bool startWalReceiver = false; + + /* + * First check if we failed to read from the current source, and + * advance the state machine if so. The failure to read might've + * happened outside this function, e.g when a CRC check fails on a + * record, or within this loop. + */ + if (lastSourceFailed) + { + switch (currentSource) + { + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: + + /* + * Check to see if the trigger file exists. Note that we + * do this only after failure, so when you create the + * trigger file, we still finish replaying as much as we + * can from archive and pg_wal before failover. + */ + if (StandbyMode && CheckForStandbyTrigger()) + { + ShutdownWalRcv(); + return false; + } + + /* + * Not in standby mode, and we've now tried the archive + * and pg_wal. + */ + if (!StandbyMode) + return false; + + /* + * Move to XLOG_FROM_STREAM state, and set to start a + * walreceiver if necessary. + */ + currentSource = XLOG_FROM_STREAM; + startWalReceiver = true; + break; + + case XLOG_FROM_STREAM: + + /* + * Failure while streaming. Most likely, we got here + * because streaming replication was terminated, or + * promotion was triggered. But we also get here if we + * find an invalid record in the WAL streamed from the + * primary, in which case something is seriously wrong. + * There's little chance that the problem will just go + * away, but PANIC is not good for availability either, + * especially in hot standby mode. So, we treat that the + * same as disconnection, and retry from archive/pg_wal + * again. The WAL in the archive should be identical to + * what was streamed, so it's unlikely that it helps, but + * one can hope... + */ + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + + /* + * Before we leave XLOG_FROM_STREAM state, make sure that + * walreceiver is not active, so that it won't overwrite + * WAL that we restore from archive. + */ + if (WalRcvStreaming()) + ShutdownWalRcv(); + + /* + * Before we sleep, re-scan for possible new timelines if + * we were requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) + { + if (rescanLatestTimeLine()) + { + currentSource = XLOG_FROM_ARCHIVE; + break; + } + } + + /* + * XLOG_FROM_STREAM is the last state in our state + * machine, so we've exhausted all the options for + * obtaining the requested WAL. We're going to loop back + * and retry from the archive, but if it hasn't been long + * since last attempt, sleep wal_retrieve_retry_interval + * milliseconds to avoid busy-waiting. + */ + now = GetCurrentTimestamp(); + if (!TimestampDifferenceExceeds(last_fail_time, now, + wal_retrieve_retry_interval)) + { + long wait_time; + + wait_time = wal_retrieve_retry_interval - + TimestampDifferenceMilliseconds(last_fail_time, now); + + (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + wait_time, + WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL); + ResetLatch(&XLogCtl->recoveryWakeupLatch); + now = GetCurrentTimestamp(); + + /* Handle interrupt signals of startup process */ + HandleStartupProcInterrupts(); + } + last_fail_time = now; + currentSource = XLOG_FROM_ARCHIVE; + break; + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); + } + } + else if (currentSource == XLOG_FROM_PG_WAL) + { + /* + * We just successfully read a file in pg_wal. We prefer files in + * the archive over ones in pg_wal, so try the next file again + * from the archive first. + */ + if (InArchiveRecovery) + currentSource = XLOG_FROM_ARCHIVE; + } + + if (currentSource != oldSource) + elog(DEBUG2, "switched WAL source from %s to %s after %s", + xlogSourceNames[oldSource], xlogSourceNames[currentSource], + lastSourceFailed ? "failure" : "success"); + + /* + * We've now handled possible failure. Try to read from the chosen + * source. + */ + lastSourceFailed = false; + + switch (currentSource) + { + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: + + /* + * WAL receiver must not be running when reading WAL from + * archive or pg_wal. + */ + Assert(!WalRcvStreaming()); + + /* Close any old file we might have open. */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + /* Reset curFileTLI if random fetch. */ + if (randAccess) + curFileTLI = 0; + + /* + * Try to restore the file from archive, or read an existing + * file from pg_wal. + */ + readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, + currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : + currentSource); + if (readFile >= 0) + return true; /* success! */ + + /* + * Nope, not found in archive or pg_wal. + */ + lastSourceFailed = true; + break; + + case XLOG_FROM_STREAM: + { + bool havedata; + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + + /* + * First, shutdown walreceiver if its restart has been + * requested -- but no point if we're already slated for + * starting it. + */ + if (pendingWalRcvRestart && !startWalReceiver) + { + ShutdownWalRcv(); + + /* + * Re-scan for possible new timelines if we were + * requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == + RECOVERY_TARGET_TIMELINE_LATEST) + rescanLatestTimeLine(); + + startWalReceiver = true; + } + pendingWalRcvRestart = false; + + /* + * Launch walreceiver if needed. + * + * If fetching_ckpt is true, RecPtr points to the initial + * checkpoint location. In that case, we use RedoStartLSN + * as the streaming start position instead of RecPtr, so + * that when we later jump backwards to start redo at + * RedoStartLSN, we will have the logs streamed already. + */ + if (startWalReceiver && + PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) + { + XLogRecPtr ptr; + TimeLineID tli; + + if (fetching_ckpt) + { + ptr = RedoStartLSN; + tli = ControlFile->checkPointCopy.ThisTimeLineID; + } + else + { + ptr = RecPtr; + + /* + * Use the record begin position to determine the + * TLI, rather than the position we're reading. + */ + tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); + + if (curFileTLI > 0 && tli < curFileTLI) + elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + LSN_FORMAT_ARGS(tliRecPtr), + tli, curFileTLI); + } + curFileTLI = tli; + RequestXLogStreaming(tli, ptr, PrimaryConnInfo, + PrimarySlotName, + wal_receiver_create_temp_slot); + flushedUpto = 0; + } + + /* + * Check if WAL receiver is active or wait to start up. + */ + if (!WalRcvStreaming()) + { + lastSourceFailed = true; + break; + } + + /* + * Walreceiver is active, so see if new data has arrived. + * + * We only advance XLogReceiptTime when we obtain fresh + * WAL from walreceiver and observe that we had already + * processed everything before the most recent "chunk" + * that it flushed to disk. In steady state where we are + * keeping up with the incoming data, XLogReceiptTime will + * be updated on each cycle. When we are behind, + * XLogReceiptTime will not advance, so the grace time + * allotted to conflicting queries will decrease. + */ + if (RecPtr < flushedUpto) + havedata = true; + else + { + XLogRecPtr latestChunkStart; + + flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI); + if (RecPtr < flushedUpto && receiveTLI == curFileTLI) + { + havedata = true; + if (latestChunkStart <= RecPtr) + { + XLogReceiptTime = GetCurrentTimestamp(); + SetCurrentChunkStartTime(XLogReceiptTime); + } + } + else + havedata = false; + } + if (havedata) + { + /* + * Great, streamed far enough. Open the file if it's + * not open already. Also read the timeline history + * file if we haven't initialized timeline history + * yet; it should be streamed over and present in + * pg_wal by now. Use XLOG_FROM_STREAM so that source + * info is set correctly and XLogReceiptTime isn't + * changed. + * + * NB: We must set readTimeLineHistory based on + * recoveryTargetTLI, not receiveTLI. Normally they'll + * be the same, but if recovery_target_timeline is + * 'latest' and archiving is configured, then it's + * possible that we managed to retrieve one or more + * new timeline history files from the archive, + * updating recoveryTargetTLI. + */ + if (readFile < 0) + { + if (!expectedTLEs) + expectedTLEs = readTimeLineHistory(recoveryTargetTLI); + readFile = XLogFileRead(readSegNo, PANIC, + receiveTLI, + XLOG_FROM_STREAM, false); + Assert(readFile >= 0); + } + else + { + /* just make sure source info is correct... */ + readSource = XLOG_FROM_STREAM; + XLogReceiptSource = XLOG_FROM_STREAM; + return true; + } + break; + } + + /* + * Data not here yet. Check for trigger, then wait for + * walreceiver to wake us up when new WAL arrives. + */ + if (CheckForStandbyTrigger()) + { + /* + * Note that we don't "return false" immediately here. + * After being triggered, we still want to replay all + * the WAL that was already streamed. It's in pg_wal + * now, so we just treat this as a failure, and the + * state machine will move on to replay the streamed + * WAL from pg_wal, and then recheck the trigger and + * exit replay. + */ + lastSourceFailed = true; + break; + } + + /* + * Since we have replayed everything we have received so + * far and are about to start waiting for more WAL, let's + * tell the upstream server our replay location now so + * that pg_stat_replication doesn't show stale + * information. + */ + if (!streaming_reply_sent) + { + WalRcvForceReply(); + streaming_reply_sent = true; + } + + /* + * Wait for more WAL to arrive. Time out after 5 seconds + * to react to a trigger file promptly and to check if the + * WAL receiver is still active. + */ + (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM); + ResetLatch(&XLogCtl->recoveryWakeupLatch); + break; + } + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); + } + + /* + * Check for recovery pause here so that we can confirm more quickly + * that a requested pause has actually taken effect. + */ + if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + + /* + * This possibly-long loop needs to handle interrupts of startup + * process. + */ + HandleStartupProcInterrupts(); + } + + return false; /* not reached */ +} + +/* + * Set flag to signal the walreceiver to restart. (The startup process calls + * this on noticing a relevant configuration change.) + */ +void +StartupRequestWalReceiverRestart(void) +{ + if (currentSource == XLOG_FROM_STREAM && WalRcvRunning()) + { + ereport(LOG, + (errmsg("WAL receiver process shutdown requested"))); + + pendingWalRcvRestart = true; + } +} + +/* + * Determine what log level should be used to report a corrupt WAL record + * in the current WAL page, previously read by XLogPageRead(). + * + * 'emode' is the error mode that would be used to report a file-not-found + * or legitimate end-of-WAL situation. Generally, we use it as-is, but if + * we're retrying the exact same record that we've tried previously, only + * complain the first time to keep the noise down. However, we only do when + * reading from pg_wal, because we don't expect any invalid records in archive + * or in records streamed from the primary. Files in the archive should be complete, + * and we should never hit the end of WAL because we stop and wait for more WAL + * to arrive before replaying it. + * + * NOTE: This function remembers the RecPtr value it was last called with, + * to suppress repeated messages about the same record. Only call this when + * you are about to ereport(), or you might cause a later message to be + * erroneously suppressed. + */ +static int +emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) +{ + static XLogRecPtr lastComplaint = 0; + + if (readSource == XLOG_FROM_PG_WAL && emode == LOG) + { + if (RecPtr == lastComplaint) + emode = DEBUG1; + else + lastComplaint = RecPtr; + } + return emode; +} + +/* + * Has a standby promotion already been triggered? + * + * Unlike CheckForStandbyTrigger(), this works in any process + * that's connected to shared memory. + */ +bool +PromoteIsTriggered(void) +{ + /* + * We check shared state each time only until a standby promotion is + * triggered. We can't trigger a promotion again, so there's no need to + * keep checking after the shared variable has once been seen true. + */ + if (LocalPromoteIsTriggered) + return true; + + SpinLockAcquire(&XLogCtl->info_lck); + LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered; + SpinLockRelease(&XLogCtl->info_lck); + + return LocalPromoteIsTriggered; +} + +static void +SetPromoteIsTriggered(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedPromoteIsTriggered = true; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Mark the recovery pause state as 'not paused' because the paused state + * ends and promotion continues if a promotion is triggered while recovery + * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly + * return 'paused' while a promotion is ongoing. + */ + SetRecoveryPause(false); + + LocalPromoteIsTriggered = true; +} + +/* + * Check to see whether the user-specified trigger file exists and whether a + * promote request has arrived. If either condition holds, return true. + */ +static bool +CheckForStandbyTrigger(void) +{ + struct stat stat_buf; + + if (LocalPromoteIsTriggered) + return true; + + if (IsPromoteSignaled() && CheckPromoteSignal()) + { + ereport(LOG, (errmsg("received promote request"))); + RemovePromoteSignalFiles(); + ResetPromoteSignaled(); + SetPromoteIsTriggered(); + return true; + } + + if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0) + return false; + + if (stat(PromoteTriggerFile, &stat_buf) == 0) + { + ereport(LOG, + (errmsg("promote trigger file found: %s", PromoteTriggerFile))); + unlink(PromoteTriggerFile); + SetPromoteIsTriggered(); + return true; + } + else if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat promote trigger file \"%s\": %m", + PromoteTriggerFile))); + + return false; +} + +/* + * Remove the files signaling a standby promotion request. + */ +void +RemovePromoteSignalFiles(void) +{ + unlink(PROMOTE_SIGNAL_FILE); +} + +/* + * Check to see if a promote request has arrived. + */ +bool +CheckPromoteSignal(void) +{ + struct stat stat_buf; + + if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) + return true; + + return false; +} + +/* + * Wake up startup process to replay newly arrived WAL, or to notice that + * failover has been requested. + */ +void +WakeupRecovery(void) +{ + SetLatch(&XLogCtl->recoveryWakeupLatch); +} + +/* + * Update the WalWriterSleeping flag. + */ +void +SetWalWriterSleeping(bool sleeping) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->WalWriterSleeping = sleeping; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Schedule a walreceiver wakeup in the main recovery loop. + */ +void +XLogRequestWalReceiverReply(void) +{ + doRequestWalReceiverReply = true; +} diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c new file mode 100644 index 0000000..26b023e --- /dev/null +++ b/src/backend/access/transam/xlogarchive.c @@ -0,0 +1,732 @@ +/*------------------------------------------------------------------------- + * + * xlogarchive.c + * Functions for archiving WAL files and restoring from the archive. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogarchive.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#include + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "common/archive.h" +#include "miscadmin.h" +#include "postmaster/startup.h" +#include "postmaster/pgarch.h" +#include "replication/walsender.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" + +/* + * Attempt to retrieve the specified file from off-line archival storage. + * If successful, fill "path" with its complete path (note that this will be + * a temp file name that doesn't follow the normal naming convention), and + * return true. + * + * If not successful, fill "path" with the name of the normal on-line file + * (which may or may not actually exist, but we'll try to use it), and return + * false. + * + * For fixed-size files, the caller may pass the expected size as an + * additional crosscheck on successful recovery. If the file size is not + * known, set expectedSize = 0. + * + * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments + * in the archive. This is used when fetching the initial checkpoint record, + * when we are not yet sure how far back we need the WAL. + */ +bool +RestoreArchivedFile(char *path, const char *xlogfname, + const char *recovername, off_t expectedSize, + bool cleanupEnabled) +{ + char xlogpath[MAXPGPATH]; + char *xlogRestoreCmd; + char lastRestartPointFname[MAXPGPATH]; + int rc; + struct stat stat_buf; + XLogSegNo restartSegNo; + XLogRecPtr restartRedoPtr; + TimeLineID restartTli; + + /* + * Ignore restore_command when not in archive recovery (meaning we are in + * crash recovery). + */ + if (!ArchiveRecoveryRequested) + goto not_available; + + /* In standby mode, restore_command might not be supplied */ + if (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0) + goto not_available; + + /* + * When doing archive recovery, we always prefer an archived log file even + * if a file of the same name exists in XLOGDIR. The reason is that the + * file in XLOGDIR could be an old, un-filled or partly-filled version + * that was copied and restored as part of backing up $PGDATA. + * + * We could try to optimize this slightly by checking the local copy + * lastchange timestamp against the archived copy, but we have no API to + * do this, nor can we guarantee that the lastchange timestamp was + * preserved correctly when we copied to archive. Our aim is robustness, + * so we elect not to do this. + * + * If we cannot obtain the log file from the archive, however, we will try + * to use the XLOGDIR file if it exists. This is so that we can make use + * of log segments that weren't yet transferred to the archive. + * + * Notice that we don't actually overwrite any files when we copy back + * from archive because the restore_command may inadvertently restore + * inappropriate xlogs, or they may be corrupt, so we may wish to fallback + * to the segments remaining in current XLOGDIR later. The + * copy-from-archive filename is always the same, ensuring that we don't + * run out of disk space on long recoveries. + */ + snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername); + + /* + * Make sure there is no existing file named recovername. + */ + if (stat(xlogpath, &stat_buf) != 0) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + xlogpath))); + } + else + { + if (unlink(xlogpath) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + xlogpath))); + } + + /* + * Calculate the archive file cutoff point for use during log shipping + * replication. All files earlier than this point can be deleted from the + * archive, though there is no requirement to do so. + * + * If cleanup is not enabled, initialise this with the filename of + * InvalidXLogRecPtr, which will prevent the deletion of any WAL files + * from the archive because of the alphabetic sorting property of WAL + * filenames. + * + * Once we have successfully located the redo pointer of the checkpoint + * from which we start recovery we never request a file prior to the redo + * pointer of the last restartpoint. When redo begins we know that we have + * successfully located it, so there is no need for additional status + * flags to signify the point when we can begin deleting WAL files from + * the archive. + */ + if (cleanupEnabled) + { + GetOldestRestartPoint(&restartRedoPtr, &restartTli); + XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size); + XLogFileName(lastRestartPointFname, restartTli, restartSegNo, + wal_segment_size); + /* we shouldn't need anything earlier than last restart point */ + Assert(strcmp(lastRestartPointFname, xlogfname) <= 0); + } + else + XLogFileName(lastRestartPointFname, 0, 0L, wal_segment_size); + + /* Build the restore command to execute */ + xlogRestoreCmd = BuildRestoreCommand(recoveryRestoreCommand, + xlogpath, xlogfname, + lastRestartPointFname); + if (xlogRestoreCmd == NULL) + elog(ERROR, "could not build restore command \"%s\"", + recoveryRestoreCommand); + + ereport(DEBUG3, + (errmsg_internal("executing restore command \"%s\"", + xlogRestoreCmd))); + + /* + * Check signals before restore command and reset afterwards. + */ + PreRestoreCommand(); + + /* + * Copy xlog from archival storage to XLOGDIR + */ + rc = system(xlogRestoreCmd); + + PostRestoreCommand(); + pfree(xlogRestoreCmd); + + if (rc == 0) + { + /* + * command apparently succeeded, but let's make sure the file is + * really there now and has the correct size. + */ + if (stat(xlogpath, &stat_buf) == 0) + { + if (expectedSize > 0 && stat_buf.st_size != expectedSize) + { + int elevel; + + /* + * If we find a partial file in standby mode, we assume it's + * because it's just being copied to the archive, and keep + * trying. + * + * Otherwise treat a wrong-sized file as FATAL to ensure the + * DBA would notice it, but is that too strong? We could try + * to plow ahead with a local copy of the file ... but the + * problem is that there probably isn't one, and we'd + * incorrectly conclude we've reached the end of WAL and we're + * done recovering ... + */ + if (StandbyMode && stat_buf.st_size < expectedSize) + elevel = DEBUG1; + else + elevel = FATAL; + ereport(elevel, + (errmsg("archive file \"%s\" has wrong size: %lld instead of %lld", + xlogfname, + (long long int) stat_buf.st_size, + (long long int) expectedSize))); + return false; + } + else + { + ereport(LOG, + (errmsg("restored log file \"%s\" from archive", + xlogfname))); + strcpy(path, xlogpath); + return true; + } + } + else + { + /* stat failed */ + int elevel = (errno == ENOENT) ? LOG : FATAL; + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", xlogpath), + errdetail("restore_command returned a zero exit status, but stat() failed."))); + } + } + + /* + * Remember, we rollforward UNTIL the restore fails so failure here is + * just part of the process... that makes it difficult to determine + * whether the restore failed because there isn't an archive to restore, + * or because the administrator has specified the restore program + * incorrectly. We have to assume the former. + * + * However, if the failure was due to any sort of signal, it's best to + * punt and abort recovery. (If we "return false" here, upper levels will + * assume that recovery is complete and start up the database!) It's + * essential to abort on child SIGINT and SIGQUIT, because per spec + * system() ignores SIGINT and SIGQUIT while waiting; if we see one of + * those it's a good bet we should have gotten it too. + * + * On SIGTERM, assume we have received a fast shutdown request, and exit + * cleanly. It's pure chance whether we receive the SIGTERM first, or the + * child process. If we receive it first, the signal handler will call + * proc_exit, otherwise we do it here. If we or the child process received + * SIGTERM for any other reason than a fast shutdown request, postmaster + * will perform an immediate shutdown when it sees us exiting + * unexpectedly. + * + * We treat hard shell errors such as "command not found" as fatal, too. + */ + if (wait_result_is_signal(rc, SIGTERM)) + proc_exit(1); + + ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2, + (errmsg("could not restore file \"%s\" from archive: %s", + xlogfname, wait_result_to_str(rc)))); + +not_available: + + /* + * if an archived file is not available, there might still be a version of + * this file in XLOGDIR, so return that as the filename to open. + * + * In many recovery scenarios we expect this to fail also, but if so that + * just means we've reached the end of WAL. + */ + snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); + return false; +} + +/* + * Attempt to execute an external shell command during recovery. + * + * 'command' is the shell command to be executed, 'commandName' is a + * human-readable name describing the command emitted in the logs. If + * 'failOnSignal' is true and the command is killed by a signal, a FATAL + * error is thrown. Otherwise a WARNING is emitted. + * + * This is currently used for recovery_end_command and archive_cleanup_command. + */ +void +ExecuteRecoveryCommand(const char *command, const char *commandName, bool failOnSignal) +{ + char xlogRecoveryCmd[MAXPGPATH]; + char lastRestartPointFname[MAXPGPATH]; + char *dp; + char *endp; + const char *sp; + int rc; + XLogSegNo restartSegNo; + XLogRecPtr restartRedoPtr; + TimeLineID restartTli; + + Assert(command && commandName); + + /* + * Calculate the archive file cutoff point for use during log shipping + * replication. All files earlier than this point can be deleted from the + * archive, though there is no requirement to do so. + */ + GetOldestRestartPoint(&restartRedoPtr, &restartTli); + XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size); + XLogFileName(lastRestartPointFname, restartTli, restartSegNo, + wal_segment_size); + + /* + * construct the command to be executed + */ + dp = xlogRecoveryCmd; + endp = xlogRecoveryCmd + MAXPGPATH - 1; + *endp = '\0'; + + for (sp = command; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + case 'r': + /* %r: filename of last restartpoint */ + sp++; + strlcpy(dp, lastRestartPointFname, endp - dp); + dp += strlen(dp); + break; + case '%': + /* convert %% to a single % */ + sp++; + if (dp < endp) + *dp++ = *sp; + break; + default: + /* otherwise treat the % as not special */ + if (dp < endp) + *dp++ = *sp; + break; + } + } + else + { + if (dp < endp) + *dp++ = *sp; + } + } + *dp = '\0'; + + ereport(DEBUG3, + (errmsg_internal("executing %s \"%s\"", commandName, command))); + + /* + * execute the constructed command + */ + rc = system(xlogRecoveryCmd); + if (rc != 0) + { + /* + * If the failure was due to any sort of signal, it's best to punt and + * abort recovery. See comments in RestoreArchivedFile(). + */ + ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING, + /*------ + translator: First %s represents a postgresql.conf parameter name like + "recovery_end_command", the 2nd is the value of that parameter, the + third an already translated error message. */ + (errmsg("%s \"%s\": %s", commandName, + command, wait_result_to_str(rc)))); + } +} + + +/* + * A file was restored from the archive under a temporary filename (path), + * and now we want to keep it. Rename it under the permanent filename in + * pg_wal (xlogfname), replacing any existing file with the same name. + */ +void +KeepFileRestoredFromArchive(const char *path, const char *xlogfname) +{ + char xlogfpath[MAXPGPATH]; + bool reload = false; + struct stat statbuf; + + snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", xlogfname); + + if (stat(xlogfpath, &statbuf) == 0) + { + char oldpath[MAXPGPATH]; + +#ifdef WIN32 + static unsigned int deletedcounter = 1; + + /* + * On Windows, if another process (e.g a walsender process) holds the + * file open in FILE_SHARE_DELETE mode, unlink will succeed, but the + * file will still show up in directory listing until the last handle + * is closed, and we cannot rename the new file in its place until + * that. To avoid that problem, rename the old file to a temporary + * name first. Use a counter to create a unique filename, because the + * same file might be restored from the archive multiple times, and a + * walsender could still be holding onto an old deleted version of it. + */ + snprintf(oldpath, MAXPGPATH, "%s.deleted%u", + xlogfpath, deletedcounter++); + if (rename(xlogfpath, oldpath) != 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + xlogfpath, oldpath))); + } +#else + /* same-size buffers, so this never truncates */ + strlcpy(oldpath, xlogfpath, MAXPGPATH); +#endif + if (unlink(oldpath) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + xlogfpath))); + reload = true; + } + + durable_rename(path, xlogfpath, ERROR); + + /* + * Create .done file forcibly to prevent the restored segment from being + * archived again later. + */ + if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS) + XLogArchiveForceDone(xlogfname); + else + XLogArchiveNotify(xlogfname); + + /* + * If the existing file was replaced, since walsenders might have it open, + * request them to reload a currently-open segment. This is only required + * for WAL segments, walsenders don't hold other files open, but there's + * no harm in doing this too often, and we don't know what kind of a file + * we're dealing with here. + */ + if (reload) + WalSndRqstFileReload(); + + /* + * Signal walsender that new WAL has arrived. Again, this isn't necessary + * if we restored something other than a WAL segment, but it does no harm + * either. + */ + WalSndWakeup(); +} + +/* + * XLogArchiveNotify + * + * Create an archive notification file + * + * The name of the notification file is the message that will be picked up + * by the archiver, e.g. we write 0000000100000001000000C6.ready + * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6, + * then when complete, rename it to 0000000100000001000000C6.done + */ +void +XLogArchiveNotify(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + FILE *fd; + + /* insert an otherwise empty file called .ready */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + fd = AllocateFile(archiveStatusPath, "w"); + if (fd == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create archive status file \"%s\": %m", + archiveStatusPath))); + return; + } + if (FreeFile(fd)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write archive status file \"%s\": %m", + archiveStatusPath))); + return; + } + + /* Notify archiver that it's got something to do */ + if (IsUnderPostmaster) + PgArchWakeup(); +} + +/* + * Convenience routine to notify using segment number representation of filename + */ +void +XLogArchiveNotifySeg(XLogSegNo segno) +{ + char xlog[MAXFNAMELEN]; + + XLogFileName(xlog, ThisTimeLineID, segno, wal_segment_size); + XLogArchiveNotify(xlog); +} + +/* + * XLogArchiveForceDone + * + * Emit notification forcibly that an XLOG segment file has been successfully + * archived, by creating .done regardless of whether .ready + * exists or not. + */ +void +XLogArchiveForceDone(const char *xlog) +{ + char archiveReady[MAXPGPATH]; + char archiveDone[MAXPGPATH]; + struct stat stat_buf; + FILE *fd; + + /* Exit if already known done */ + StatusFilePath(archiveDone, xlog, ".done"); + if (stat(archiveDone, &stat_buf) == 0) + return; + + /* If .ready exists, rename it to .done */ + StatusFilePath(archiveReady, xlog, ".ready"); + if (stat(archiveReady, &stat_buf) == 0) + { + (void) durable_rename(archiveReady, archiveDone, WARNING); + return; + } + + /* insert an otherwise empty file called .done */ + fd = AllocateFile(archiveDone, "w"); + if (fd == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create archive status file \"%s\": %m", + archiveDone))); + return; + } + if (FreeFile(fd)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write archive status file \"%s\": %m", + archiveDone))); + return; + } +} + +/* + * XLogArchiveCheckDone + * + * This is called when we are ready to delete or recycle an old XLOG segment + * file or backup history file. If it is okay to delete it then return true. + * If it is not time to delete it, make sure a .ready file exists, and return + * false. + * + * If .done exists, then return true; else if .ready exists, + * then return false; else create .ready and return false. + * + * The reason we do things this way is so that if the original attempt to + * create .ready fails, we'll retry during subsequent checkpoints. + */ +bool +XLogArchiveCheckDone(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + /* The file is always deletable if archive_mode is "off". */ + if (!XLogArchivingActive()) + return true; + + /* + * During archive recovery, the file is deletable if archive_mode is not + * "always". + */ + if (!XLogArchivingAlways() && + GetRecoveryState() == RECOVERY_STATE_ARCHIVE) + return true; + + /* + * At this point of the logic, note that we are either a primary with + * archive_mode set to "on" or "always", or a standby with archive_mode + * set to "always". + */ + + /* First check for .done --- this means archiver is done with it */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* check for .ready --- this means archiver is still busy with it */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return false; + + /* Race condition --- maybe archiver just finished, so recheck */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* Retry creation of the .ready file */ + XLogArchiveNotify(xlog); + return false; +} + +/* + * XLogArchiveIsBusy + * + * Check to see if an XLOG segment file is still unarchived. + * This is almost but not quite the inverse of XLogArchiveCheckDone: in + * the first place we aren't chartered to recreate the .ready file, and + * in the second place we should consider that if the file is already gone + * then it's not busy. (This check is needed to handle the race condition + * that a checkpoint already deleted the no-longer-needed file.) + */ +bool +XLogArchiveIsBusy(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + /* First check for .done --- this means archiver is done with it */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return false; + + /* check for .ready --- this means archiver is still busy with it */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* Race condition --- maybe archiver just finished, so recheck */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return false; + + /* + * Check to see if the WAL file has been removed by checkpoint, which + * implies it has already been archived, and explains why we can't see a + * status file for it. + */ + snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog); + if (stat(archiveStatusPath, &stat_buf) != 0 && + errno == ENOENT) + return false; + + return true; +} + +/* + * XLogArchiveIsReadyOrDone + * + * Check to see if an XLOG segment file has a .ready or .done file. + * This is similar to XLogArchiveIsBusy(), but returns true if the file + * is already archived or is about to be archived. + * + * This is currently only used at recovery. During normal operation this + * would be racy: the file might get removed or marked with .ready as we're + * checking it, or immediately after we return. + */ +bool +XLogArchiveIsReadyOrDone(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + /* First check for .done --- this means archiver is done with it */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* check for .ready --- this means archiver is still busy with it */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* Race condition --- maybe archiver just finished, so recheck */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + return false; +} + +/* + * XLogArchiveIsReady + * + * Check to see if an XLOG segment file has an archive notification (.ready) + * file. + */ +bool +XLogArchiveIsReady(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + return false; +} + +/* + * XLogArchiveCleanup + * + * Cleanup archive notification file(s) for a particular xlog segment + */ +void +XLogArchiveCleanup(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + + /* Remove the .done file */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + unlink(archiveStatusPath); + /* should we complain about failure? */ + + /* Remove the .ready file if present --- normally it shouldn't be */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + unlink(archiveStatusPath); + /* should we complain about failure? */ +} diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c new file mode 100644 index 0000000..b98deb7 --- /dev/null +++ b/src/backend/access/transam/xlogfuncs.c @@ -0,0 +1,830 @@ +/*------------------------------------------------------------------------- + * + * xlogfuncs.c + * + * PostgreSQL write-ahead log manager user interface functions + * + * This file contains WAL control and information functions. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/walreceiver.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/numeric.h" +#include "utils/pg_lsn.h" +#include "utils/timestamp.h" +#include "utils/tuplestore.h" + +/* + * Store label file and tablespace map during non-exclusive backups. + */ +static StringInfo label_file; +static StringInfo tblspc_map_file; + +/* + * pg_start_backup: set up for taking an on-line backup dump + * + * Essentially what this does is to create a backup label file in $PGDATA, + * where it will be archived as part of the backup dump. The label file + * contains the user-supplied label string (typically this would be used + * to tell where the backup dump will be stored) and the starting time and + * starting WAL location for the dump. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_start_backup(PG_FUNCTION_ARGS) +{ + text *backupid = PG_GETARG_TEXT_PP(0); + bool fast = PG_GETARG_BOOL(1); + bool exclusive = PG_GETARG_BOOL(2); + char *backupidstr; + XLogRecPtr startpoint; + SessionBackupState status = get_backup_status(); + + backupidstr = text_to_cstring(backupid); + + if (status == SESSION_BACKUP_NON_EXCLUSIVE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress in this session"))); + + if (exclusive) + { + startpoint = do_pg_start_backup(backupidstr, fast, NULL, NULL, + NULL, NULL); + } + else + { + MemoryContext oldcontext; + + /* + * Label file and tablespace map file need to be long-lived, since + * they are read in pg_stop_backup. + */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + label_file = makeStringInfo(); + tblspc_map_file = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + + register_persistent_abort_backup_handler(); + + startpoint = do_pg_start_backup(backupidstr, fast, NULL, label_file, + NULL, tblspc_map_file); + } + + PG_RETURN_LSN(startpoint); +} + +/* + * pg_stop_backup: finish taking an on-line backup dump + * + * We write an end-of-backup WAL record, and remove the backup label file + * created by pg_start_backup, creating a backup history file in pg_wal + * instead (whence it will immediately be archived). The backup history file + * contains the same info found in the label file, plus the backup-end time + * and WAL location. Before 9.0, the backup-end time was read from the backup + * history file at the beginning of archive recovery, but we now use the WAL + * record for that and the file is for informational and debug purposes only. + * + * Note: different from CancelBackup which just cancels online backup mode. + * + * Note: this version is only called to stop an exclusive backup. The function + * pg_stop_backup_v2 (overloaded as pg_stop_backup in SQL) is called to + * stop non-exclusive backups. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_stop_backup(PG_FUNCTION_ARGS) +{ + XLogRecPtr stoppoint; + SessionBackupState status = get_backup_status(); + + if (status == SESSION_BACKUP_NON_EXCLUSIVE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("non-exclusive backup in progress"), + errhint("Did you mean to use pg_stop_backup('f')?"))); + + /* + * Exclusive backups were typically started in a different connection, so + * don't try to verify that status of backup is set to + * SESSION_BACKUP_EXCLUSIVE in this function. Actual verification that an + * exclusive backup is in fact running is handled inside + * do_pg_stop_backup. + */ + stoppoint = do_pg_stop_backup(NULL, true, NULL); + + PG_RETURN_LSN(stoppoint); +} + + +/* + * pg_stop_backup_v2: finish taking exclusive or nonexclusive on-line backup. + * + * Works the same as pg_stop_backup, except for non-exclusive backups it returns + * the backup label and tablespace map files as text fields in as part of the + * resultset. + * + * The first parameter (variable 'exclusive') allows the user to tell us if + * this is an exclusive or a non-exclusive backup. + * + * The second parameter (variable 'waitforarchive'), which is optional, + * allows the user to choose if they want to wait for the WAL to be archived + * or if we should just return as soon as the WAL record is written. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_stop_backup_v2(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + Datum values[3]; + bool nulls[3]; + + bool exclusive = PG_GETARG_BOOL(0); + bool waitforarchive = PG_GETARG_BOOL(1); + XLogRecPtr stoppoint; + SessionBackupState status = get_backup_status(); + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + if (exclusive) + { + if (status == SESSION_BACKUP_NON_EXCLUSIVE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("non-exclusive backup in progress"), + errhint("Did you mean to use pg_stop_backup('f')?"))); + + /* + * Stop the exclusive backup, and since we're in an exclusive backup + * return NULL for both backup_label and tablespace_map. + */ + stoppoint = do_pg_stop_backup(NULL, waitforarchive, NULL); + + nulls[1] = true; + nulls[2] = true; + } + else + { + if (status != SESSION_BACKUP_NON_EXCLUSIVE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("non-exclusive backup is not in progress"), + errhint("Did you mean to use pg_stop_backup('t')?"))); + + /* + * Stop the non-exclusive backup. Return a copy of the backup label + * and tablespace map so they can be written to disk by the caller. + */ + stoppoint = do_pg_stop_backup(label_file->data, waitforarchive, NULL); + + values[1] = CStringGetTextDatum(label_file->data); + values[2] = CStringGetTextDatum(tblspc_map_file->data); + + /* Free structures allocated in TopMemoryContext */ + pfree(label_file->data); + pfree(label_file); + label_file = NULL; + pfree(tblspc_map_file->data); + pfree(tblspc_map_file); + tblspc_map_file = NULL; + } + + /* Stoppoint is included on both exclusive and nonexclusive backups */ + values[0] = LSNGetDatum(stoppoint); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} + +/* + * pg_switch_wal: switch to next xlog file + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_switch_wal(PG_FUNCTION_ARGS) +{ + XLogRecPtr switchpoint; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + switchpoint = RequestXLogSwitch(false); + + /* + * As a convenience, return the WAL location of the switch record + */ + PG_RETURN_LSN(switchpoint); +} + +/* + * pg_create_restore_point: a named point for restore + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_create_restore_point(PG_FUNCTION_ARGS) +{ + text *restore_name = PG_GETARG_TEXT_PP(0); + char *restore_name_str; + XLogRecPtr restorepoint; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + if (!XLogIsNeeded()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for creating a restore point"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + + restore_name_str = text_to_cstring(restore_name); + + if (strlen(restore_name_str) >= MAXFNAMELEN) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1))); + + restorepoint = XLogRestorePoint(restore_name_str); + + /* + * As a convenience, return the WAL location of the restore point record + */ + PG_RETURN_LSN(restorepoint); +} + +/* + * Report the current WAL write location (same format as pg_start_backup etc) + * + * This is useful for determining how much of WAL is visible to an external + * archiving process. Note that the data before this point is written out + * to the kernel, but is not necessarily synced to disk. + */ +Datum +pg_current_wal_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr current_recptr; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + current_recptr = GetXLogWriteRecPtr(); + + PG_RETURN_LSN(current_recptr); +} + +/* + * Report the current WAL insert location (same format as pg_start_backup etc) + * + * This function is mostly for debugging purposes. + */ +Datum +pg_current_wal_insert_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr current_recptr; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + current_recptr = GetXLogInsertRecPtr(); + + PG_RETURN_LSN(current_recptr); +} + +/* + * Report the current WAL flush location (same format as pg_start_backup etc) + * + * This function is mostly for debugging purposes. + */ +Datum +pg_current_wal_flush_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr current_recptr; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + current_recptr = GetFlushRecPtr(); + + PG_RETURN_LSN(current_recptr); +} + +/* + * Report the last WAL receive location (same format as pg_start_backup etc) + * + * This is useful for determining how much of WAL is guaranteed to be received + * and synced to disk by walreceiver. + */ +Datum +pg_last_wal_receive_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr recptr; + + recptr = GetWalRcvFlushRecPtr(NULL, NULL); + + if (recptr == 0) + PG_RETURN_NULL(); + + PG_RETURN_LSN(recptr); +} + +/* + * Report the last WAL replay location (same format as pg_start_backup etc) + * + * This is useful for determining how much of WAL is visible to read-only + * connections during recovery. + */ +Datum +pg_last_wal_replay_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr recptr; + + recptr = GetXLogReplayRecPtr(NULL); + + if (recptr == 0) + PG_RETURN_NULL(); + + PG_RETURN_LSN(recptr); +} + +/* + * Compute an xlog file name and decimal byte offset given a WAL location, + * such as is returned by pg_stop_backup() or pg_switch_wal(). + * + * Note that a location exactly at a segment boundary is taken to be in + * the previous segment. This is usually the right thing, since the + * expected usage is to determine which xlog file(s) are ready to archive. + */ +Datum +pg_walfile_name_offset(PG_FUNCTION_ARGS) +{ + XLogSegNo xlogsegno; + uint32 xrecoff; + XLogRecPtr locationpoint = PG_GETARG_LSN(0); + char xlogfilename[MAXFNAMELEN]; + Datum values[2]; + bool isnull[2]; + TupleDesc resultTupleDesc; + HeapTuple resultHeapTuple; + Datum result; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("%s cannot be executed during recovery.", + "pg_walfile_name_offset()"))); + + /* + * Construct a tuple descriptor for the result row. This must match this + * function's pg_proc entry! + */ + resultTupleDesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name", + TEXTOID, -1, 0); + TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset", + INT4OID, -1, 0); + + resultTupleDesc = BlessTupleDesc(resultTupleDesc); + + /* + * xlogfilename + */ + XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size); + XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size); + + values[0] = CStringGetTextDatum(xlogfilename); + isnull[0] = false; + + /* + * offset + */ + xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size); + + values[1] = UInt32GetDatum(xrecoff); + isnull[1] = false; + + /* + * Tuple jam: Having first prepared your Datums, then squash together + */ + resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull); + + result = HeapTupleGetDatum(resultHeapTuple); + + PG_RETURN_DATUM(result); +} + +/* + * Compute an xlog file name given a WAL location, + * such as is returned by pg_stop_backup() or pg_switch_wal(). + */ +Datum +pg_walfile_name(PG_FUNCTION_ARGS) +{ + XLogSegNo xlogsegno; + XLogRecPtr locationpoint = PG_GETARG_LSN(0); + char xlogfilename[MAXFNAMELEN]; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("%s cannot be executed during recovery.", + "pg_walfile_name()"))); + + XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size); + XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size); + + PG_RETURN_TEXT_P(cstring_to_text(xlogfilename)); +} + +/* + * pg_wal_replay_pause - Request to pause recovery + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_wal_replay_pause(PG_FUNCTION_ARGS) +{ + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + if (PromoteIsTriggered()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("standby promotion is ongoing"), + errhint("%s cannot be executed after promotion is triggered.", + "pg_wal_replay_pause()"))); + + SetRecoveryPause(true); + + /* wake up the recovery process so that it can process the pause request */ + WakeupRecovery(); + + PG_RETURN_VOID(); +} + +/* + * pg_wal_replay_resume - resume recovery now + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_wal_replay_resume(PG_FUNCTION_ARGS) +{ + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + if (PromoteIsTriggered()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("standby promotion is ongoing"), + errhint("%s cannot be executed after promotion is triggered.", + "pg_wal_replay_resume()"))); + + SetRecoveryPause(false); + + PG_RETURN_VOID(); +} + +/* + * pg_is_wal_replay_paused + */ +Datum +pg_is_wal_replay_paused(PG_FUNCTION_ARGS) +{ + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + PG_RETURN_BOOL(GetRecoveryPauseState() != RECOVERY_NOT_PAUSED); +} + +/* + * pg_get_wal_replay_pause_state - Returns the recovery pause state. + * + * Returned values: + * + * 'not paused' - if pause is not requested + * 'pause requested' - if pause is requested but recovery is not yet paused + * 'paused' - if recovery is paused + */ +Datum +pg_get_wal_replay_pause_state(PG_FUNCTION_ARGS) +{ + char *statestr = NULL; + + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + /* get the recovery pause state */ + switch (GetRecoveryPauseState()) + { + case RECOVERY_NOT_PAUSED: + statestr = "not paused"; + break; + case RECOVERY_PAUSE_REQUESTED: + statestr = "pause requested"; + break; + case RECOVERY_PAUSED: + statestr = "paused"; + break; + } + + Assert(statestr != NULL); + PG_RETURN_TEXT_P(cstring_to_text(statestr)); +} + +/* + * Returns timestamp of latest processed commit/abort record. + * + * When the server has been started normally without recovery the function + * returns NULL. + */ +Datum +pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS) +{ + TimestampTz xtime; + + xtime = GetLatestXTime(); + if (xtime == 0) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(xtime); +} + +/* + * Returns bool with current recovery mode, a global state. + */ +Datum +pg_is_in_recovery(PG_FUNCTION_ARGS) +{ + PG_RETURN_BOOL(RecoveryInProgress()); +} + +/* + * Compute the difference in bytes between two WAL locations. + */ +Datum +pg_wal_lsn_diff(PG_FUNCTION_ARGS) +{ + Datum result; + + result = DirectFunctionCall2(pg_lsn_mi, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1)); + + PG_RETURN_NUMERIC(result); +} + +/* + * Returns bool with current on-line backup mode, a global state. + */ +Datum +pg_is_in_backup(PG_FUNCTION_ARGS) +{ + PG_RETURN_BOOL(BackupInProgress()); +} + +/* + * Returns start time of an online exclusive backup. + * + * When there's no exclusive backup in progress, the function + * returns NULL. + */ +Datum +pg_backup_start_time(PG_FUNCTION_ARGS) +{ + Datum xtime; + FILE *lfp; + char fline[MAXPGPATH]; + char backup_start_time[30]; + + /* + * See if label file is present + */ + lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); + if (lfp == NULL) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + PG_RETURN_NULL(); + } + + /* + * Parse the file to find the START TIME line. + */ + backup_start_time[0] = '\0'; + while (fgets(fline, sizeof(fline), lfp) != NULL) + { + if (sscanf(fline, "START TIME: %25[^\n]\n", backup_start_time) == 1) + break; + } + + /* Check for a read error. */ + if (ferror(lfp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE))); + + /* Close the backup label file. */ + if (FreeFile(lfp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", BACKUP_LABEL_FILE))); + + if (strlen(backup_start_time) == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + + /* + * Convert the time string read from file to TimestampTz form. + */ + xtime = DirectFunctionCall3(timestamptz_in, + CStringGetDatum(backup_start_time), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1)); + + PG_RETURN_DATUM(xtime); +} + +/* + * Promotes a standby server. + * + * A result of "true" means that promotion has been completed if "wait" is + * "true", or initiated if "wait" is false. + */ +Datum +pg_promote(PG_FUNCTION_ARGS) +{ + bool wait = PG_GETARG_BOOL(0); + int wait_seconds = PG_GETARG_INT32(1); + FILE *promote_file; + int i; + + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + if (wait_seconds <= 0) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"wait_seconds\" must not be negative or zero"))); + + /* create the promote signal file */ + promote_file = AllocateFile(PROMOTE_SIGNAL_FILE, "w"); + if (!promote_file) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + PROMOTE_SIGNAL_FILE))); + + if (FreeFile(promote_file)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + PROMOTE_SIGNAL_FILE))); + + /* signal the postmaster */ + if (kill(PostmasterPid, SIGUSR1) != 0) + { + ereport(WARNING, + (errmsg("failed to send signal to postmaster: %m"))); + (void) unlink(PROMOTE_SIGNAL_FILE); + PG_RETURN_BOOL(false); + } + + /* return immediately if waiting was not requested */ + if (!wait) + PG_RETURN_BOOL(true); + + /* wait for the amount of time wanted until promotion */ +#define WAITS_PER_SECOND 10 + for (i = 0; i < WAITS_PER_SECOND * wait_seconds; i++) + { + int rc; + + ResetLatch(MyLatch); + + if (!RecoveryInProgress()) + PG_RETURN_BOOL(true); + + CHECK_FOR_INTERRUPTS(); + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 1000L / WAITS_PER_SECOND, + WAIT_EVENT_PROMOTE); + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. + */ + if (rc & WL_POSTMASTER_DEATH) + PG_RETURN_BOOL(false); + } + + ereport(WARNING, + (errmsg_plural("server did not promote within %d second", + "server did not promote within %d seconds", + wait_seconds, + wait_seconds))); + PG_RETURN_BOOL(false); +} diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c new file mode 100644 index 0000000..b153fad --- /dev/null +++ b/src/backend/access/transam/xloginsert.c @@ -0,0 +1,1229 @@ +/*------------------------------------------------------------------------- + * + * xloginsert.c + * Functions for constructing WAL records + * + * Constructing a WAL record begins with a call to XLogBeginInsert, + * followed by a number of XLogRegister* calls. The registered data is + * collected in private working memory, and finally assembled into a chain + * of XLogRecData structs by a call to XLogRecordAssemble(). See + * access/transam/README for details. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xloginsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xloginsert.h" +#include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "replication/origin.h" +#include "storage/bufmgr.h" +#include "storage/proc.h" +#include "utils/memutils.h" + +/* Buffer size required to store a compressed version of backup block image */ +#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) + +/* + * For each block reference registered with XLogRegisterBuffer, we fill in + * a registered_buffer struct. + */ +typedef struct +{ + bool in_use; /* is this slot in use? */ + uint8 flags; /* REGBUF_* flags */ + RelFileNode rnode; /* identifies the relation and block */ + ForkNumber forkno; + BlockNumber block; + Page page; /* page content */ + uint32 rdata_len; /* total length of data in rdata chain */ + XLogRecData *rdata_head; /* head of the chain of data registered with + * this block */ + XLogRecData *rdata_tail; /* last entry in the chain, or &rdata_head if + * empty */ + + XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to + * backup block data in XLogRecordAssemble() */ + + /* buffer to store a compressed version of backup block image */ + char compressed_page[PGLZ_MAX_BLCKSZ]; +} registered_buffer; + +static registered_buffer *registered_buffers; +static int max_registered_buffers; /* allocated size */ +static int max_registered_block_id = 0; /* highest block_id + 1 currently + * registered */ + +/* + * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered + * with XLogRegisterData(...). + */ +static XLogRecData *mainrdata_head; +static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head; +static uint32 mainrdata_len; /* total # of bytes in chain */ + +/* flags for the in-progress insertion */ +static uint8 curinsert_flags = 0; + +/* + * These are used to hold the record header while constructing a record. + * 'hdr_scratch' is not a plain variable, but is palloc'd at initialization, + * because we want it to be MAXALIGNed and padding bytes zeroed. + * + * For simplicity, it's allocated large enough to hold the headers for any + * WAL record. + */ +static XLogRecData hdr_rdt; +static char *hdr_scratch = NULL; + +#define SizeOfXlogOrigin (sizeof(RepOriginId) + sizeof(char)) +#define SizeOfXLogTransactionId (sizeof(TransactionId) + sizeof(char)) + +#define HEADER_SCRATCH_SIZE \ + (SizeOfXLogRecord + \ + MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \ + SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin + \ + SizeOfXLogTransactionId) + +/* + * An array of XLogRecData structs, to hold registered data. + */ +static XLogRecData *rdatas; +static int num_rdatas; /* entries currently used */ +static int max_rdatas; /* allocated size */ + +static bool begininsert_called = false; + +/* Memory context to hold the registered buffer and data references. */ +static MemoryContext xloginsert_cxt; + +static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, int *num_fpi); +static bool XLogCompressBackupBlock(char *page, uint16 hole_offset, + uint16 hole_length, char *dest, uint16 *dlen); + +/* + * Begin constructing a WAL record. This must be called before the + * XLogRegister* functions and XLogInsert(). + */ +void +XLogBeginInsert(void) +{ + Assert(max_registered_block_id == 0); + Assert(mainrdata_last == (XLogRecData *) &mainrdata_head); + Assert(mainrdata_len == 0); + + /* cross-check on whether we should be here or not */ + if (!XLogInsertAllowed()) + elog(ERROR, "cannot make new WAL entries during recovery"); + + if (begininsert_called) + elog(ERROR, "XLogBeginInsert was already called"); + + begininsert_called = true; +} + +/* + * Ensure that there are enough buffer and data slots in the working area, + * for subsequent XLogRegisterBuffer, XLogRegisterData and XLogRegisterBufData + * calls. + * + * There is always space for a small number of buffers and data chunks, enough + * for most record types. This function is for the exceptional cases that need + * more. + */ +void +XLogEnsureRecordSpace(int max_block_id, int ndatas) +{ + int nbuffers; + + /* + * This must be called before entering a critical section, because + * allocating memory inside a critical section can fail. repalloc() will + * check the same, but better to check it here too so that we fail + * consistently even if the arrays happen to be large enough already. + */ + Assert(CritSectionCount == 0); + + /* the minimum values can't be decreased */ + if (max_block_id < XLR_NORMAL_MAX_BLOCK_ID) + max_block_id = XLR_NORMAL_MAX_BLOCK_ID; + if (ndatas < XLR_NORMAL_RDATAS) + ndatas = XLR_NORMAL_RDATAS; + + if (max_block_id > XLR_MAX_BLOCK_ID) + elog(ERROR, "maximum number of WAL record block references exceeded"); + nbuffers = max_block_id + 1; + + if (nbuffers > max_registered_buffers) + { + registered_buffers = (registered_buffer *) + repalloc(registered_buffers, sizeof(registered_buffer) * nbuffers); + + /* + * At least the padding bytes in the structs must be zeroed, because + * they are included in WAL data, but initialize it all for tidiness. + */ + MemSet(®istered_buffers[max_registered_buffers], 0, + (nbuffers - max_registered_buffers) * sizeof(registered_buffer)); + max_registered_buffers = nbuffers; + } + + if (ndatas > max_rdatas) + { + rdatas = (XLogRecData *) repalloc(rdatas, sizeof(XLogRecData) * ndatas); + max_rdatas = ndatas; + } +} + +/* + * Reset WAL record construction buffers. + */ +void +XLogResetInsertion(void) +{ + int i; + + /* reset the subxact assignment flag (if needed) */ + if (curinsert_flags & XLOG_INCLUDE_XID) + MarkSubTransactionAssigned(); + + for (i = 0; i < max_registered_block_id; i++) + registered_buffers[i].in_use = false; + + num_rdatas = 0; + max_registered_block_id = 0; + mainrdata_len = 0; + mainrdata_last = (XLogRecData *) &mainrdata_head; + curinsert_flags = 0; + begininsert_called = false; +} + +/* + * Register a reference to a buffer with the WAL record being constructed. + * This must be called for every page that the WAL-logged operation modifies. + */ +void +XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags) +{ + registered_buffer *regbuf; + + /* NO_IMAGE doesn't make sense with FORCE_IMAGE */ + Assert(!((flags & REGBUF_FORCE_IMAGE) && (flags & (REGBUF_NO_IMAGE)))); + Assert(begininsert_called); + + if (block_id >= max_registered_block_id) + { + if (block_id >= max_registered_buffers) + elog(ERROR, "too many registered buffers"); + max_registered_block_id = block_id + 1; + } + + regbuf = ®istered_buffers[block_id]; + + BufferGetTag(buffer, ®buf->rnode, ®buf->forkno, ®buf->block); + regbuf->page = BufferGetPage(buffer); + regbuf->flags = flags; + regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; + regbuf->rdata_len = 0; + + /* + * Check that this page hasn't already been registered with some other + * block_id. + */ +#ifdef USE_ASSERT_CHECKING + { + int i; + + for (i = 0; i < max_registered_block_id; i++) + { + registered_buffer *regbuf_old = ®istered_buffers[i]; + + if (i == block_id || !regbuf_old->in_use) + continue; + + Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) || + regbuf_old->forkno != regbuf->forkno || + regbuf_old->block != regbuf->block); + } + } +#endif + + regbuf->in_use = true; +} + +/* + * Like XLogRegisterBuffer, but for registering a block that's not in the + * shared buffer pool (i.e. when you don't have a Buffer for it). + */ +void +XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum, + BlockNumber blknum, Page page, uint8 flags) +{ + registered_buffer *regbuf; + + /* This is currently only used to WAL-log a full-page image of a page */ + Assert(flags & REGBUF_FORCE_IMAGE); + Assert(begininsert_called); + + if (block_id >= max_registered_block_id) + max_registered_block_id = block_id + 1; + + if (block_id >= max_registered_buffers) + elog(ERROR, "too many registered buffers"); + + regbuf = ®istered_buffers[block_id]; + + regbuf->rnode = *rnode; + regbuf->forkno = forknum; + regbuf->block = blknum; + regbuf->page = page; + regbuf->flags = flags; + regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; + regbuf->rdata_len = 0; + + /* + * Check that this page hasn't already been registered with some other + * block_id. + */ +#ifdef USE_ASSERT_CHECKING + { + int i; + + for (i = 0; i < max_registered_block_id; i++) + { + registered_buffer *regbuf_old = ®istered_buffers[i]; + + if (i == block_id || !regbuf_old->in_use) + continue; + + Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) || + regbuf_old->forkno != regbuf->forkno || + regbuf_old->block != regbuf->block); + } + } +#endif + + regbuf->in_use = true; +} + +/* + * Add data to the WAL record that's being constructed. + * + * The data is appended to the "main chunk", available at replay with + * XLogRecGetData(). + */ +void +XLogRegisterData(char *data, int len) +{ + XLogRecData *rdata; + + Assert(begininsert_called); + + if (num_rdatas >= max_rdatas) + elog(ERROR, "too much WAL data"); + rdata = &rdatas[num_rdatas++]; + + rdata->data = data; + rdata->len = len; + + /* + * we use the mainrdata_last pointer to track the end of the chain, so no + * need to clear 'next' here. + */ + + mainrdata_last->next = rdata; + mainrdata_last = rdata; + + mainrdata_len += len; +} + +/* + * Add buffer-specific data to the WAL record that's being constructed. + * + * Block_id must reference a block previously registered with + * XLogRegisterBuffer(). If this is called more than once for the same + * block_id, the data is appended. + * + * The maximum amount of data that can be registered per block is 65535 + * bytes. That should be plenty; if you need more than BLCKSZ bytes to + * reconstruct the changes to the page, you might as well just log a full + * copy of it. (the "main data" that's not associated with a block is not + * limited) + */ +void +XLogRegisterBufData(uint8 block_id, char *data, int len) +{ + registered_buffer *regbuf; + XLogRecData *rdata; + + Assert(begininsert_called); + + /* find the registered buffer struct */ + regbuf = ®istered_buffers[block_id]; + if (!regbuf->in_use) + elog(ERROR, "no block with id %d registered with WAL insertion", + block_id); + + if (num_rdatas >= max_rdatas) + elog(ERROR, "too much WAL data"); + rdata = &rdatas[num_rdatas++]; + + rdata->data = data; + rdata->len = len; + + regbuf->rdata_tail->next = rdata; + regbuf->rdata_tail = rdata; + regbuf->rdata_len += len; +} + +/* + * Set insert status flags for the upcoming WAL record. + * + * The flags that can be used here are: + * - XLOG_INCLUDE_ORIGIN, to determine if the replication origin should be + * included in the record. + * - XLOG_MARK_UNIMPORTANT, to signal that the record is not important for + * durability, which allows to avoid triggering WAL archiving and other + * background activity. + * - XLOG_INCLUDE_XID, a message-passing hack between XLogRecordAssemble + * and XLogResetInsertion. + */ +void +XLogSetRecordFlags(uint8 flags) +{ + Assert(begininsert_called); + curinsert_flags |= flags; +} + +/* + * Insert an XLOG record having the specified RMID and info bytes, with the + * body of the record being the data and buffer references registered earlier + * with XLogRegister* calls. + * + * Returns XLOG pointer to end of record (beginning of next record). + * This can be used as LSN for data pages affected by the logged action. + * (LSN is the XLOG point up to which the XLOG must be flushed to disk + * before the data page can be written out. This implements the basic + * WAL rule "write the log before the data".) + */ +XLogRecPtr +XLogInsert(RmgrId rmid, uint8 info) +{ + XLogRecPtr EndPos; + + /* XLogBeginInsert() must have been called. */ + if (!begininsert_called) + elog(ERROR, "XLogBeginInsert was not called"); + + /* + * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and + * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me. + */ + if ((info & ~(XLR_RMGR_INFO_MASK | + XLR_SPECIAL_REL_UPDATE | + XLR_CHECK_CONSISTENCY)) != 0) + elog(PANIC, "invalid xlog info mask %02X", info); + + TRACE_POSTGRESQL_WAL_INSERT(rmid, info); + + /* + * In bootstrap mode, we don't actually log anything but XLOG resources; + * return a phony record pointer. + */ + if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID) + { + XLogResetInsertion(); + EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */ + return EndPos; + } + + do + { + XLogRecPtr RedoRecPtr; + bool doPageWrites; + XLogRecPtr fpw_lsn; + XLogRecData *rdt; + int num_fpi = 0; + + /* + * Get values needed to decide whether to do full-page writes. Since + * we don't yet have an insertion lock, these could change under us, + * but XLogInsertRecord will recheck them once it has a lock. + */ + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + + rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, + &fpw_lsn, &num_fpi); + + EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi); + } while (EndPos == InvalidXLogRecPtr); + + XLogResetInsertion(); + + return EndPos; +} + +/* + * Assemble a WAL record from the registered data and buffers into an + * XLogRecData chain, ready for insertion with XLogInsertRecord(). + * + * The record header fields are filled in, except for the xl_prev field. The + * calculated CRC does not include the record header yet. + * + * If there are any registered buffers, and a full-page image was not taken + * of all of them, *fpw_lsn is set to the lowest LSN among such pages. This + * signals that the assembled record is only good for insertion on the + * assumption that the RedoRecPtr and doPageWrites values were up-to-date. + */ +static XLogRecData * +XLogRecordAssemble(RmgrId rmid, uint8 info, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, int *num_fpi) +{ + XLogRecData *rdt; + uint32 total_len = 0; + int block_id; + pg_crc32c rdata_crc; + registered_buffer *prev_regbuf = NULL; + XLogRecData *rdt_datas_last; + XLogRecord *rechdr; + char *scratch = hdr_scratch; + + /* + * Note: this function can be called multiple times for the same record. + * All the modifications we do to the rdata chains below must handle that. + */ + + /* The record begins with the fixed-size header */ + rechdr = (XLogRecord *) scratch; + scratch += SizeOfXLogRecord; + + hdr_rdt.next = NULL; + rdt_datas_last = &hdr_rdt; + hdr_rdt.data = hdr_scratch; + + /* + * Enforce consistency checks for this record if user is looking for it. + * Do this before at the beginning of this routine to give the possibility + * for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY directly for + * a record. + */ + if (wal_consistency_checking[rmid]) + info |= XLR_CHECK_CONSISTENCY; + + /* + * Make an rdata chain containing all the data portions of all block + * references. This includes the data for full-page images. Also append + * the headers for the block references in the scratch buffer. + */ + *fpw_lsn = InvalidXLogRecPtr; + for (block_id = 0; block_id < max_registered_block_id; block_id++) + { + registered_buffer *regbuf = ®istered_buffers[block_id]; + bool needs_backup; + bool needs_data; + XLogRecordBlockHeader bkpb; + XLogRecordBlockImageHeader bimg; + XLogRecordBlockCompressHeader cbimg = {0}; + bool samerel; + bool is_compressed = false; + bool include_image; + + if (!regbuf->in_use) + continue; + + /* Determine if this block needs to be backed up */ + if (regbuf->flags & REGBUF_FORCE_IMAGE) + needs_backup = true; + else if (regbuf->flags & REGBUF_NO_IMAGE) + needs_backup = false; + else if (!doPageWrites) + needs_backup = false; + else + { + /* + * We assume page LSN is first data on *every* page that can be + * passed to XLogInsert, whether it has the standard page layout + * or not. + */ + XLogRecPtr page_lsn = PageGetLSN(regbuf->page); + + needs_backup = (page_lsn <= RedoRecPtr); + if (!needs_backup) + { + if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn) + *fpw_lsn = page_lsn; + } + } + + /* Determine if the buffer data needs to included */ + if (regbuf->rdata_len == 0) + needs_data = false; + else if ((regbuf->flags & REGBUF_KEEP_DATA) != 0) + needs_data = true; + else + needs_data = !needs_backup; + + bkpb.id = block_id; + bkpb.fork_flags = regbuf->forkno; + bkpb.data_length = 0; + + if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) + bkpb.fork_flags |= BKPBLOCK_WILL_INIT; + + /* + * If needs_backup is true or WAL checking is enabled for current + * resource manager, log a full-page write for the current block. + */ + include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0; + + if (include_image) + { + Page page = regbuf->page; + uint16 compressed_len = 0; + + /* + * The page needs to be backed up, so calculate its hole length + * and offset. + */ + if (regbuf->flags & REGBUF_STANDARD) + { + /* Assume we can omit data between pd_lower and pd_upper */ + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + bimg.hole_offset = lower; + cbimg.hole_length = upper - lower; + } + else + { + /* No "hole" to remove */ + bimg.hole_offset = 0; + cbimg.hole_length = 0; + } + } + else + { + /* Not a standard page header, don't try to eliminate "hole" */ + bimg.hole_offset = 0; + cbimg.hole_length = 0; + } + + /* + * Try to compress a block image if wal_compression is enabled + */ + if (wal_compression) + { + is_compressed = + XLogCompressBackupBlock(page, bimg.hole_offset, + cbimg.hole_length, + regbuf->compressed_page, + &compressed_len); + } + + /* + * Fill in the remaining fields in the XLogRecordBlockHeader + * struct + */ + bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; + + /* Report a full page image constructed for the WAL record */ + *num_fpi += 1; + + /* + * Construct XLogRecData entries for the page content. + */ + rdt_datas_last->next = ®buf->bkp_rdatas[0]; + rdt_datas_last = rdt_datas_last->next; + + bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE; + + /* + * If WAL consistency checking is enabled for the resource manager + * of this WAL record, a full-page image is included in the record + * for the block modified. During redo, the full-page is replayed + * only if BKPIMAGE_APPLY is set. + */ + if (needs_backup) + bimg.bimg_info |= BKPIMAGE_APPLY; + + if (is_compressed) + { + bimg.length = compressed_len; + bimg.bimg_info |= BKPIMAGE_IS_COMPRESSED; + + rdt_datas_last->data = regbuf->compressed_page; + rdt_datas_last->len = compressed_len; + } + else + { + bimg.length = BLCKSZ - cbimg.hole_length; + + if (cbimg.hole_length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = BLCKSZ; + } + else + { + /* must skip the hole */ + rdt_datas_last->data = page; + rdt_datas_last->len = bimg.hole_offset; + + rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; + + rdt_datas_last->data = + page + (bimg.hole_offset + cbimg.hole_length); + rdt_datas_last->len = + BLCKSZ - (bimg.hole_offset + cbimg.hole_length); + } + } + + total_len += bimg.length; + } + + if (needs_data) + { + /* + * Link the caller-supplied rdata chain for this buffer to the + * overall list. + */ + bkpb.fork_flags |= BKPBLOCK_HAS_DATA; + bkpb.data_length = regbuf->rdata_len; + total_len += regbuf->rdata_len; + + rdt_datas_last->next = regbuf->rdata_head; + rdt_datas_last = regbuf->rdata_tail; + } + + if (prev_regbuf && RelFileNodeEquals(regbuf->rnode, prev_regbuf->rnode)) + { + samerel = true; + bkpb.fork_flags |= BKPBLOCK_SAME_REL; + } + else + samerel = false; + prev_regbuf = regbuf; + + /* Ok, copy the header to the scratch buffer */ + memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); + scratch += SizeOfXLogRecordBlockHeader; + if (include_image) + { + memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); + scratch += SizeOfXLogRecordBlockImageHeader; + if (cbimg.hole_length != 0 && is_compressed) + { + memcpy(scratch, &cbimg, + SizeOfXLogRecordBlockCompressHeader); + scratch += SizeOfXLogRecordBlockCompressHeader; + } + } + if (!samerel) + { + memcpy(scratch, ®buf->rnode, sizeof(RelFileNode)); + scratch += sizeof(RelFileNode); + } + memcpy(scratch, ®buf->block, sizeof(BlockNumber)); + scratch += sizeof(BlockNumber); + } + + /* followed by the record's origin, if any */ + if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) && + replorigin_session_origin != InvalidRepOriginId) + { + *(scratch++) = (char) XLR_BLOCK_ID_ORIGIN; + memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin)); + scratch += sizeof(replorigin_session_origin); + } + + /* followed by toplevel XID, if not already included in previous record */ + if (IsSubTransactionAssignmentPending()) + { + TransactionId xid = GetTopTransactionIdIfAny(); + + /* update the flag (later used by XLogResetInsertion) */ + XLogSetRecordFlags(XLOG_INCLUDE_XID); + + *(scratch++) = (char) XLR_BLOCK_ID_TOPLEVEL_XID; + memcpy(scratch, &xid, sizeof(TransactionId)); + scratch += sizeof(TransactionId); + } + + /* followed by main data, if any */ + if (mainrdata_len > 0) + { + if (mainrdata_len > 255) + { + *(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG; + memcpy(scratch, &mainrdata_len, sizeof(uint32)); + scratch += sizeof(uint32); + } + else + { + *(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT; + *(scratch++) = (uint8) mainrdata_len; + } + rdt_datas_last->next = mainrdata_head; + rdt_datas_last = mainrdata_last; + total_len += mainrdata_len; + } + rdt_datas_last->next = NULL; + + hdr_rdt.len = (scratch - hdr_scratch); + total_len += hdr_rdt.len; + + /* + * Calculate CRC of the data + * + * Note that the record header isn't added into the CRC initially since we + * don't know the prev-link yet. Thus, the CRC will represent the CRC of + * the whole record in the order: rdata, then backup blocks, then record + * header. + */ + INIT_CRC32C(rdata_crc); + COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord); + for (rdt = hdr_rdt.next; rdt != NULL; rdt = rdt->next) + COMP_CRC32C(rdata_crc, rdt->data, rdt->len); + + /* + * Fill in the fields in the record header. Prev-link is filled in later, + * once we know where in the WAL the record will be inserted. The CRC does + * not include the record header yet. + */ + rechdr->xl_xid = GetCurrentTransactionIdIfAny(); + rechdr->xl_tot_len = total_len; + rechdr->xl_info = info; + rechdr->xl_rmid = rmid; + rechdr->xl_prev = InvalidXLogRecPtr; + rechdr->xl_crc = rdata_crc; + + return &hdr_rdt; +} + +/* + * Create a compressed version of a backup block image. + * + * Returns false if compression fails (i.e., compressed result is actually + * bigger than original). Otherwise, returns true and sets 'dlen' to + * the length of compressed block image. + */ +static bool +XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length, + char *dest, uint16 *dlen) +{ + int32 orig_len = BLCKSZ - hole_length; + int32 len; + int32 extra_bytes = 0; + char *source; + PGAlignedBlock tmp; + + if (hole_length != 0) + { + /* must skip the hole */ + source = tmp.data; + memcpy(source, page, hole_offset); + memcpy(source + hole_offset, + page + (hole_offset + hole_length), + BLCKSZ - (hole_length + hole_offset)); + + /* + * Extra data needs to be stored in WAL record for the compressed + * version of block image if the hole exists. + */ + extra_bytes = SizeOfXLogRecordBlockCompressHeader; + } + else + source = page; + + /* + * We recheck the actual size even if pglz_compress() reports success and + * see if the number of bytes saved by compression is larger than the + * length of extra data needed for the compressed version of block image. + */ + len = pglz_compress(source, orig_len, dest, PGLZ_strategy_default); + if (len >= 0 && + len + extra_bytes < orig_len) + { + *dlen = (uint16) len; /* successful compression */ + return true; + } + return false; +} + +/* + * Determine whether the buffer referenced has to be backed up. + * + * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites + * could change later, so the result should be used for optimization purposes + * only. + */ +bool +XLogCheckBufferNeedsBackup(Buffer buffer) +{ + XLogRecPtr RedoRecPtr; + bool doPageWrites; + Page page; + + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + + page = BufferGetPage(buffer); + + if (doPageWrites && PageGetLSN(page) <= RedoRecPtr) + return true; /* buffer requires backup */ + + return false; /* buffer does not need to be backed up */ +} + +/* + * Write a backup block if needed when we are setting a hint. Note that + * this may be called for a variety of page types, not just heaps. + * + * Callable while holding just share lock on the buffer content. + * + * We can't use the plain backup block mechanism since that relies on the + * Buffer being exclusively locked. Since some modifications (setting LSN, hint + * bits) are allowed in a sharelocked buffer that can lead to wal checksum + * failures. So instead we copy the page and insert the copied data as normal + * record data. + * + * We only need to do something if page has not yet been full page written in + * this checkpoint round. The LSN of the inserted wal record is returned if we + * had to write, InvalidXLogRecPtr otherwise. + * + * It is possible that multiple concurrent backends could attempt to write WAL + * records. In that case, multiple copies of the same block would be recorded + * in separate WAL records by different backends, though that is still OK from + * a correctness perspective. + */ +XLogRecPtr +XLogSaveBufferForHint(Buffer buffer, bool buffer_std) +{ + XLogRecPtr recptr = InvalidXLogRecPtr; + XLogRecPtr lsn; + XLogRecPtr RedoRecPtr; + + /* + * Ensure no checkpoint can change our view of RedoRecPtr. + */ + Assert(MyProc->delayChkpt); + + /* + * Update RedoRecPtr so that we can make the right decision + */ + RedoRecPtr = GetRedoRecPtr(); + + /* + * We assume page LSN is first data on *every* page that can be passed to + * XLogInsert, whether it has the standard page layout or not. Since we're + * only holding a share-lock on the page, we must take the buffer header + * lock when we look at the LSN. + */ + lsn = BufferGetLSNAtomic(buffer); + + if (lsn <= RedoRecPtr) + { + int flags; + PGAlignedBlock copied_buffer; + char *origdata = (char *) BufferGetBlock(buffer); + RelFileNode rnode; + ForkNumber forkno; + BlockNumber blkno; + + /* + * Copy buffer so we don't have to worry about concurrent hint bit or + * lsn updates. We assume pd_lower/upper cannot be changed without an + * exclusive lock, so the contents bkp are not racy. + */ + if (buffer_std) + { + /* Assume we can omit data between pd_lower and pd_upper */ + Page page = BufferGetPage(buffer); + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + memcpy(copied_buffer.data, origdata, lower); + memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); + } + else + memcpy(copied_buffer.data, origdata, BLCKSZ); + + XLogBeginInsert(); + + flags = REGBUF_FORCE_IMAGE; + if (buffer_std) + flags |= REGBUF_STANDARD; + + BufferGetTag(buffer, &rnode, &forkno, &blkno); + XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); + } + + return recptr; +} + +/* + * Write a WAL record containing a full image of a page. Caller is responsible + * for writing the page to disk after calling this routine. + * + * Note: If you're using this function, you should be building pages in private + * memory and writing them directly to smgr. If you're using buffers, call + * log_newpage_buffer instead. + * + * If the page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to true. That allows + * the unused space to be left out from the WAL record, making it smaller. + */ +XLogRecPtr +log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + int flags; + XLogRecPtr recptr; + + flags = REGBUF_FORCE_IMAGE; + if (page_std) + flags |= REGBUF_STANDARD; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags); + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + /* + * The page may be uninitialized. If so, we can't set the LSN because that + * would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, recptr); + } + + return recptr; +} + +/* + * Like log_newpage(), but allows logging multiple pages in one operation. + * It is more efficient than calling log_newpage() for each page separately, + * because we can write multiple pages in a single WAL record. + */ +void +log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages, + BlockNumber *blknos, Page *pages, bool page_std) +{ + int flags; + XLogRecPtr recptr; + int i; + int j; + + flags = REGBUF_FORCE_IMAGE; + if (page_std) + flags |= REGBUF_STANDARD; + + /* + * Iterate over all the pages. They are collected into batches of + * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each + * batch. + */ + XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0); + + i = 0; + while (i < num_pages) + { + int batch_start = i; + int nbatch; + + XLogBeginInsert(); + + nbatch = 0; + while (nbatch < XLR_MAX_BLOCK_ID && i < num_pages) + { + XLogRegisterBlock(nbatch, rnode, forkNum, blknos[i], pages[i], flags); + i++; + nbatch++; + } + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + for (j = batch_start; j < i; j++) + { + /* + * The page may be uninitialized. If so, we can't set the LSN + * because that would corrupt the page. + */ + if (!PageIsNew(pages[j])) + { + PageSetLSN(pages[j], recptr); + } + } + } +} + +/* + * Write a WAL record containing a full image of a page. + * + * Caller should initialize the buffer and mark it dirty before calling this + * function. This function will set the page LSN. + * + * If the page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to true. That allows + * the unused space to be left out from the WAL record, making it smaller. + */ +XLogRecPtr +log_newpage_buffer(Buffer buffer, bool page_std) +{ + Page page = BufferGetPage(buffer); + RelFileNode rnode; + ForkNumber forkNum; + BlockNumber blkno; + + /* Shared buffers should be modified in a critical section. */ + Assert(CritSectionCount > 0); + + BufferGetTag(buffer, &rnode, &forkNum, &blkno); + + return log_newpage(&rnode, forkNum, blkno, page, page_std); +} + +/* + * WAL-log a range of blocks in a relation. + * + * An image of all pages with block numbers 'startblk' <= X < 'endblk' is + * written to the WAL. If the range is large, this is done in multiple WAL + * records. + * + * If all page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to true. That allows + * the unused space to be left out from the WAL records, making them smaller. + * + * NOTE: This function acquires exclusive-locks on the pages. Typically, this + * is used on a newly-built relation, and the caller is holding a + * AccessExclusiveLock on it, so no other backend can be accessing it at the + * same time. If that's not the case, you must ensure that this does not + * cause a deadlock through some other means. + */ +void +log_newpage_range(Relation rel, ForkNumber forkNum, + BlockNumber startblk, BlockNumber endblk, + bool page_std) +{ + int flags; + BlockNumber blkno; + + flags = REGBUF_FORCE_IMAGE; + if (page_std) + flags |= REGBUF_STANDARD; + + /* + * Iterate over all the pages in the range. They are collected into + * batches of XLR_MAX_BLOCK_ID pages, and a single WAL-record is written + * for each batch. + */ + XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0); + + blkno = startblk; + while (blkno < endblk) + { + Buffer bufpack[XLR_MAX_BLOCK_ID]; + XLogRecPtr recptr; + int nbufs; + int i; + + CHECK_FOR_INTERRUPTS(); + + /* Collect a batch of blocks. */ + nbufs = 0; + while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk) + { + Buffer buf = ReadBufferExtended(rel, forkNum, blkno, + RBM_NORMAL, NULL); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Completely empty pages are not WAL-logged. Writing a WAL record + * would change the LSN, and we don't want that. We want the page + * to stay empty. + */ + if (!PageIsNew(BufferGetPage(buf))) + bufpack[nbufs++] = buf; + else + UnlockReleaseBuffer(buf); + blkno++; + } + + /* Write WAL record for this batch. */ + XLogBeginInsert(); + + START_CRIT_SECTION(); + for (i = 0; i < nbufs; i++) + { + XLogRegisterBuffer(i, bufpack[i], flags); + MarkBufferDirty(bufpack[i]); + } + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + for (i = 0; i < nbufs; i++) + { + PageSetLSN(BufferGetPage(bufpack[i]), recptr); + UnlockReleaseBuffer(bufpack[i]); + } + END_CRIT_SECTION(); + } +} + +/* + * Allocate working buffers needed for WAL record construction. + */ +void +InitXLogInsert(void) +{ + /* Initialize the working areas */ + if (xloginsert_cxt == NULL) + { + xloginsert_cxt = AllocSetContextCreate(TopMemoryContext, + "WAL record construction", + ALLOCSET_DEFAULT_SIZES); + } + + if (registered_buffers == NULL) + { + registered_buffers = (registered_buffer *) + MemoryContextAllocZero(xloginsert_cxt, + sizeof(registered_buffer) * (XLR_NORMAL_MAX_BLOCK_ID + 1)); + max_registered_buffers = XLR_NORMAL_MAX_BLOCK_ID + 1; + } + if (rdatas == NULL) + { + rdatas = MemoryContextAlloc(xloginsert_cxt, + sizeof(XLogRecData) * XLR_NORMAL_RDATAS); + max_rdatas = XLR_NORMAL_RDATAS; + } + + /* + * Allocate a buffer to hold the header information for a WAL record. + */ + if (hdr_scratch == NULL) + hdr_scratch = MemoryContextAllocZero(xloginsert_cxt, + HEADER_SCRATCH_SIZE); +} diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c new file mode 100644 index 0000000..d797d9d --- /dev/null +++ b/src/backend/access/transam/xlogreader.c @@ -0,0 +1,1660 @@ +/*------------------------------------------------------------------------- + * + * xlogreader.c + * Generic XLog reading facility + * + * Portions Copyright (c) 2013-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/transam/xlogreader.c + * + * NOTES + * See xlogreader.h for more notes on this facility. + * + * This file is compiled as both front-end and backend code, so it + * may not use ereport, server-defined static variables, etc. + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/transam.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "access/xlogrecord.h" +#include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" +#include "replication/origin.h" + +#ifndef FRONTEND +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/memutils.h" +#endif + +static void report_invalid_record(XLogReaderState *state, const char *fmt,...) + pg_attribute_printf(2, 3); +static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); +static int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, + int reqLen); +static void XLogReaderInvalReadState(XLogReaderState *state); +static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, + XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess); +static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record, + XLogRecPtr recptr); +static void ResetDecoder(XLogReaderState *state); +static void WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, + int segsize, const char *waldir); + +/* size of the buffer allocated for error message. */ +#define MAX_ERRORMSG_LEN 1000 + +/* + * Construct a string in state->errormsg_buf explaining what's wrong with + * the current record being read. + */ +static void +report_invalid_record(XLogReaderState *state, const char *fmt,...) +{ + va_list args; + + fmt = _(fmt); + + va_start(args, fmt); + vsnprintf(state->errormsg_buf, MAX_ERRORMSG_LEN, fmt, args); + va_end(args); +} + +/* + * Allocate and initialize a new XLogReader. + * + * Returns NULL if the xlogreader couldn't be allocated. + */ +XLogReaderState * +XLogReaderAllocate(int wal_segment_size, const char *waldir, + XLogReaderRoutine *routine, void *private_data) +{ + XLogReaderState *state; + + state = (XLogReaderState *) + palloc_extended(sizeof(XLogReaderState), + MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (!state) + return NULL; + + /* initialize caller-provided support functions */ + state->routine = *routine; + + state->max_block_id = -1; + + /* + * Permanently allocate readBuf. We do it this way, rather than just + * making a static array, for two reasons: (1) no need to waste the + * storage in most instantiations of the backend; (2) a static char array + * isn't guaranteed to have any particular alignment, whereas + * palloc_extended() will provide MAXALIGN'd storage. + */ + state->readBuf = (char *) palloc_extended(XLOG_BLCKSZ, + MCXT_ALLOC_NO_OOM); + if (!state->readBuf) + { + pfree(state); + return NULL; + } + + /* Initialize segment info. */ + WALOpenSegmentInit(&state->seg, &state->segcxt, wal_segment_size, + waldir); + + /* system_identifier initialized to zeroes above */ + state->private_data = private_data; + /* ReadRecPtr, EndRecPtr and readLen initialized to zeroes above */ + state->errormsg_buf = palloc_extended(MAX_ERRORMSG_LEN + 1, + MCXT_ALLOC_NO_OOM); + if (!state->errormsg_buf) + { + pfree(state->readBuf); + pfree(state); + return NULL; + } + state->errormsg_buf[0] = '\0'; + + /* + * Allocate an initial readRecordBuf of minimal size, which can later be + * enlarged if necessary. + */ + if (!allocate_recordbuf(state, 0)) + { + pfree(state->errormsg_buf); + pfree(state->readBuf); + pfree(state); + return NULL; + } + + return state; +} + +void +XLogReaderFree(XLogReaderState *state) +{ + int block_id; + + if (state->seg.ws_file != -1) + state->routine.segment_close(state); + + for (block_id = 0; block_id <= XLR_MAX_BLOCK_ID; block_id++) + { + if (state->blocks[block_id].data) + pfree(state->blocks[block_id].data); + } + if (state->main_data) + pfree(state->main_data); + + pfree(state->errormsg_buf); + if (state->readRecordBuf) + pfree(state->readRecordBuf); + pfree(state->readBuf); + pfree(state); +} + +/* + * Allocate readRecordBuf to fit a record of at least the given length. + * Returns true if successful, false if out of memory. + * + * readRecordBufSize is set to the new buffer size. + * + * To avoid useless small increases, round its size to a multiple of + * XLOG_BLCKSZ, and make sure it's at least 5*Max(BLCKSZ, XLOG_BLCKSZ) to start + * with. (That is enough for all "normal" records, but very large commit or + * abort records might need more space.) + */ +static bool +allocate_recordbuf(XLogReaderState *state, uint32 reclength) +{ + uint32 newSize = reclength; + + newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ); + newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ)); + +#ifndef FRONTEND + + /* + * Note that in much unlucky circumstances, the random data read from a + * recycled segment can cause this routine to be called with a size + * causing a hard failure at allocation. For a standby, this would cause + * the instance to stop suddenly with a hard failure, preventing it to + * retry fetching WAL from one of its sources which could allow it to move + * on with replay without a manual restart. If the data comes from a past + * recycled segment and is still valid, then the allocation may succeed + * but record checks are going to fail so this would be short-lived. If + * the allocation fails because of a memory shortage, then this is not a + * hard failure either per the guarantee given by MCXT_ALLOC_NO_OOM. + */ + if (!AllocSizeIsValid(newSize)) + return false; + +#endif + + if (state->readRecordBuf) + pfree(state->readRecordBuf); + state->readRecordBuf = + (char *) palloc_extended(newSize, MCXT_ALLOC_NO_OOM); + if (state->readRecordBuf == NULL) + { + state->readRecordBufSize = 0; + return false; + } + state->readRecordBufSize = newSize; + return true; +} + +/* + * Initialize the passed segment structs. + */ +static void +WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, + int segsize, const char *waldir) +{ + seg->ws_file = -1; + seg->ws_segno = 0; + seg->ws_tli = 0; + + segcxt->ws_segsize = segsize; + if (waldir) + snprintf(segcxt->ws_dir, MAXPGPATH, "%s", waldir); +} + +/* + * Begin reading WAL at 'RecPtr'. + * + * 'RecPtr' should point to the beginnning of a valid WAL record. Pointing at + * the beginning of a page is also OK, if there is a new record right after + * the page header, i.e. not a continuation. + * + * This does not make any attempt to read the WAL yet, and hence cannot fail. + * If the starting address is not correct, the first call to XLogReadRecord() + * will error out. + */ +void +XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) +{ + Assert(!XLogRecPtrIsInvalid(RecPtr)); + + ResetDecoder(state); + + /* Begin at the passed-in record pointer. */ + state->EndRecPtr = RecPtr; + state->ReadRecPtr = InvalidXLogRecPtr; +} + +/* + * Attempt to read an XLOG record. + * + * XLogBeginRead() or XLogFindNextRecord() must be called before the first call + * to XLogReadRecord(). + * + * If the page_read callback fails to read the requested data, NULL is + * returned. The callback is expected to have reported the error; errormsg + * is set to NULL. + * + * If the reading fails for some other reason, NULL is also returned, and + * *errormsg is set to a string with details of the failure. + * + * The returned pointer (or *errormsg) points to an internal buffer that's + * valid until the next call to XLogReadRecord. + */ +XLogRecord * +XLogReadRecord(XLogReaderState *state, char **errormsg) +{ + XLogRecPtr RecPtr; + XLogRecord *record; + XLogRecPtr targetPagePtr; + bool randAccess; + uint32 len, + total_len; + uint32 targetRecOff; + uint32 pageHeaderSize; + bool assembled; + bool gotheader; + int readOff; + + /* + * randAccess indicates whether to verify the previous-record pointer of + * the record we're reading. We only do this if we're reading + * sequentially, which is what we initially assume. + */ + randAccess = false; + + /* reset error state */ + *errormsg = NULL; + state->errormsg_buf[0] = '\0'; + + ResetDecoder(state); + state->abortedRecPtr = InvalidXLogRecPtr; + state->missingContrecPtr = InvalidXLogRecPtr; + + RecPtr = state->EndRecPtr; + + if (state->ReadRecPtr != InvalidXLogRecPtr) + { + /* read the record after the one we just read */ + + /* + * EndRecPtr is pointing to end+1 of the previous WAL record. If + * we're at a page boundary, no more records can fit on the current + * page. We must skip over the page header, but we can't do that until + * we've read in the page, since the header size is variable. + */ + } + else + { + /* + * Caller supplied a position to start at. + * + * In this case, EndRecPtr should already be pointing to a valid + * record starting position. + */ + Assert(XRecOffIsValid(RecPtr)); + randAccess = true; + } + +restart: + state->currRecPtr = RecPtr; + assembled = false; + + targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ); + targetRecOff = RecPtr % XLOG_BLCKSZ; + + /* + * Read the page containing the record into state->readBuf. Request enough + * byte to cover the whole record header, or at least the part of it that + * fits on the same page. + */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); + if (readOff < 0) + goto err; + + /* + * ReadPageInternal always returns at least the page header, so we can + * examine it now. + */ + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + if (targetRecOff == 0) + { + /* + * At page start, so skip over page header. + */ + RecPtr += pageHeaderSize; + targetRecOff = pageHeaderSize; + } + else if (targetRecOff < pageHeaderSize) + { + report_invalid_record(state, "invalid record offset at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && + targetRecOff == pageHeaderSize) + { + report_invalid_record(state, "contrecord is requested by %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* ReadPageInternal has verified the page header */ + Assert(pageHeaderSize <= readOff); + + /* + * Read the record length. + * + * NB: Even though we use an XLogRecord pointer here, the whole record + * header might not fit on this page. xl_tot_len is the first field of the + * struct, so it must be on this page (the records are MAXALIGNed), but we + * cannot access any other fields until we've verified that we got the + * whole header. + */ + record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); + total_len = record->xl_tot_len; + + /* + * If the whole record header is on this page, validate it immediately. + * Otherwise do just a basic sanity check on xl_tot_len, and validate the + * rest of the header after reading it from the next page. The xl_tot_len + * check is necessary here to ensure that we enter the "Need to reassemble + * record" code path below; otherwise we might fail to apply + * ValidXLogRecordHeader at all. + */ + if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) + { + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, + randAccess)) + goto err; + gotheader = true; + } + else + { + /* XXX: more validation should be done here */ + if (total_len < SizeOfXLogRecord) + { + report_invalid_record(state, + "invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + goto err; + } + gotheader = false; + } + + len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; + if (total_len > len) + { + /* Need to reassemble record */ + char *contdata; + XLogPageHeader pageHeader; + char *buffer; + uint32 gotlen; + + assembled = true; + + /* + * Enlarge readRecordBuf as needed. + */ + if (total_len > state->readRecordBufSize && + !allocate_recordbuf(state, total_len)) + { + /* We treat this as a "bogus data" condition */ + report_invalid_record(state, "record length %u at %X/%X too long", + total_len, LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* Copy the first fragment of the record from the first page. */ + memcpy(state->readRecordBuf, + state->readBuf + RecPtr % XLOG_BLCKSZ, len); + buffer = state->readRecordBuf + len; + gotlen = len; + + do + { + /* Calculate pointer to beginning of next page */ + targetPagePtr += XLOG_BLCKSZ; + + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + + if (readOff < 0) + goto err; + + Assert(SizeOfXLogShortPHD <= readOff); + + pageHeader = (XLogPageHeader) state->readBuf; + + /* + * If we were expecting a continuation record and got an + * "overwrite contrecord" flag, that means the continuation record + * was overwritten with a different record. Restart the read by + * assuming the address to read is the location where we found + * this flag; but keep track of the LSN of the record we were + * reading, for later verification. + */ + if (pageHeader->xlp_info & XLP_FIRST_IS_OVERWRITE_CONTRECORD) + { + state->overwrittenRecPtr = RecPtr; + ResetDecoder(state); + RecPtr = targetPagePtr; + goto restart; + } + + /* Check that the continuation on next page looks valid */ + if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) + { + report_invalid_record(state, + "there is no contrecord flag at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* + * Cross-check that xlp_rem_len agrees with how much of the record + * we expect there to be left. + */ + if (pageHeader->xlp_rem_len == 0 || + total_len != (pageHeader->xlp_rem_len + gotlen)) + { + report_invalid_record(state, + "invalid contrecord length %u (expected %lld) at %X/%X", + pageHeader->xlp_rem_len, + ((long long) total_len) - gotlen, + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* Append the continuation from this page to the buffer */ + pageHeaderSize = XLogPageHeaderSize(pageHeader); + + if (readOff < pageHeaderSize) + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize); + + Assert(pageHeaderSize <= readOff); + + contdata = (char *) state->readBuf + pageHeaderSize; + len = XLOG_BLCKSZ - pageHeaderSize; + if (pageHeader->xlp_rem_len < len) + len = pageHeader->xlp_rem_len; + + if (readOff < pageHeaderSize + len) + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize + len); + + memcpy(buffer, (char *) contdata, len); + buffer += len; + gotlen += len; + + /* If we just reassembled the record header, validate it. */ + if (!gotheader) + { + record = (XLogRecord *) state->readRecordBuf; + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, + record, randAccess)) + goto err; + gotheader = true; + } + } while (gotlen < total_len); + + Assert(gotheader); + + record = (XLogRecord *) state->readRecordBuf; + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + state->ReadRecPtr = RecPtr; + state->EndRecPtr = targetPagePtr + pageHeaderSize + + MAXALIGN(pageHeader->xlp_rem_len); + } + else + { + /* Wait for the record data to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + total_len, XLOG_BLCKSZ)); + if (readOff < 0) + goto err; + + /* Record does not cross a page boundary */ + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + state->EndRecPtr = RecPtr + MAXALIGN(total_len); + + state->ReadRecPtr = RecPtr; + } + + /* + * Special processing if it's an XLOG SWITCH record + */ + if (record->xl_rmid == RM_XLOG_ID && + (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) + { + /* Pretend it extends to end of segment */ + state->EndRecPtr += state->segcxt.ws_segsize - 1; + state->EndRecPtr -= XLogSegmentOffset(state->EndRecPtr, state->segcxt.ws_segsize); + } + + if (DecodeXLogRecord(state, record, errormsg)) + return record; + else + return NULL; + +err: + if (assembled) + { + /* + * We get here when a record that spans multiple pages needs to be + * assembled, but something went wrong -- perhaps a contrecord piece + * was lost. If caller is WAL replay, it will know where the aborted + * record was and where to direct followup WAL to be written, marking + * the next piece with XLP_FIRST_IS_OVERWRITE_CONTRECORD, which will + * in turn signal downstream WAL consumers that the broken WAL record + * is to be ignored. + */ + state->abortedRecPtr = RecPtr; + state->missingContrecPtr = targetPagePtr; + } + + /* + * Invalidate the read state. We might read from a different source after + * failure. + */ + XLogReaderInvalReadState(state); + + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + + return NULL; +} + +/* + * Read a single xlog page including at least [pageptr, reqLen] of valid data + * via the page_read() callback. + * + * Returns -1 if the required page cannot be read for some reason; errormsg_buf + * is set in that case (unless the error occurs in the page_read callback). + * + * We fetch the page from a reader-local cache if we know we have the required + * data and if there hasn't been any error since caching the data. + */ +static int +ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) +{ + int readLen; + uint32 targetPageOff; + XLogSegNo targetSegNo; + XLogPageHeader hdr; + + Assert((pageptr % XLOG_BLCKSZ) == 0); + + XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize); + targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize); + + /* check whether we have all the requested data already */ + if (targetSegNo == state->seg.ws_segno && + targetPageOff == state->segoff && reqLen <= state->readLen) + return state->readLen; + + /* + * Data is not in our buffer. + * + * Every time we actually read the segment, even if we looked at parts of + * it before, we need to do verification as the page_read callback might + * now be rereading data from a different source. + * + * Whenever switching to a new WAL segment, we read the first page of the + * file and validate its header, even if that's not where the target + * record is. This is so that we can check the additional identification + * info that is present in the first page's "long" header. + */ + if (targetSegNo != state->seg.ws_segno && targetPageOff != 0) + { + XLogRecPtr targetSegmentPtr = pageptr - targetPageOff; + + readLen = state->routine.page_read(state, targetSegmentPtr, XLOG_BLCKSZ, + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + + /* we can be sure to have enough WAL available, we scrolled back */ + Assert(readLen == XLOG_BLCKSZ); + + if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, + state->readBuf)) + goto err; + } + + /* + * First, read the requested data length, but at least a short page header + * so that we can validate it. + */ + readLen = state->routine.page_read(state, pageptr, Max(reqLen, SizeOfXLogShortPHD), + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + + Assert(readLen <= XLOG_BLCKSZ); + + /* Do we have enough data to check the header length? */ + if (readLen <= SizeOfXLogShortPHD) + goto err; + + Assert(readLen >= reqLen); + + hdr = (XLogPageHeader) state->readBuf; + + /* still not enough */ + if (readLen < XLogPageHeaderSize(hdr)) + { + readLen = state->routine.page_read(state, pageptr, XLogPageHeaderSize(hdr), + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + } + + /* + * Now that we know we have the full header, validate it. + */ + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + goto err; + + /* update read state information */ + state->seg.ws_segno = targetSegNo; + state->segoff = targetPageOff; + state->readLen = readLen; + + return readLen; + +err: + XLogReaderInvalReadState(state); + return -1; +} + +/* + * Invalidate the xlogreader's read state to force a re-read. + */ +static void +XLogReaderInvalReadState(XLogReaderState *state) +{ + state->seg.ws_segno = 0; + state->segoff = 0; + state->readLen = 0; +} + +/* + * Validate an XLOG record header. + * + * This is just a convenience subroutine to avoid duplicated code in + * XLogReadRecord. It's not intended for use from anywhere else. + */ +static bool +ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, + XLogRecPtr PrevRecPtr, XLogRecord *record, + bool randAccess) +{ + if (record->xl_tot_len < SizeOfXLogRecord) + { + report_invalid_record(state, + "invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, record->xl_tot_len); + return false; + } + if (record->xl_rmid > RM_MAX_ID) + { + report_invalid_record(state, + "invalid resource manager ID %u at %X/%X", + record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); + return false; + } + if (randAccess) + { + /* + * We can't exactly verify the prev-link, but surely it should be less + * than the record's own address. + */ + if (!(record->xl_prev < RecPtr)) + { + report_invalid_record(state, + "record with incorrect prev-link %X/%X at %X/%X", + LSN_FORMAT_ARGS(record->xl_prev), + LSN_FORMAT_ARGS(RecPtr)); + return false; + } + } + else + { + /* + * Record's prev-link should exactly match our previous location. This + * check guards against torn WAL pages where a stale but valid-looking + * WAL record starts on a sector boundary. + */ + if (record->xl_prev != PrevRecPtr) + { + report_invalid_record(state, + "record with incorrect prev-link %X/%X at %X/%X", + LSN_FORMAT_ARGS(record->xl_prev), + LSN_FORMAT_ARGS(RecPtr)); + return false; + } + } + + return true; +} + + +/* + * CRC-check an XLOG record. We do not believe the contents of an XLOG + * record (other than to the minimal extent of computing the amount of + * data to read in) until we've checked the CRCs. + * + * We assume all of the record (that is, xl_tot_len bytes) has been read + * into memory at *record. Also, ValidXLogRecordHeader() has accepted the + * record's header, which means in particular that xl_tot_len is at least + * SizeOfXLogRecord. + */ +static bool +ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) +{ + pg_crc32c crc; + + /* Calculate the CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + /* include the record header last */ + COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(record->xl_crc, crc)) + { + report_invalid_record(state, + "incorrect resource manager data checksum in record at %X/%X", + LSN_FORMAT_ARGS(recptr)); + return false; + } + + return true; +} + +/* + * Validate a page header. + * + * Check if 'phdr' is valid as the header of the XLog page at position + * 'recptr'. + */ +bool +XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, + char *phdr) +{ + XLogRecPtr recaddr; + XLogSegNo segno; + int32 offset; + XLogPageHeader hdr = (XLogPageHeader) phdr; + + Assert((recptr % XLOG_BLCKSZ) == 0); + + XLByteToSeg(recptr, segno, state->segcxt.ws_segsize); + offset = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + + XLogSegNoOffsetToRecPtr(segno, offset, state->segcxt.ws_segsize, recaddr); + + if (hdr->xlp_magic != XLOG_PAGE_MAGIC) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "invalid magic number %04X in log segment %s, offset %u", + hdr->xlp_magic, + fname, + offset); + return false; + } + + if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "invalid info bits %04X in log segment %s, offset %u", + hdr->xlp_info, + fname, + offset); + return false; + } + + if (hdr->xlp_info & XLP_LONG_HEADER) + { + XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr; + + if (state->system_identifier && + longhdr->xlp_sysid != state->system_identifier) + { + report_invalid_record(state, + "WAL file is from different database system: WAL file database system identifier is %llu, pg_control database system identifier is %llu", + (unsigned long long) longhdr->xlp_sysid, + (unsigned long long) state->system_identifier); + return false; + } + else if (longhdr->xlp_seg_size != state->segcxt.ws_segsize) + { + report_invalid_record(state, + "WAL file is from different database system: incorrect segment size in page header"); + return false; + } + else if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ) + { + report_invalid_record(state, + "WAL file is from different database system: incorrect XLOG_BLCKSZ in page header"); + return false; + } + } + else if (offset == 0) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + /* hmm, first page of file doesn't have a long header? */ + report_invalid_record(state, + "invalid info bits %04X in log segment %s, offset %u", + hdr->xlp_info, + fname, + offset); + return false; + } + + /* + * Check that the address on the page agrees with what we expected. This + * check typically fails when an old WAL segment is recycled, and hasn't + * yet been overwritten with new data yet. + */ + if (hdr->xlp_pageaddr != recaddr) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "unexpected pageaddr %X/%X in log segment %s, offset %u", + LSN_FORMAT_ARGS(hdr->xlp_pageaddr), + fname, + offset); + return false; + } + + /* + * Since child timelines are always assigned a TLI greater than their + * immediate parent's TLI, we should never see TLI go backwards across + * successive pages of a consistent WAL sequence. + * + * Sometimes we re-read a segment that's already been (partially) read. So + * we only verify TLIs for pages that are later than the last remembered + * LSN. + */ + if (recptr > state->latestPagePtr) + { + if (hdr->xlp_tli < state->latestPageTLI) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u", + hdr->xlp_tli, + state->latestPageTLI, + fname, + offset); + return false; + } + } + state->latestPagePtr = recptr; + state->latestPageTLI = hdr->xlp_tli; + + return true; +} + +#ifdef FRONTEND +/* + * Functions that are currently not needed in the backend, but are better + * implemented inside xlogreader.c because of the internal facilities available + * here. + */ + +/* + * Find the first record with an lsn >= RecPtr. + * + * This is different from XLogBeginRead() in that RecPtr doesn't need to point + * to a valid record boundary. Useful for checking whether RecPtr is a valid + * xlog address for reading, and to find the first valid address after some + * address when dumping records for debugging purposes. + * + * This positions the reader, like XLogBeginRead(), so that the next call to + * XLogReadRecord() will read the next valid record. + */ +XLogRecPtr +XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) +{ + XLogRecPtr tmpRecPtr; + XLogRecPtr found = InvalidXLogRecPtr; + XLogPageHeader header; + char *errormsg; + + Assert(!XLogRecPtrIsInvalid(RecPtr)); + + /* + * skip over potential continuation data, keeping in mind that it may span + * multiple pages + */ + tmpRecPtr = RecPtr; + while (true) + { + XLogRecPtr targetPagePtr; + int targetRecOff; + uint32 pageHeaderSize; + int readLen; + + /* + * Compute targetRecOff. It should typically be equal or greater than + * short page-header since a valid record can't start anywhere before + * that, except when caller has explicitly specified the offset that + * falls somewhere there or when we are skipping multi-page + * continuation record. It doesn't matter though because + * ReadPageInternal() is prepared to handle that and will read at + * least short page-header worth of data + */ + targetRecOff = tmpRecPtr % XLOG_BLCKSZ; + + /* scroll back to page boundary */ + targetPagePtr = tmpRecPtr - targetRecOff; + + /* Read the page containing the record */ + readLen = ReadPageInternal(state, targetPagePtr, targetRecOff); + if (readLen < 0) + goto err; + + header = (XLogPageHeader) state->readBuf; + + pageHeaderSize = XLogPageHeaderSize(header); + + /* make sure we have enough data for the page header */ + readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize); + if (readLen < 0) + goto err; + + /* skip over potential continuation data */ + if (header->xlp_info & XLP_FIRST_IS_CONTRECORD) + { + /* + * If the length of the remaining continuation data is more than + * what can fit in this page, the continuation record crosses over + * this page. Read the next page and try again. xlp_rem_len in the + * next page header will contain the remaining length of the + * continuation data + * + * Note that record headers are MAXALIGN'ed + */ + if (MAXALIGN(header->xlp_rem_len) >= (XLOG_BLCKSZ - pageHeaderSize)) + tmpRecPtr = targetPagePtr + XLOG_BLCKSZ; + else + { + /* + * The previous continuation record ends in this page. Set + * tmpRecPtr to point to the first valid record + */ + tmpRecPtr = targetPagePtr + pageHeaderSize + + MAXALIGN(header->xlp_rem_len); + break; + } + } + else + { + tmpRecPtr = targetPagePtr + pageHeaderSize; + break; + } + } + + /* + * we know now that tmpRecPtr is an address pointing to a valid XLogRecord + * because either we're at the first record after the beginning of a page + * or we just jumped over the remaining data of a continuation. + */ + XLogBeginRead(state, tmpRecPtr); + while (XLogReadRecord(state, &errormsg) != NULL) + { + /* past the record we've found, break out */ + if (RecPtr <= state->ReadRecPtr) + { + /* Rewind the reader to the beginning of the last record. */ + found = state->ReadRecPtr; + XLogBeginRead(state, found); + return found; + } + } + +err: + XLogReaderInvalReadState(state); + + return InvalidXLogRecPtr; +} + +#endif /* FRONTEND */ + +/* + * Helper function to ease writing of XLogRoutine->page_read callbacks. + * If this function is used, caller must supply a segment_open callback in + * 'state', as that is used here. + * + * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL + * fetched from timeline 'tli'. + * + * Returns true if succeeded, false if an error occurs, in which case + * 'errinfo' receives error details. + * + * XXX probably this should be improved to suck data directly from the + * WAL buffers when possible. + */ +bool +WALRead(XLogReaderState *state, + char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, + WALReadError *errinfo) +{ + char *p; + XLogRecPtr recptr; + Size nbytes; + + p = buf; + recptr = startptr; + nbytes = count; + + while (nbytes > 0) + { + uint32 startoff; + int segbytes; + int readbytes; + + startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + + /* + * If the data we want is not in a segment we have open, close what we + * have (if anything) and open the next one, using the caller's + * provided openSegment callback. + */ + if (state->seg.ws_file < 0 || + !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || + tli != state->seg.ws_tli) + { + XLogSegNo nextSegNo; + + if (state->seg.ws_file >= 0) + state->routine.segment_close(state); + + XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); + state->routine.segment_open(state, nextSegNo, &tli); + + /* This shouldn't happen -- indicates a bug in segment_open */ + Assert(state->seg.ws_file >= 0); + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + state->seg.ws_segno = nextSegNo; + } + + /* How many bytes are within this segment? */ + if (nbytes > (state->segcxt.ws_segsize - startoff)) + segbytes = state->segcxt.ws_segsize - startoff; + else + segbytes = nbytes; + +#ifndef FRONTEND + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); +#endif + + /* Reset errno first; eases reporting non-errno-affecting errors */ + errno = 0; + readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + +#ifndef FRONTEND + pgstat_report_wait_end(); +#endif + + if (readbytes <= 0) + { + errinfo->wre_errno = errno; + errinfo->wre_req = segbytes; + errinfo->wre_read = readbytes; + errinfo->wre_off = startoff; + errinfo->wre_seg = state->seg; + return false; + } + + /* Update state for read */ + recptr += readbytes; + nbytes -= readbytes; + p += readbytes; + } + + return true; +} + +/* ---------------------------------------- + * Functions for decoding the data and block references in a record. + * ---------------------------------------- + */ + +/* private function to reset the state between records */ +static void +ResetDecoder(XLogReaderState *state) +{ + int block_id; + + state->decoded_record = NULL; + + state->main_data_len = 0; + + for (block_id = 0; block_id <= state->max_block_id; block_id++) + { + state->blocks[block_id].in_use = false; + state->blocks[block_id].has_image = false; + state->blocks[block_id].has_data = false; + state->blocks[block_id].apply_image = false; + } + state->max_block_id = -1; +} + +/* + * Decode the previously read record. + * + * On error, a human-readable error message is returned in *errormsg, and + * the return value is false. + */ +bool +DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) +{ + /* + * read next _size bytes from record buffer, but check for overrun first. + */ +#define COPY_HEADER_FIELD(_dst, _size) \ + do { \ + if (remaining < _size) \ + goto shortdata_err; \ + memcpy(_dst, ptr, _size); \ + ptr += _size; \ + remaining -= _size; \ + } while(0) + + char *ptr; + uint32 remaining; + uint32 datatotal; + RelFileNode *rnode = NULL; + uint8 block_id; + + ResetDecoder(state); + + state->decoded_record = record; + state->record_origin = InvalidRepOriginId; + state->toplevel_xid = InvalidTransactionId; + + ptr = (char *) record; + ptr += SizeOfXLogRecord; + remaining = record->xl_tot_len - SizeOfXLogRecord; + + /* Decode the headers */ + datatotal = 0; + while (remaining > datatotal) + { + COPY_HEADER_FIELD(&block_id, sizeof(uint8)); + + if (block_id == XLR_BLOCK_ID_DATA_SHORT) + { + /* XLogRecordDataHeaderShort */ + uint8 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint8)); + + state->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id == XLR_BLOCK_ID_DATA_LONG) + { + /* XLogRecordDataHeaderLong */ + uint32 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint32)); + state->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id == XLR_BLOCK_ID_ORIGIN) + { + COPY_HEADER_FIELD(&state->record_origin, sizeof(RepOriginId)); + } + else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) + { + COPY_HEADER_FIELD(&state->toplevel_xid, sizeof(TransactionId)); + } + else if (block_id <= XLR_MAX_BLOCK_ID) + { + /* XLogRecordBlockHeader */ + DecodedBkpBlock *blk; + uint8 fork_flags; + + if (block_id <= state->max_block_id) + { + report_invalid_record(state, + "out-of-order block_id %u at %X/%X", + block_id, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + state->max_block_id = block_id; + + blk = &state->blocks[block_id]; + blk->in_use = true; + blk->apply_image = false; + + COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); + blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; + blk->flags = fork_flags; + blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0); + blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0); + + COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16)); + /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ + if (blk->has_data && blk->data_len == 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + if (!blk->has_data && blk->data_len != 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", + (unsigned int) blk->data_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + datatotal += blk->data_len; + + if (blk->has_image) + { + COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); + COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); + COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); + + blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0); + + if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) + { + if (blk->bimg_info & BKPIMAGE_HAS_HOLE) + COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); + else + blk->hole_length = 0; + } + else + blk->hole_length = BLCKSZ - blk->bimg_len; + datatotal += blk->bimg_len; + + /* + * cross-check that hole_offset > 0, hole_length > 0 and + * bimg_len < BLCKSZ if the HAS_HOLE flag is set. + */ + if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset == 0 || + blk->hole_length == 0 || + blk->bimg_len == BLCKSZ)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (unsigned int) blk->bimg_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that hole_offset == 0 and hole_length == 0 if + * the HAS_HOLE flag is not set. + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset != 0 || blk->hole_length != 0)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED + * flag is set. + */ + if ((blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && + blk->bimg_len == BLCKSZ) + { + report_invalid_record(state, + "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", + (unsigned int) blk->bimg_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor + * IS_COMPRESSED flag is set. + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + !(blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && + blk->bimg_len != BLCKSZ) + { + report_invalid_record(state, + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", + (unsigned int) blk->data_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + } + if (!(fork_flags & BKPBLOCK_SAME_REL)) + { + COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode)); + rnode = &blk->rnode; + } + else + { + if (rnode == NULL) + { + report_invalid_record(state, + "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + blk->rnode = *rnode; + } + COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber)); + } + else + { + report_invalid_record(state, + "invalid block_id %u at %X/%X", + block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + } + + if (remaining != datatotal) + goto shortdata_err; + + /* + * Ok, we've parsed the fragment headers, and verified that the total + * length of the payload in the fragments is equal to the amount of data + * left. Copy the data of each fragment to a separate buffer. + * + * We could just set up pointers into readRecordBuf, but we want to align + * the data for the convenience of the callers. Backup images are not + * copied, however; they don't need alignment. + */ + + /* block data first */ + for (block_id = 0; block_id <= state->max_block_id; block_id++) + { + DecodedBkpBlock *blk = &state->blocks[block_id]; + + if (!blk->in_use) + continue; + + Assert(blk->has_image || !blk->apply_image); + + if (blk->has_image) + { + blk->bkp_image = ptr; + ptr += blk->bimg_len; + } + if (blk->has_data) + { + if (!blk->data || blk->data_len > blk->data_bufsz) + { + if (blk->data) + pfree(blk->data); + + /* + * Force the initial request to be BLCKSZ so that we don't + * waste time with lots of trips through this stanza as a + * result of WAL compression. + */ + blk->data_bufsz = MAXALIGN(Max(blk->data_len, BLCKSZ)); + blk->data = palloc(blk->data_bufsz); + } + memcpy(blk->data, ptr, blk->data_len); + ptr += blk->data_len; + } + } + + /* and finally, the main data */ + if (state->main_data_len > 0) + { + if (!state->main_data || state->main_data_len > state->main_data_bufsz) + { + if (state->main_data) + pfree(state->main_data); + + /* + * main_data_bufsz must be MAXALIGN'ed. In many xlog record + * types, we omit trailing struct padding on-disk to save a few + * bytes; but compilers may generate accesses to the xlog struct + * that assume that padding bytes are present. If the palloc + * request is not large enough to include such padding bytes then + * we'll get valgrind complaints due to otherwise-harmless fetches + * of the padding bytes. + * + * In addition, force the initial request to be reasonably large + * so that we don't waste time with lots of trips through this + * stanza. BLCKSZ / 2 seems like a good compromise choice. + */ + state->main_data_bufsz = MAXALIGN(Max(state->main_data_len, + BLCKSZ / 2)); + state->main_data = palloc(state->main_data_bufsz); + } + memcpy(state->main_data, ptr, state->main_data_len); + ptr += state->main_data_len; + } + + return true; + +shortdata_err: + report_invalid_record(state, + "record with invalid length at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); +err: + *errormsg = state->errormsg_buf; + + return false; +} + +/* + * Returns information about the block that a block reference refers to. + * + * If the WAL record contains a block reference with the given ID, *rnode, + * *forknum, and *blknum are filled in (if not NULL), and returns true. + * Otherwise returns false. + */ +bool +XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, + RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum) +{ + DecodedBkpBlock *bkpb; + + if (!record->blocks[block_id].in_use) + return false; + + bkpb = &record->blocks[block_id]; + if (rnode) + *rnode = bkpb->rnode; + if (forknum) + *forknum = bkpb->forknum; + if (blknum) + *blknum = bkpb->blkno; + return true; +} + +/* + * Returns the data associated with a block reference, or NULL if there is + * no data (e.g. because a full-page image was taken instead). The returned + * pointer points to a MAXALIGNed buffer. + */ +char * +XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len) +{ + DecodedBkpBlock *bkpb; + + if (!record->blocks[block_id].in_use) + return NULL; + + bkpb = &record->blocks[block_id]; + + if (!bkpb->has_data) + { + if (len) + *len = 0; + return NULL; + } + else + { + if (len) + *len = bkpb->data_len; + return bkpb->data; + } +} + +/* + * Restore a full-page image from a backup block attached to an XLOG record. + * + * Returns true if a full-page image is restored. + */ +bool +RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) +{ + DecodedBkpBlock *bkpb; + char *ptr; + PGAlignedBlock tmp; + + if (!record->blocks[block_id].in_use) + return false; + if (!record->blocks[block_id].has_image) + return false; + + bkpb = &record->blocks[block_id]; + ptr = bkpb->bkp_image; + + if (bkpb->bimg_info & BKPIMAGE_IS_COMPRESSED) + { + /* If a backup block image is compressed, decompress it */ + if (pglz_decompress(ptr, bkpb->bimg_len, tmp.data, + BLCKSZ - bkpb->hole_length, true) < 0) + { + report_invalid_record(record, "invalid compressed image at %X/%X, block %d", + LSN_FORMAT_ARGS(record->ReadRecPtr), + block_id); + return false; + } + ptr = tmp.data; + } + + /* generate page, taking into account hole if necessary */ + if (bkpb->hole_length == 0) + { + memcpy(page, ptr, BLCKSZ); + } + else + { + memcpy(page, ptr, bkpb->hole_offset); + /* must zero-fill the hole */ + MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); + memcpy(page + (bkpb->hole_offset + bkpb->hole_length), + ptr + bkpb->hole_offset, + BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); + } + + return true; +} + +#ifndef FRONTEND + +/* + * Extract the FullTransactionId from a WAL record. + */ +FullTransactionId +XLogRecGetFullXid(XLogReaderState *record) +{ + TransactionId xid, + next_xid; + uint32 epoch; + + /* + * This function is only safe during replay, because it depends on the + * replay state. See AdvanceNextFullTransactionIdPastXid() for more. + */ + Assert(AmStartupProcess() || !IsUnderPostmaster); + + xid = XLogRecGetXid(record); + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * If xid is numerically greater than next_xid, it has to be from the last + * epoch. + */ + if (unlikely(xid > next_xid)) + --epoch; + + return FullTransactionIdFromEpochAndXid(epoch, xid); +} + +#endif diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c new file mode 100644 index 0000000..d17d660 --- /dev/null +++ b/src/backend/access/transam/xlogutils.c @@ -0,0 +1,978 @@ +/*------------------------------------------------------------------------- + * + * xlogutils.c + * + * PostgreSQL write-ahead log manager utility routines + * + * This file contains support routines that are used by XLOG replay functions. + * None of this code is used during normal system operation. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogutils.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/timeline.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/hsearch.h" +#include "utils/rel.h" + + +/* GUC variable */ +bool ignore_invalid_pages = false; + +/* + * During XLOG replay, we may see XLOG records for incremental updates of + * pages that no longer exist, because their relation was later dropped or + * truncated. (Note: this is only possible when full_page_writes = OFF, + * since when it's ON, the first reference we see to a page should always + * be a full-page rewrite not an incremental update.) Rather than simply + * ignoring such records, we make a note of the referenced page, and then + * complain if we don't actually see a drop or truncate covering the page + * later in replay. + */ +typedef struct xl_invalid_page_key +{ + RelFileNode node; /* the relation */ + ForkNumber forkno; /* the fork number */ + BlockNumber blkno; /* the page */ +} xl_invalid_page_key; + +typedef struct xl_invalid_page +{ + xl_invalid_page_key key; /* hash key ... must be first */ + bool present; /* page existed but contained zeroes */ +} xl_invalid_page; + +static HTAB *invalid_page_tab = NULL; + + +/* Report a reference to an invalid page */ +static void +report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno, + BlockNumber blkno, bool present) +{ + char *path = relpathperm(node, forkno); + + if (present) + elog(elevel, "page %u of relation %s is uninitialized", + blkno, path); + else + elog(elevel, "page %u of relation %s does not exist", + blkno, path); + pfree(path); +} + +/* Log a reference to an invalid page */ +static void +log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno, + bool present) +{ + xl_invalid_page_key key; + xl_invalid_page *hentry; + bool found; + + /* + * Once recovery has reached a consistent state, the invalid-page table + * should be empty and remain so. If a reference to an invalid page is + * found after consistency is reached, PANIC immediately. This might seem + * aggressive, but it's better than letting the invalid reference linger + * in the hash table until the end of recovery and PANIC there, which + * might come only much later if this is a standby server. + */ + if (reachedConsistency) + { + report_invalid_page(WARNING, node, forkno, blkno, present); + elog(ignore_invalid_pages ? WARNING : PANIC, + "WAL contains references to invalid pages"); + } + + /* + * Log references to invalid pages at DEBUG1 level. This allows some + * tracing of the cause (note the elog context mechanism will tell us + * something about the XLOG record that generated the reference). + */ + if (message_level_is_interesting(DEBUG1)) + report_invalid_page(DEBUG1, node, forkno, blkno, present); + + if (invalid_page_tab == NULL) + { + /* create hash table when first needed */ + HASHCTL ctl; + + ctl.keysize = sizeof(xl_invalid_page_key); + ctl.entrysize = sizeof(xl_invalid_page); + + invalid_page_tab = hash_create("XLOG invalid-page table", + 100, + &ctl, + HASH_ELEM | HASH_BLOBS); + } + + /* we currently assume xl_invalid_page_key contains no padding */ + key.node = node; + key.forkno = forkno; + key.blkno = blkno; + hentry = (xl_invalid_page *) + hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found); + + if (!found) + { + /* hash_search already filled in the key */ + hentry->present = present; + } + else + { + /* repeat reference ... leave "present" as it was */ + } +} + +/* Forget any invalid pages >= minblkno, because they've been dropped */ +static void +forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno) +{ + HASH_SEQ_STATUS status; + xl_invalid_page *hentry; + + if (invalid_page_tab == NULL) + return; /* nothing to do */ + + hash_seq_init(&status, invalid_page_tab); + + while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) + { + if (RelFileNodeEquals(hentry->key.node, node) && + hentry->key.forkno == forkno && + hentry->key.blkno >= minblkno) + { + if (message_level_is_interesting(DEBUG2)) + { + char *path = relpathperm(hentry->key.node, forkno); + + elog(DEBUG2, "page %u of relation %s has been dropped", + hentry->key.blkno, path); + pfree(path); + } + + if (hash_search(invalid_page_tab, + (void *) &hentry->key, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + } + } +} + +/* Forget any invalid pages in a whole database */ +static void +forget_invalid_pages_db(Oid dbid) +{ + HASH_SEQ_STATUS status; + xl_invalid_page *hentry; + + if (invalid_page_tab == NULL) + return; /* nothing to do */ + + hash_seq_init(&status, invalid_page_tab); + + while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) + { + if (hentry->key.node.dbNode == dbid) + { + if (message_level_is_interesting(DEBUG2)) + { + char *path = relpathperm(hentry->key.node, hentry->key.forkno); + + elog(DEBUG2, "page %u of relation %s has been dropped", + hentry->key.blkno, path); + pfree(path); + } + + if (hash_search(invalid_page_tab, + (void *) &hentry->key, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + } + } +} + +/* Are there any unresolved references to invalid pages? */ +bool +XLogHaveInvalidPages(void) +{ + if (invalid_page_tab != NULL && + hash_get_num_entries(invalid_page_tab) > 0) + return true; + return false; +} + +/* Complain about any remaining invalid-page entries */ +void +XLogCheckInvalidPages(void) +{ + HASH_SEQ_STATUS status; + xl_invalid_page *hentry; + bool foundone = false; + + if (invalid_page_tab == NULL) + return; /* nothing to do */ + + hash_seq_init(&status, invalid_page_tab); + + /* + * Our strategy is to emit WARNING messages for all remaining entries and + * only PANIC after we've dumped all the available info. + */ + while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) + { + report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno, + hentry->key.blkno, hentry->present); + foundone = true; + } + + if (foundone) + elog(ignore_invalid_pages ? WARNING : PANIC, + "WAL contains references to invalid pages"); + + hash_destroy(invalid_page_tab); + invalid_page_tab = NULL; +} + + +/* + * XLogReadBufferForRedo + * Read a page during XLOG replay + * + * Reads a block referenced by a WAL record into shared buffer cache, and + * determines what needs to be done to redo the changes to it. If the WAL + * record includes a full-page image of the page, it is restored. + * + * 'record.EndRecPtr' is compared to the page's LSN to determine if the record + * has already been replayed. 'block_id' is the ID number the block was + * registered with, when the WAL record was created. + * + * Returns one of the following: + * + * BLK_NEEDS_REDO - changes from the WAL record need to be applied + * BLK_DONE - block doesn't need replaying + * BLK_RESTORED - block was restored from a full-page image included in + * the record + * BLK_NOTFOUND - block was not found (because it was truncated away by + * an operation later in the WAL stream) + * + * On return, the buffer is locked in exclusive-mode, and returned in *buf. + * Note that the buffer is locked and returned even if it doesn't need + * replaying. (Getting the buffer lock is not really necessary during + * single-process crash recovery, but some subroutines such as MarkBufferDirty + * will complain if we don't have the lock. In hot standby mode it's + * definitely necessary.) + * + * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag + * set, we restore it, even if the page in the database appears newer. This + * is to protect ourselves against database pages that were partially or + * incorrectly written during a crash. We assume that the XLOG data must be + * good because it has passed a CRC check, while the database page might not + * be. This will force us to replay all subsequent modifications of the page + * that appear in XLOG, rather than possibly ignoring them as already + * applied, but that's not a huge drawback. + */ +XLogRedoAction +XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, + Buffer *buf) +{ + return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL, + false, buf); +} + +/* + * Pin and lock a buffer referenced by a WAL record, for the purpose of + * re-initializing it. + */ +Buffer +XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) +{ + Buffer buf; + + XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false, + &buf); + return buf; +} + +/* + * XLogReadBufferForRedoExtended + * Like XLogReadBufferForRedo, but with extra options. + * + * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended + * with all-zeroes pages up to the referenced block number. In + * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value + * is always BLK_NEEDS_REDO. + * + * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock + * parameter. Do not use an inconsistent combination!) + * + * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer + * using LockBufferForCleanup(), instead of a regular exclusive lock. + */ +XLogRedoAction +XLogReadBufferForRedoExtended(XLogReaderState *record, + uint8 block_id, + ReadBufferMode mode, bool get_cleanup_lock, + Buffer *buf) +{ + XLogRecPtr lsn = record->EndRecPtr; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + Page page; + bool zeromode; + bool willinit; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + { + /* Caller specified a bogus block_id */ + elog(PANIC, "failed to locate backup block with ID %d", block_id); + } + + /* + * Make sure that if the block is marked with WILL_INIT, the caller is + * going to initialize it. And vice versa. + */ + zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); + willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; + if (willinit && !zeromode) + elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); + if (!willinit && zeromode) + elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); + + /* If it has a full-page image and it should be restored, do it. */ + if (XLogRecBlockImageApply(record, block_id)) + { + Assert(XLogRecHasBlockImage(record, block_id)); + *buf = XLogReadBufferExtended(rnode, forknum, blkno, + get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); + page = BufferGetPage(*buf); + if (!RestoreBlockImage(record, block_id, page)) + elog(ERROR, "failed to restore block image"); + + /* + * The page may be uninitialized. If so, we can't set the LSN because + * that would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, lsn); + } + + MarkBufferDirty(*buf); + + /* + * At the end of crash recovery the init forks of unlogged relations + * are copied, without going through shared buffers. So we need to + * force the on-disk state of init forks to always be in sync with the + * state in shared buffers. + */ + if (forknum == INIT_FORKNUM) + FlushOneBuffer(*buf); + + return BLK_RESTORED; + } + else + { + *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); + if (BufferIsValid(*buf)) + { + if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) + { + if (get_cleanup_lock) + LockBufferForCleanup(*buf); + else + LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + } + if (lsn <= PageGetLSN(BufferGetPage(*buf))) + return BLK_DONE; + else + return BLK_NEEDS_REDO; + } + else + return BLK_NOTFOUND; + } +} + +/* + * XLogReadBufferExtended + * Read a page during XLOG replay + * + * This is functionally comparable to ReadBufferExtended. There's some + * differences in the behavior wrt. the "mode" argument: + * + * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we + * return InvalidBuffer. In this case the caller should silently skip the + * update on this page. (In this situation, we expect that the page was later + * dropped or truncated. If we don't see evidence of that later in the WAL + * sequence, we'll complain at the end of WAL replay.) + * + * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended + * with all-zeroes pages up to the given block number. + * + * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't + * exist, and we don't check for all-zeroes. Thus, no log entry is made + * to imply that the page should be dropped or truncated later. + * + * NB: A redo function should normally not call this directly. To get a page + * to modify, use XLogReadBufferForRedoExtended instead. It is important that + * all pages modified by a WAL record are registered in the WAL records, or + * they will be invisible to tools that need to know which pages are modified. + */ +Buffer +XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, + BlockNumber blkno, ReadBufferMode mode) +{ + BlockNumber lastblock; + Buffer buffer; + SMgrRelation smgr; + + Assert(blkno != P_NEW); + + /* Open the relation at smgr level */ + smgr = smgropen(rnode, InvalidBackendId); + + /* + * Create the target file if it doesn't already exist. This lets us cope + * if the replay sequence contains writes to a relation that is later + * deleted. (The original coding of this routine would instead suppress + * the writes, but that seems like it risks losing valuable data if the + * filesystem loses an inode during a crash. Better to write the data + * until we are actually told to delete the file.) + */ + smgrcreate(smgr, forknum, true); + + lastblock = smgrnblocks(smgr, forknum); + + if (blkno < lastblock) + { + /* page exists in file */ + buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, + mode, NULL); + } + else + { + /* hm, page doesn't exist in file */ + if (mode == RBM_NORMAL) + { + log_invalid_page(rnode, forknum, blkno, false); + return InvalidBuffer; + } + if (mode == RBM_NORMAL_NO_LOG) + return InvalidBuffer; + /* OK to extend the file */ + /* we do this in recovery only - no rel-extension lock needed */ + Assert(InRecovery); + buffer = InvalidBuffer; + do + { + if (buffer != InvalidBuffer) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + buffer = ReadBufferWithoutRelcache(rnode, forknum, + P_NEW, mode, NULL); + } + while (BufferGetBlockNumber(buffer) < blkno); + /* Handle the corner case that P_NEW returns non-consecutive pages */ + if (BufferGetBlockNumber(buffer) != blkno) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, + mode, NULL); + } + } + + if (mode == RBM_NORMAL) + { + /* check that page has been initialized */ + Page page = (Page) BufferGetPage(buffer); + + /* + * We assume that PageIsNew is safe without a lock. During recovery, + * there should be no other backends that could modify the buffer at + * the same time. + */ + if (PageIsNew(page)) + { + ReleaseBuffer(buffer); + log_invalid_page(rnode, forknum, blkno, true); + return InvalidBuffer; + } + } + + return buffer; +} + +/* + * Struct actually returned by CreateFakeRelcacheEntry, though the declared + * return type is Relation. + */ +typedef struct +{ + RelationData reldata; /* Note: this must be first */ + FormData_pg_class pgc; +} FakeRelCacheEntryData; + +typedef FakeRelCacheEntryData *FakeRelCacheEntry; + +/* + * Create a fake relation cache entry for a physical relation + * + * It's often convenient to use the same functions in XLOG replay as in the + * main codepath, but those functions typically work with a relcache entry. + * We don't have a working relation cache during XLOG replay, but this + * function can be used to create a fake relcache entry instead. Only the + * fields related to physical storage, like rd_rel, are initialized, so the + * fake entry is only usable in low-level operations like ReadBuffer(). + * + * This is also used for syncing WAL-skipped files. + * + * Caller must free the returned entry with FreeFakeRelcacheEntry(). + */ +Relation +CreateFakeRelcacheEntry(RelFileNode rnode) +{ + FakeRelCacheEntry fakeentry; + Relation rel; + + /* Allocate the Relation struct and all related space in one block. */ + fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); + rel = (Relation) fakeentry; + + rel->rd_rel = &fakeentry->pgc; + rel->rd_node = rnode; + + /* + * We will never be working with temp rels during recovery or while + * syncing WAL-skipped files. + */ + rel->rd_backend = InvalidBackendId; + + /* It must be a permanent table here */ + rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; + + /* We don't know the name of the relation; use relfilenode instead */ + sprintf(RelationGetRelationName(rel), "%u", rnode.relNode); + + /* + * We set up the lockRelId in case anything tries to lock the dummy + * relation. Note that this is fairly bogus since relNode may be + * different from the relation's OID. It shouldn't really matter though. + * In recovery, we are running by ourselves and can't have any lock + * conflicts. While syncing, we already hold AccessExclusiveLock. + */ + rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode; + rel->rd_lockInfo.lockRelId.relId = rnode.relNode; + + rel->rd_smgr = NULL; + + return rel; +} + +/* + * Free a fake relation cache entry. + */ +void +FreeFakeRelcacheEntry(Relation fakerel) +{ + /* make sure the fakerel is not referenced by the SmgrRelation anymore */ + if (fakerel->rd_smgr != NULL) + smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr); + pfree(fakerel); +} + +/* + * Drop a relation during XLOG replay + * + * This is called when the relation is about to be deleted; we need to remove + * any open "invalid-page" records for the relation. + */ +void +XLogDropRelation(RelFileNode rnode, ForkNumber forknum) +{ + forget_invalid_pages(rnode, forknum, 0); +} + +/* + * Drop a whole database during XLOG replay + * + * As above, but for DROP DATABASE instead of dropping a single rel + */ +void +XLogDropDatabase(Oid dbid) +{ + /* + * This is unnecessarily heavy-handed, as it will close SMgrRelation + * objects for other databases as well. DROP DATABASE occurs seldom enough + * that it's not worth introducing a variant of smgrclose for just this + * purpose. XXX: Or should we rather leave the smgr entries dangling? + */ + smgrcloseall(); + + forget_invalid_pages_db(dbid); +} + +/* + * Truncate a relation during XLOG replay + * + * We need to clean up any open "invalid-page" records for the dropped pages. + */ +void +XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, + BlockNumber nblocks) +{ + forget_invalid_pages(rnode, forkNum, nblocks); +} + +/* + * Determine which timeline to read an xlog page from and set the + * XLogReaderState's currTLI to that timeline ID. + * + * We care about timelines in xlogreader when we might be reading xlog + * generated prior to a promotion, either if we're currently a standby in + * recovery or if we're a promoted primary reading xlogs generated by the old + * primary before our promotion. + * + * wantPage must be set to the start address of the page to read and + * wantLength to the amount of the page that will be read, up to + * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ. + * + * We switch to an xlog segment from the new timeline eagerly when on a + * historical timeline, as soon as we reach the start of the xlog segment + * containing the timeline switch. The server copied the segment to the new + * timeline so all the data up to the switch point is the same, but there's no + * guarantee the old segment will still exist. It may have been deleted or + * renamed with a .partial suffix so we can't necessarily keep reading from + * the old TLI even though tliSwitchPoint says it's OK. + * + * We can't just check the timeline when we read a page on a different segment + * to the last page. We could've received a timeline switch from a cascading + * upstream, so the current segment ends abruptly (possibly getting renamed to + * .partial) and we have to switch to a new one. Even in the middle of reading + * a page we could have to dump the cached page and switch to a new TLI. + * + * Because of this, callers MAY NOT assume that currTLI is the timeline that + * will be in a page's xlp_tli; the page may begin on an older timeline or we + * might be reading from historical timeline data on a segment that's been + * copied to a new timeline. + * + * The caller must also make sure it doesn't read past the current replay + * position (using GetXLogReplayRecPtr) if executing in recovery, so it + * doesn't fail to notice that the current timeline became historical. The + * caller must also update ThisTimeLineID with the result of + * GetXLogReplayRecPtr and must check RecoveryInProgress(). + */ +void +XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength) +{ + const XLogRecPtr lastReadPage = (state->seg.ws_segno * + state->segcxt.ws_segsize + state->segoff); + + Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); + Assert(wantLength <= XLOG_BLCKSZ); + Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); + + /* + * If the desired page is currently read in and valid, we have nothing to + * do. + * + * The caller should've ensured that it didn't previously advance readOff + * past the valid limit of this timeline, so it doesn't matter if the + * current TLI has since become historical. + */ + if (lastReadPage == wantPage && + state->readLen != 0 && + lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) + return; + + /* + * If we're reading from the current timeline, it hasn't become historical + * and the page we're reading is after the last page read, we can again + * just carry on. (Seeking backwards requires a check to make sure the + * older page isn't on a prior timeline). + * + * ThisTimeLineID might've become historical since we last looked, but the + * caller is required not to read past the flush limit it saw at the time + * it looked up the timeline. There's nothing we can do about it if + * StartupXLOG() renames it to .partial concurrently. + */ + if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage) + { + Assert(state->currTLIValidUntil == InvalidXLogRecPtr); + return; + } + + /* + * If we're just reading pages from a previously validated historical + * timeline and the timeline we're reading from is valid until the end of + * the current segment we can just keep reading. + */ + if (state->currTLIValidUntil != InvalidXLogRecPtr && + state->currTLI != ThisTimeLineID && + state->currTLI != 0 && + ((wantPage + wantLength) / state->segcxt.ws_segsize) < + (state->currTLIValidUntil / state->segcxt.ws_segsize)) + return; + + /* + * If we reach this point we're either looking up a page for random + * access, the current timeline just became historical, or we're reading + * from a new segment containing a timeline switch. In all cases we need + * to determine the newest timeline on the segment. + * + * If it's the current timeline we can just keep reading from here unless + * we detect a timeline switch that makes the current timeline historical. + * If it's a historical timeline we can read all the segment on the newest + * timeline because it contains all the old timelines' data too. So only + * one switch check is required. + */ + { + /* + * We need to re-read the timeline history in case it's been changed + * by a promotion or replay from a cascaded replica. + */ + List *timelineHistory = readTimeLineHistory(ThisTimeLineID); + XLogRecPtr endOfSegment; + + endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) * + state->segcxt.ws_segsize - 1; + Assert(wantPage / state->segcxt.ws_segsize == + endOfSegment / state->segcxt.ws_segsize); + + /* + * Find the timeline of the last LSN on the segment containing + * wantPage. + */ + state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); + state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, + &state->nextTLI); + + Assert(state->currTLIValidUntil == InvalidXLogRecPtr || + wantPage + wantLength < state->currTLIValidUntil); + + list_free_deep(timelineHistory); + + elog(DEBUG3, "switched to timeline %u valid until %X/%X", + state->currTLI, + LSN_FORMAT_ARGS(state->currTLIValidUntil)); + } +} + +/* XLogReaderRoutine->segment_open callback for local pg_wal files */ +void +wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + TimeLineID tli = *tli_p; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; + + if (errno == ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + path))); + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + +/* stock XLogReaderRoutine->segment_close callback */ +void +wal_segment_close(XLogReaderState *state) +{ + close(state->seg.ws_file); + /* need to check errno? */ + state->seg.ws_file = -1; +} + +/* + * XLogReaderRoutine->page_read callback for reading local xlog files + * + * Public because it would likely be very helpful for someone writing another + * output method outside walsender, e.g. in a bgworker. + * + * TODO: The walsender has its own version of this, but it relies on the + * walsender's latch being set whenever WAL is flushed. No such infrastructure + * exists for normal backends, so we have to do a check/sleep/repeat style of + * loop for now. + */ +int +read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *cur_page) +{ + XLogRecPtr read_upto, + loc; + TimeLineID tli; + int count; + WALReadError errinfo; + + loc = targetPagePtr + reqLen; + + /* Loop waiting for xlog to be available if necessary */ + while (1) + { + /* + * Determine the limit of xlog we can currently read to, and what the + * most recent timeline is. + * + * RecoveryInProgress() will update ThisTimeLineID when it first + * notices recovery finishes, so we only have to maintain it for the + * local process until recovery ends. + */ + if (!RecoveryInProgress()) + read_upto = GetFlushRecPtr(); + else + read_upto = GetXLogReplayRecPtr(&ThisTimeLineID); + tli = ThisTimeLineID; + + /* + * Check which timeline to get the record from. + * + * We have to do it each time through the loop because if we're in + * recovery as a cascading standby, the current timeline might've + * become historical. We can't rely on RecoveryInProgress() because in + * a standby configuration like + * + * A => B => C + * + * if we're a logical decoding session on C, and B gets promoted, our + * timeline will change while we remain in recovery. + * + * We can't just keep reading from the old timeline as the last WAL + * archive in the timeline will get renamed to .partial by + * StartupXLOG(). + * + * If that happens after our caller updated ThisTimeLineID but before + * we actually read the xlog page, we might still try to read from the + * old (now renamed) segment and fail. There's not much we can do + * about this, but it can only happen when we're a leaf of a cascading + * standby whose primary gets promoted while we're decoding, so a + * one-off ERROR isn't too bad. + */ + XLogReadDetermineTimeline(state, targetPagePtr, reqLen); + + if (state->currTLI == ThisTimeLineID) + { + + if (loc <= read_upto) + break; + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + } + else + { + /* + * We're on a historical timeline, so limit reading to the switch + * point where we moved to the next timeline. + * + * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know + * about the new timeline, so we must've received past the end of + * it. + */ + read_upto = state->currTLIValidUntil; + + /* + * Setting tli to our wanted record's TLI is slightly wrong; the + * page might begin on an older timeline if it contains a timeline + * switch, since its xlog segment will have been copied from the + * prior timeline. This is pretty harmless though, as nothing + * cares so long as the timeline doesn't go backwards. We should + * read the page header instead; FIXME someday. + */ + tli = state->currTLI; + + /* No need to wait on a historical timeline */ + break; + } + } + + if (targetPagePtr + XLOG_BLCKSZ <= read_upto) + { + /* + * more than one block available; read only that block, have caller + * come back if they need more. + */ + count = XLOG_BLCKSZ; + } + else if (targetPagePtr + reqLen > read_upto) + { + /* not enough data there */ + return -1; + } + else + { + /* enough bytes available to satisfy the request */ + count = read_upto - targetPagePtr; + } + + /* + * Even though we just determined how much of the page can be validly read + * as 'count', read the whole page anyway. It's guaranteed to be + * zero-padded up to the page boundary if it's incomplete. + */ + if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli, + &errinfo)) + WALReadRaiseError(&errinfo); + + /* number of valid bytes in the buffer */ + return count; +} + +/* + * Backend-specific convenience code to handle read errors encountered by + * WALRead(). + */ +void +WALReadRaiseError(WALReadError *errinfo) +{ + WALOpenSegment *seg = &errinfo->wre_seg; + char fname[MAXFNAMELEN]; + + XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size); + + if (errinfo->wre_read < 0) + { + errno = errinfo->wre_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from log segment %s, offset %u: %m", + fname, errinfo->wre_off))); + } + else if (errinfo->wre_read == 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read from log segment %s, offset %u: read %d of %zu", + fname, errinfo->wre_off, errinfo->wre_read, + (Size) errinfo->wre_req))); + } +} -- cgit v1.2.3